struct A { char s, t, u, v; short a; };
// CHECK: %a = alloca %struct.A, align 2
// CHECK: %b = alloca %struct.A, align 2
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{.*}}, i32 2, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{.*}} align 2 {{.*}} align 2 {{.*}}, i1 false)
void q() { struct A a, b; a = b; }
// RUN: %clang_cc1 -emit-llvm %s -o - | FileCheck %s
struct A { short s; short t; int i; };
// CHECK: %a = alloca %struct.A, align 4
-// CHECK: call void @llvm.memset.p0i8.{{.*}}i32 4, i1 false)
+// CHECK: call void @llvm.memset.p0i8.{{.*}} align 4 {{.*}}, i1 false)
void q() { struct A a = {0}; }
// CHECK: [[RET:%.*]] = alloca [[STRUCT:%.*]], align 1
// CHECK: [[RES:%.*]] = alloca [[STRUCT]], align 1
// CHECK: [[CAST:%.*]] = bitcast [[STRUCT]]* [[RES]] to i8*
-// CHECK: call void @llvm.memset{{.*}}(i8* [[CAST]], i8 0, i64 5
+// CHECK: call void @llvm.memset{{.*}}(i8* align 1 [[CAST]], i8 0, i64 5
// CHECK: [[CASTRET:%.*]] = bitcast [[STRUCT]]* [[RET]] to i8*
// CHECK: [[CASTRES:%.*]] = bitcast [[STRUCT]]* [[RES]] to i8*
-// CHECK: call void @llvm.memcpy{{.*}}(i8* [[CASTRET]], i8* [[CASTRES]], i64 5
+// CHECK: call void @llvm.memcpy{{.*}}(i8* align 1 [[CASTRET]], i8* align 1 [[CASTRES]], i64 5
// CHECK: [[CAST:%.*]] = bitcast [[STRUCT]]* [[RET]] to { i64 }*
// CHECK: [[GEP:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[CAST]], i32 0, i32 0
// CHECK: [[R0:%.*]] = load i64, i64* [[GEP]], align 1
// CHECK: [[RET:%.*]] = alloca [[UNION:%.*]], align 8
// CHECK: [[RES:%.*]] = alloca [[UNION]], align 8
// CHECK: [[CAST:%.*]] = bitcast [[UNION]]* [[RES]] to i8*
-// CHECK: call void @llvm.memcpy{{.*}}(i8* [[CAST]]
+// CHECK: call void @llvm.memcpy{{.*}}(i8* align 8 [[CAST]]
// CHECK: [[CASTRET:%.*]] = bitcast [[UNION]]* [[RET]] to i8*
// CHECK: [[CASTRES:%.*]] = bitcast [[UNION]]* [[RES]] to i8*
-// CHECK: call void @llvm.memcpy{{.*}}(i8* [[CASTRET]], i8* [[CASTRES]]
+// CHECK: call void @llvm.memcpy{{.*}}(i8* align 8 [[CASTRET]], i8* align 8 [[CASTRES]]
// CHECK: [[CAST:%.*]] = bitcast [[UNION]]* [[RET]] to { i64 }*
// CHECK: [[GEP:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[CAST]], i32 0, i32 0
// CHECK: [[R0:%.*]] = load i64, i64* [[GEP]], align 8
// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false)
// CHECK: [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint8x16x2_t [[TMP5]]
uint8x16x2_t test_vld2q_u8(uint8_t const *a) {
// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint16x8x2_t [[TMP6]]
uint16x8x2_t test_vld2q_u16(uint16_t const *a) {
// CHECK: store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint32x4x2_t [[TMP6]]
uint32x4x2_t test_vld2q_u32(uint32_t const *a) {
// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint64x2x2_t [[TMP6]]
uint64x2x2_t test_vld2q_u64(uint64_t const *a) {
// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false)
// CHECK: [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.int8x16x2_t [[TMP5]]
int8x16x2_t test_vld2q_s8(int8_t const *a) {
// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.int16x8x2_t [[TMP6]]
int16x8x2_t test_vld2q_s16(int16_t const *a) {
// CHECK: store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.int32x4x2_t [[TMP6]]
int32x4x2_t test_vld2q_s32(int32_t const *a) {
// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.int64x2x2_t [[TMP6]]
int64x2x2_t test_vld2q_s64(int64_t const *a) {
// CHECK: store { <8 x half>, <8 x half> } [[VLD2]], { <8 x half>, <8 x half> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.float16x8x2_t [[TMP6]]
float16x8x2_t test_vld2q_f16(float16_t const *a) {
// CHECK: store { <4 x float>, <4 x float> } [[VLD2]], { <4 x float>, <4 x float> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.float32x4x2_t [[TMP6]]
float32x4x2_t test_vld2q_f32(float32_t const *a) {
// CHECK: store { <2 x double>, <2 x double> } [[VLD2]], { <2 x double>, <2 x double> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.float64x2x2_t [[TMP6]]
float64x2x2_t test_vld2q_f64(float64_t const *a) {
// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false)
// CHECK: [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly8x16x2_t [[TMP5]]
poly8x16x2_t test_vld2q_p8(poly8_t const *a) {
// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly16x8x2_t [[TMP6]]
poly16x8x2_t test_vld2q_p16(poly16_t const *a) {
// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false)
// CHECK: [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint8x8x2_t [[TMP5]]
uint8x8x2_t test_vld2_u8(uint8_t const *a) {
// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint16x4x2_t [[TMP6]]
uint16x4x2_t test_vld2_u16(uint16_t const *a) {
// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint32x2x2_t [[TMP6]]
uint32x2x2_t test_vld2_u32(uint32_t const *a) {
// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint64x1x2_t [[TMP6]]
uint64x1x2_t test_vld2_u64(uint64_t const *a) {
// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false)
// CHECK: [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.int8x8x2_t [[TMP5]]
int8x8x2_t test_vld2_s8(int8_t const *a) {
// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.int16x4x2_t [[TMP6]]
int16x4x2_t test_vld2_s16(int16_t const *a) {
// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.int32x2x2_t [[TMP6]]
int32x2x2_t test_vld2_s32(int32_t const *a) {
// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.int64x1x2_t [[TMP6]]
int64x1x2_t test_vld2_s64(int64_t const *a) {
// CHECK: store { <4 x half>, <4 x half> } [[VLD2]], { <4 x half>, <4 x half> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.float16x4x2_t [[TMP6]]
float16x4x2_t test_vld2_f16(float16_t const *a) {
// CHECK: store { <2 x float>, <2 x float> } [[VLD2]], { <2 x float>, <2 x float> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.float32x2x2_t [[TMP6]]
float32x2x2_t test_vld2_f32(float32_t const *a) {
// CHECK: store { <1 x double>, <1 x double> } [[VLD2]], { <1 x double>, <1 x double> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.float64x1x2_t [[TMP6]]
float64x1x2_t test_vld2_f64(float64_t const *a) {
// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false)
// CHECK: [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly8x8x2_t [[TMP5]]
poly8x8x2_t test_vld2_p8(poly8_t const *a) {
// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly16x4x2_t [[TMP6]]
poly16x4x2_t test_vld2_p16(poly16_t const *a) {
// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 48, i1 false)
// CHECK: [[TMP5:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint8x16x3_t [[TMP5]]
uint8x16x3_t test_vld3q_u8(uint8_t const *a) {
// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint16x8x3_t [[TMP6]]
uint16x8x3_t test_vld3q_u16(uint16_t const *a) {
// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint32x4x3_t [[TMP6]]
uint32x4x3_t test_vld3q_u32(uint32_t const *a) {
// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint64x2x3_t [[TMP6]]
uint64x2x3_t test_vld3q_u64(uint64_t const *a) {
// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 48, i1 false)
// CHECK: [[TMP5:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.int8x16x3_t [[TMP5]]
int8x16x3_t test_vld3q_s8(int8_t const *a) {
// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.int16x8x3_t [[TMP6]]
int16x8x3_t test_vld3q_s16(int16_t const *a) {
// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.int32x4x3_t [[TMP6]]
int32x4x3_t test_vld3q_s32(int32_t const *a) {
// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.int64x2x3_t [[TMP6]]
int64x2x3_t test_vld3q_s64(int64_t const *a) {
// CHECK: store { <8 x half>, <8 x half>, <8 x half> } [[VLD3]], { <8 x half>, <8 x half>, <8 x half> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.float16x8x3_t [[TMP6]]
float16x8x3_t test_vld3q_f16(float16_t const *a) {
// CHECK: store { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.float32x4x3_t [[TMP6]]
float32x4x3_t test_vld3q_f32(float32_t const *a) {
// CHECK: store { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.float64x2x3_t [[TMP6]]
float64x2x3_t test_vld3q_f64(float64_t const *a) {
// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 48, i1 false)
// CHECK: [[TMP5:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly8x16x3_t [[TMP5]]
poly8x16x3_t test_vld3q_p8(poly8_t const *a) {
// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly16x8x3_t [[TMP6]]
poly16x8x3_t test_vld3q_p16(poly16_t const *a) {
// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 24, i1 false)
// CHECK: [[TMP5:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint8x8x3_t [[TMP5]]
uint8x8x3_t test_vld3_u8(uint8_t const *a) {
// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint16x4x3_t [[TMP6]]
uint16x4x3_t test_vld3_u16(uint16_t const *a) {
// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint32x2x3_t [[TMP6]]
uint32x2x3_t test_vld3_u32(uint32_t const *a) {
// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint64x1x3_t [[TMP6]]
uint64x1x3_t test_vld3_u64(uint64_t const *a) {
// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 24, i1 false)
// CHECK: [[TMP5:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.int8x8x3_t [[TMP5]]
int8x8x3_t test_vld3_s8(int8_t const *a) {
// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.int16x4x3_t [[TMP6]]
int16x4x3_t test_vld3_s16(int16_t const *a) {
// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.int32x2x3_t [[TMP6]]
int32x2x3_t test_vld3_s32(int32_t const *a) {
// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.int64x1x3_t [[TMP6]]
int64x1x3_t test_vld3_s64(int64_t const *a) {
// CHECK: store { <4 x half>, <4 x half>, <4 x half> } [[VLD3]], { <4 x half>, <4 x half>, <4 x half> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.float16x4x3_t [[TMP6]]
float16x4x3_t test_vld3_f16(float16_t const *a) {
// CHECK: store { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.float32x2x3_t [[TMP6]]
float32x2x3_t test_vld3_f32(float32_t const *a) {
// CHECK: store { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.float64x1x3_t [[TMP6]]
float64x1x3_t test_vld3_f64(float64_t const *a) {
// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 24, i1 false)
// CHECK: [[TMP5:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly8x8x3_t [[TMP5]]
poly8x8x3_t test_vld3_p8(poly8_t const *a) {
// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly16x4x3_t [[TMP6]]
poly16x4x3_t test_vld3_p16(poly16_t const *a) {
// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 64, i1 false)
// CHECK: [[TMP5:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint8x16x4_t [[TMP5]]
uint8x16x4_t test_vld4q_u8(uint8_t const *a) {
// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint16x8x4_t [[TMP6]]
uint16x8x4_t test_vld4q_u16(uint16_t const *a) {
// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint32x4x4_t [[TMP6]]
uint32x4x4_t test_vld4q_u32(uint32_t const *a) {
// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint64x2x4_t [[TMP6]]
uint64x2x4_t test_vld4q_u64(uint64_t const *a) {
// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 64, i1 false)
// CHECK: [[TMP5:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.int8x16x4_t [[TMP5]]
int8x16x4_t test_vld4q_s8(int8_t const *a) {
// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.int16x8x4_t [[TMP6]]
int16x8x4_t test_vld4q_s16(int16_t const *a) {
// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.int32x4x4_t [[TMP6]]
int32x4x4_t test_vld4q_s32(int32_t const *a) {
// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.int64x2x4_t [[TMP6]]
int64x2x4_t test_vld4q_s64(int64_t const *a) {
// CHECK: store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4]], { <8 x half>, <8 x half>, <8 x half>, <8 x half> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.float16x8x4_t [[TMP6]]
float16x8x4_t test_vld4q_f16(float16_t const *a) {
// CHECK: store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.float32x4x4_t [[TMP6]]
float32x4x4_t test_vld4q_f32(float32_t const *a) {
// CHECK: store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.float64x2x4_t [[TMP6]]
float64x2x4_t test_vld4q_f64(float64_t const *a) {
// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 64, i1 false)
// CHECK: [[TMP5:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly8x16x4_t [[TMP5]]
poly8x16x4_t test_vld4q_p8(poly8_t const *a) {
// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly16x8x4_t [[TMP6]]
poly16x8x4_t test_vld4q_p16(poly16_t const *a) {
// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 32, i1 false)
// CHECK: [[TMP5:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint8x8x4_t [[TMP5]]
uint8x8x4_t test_vld4_u8(uint8_t const *a) {
// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint16x4x4_t [[TMP6]]
uint16x4x4_t test_vld4_u16(uint16_t const *a) {
// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint32x2x4_t [[TMP6]]
uint32x2x4_t test_vld4_u32(uint32_t const *a) {
// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint64x1x4_t [[TMP6]]
uint64x1x4_t test_vld4_u64(uint64_t const *a) {
// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 32, i1 false)
// CHECK: [[TMP5:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.int8x8x4_t [[TMP5]]
int8x8x4_t test_vld4_s8(int8_t const *a) {
// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.int16x4x4_t [[TMP6]]
int16x4x4_t test_vld4_s16(int16_t const *a) {
// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.int32x2x4_t [[TMP6]]
int32x2x4_t test_vld4_s32(int32_t const *a) {
// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.int64x1x4_t [[TMP6]]
int64x1x4_t test_vld4_s64(int64_t const *a) {
// CHECK: store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4]], { <4 x half>, <4 x half>, <4 x half>, <4 x half> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.float16x4x4_t [[TMP6]]
float16x4x4_t test_vld4_f16(float16_t const *a) {
// CHECK: store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.float32x2x4_t [[TMP6]]
float32x2x4_t test_vld4_f32(float32_t const *a) {
// CHECK: store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.float64x1x4_t [[TMP6]]
float64x1x4_t test_vld4_f64(float64_t const *a) {
// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 32, i1 false)
// CHECK: [[TMP5:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly8x8x4_t [[TMP5]]
poly8x8x4_t test_vld4_p8(poly8_t const *a) {
// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly16x4x4_t [[TMP6]]
poly16x4x4_t test_vld4_p16(poly16_t const *a) {
// CHECK: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 32, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint8x16x2_t [[TMP4]]
uint8x16x2_t test_vld1q_u8_x2(uint8_t const *a) {
// CHECK: store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint16x8x2_t [[TMP6]]
uint16x8x2_t test_vld1q_u16_x2(uint16_t const *a) {
// CHECK: store { <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint32x4x2_t [[TMP6]]
uint32x4x2_t test_vld1q_u32_x2(uint32_t const *a) {
// CHECK: store { <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint64x2x2_t [[TMP6]]
uint64x2x2_t test_vld1q_u64_x2(uint64_t const *a) {
// CHECK: store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 32, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.int8x16x2_t [[TMP4]]
int8x16x2_t test_vld1q_s8_x2(int8_t const *a) {
// CHECK: store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.int16x8x2_t [[TMP6]]
int16x8x2_t test_vld1q_s16_x2(int16_t const *a) {
// CHECK: store { <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.int32x4x2_t [[TMP6]]
int32x4x2_t test_vld1q_s32_x2(int32_t const *a) {
// CHECK: store { <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.int64x2x2_t [[TMP6]]
int64x2x2_t test_vld1q_s64_x2(int64_t const *a) {
// CHECK: store { <8 x half>, <8 x half> } [[VLD1XN]], { <8 x half>, <8 x half> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.float16x8x2_t [[TMP6]]
float16x8x2_t test_vld1q_f16_x2(float16_t const *a) {
// CHECK: store { <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.float32x4x2_t [[TMP6]]
float32x4x2_t test_vld1q_f32_x2(float32_t const *a) {
// CHECK: store { <2 x double>, <2 x double> } [[VLD1XN]], { <2 x double>, <2 x double> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.float64x2x2_t [[TMP6]]
float64x2x2_t test_vld1q_f64_x2(float64_t const *a) {
// CHECK: store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 32, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly8x16x2_t [[TMP4]]
poly8x16x2_t test_vld1q_p8_x2(poly8_t const *a) {
// CHECK: store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly16x8x2_t [[TMP6]]
poly16x8x2_t test_vld1q_p16_x2(poly16_t const *a) {
// CHECK: store { <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly64x2x2_t [[TMP6]]
poly64x2x2_t test_vld1q_p64_x2(poly64_t const *a) {
// CHECK: store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 16, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint8x8x2_t [[TMP4]]
uint8x8x2_t test_vld1_u8_x2(uint8_t const *a) {
// CHECK: store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint16x4x2_t [[TMP6]]
uint16x4x2_t test_vld1_u16_x2(uint16_t const *a) {
// CHECK: store { <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint32x2x2_t [[TMP6]]
uint32x2x2_t test_vld1_u32_x2(uint32_t const *a) {
// CHECK: store { <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint64x1x2_t [[TMP6]]
uint64x1x2_t test_vld1_u64_x2(uint64_t const *a) {
// CHECK: store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 16, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.int8x8x2_t [[TMP4]]
int8x8x2_t test_vld1_s8_x2(int8_t const *a) {
// CHECK: store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.int16x4x2_t [[TMP6]]
int16x4x2_t test_vld1_s16_x2(int16_t const *a) {
// CHECK: store { <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.int32x2x2_t [[TMP6]]
int32x2x2_t test_vld1_s32_x2(int32_t const *a) {
// CHECK: store { <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.int64x1x2_t [[TMP6]]
int64x1x2_t test_vld1_s64_x2(int64_t const *a) {
// CHECK: store { <4 x half>, <4 x half> } [[VLD1XN]], { <4 x half>, <4 x half> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.float16x4x2_t [[TMP6]]
float16x4x2_t test_vld1_f16_x2(float16_t const *a) {
// CHECK: store { <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.float32x2x2_t [[TMP6]]
float32x2x2_t test_vld1_f32_x2(float32_t const *a) {
// CHECK: store { <1 x double>, <1 x double> } [[VLD1XN]], { <1 x double>, <1 x double> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.float64x1x2_t [[TMP6]]
float64x1x2_t test_vld1_f64_x2(float64_t const *a) {
// CHECK: store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 16, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly8x8x2_t [[TMP4]]
poly8x8x2_t test_vld1_p8_x2(poly8_t const *a) {
// CHECK: store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly16x4x2_t [[TMP6]]
poly16x4x2_t test_vld1_p16_x2(poly16_t const *a) {
// CHECK: store { <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly64x1x2_t [[TMP6]]
poly64x1x2_t test_vld1_p64_x2(poly64_t const *a) {
// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 48, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint8x16x3_t [[TMP4]]
uint8x16x3_t test_vld1q_u8_x3(uint8_t const *a) {
// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint16x8x3_t [[TMP6]]
uint16x8x3_t test_vld1q_u16_x3(uint16_t const *a) {
// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint32x4x3_t [[TMP6]]
uint32x4x3_t test_vld1q_u32_x3(uint32_t const *a) {
// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint64x2x3_t [[TMP6]]
uint64x2x3_t test_vld1q_u64_x3(uint64_t const *a) {
// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 48, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.int8x16x3_t [[TMP4]]
int8x16x3_t test_vld1q_s8_x3(int8_t const *a) {
// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.int16x8x3_t [[TMP6]]
int16x8x3_t test_vld1q_s16_x3(int16_t const *a) {
// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.int32x4x3_t [[TMP6]]
int32x4x3_t test_vld1q_s32_x3(int32_t const *a) {
// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.int64x2x3_t [[TMP6]]
int64x2x3_t test_vld1q_s64_x3(int64_t const *a) {
// CHECK: store { <8 x half>, <8 x half>, <8 x half> } [[VLD1XN]], { <8 x half>, <8 x half>, <8 x half> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.float16x8x3_t [[TMP6]]
float16x8x3_t test_vld1q_f16_x3(float16_t const *a) {
// CHECK: store { <4 x float>, <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.float32x4x3_t [[TMP6]]
float32x4x3_t test_vld1q_f32_x3(float32_t const *a) {
// CHECK: store { <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.float64x2x3_t [[TMP6]]
float64x2x3_t test_vld1q_f64_x3(float64_t const *a) {
// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 48, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly8x16x3_t [[TMP4]]
poly8x16x3_t test_vld1q_p8_x3(poly8_t const *a) {
// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly16x8x3_t [[TMP6]]
poly16x8x3_t test_vld1q_p16_x3(poly16_t const *a) {
// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly64x2x3_t [[TMP6]]
poly64x2x3_t test_vld1q_p64_x3(poly64_t const *a) {
// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 24, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint8x8x3_t [[TMP4]]
uint8x8x3_t test_vld1_u8_x3(uint8_t const *a) {
// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint16x4x3_t [[TMP6]]
uint16x4x3_t test_vld1_u16_x3(uint16_t const *a) {
// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint32x2x3_t [[TMP6]]
uint32x2x3_t test_vld1_u32_x3(uint32_t const *a) {
// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint64x1x3_t [[TMP6]]
uint64x1x3_t test_vld1_u64_x3(uint64_t const *a) {
// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 24, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.int8x8x3_t [[TMP4]]
int8x8x3_t test_vld1_s8_x3(int8_t const *a) {
// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.int16x4x3_t [[TMP6]]
int16x4x3_t test_vld1_s16_x3(int16_t const *a) {
// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.int32x2x3_t [[TMP6]]
int32x2x3_t test_vld1_s32_x3(int32_t const *a) {
// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.int64x1x3_t [[TMP6]]
int64x1x3_t test_vld1_s64_x3(int64_t const *a) {
// CHECK: store { <4 x half>, <4 x half>, <4 x half> } [[VLD1XN]], { <4 x half>, <4 x half>, <4 x half> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.float16x4x3_t [[TMP6]]
float16x4x3_t test_vld1_f16_x3(float16_t const *a) {
// CHECK: store { <2 x float>, <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.float32x2x3_t [[TMP6]]
float32x2x3_t test_vld1_f32_x3(float32_t const *a) {
// CHECK: store { <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.float64x1x3_t [[TMP6]]
float64x1x3_t test_vld1_f64_x3(float64_t const *a) {
// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 24, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly8x8x3_t [[TMP4]]
poly8x8x3_t test_vld1_p8_x3(poly8_t const *a) {
// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly16x4x3_t [[TMP6]]
poly16x4x3_t test_vld1_p16_x3(poly16_t const *a) {
// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly64x1x3_t [[TMP6]]
poly64x1x3_t test_vld1_p64_x3(poly64_t const *a) {
// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 64, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint8x16x4_t [[TMP4]]
uint8x16x4_t test_vld1q_u8_x4(uint8_t const *a) {
// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint16x8x4_t [[TMP6]]
uint16x8x4_t test_vld1q_u16_x4(uint16_t const *a) {
// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint32x4x4_t [[TMP6]]
uint32x4x4_t test_vld1q_u32_x4(uint32_t const *a) {
// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint64x2x4_t [[TMP6]]
uint64x2x4_t test_vld1q_u64_x4(uint64_t const *a) {
// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 64, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.int8x16x4_t [[TMP4]]
int8x16x4_t test_vld1q_s8_x4(int8_t const *a) {
// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.int16x8x4_t [[TMP6]]
int16x8x4_t test_vld1q_s16_x4(int16_t const *a) {
// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.int32x4x4_t [[TMP6]]
int32x4x4_t test_vld1q_s32_x4(int32_t const *a) {
// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.int64x2x4_t [[TMP6]]
int64x2x4_t test_vld1q_s64_x4(int64_t const *a) {
// CHECK: store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD1XN]], { <8 x half>, <8 x half>, <8 x half>, <8 x half> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.float16x8x4_t [[TMP6]]
float16x8x4_t test_vld1q_f16_x4(float16_t const *a) {
// CHECK: store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.float32x4x4_t [[TMP6]]
float32x4x4_t test_vld1q_f32_x4(float32_t const *a) {
// CHECK: store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.float64x2x4_t [[TMP6]]
float64x2x4_t test_vld1q_f64_x4(float64_t const *a) {
// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 64, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly8x16x4_t [[TMP4]]
poly8x16x4_t test_vld1q_p8_x4(poly8_t const *a) {
// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly16x8x4_t [[TMP6]]
poly16x8x4_t test_vld1q_p16_x4(poly16_t const *a) {
// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly64x2x4_t [[TMP6]]
poly64x2x4_t test_vld1q_p64_x4(poly64_t const *a) {
// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 32, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint8x8x4_t [[TMP4]]
uint8x8x4_t test_vld1_u8_x4(uint8_t const *a) {
// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint16x4x4_t [[TMP6]]
uint16x4x4_t test_vld1_u16_x4(uint16_t const *a) {
// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint32x2x4_t [[TMP6]]
uint32x2x4_t test_vld1_u32_x4(uint32_t const *a) {
// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint64x1x4_t [[TMP6]]
uint64x1x4_t test_vld1_u64_x4(uint64_t const *a) {
// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 32, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.int8x8x4_t [[TMP4]]
int8x8x4_t test_vld1_s8_x4(int8_t const *a) {
// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.int16x4x4_t [[TMP6]]
int16x4x4_t test_vld1_s16_x4(int16_t const *a) {
// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.int32x2x4_t [[TMP6]]
int32x2x4_t test_vld1_s32_x4(int32_t const *a) {
// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.int64x1x4_t [[TMP6]]
int64x1x4_t test_vld1_s64_x4(int64_t const *a) {
// CHECK: store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD1XN]], { <4 x half>, <4 x half>, <4 x half>, <4 x half> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.float16x4x4_t [[TMP6]]
float16x4x4_t test_vld1_f16_x4(float16_t const *a) {
// CHECK: store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.float32x2x4_t [[TMP6]]
float32x2x4_t test_vld1_f32_x4(float32_t const *a) {
// CHECK: store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.float64x1x4_t [[TMP6]]
float64x1x4_t test_vld1_f64_x4(float64_t const *a) {
// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 32, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly8x8x4_t [[TMP4]]
poly8x8x4_t test_vld1_p8_x4(poly8_t const *a) {
// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly16x4x4_t [[TMP6]]
poly16x4x4_t test_vld1_p16_x4(poly16_t const *a) {
// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly64x1x4_t [[TMP6]]
poly64x1x4_t test_vld1_p64_x4(poly64_t const *a) {
// CHECK: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 32, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint8x16x2_t [[TMP4]]
uint8x16x2_t test_vld2q_dup_u8(uint8_t *a) {
// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint16x8x2_t [[TMP6]]
uint16x8x2_t test_vld2q_dup_u16(uint16_t *a) {
// CHECK: store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint32x4x2_t [[TMP6]]
uint32x4x2_t test_vld2q_dup_u32(uint32_t *a) {
// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint64x2x2_t [[TMP6]]
uint64x2x2_t test_vld2q_dup_u64(uint64_t *a) {
// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 32, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.int8x16x2_t [[TMP4]]
int8x16x2_t test_vld2q_dup_s8(int8_t *a) {
// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.int16x8x2_t [[TMP6]]
int16x8x2_t test_vld2q_dup_s16(int16_t *a) {
// CHECK: store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.int32x4x2_t [[TMP6]]
int32x4x2_t test_vld2q_dup_s32(int32_t *a) {
// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.int64x2x2_t [[TMP6]]
int64x2x2_t test_vld2q_dup_s64(int64_t *a) {
// CHECK: store { <8 x half>, <8 x half> } [[VLD2]], { <8 x half>, <8 x half> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.float16x8x2_t [[TMP6]]
float16x8x2_t test_vld2q_dup_f16(float16_t *a) {
// CHECK: store { <4 x float>, <4 x float> } [[VLD2]], { <4 x float>, <4 x float> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.float32x4x2_t [[TMP6]]
float32x4x2_t test_vld2q_dup_f32(float32_t *a) {
// CHECK: store { <2 x double>, <2 x double> } [[VLD2]], { <2 x double>, <2 x double> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.float64x2x2_t [[TMP6]]
float64x2x2_t test_vld2q_dup_f64(float64_t *a) {
// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 32, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly8x16x2_t [[TMP4]]
poly8x16x2_t test_vld2q_dup_p8(poly8_t *a) {
// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly16x8x2_t [[TMP6]]
poly16x8x2_t test_vld2q_dup_p16(poly16_t *a) {
// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly64x2x2_t [[TMP6]]
poly64x2x2_t test_vld2q_dup_p64(poly64_t *a) {
// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 16, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint8x8x2_t [[TMP4]]
uint8x8x2_t test_vld2_dup_u8(uint8_t *a) {
// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint16x4x2_t [[TMP6]]
uint16x4x2_t test_vld2_dup_u16(uint16_t *a) {
// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint32x2x2_t [[TMP6]]
uint32x2x2_t test_vld2_dup_u32(uint32_t *a) {
// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint64x1x2_t [[TMP6]]
uint64x1x2_t test_vld2_dup_u64(uint64_t *a) {
// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 16, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.int8x8x2_t [[TMP4]]
int8x8x2_t test_vld2_dup_s8(int8_t *a) {
// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.int16x4x2_t [[TMP6]]
int16x4x2_t test_vld2_dup_s16(int16_t *a) {
// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.int32x2x2_t [[TMP6]]
int32x2x2_t test_vld2_dup_s32(int32_t *a) {
// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.int64x1x2_t [[TMP6]]
int64x1x2_t test_vld2_dup_s64(int64_t *a) {
// CHECK: store { <4 x half>, <4 x half> } [[VLD2]], { <4 x half>, <4 x half> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.float16x4x2_t [[TMP6]]
float16x4x2_t test_vld2_dup_f16(float16_t *a) {
// CHECK: store { <2 x float>, <2 x float> } [[VLD2]], { <2 x float>, <2 x float> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.float32x2x2_t [[TMP6]]
float32x2x2_t test_vld2_dup_f32(float32_t *a) {
// CHECK: store { <1 x double>, <1 x double> } [[VLD2]], { <1 x double>, <1 x double> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.float64x1x2_t [[TMP6]]
float64x1x2_t test_vld2_dup_f64(float64_t *a) {
// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 16, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly8x8x2_t [[TMP4]]
poly8x8x2_t test_vld2_dup_p8(poly8_t *a) {
// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly16x4x2_t [[TMP6]]
poly16x4x2_t test_vld2_dup_p16(poly16_t *a) {
// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly64x1x2_t [[TMP6]]
poly64x1x2_t test_vld2_dup_p64(poly64_t *a) {
// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 48, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint8x16x3_t [[TMP4]]
uint8x16x3_t test_vld3q_dup_u8(uint8_t *a) {
// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint16x8x3_t [[TMP6]]
uint16x8x3_t test_vld3q_dup_u16(uint16_t *a) {
// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint32x4x3_t [[TMP6]]
uint32x4x3_t test_vld3q_dup_u32(uint32_t *a) {
// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint64x2x3_t [[TMP6]]
uint64x2x3_t test_vld3q_dup_u64(uint64_t *a) {
// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 48, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.int8x16x3_t [[TMP4]]
int8x16x3_t test_vld3q_dup_s8(int8_t *a) {
// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.int16x8x3_t [[TMP6]]
int16x8x3_t test_vld3q_dup_s16(int16_t *a) {
// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.int32x4x3_t [[TMP6]]
int32x4x3_t test_vld3q_dup_s32(int32_t *a) {
// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.int64x2x3_t [[TMP6]]
int64x2x3_t test_vld3q_dup_s64(int64_t *a) {
// CHECK: store { <8 x half>, <8 x half>, <8 x half> } [[VLD3]], { <8 x half>, <8 x half>, <8 x half> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.float16x8x3_t [[TMP6]]
float16x8x3_t test_vld3q_dup_f16(float16_t *a) {
// CHECK: store { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.float32x4x3_t [[TMP6]]
float32x4x3_t test_vld3q_dup_f32(float32_t *a) {
// CHECK: store { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.float64x2x3_t [[TMP6]]
float64x2x3_t test_vld3q_dup_f64(float64_t *a) {
// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 48, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly8x16x3_t [[TMP4]]
poly8x16x3_t test_vld3q_dup_p8(poly8_t *a) {
// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly16x8x3_t [[TMP6]]
poly16x8x3_t test_vld3q_dup_p16(poly16_t *a) {
// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly64x2x3_t [[TMP6]]
poly64x2x3_t test_vld3q_dup_p64(poly64_t *a) {
// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 24, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint8x8x3_t [[TMP4]]
uint8x8x3_t test_vld3_dup_u8(uint8_t *a) {
// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint16x4x3_t [[TMP6]]
uint16x4x3_t test_vld3_dup_u16(uint16_t *a) {
// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint32x2x3_t [[TMP6]]
uint32x2x3_t test_vld3_dup_u32(uint32_t *a) {
// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint64x1x3_t [[TMP6]]
uint64x1x3_t test_vld3_dup_u64(uint64_t *a) {
// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 24, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.int8x8x3_t [[TMP4]]
int8x8x3_t test_vld3_dup_s8(int8_t *a) {
// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.int16x4x3_t [[TMP6]]
int16x4x3_t test_vld3_dup_s16(int16_t *a) {
// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.int32x2x3_t [[TMP6]]
int32x2x3_t test_vld3_dup_s32(int32_t *a) {
// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.int64x1x3_t [[TMP6]]
int64x1x3_t test_vld3_dup_s64(int64_t *a) {
// CHECK: store { <4 x half>, <4 x half>, <4 x half> } [[VLD3]], { <4 x half>, <4 x half>, <4 x half> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.float16x4x3_t [[TMP6]]
float16x4x3_t test_vld3_dup_f16(float16_t *a) {
// CHECK: store { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.float32x2x3_t [[TMP6]]
float32x2x3_t test_vld3_dup_f32(float32_t *a) {
// CHECK: store { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.float64x1x3_t [[TMP6]]
float64x1x3_t test_vld3_dup_f64(float64_t *a) {
// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 24, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly8x8x3_t [[TMP4]]
poly8x8x3_t test_vld3_dup_p8(poly8_t *a) {
// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly16x4x3_t [[TMP6]]
poly16x4x3_t test_vld3_dup_p16(poly16_t *a) {
// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly64x1x3_t [[TMP6]]
poly64x1x3_t test_vld3_dup_p64(poly64_t *a) {
// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 64, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint8x16x4_t [[TMP4]]
uint8x16x4_t test_vld4q_dup_u8(uint8_t *a) {
// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint16x8x4_t [[TMP6]]
uint16x8x4_t test_vld4q_dup_u16(uint16_t *a) {
// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint32x4x4_t [[TMP6]]
uint32x4x4_t test_vld4q_dup_u32(uint32_t *a) {
// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint64x2x4_t [[TMP6]]
uint64x2x4_t test_vld4q_dup_u64(uint64_t *a) {
// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 64, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.int8x16x4_t [[TMP4]]
int8x16x4_t test_vld4q_dup_s8(int8_t *a) {
// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.int16x8x4_t [[TMP6]]
int16x8x4_t test_vld4q_dup_s16(int16_t *a) {
// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.int32x4x4_t [[TMP6]]
int32x4x4_t test_vld4q_dup_s32(int32_t *a) {
// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.int64x2x4_t [[TMP6]]
int64x2x4_t test_vld4q_dup_s64(int64_t *a) {
// CHECK: store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4]], { <8 x half>, <8 x half>, <8 x half>, <8 x half> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.float16x8x4_t [[TMP6]]
float16x8x4_t test_vld4q_dup_f16(float16_t *a) {
// CHECK: store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.float32x4x4_t [[TMP6]]
float32x4x4_t test_vld4q_dup_f32(float32_t *a) {
// CHECK: store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.float64x2x4_t [[TMP6]]
float64x2x4_t test_vld4q_dup_f64(float64_t *a) {
// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP2]], i8* align 16 [[TMP3]], i64 64, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly8x16x4_t [[TMP4]]
poly8x16x4_t test_vld4q_dup_p8(poly8_t *a) {
// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly16x8x4_t [[TMP6]]
poly16x8x4_t test_vld4q_dup_p16(poly16_t *a) {
// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly64x2x4_t [[TMP6]]
poly64x2x4_t test_vld4q_dup_p64(poly64_t *a) {
// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 32, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint8x8x4_t [[TMP4]]
uint8x8x4_t test_vld4_dup_u8(uint8_t *a) {
// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint16x4x4_t [[TMP6]]
uint16x4x4_t test_vld4_dup_u16(uint16_t *a) {
// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint32x2x4_t [[TMP6]]
uint32x2x4_t test_vld4_dup_u32(uint32_t *a) {
// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint64x1x4_t [[TMP6]]
uint64x1x4_t test_vld4_dup_u64(uint64_t *a) {
// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 32, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.int8x8x4_t [[TMP4]]
int8x8x4_t test_vld4_dup_s8(int8_t *a) {
// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.int16x4x4_t [[TMP6]]
int16x4x4_t test_vld4_dup_s16(int16_t *a) {
// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.int32x2x4_t [[TMP6]]
int32x2x4_t test_vld4_dup_s32(int32_t *a) {
// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.int64x1x4_t [[TMP6]]
int64x1x4_t test_vld4_dup_s64(int64_t *a) {
// CHECK: store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4]], { <4 x half>, <4 x half>, <4 x half>, <4 x half> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.float16x4x4_t [[TMP6]]
float16x4x4_t test_vld4_dup_f16(float16_t *a) {
// CHECK: store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.float32x2x4_t [[TMP6]]
float32x2x4_t test_vld4_dup_f32(float32_t *a) {
// CHECK: store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.float64x1x4_t [[TMP6]]
float64x1x4_t test_vld4_dup_f64(float64_t *a) {
// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP2]], i8* align 8 [[TMP3]], i64 32, i1 false)
// CHECK: [[TMP4:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly8x8x4_t [[TMP4]]
poly8x8x4_t test_vld4_dup_p8(poly8_t *a) {
// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly16x4x4_t [[TMP6]]
poly16x4x4_t test_vld4_dup_p16(poly16_t *a) {
// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly64x1x4_t [[TMP6]]
poly64x1x4_t test_vld4_dup_p64(poly64_t *a) {
// CHECK: store [2 x <16 x i8>] [[SRC]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[SRC]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]]
// CHECK: [[TMP6:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP7:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP6]], i8* align 16 [[TMP7]], i64 32, i1 false)
// CHECK: [[TMP8:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.int8x16x2_t [[TMP8]]
int8x16x2_t test_vld2q_lane_s8(int8_t const * ptr, int8x16x2_t src) {
// CHECK: store [2 x <16 x i8>] [[SRC]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[SRC]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]]
// CHECK: [[TMP6:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP7:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP6]], i8* align 16 [[TMP7]], i64 32, i1 false)
// CHECK: [[TMP8:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint8x16x2_t [[TMP8]]
uint8x16x2_t test_vld2q_lane_u8(uint8_t const * ptr, uint8x16x2_t src) {
// CHECK: store [2 x <16 x i8>] [[SRC]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[SRC]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]]
// CHECK: [[TMP6:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP7:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP6]], i8* align 16 [[TMP7]], i64 32, i1 false)
// CHECK: [[TMP8:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly8x16x2_t [[TMP8]]
poly8x16x2_t test_vld2q_lane_p8(poly8_t const * ptr, poly8x16x2_t src) {
// CHECK: store [3 x <16 x i8>] [[SRC]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[SRC]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]]
// CHECK: [[TMP7:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP8:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP7]], i8* align 16 [[TMP8]], i64 48, i1 false)
// CHECK: [[TMP9:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.int8x16x3_t [[TMP9]]
int8x16x3_t test_vld3q_lane_s8(int8_t const * ptr, int8x16x3_t src) {
// CHECK: store [3 x <16 x i8>] [[SRC]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[SRC]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]]
// CHECK: [[TMP7:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP8:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP7]], i8* align 16 [[TMP8]], i64 48, i1 false)
// CHECK: [[TMP9:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint8x16x3_t [[TMP9]]
uint8x16x3_t test_vld3q_lane_u8(uint8_t const * ptr, uint8x16x3_t src) {
// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP10]]
// CHECK: [[TMP11:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP12:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
// CHECK: [[TMP13:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint16x8x2_t [[TMP13]]
uint16x8x2_t test_vld2q_lane_u16(uint16_t *a, uint16x8x2_t b) {
// CHECK: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], { <4 x i32>, <4 x i32> }* [[TMP10]]
// CHECK: [[TMP11:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP12:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
// CHECK: [[TMP13:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint32x4x2_t [[TMP13]]
uint32x4x2_t test_vld2q_lane_u32(uint32_t *a, uint32x4x2_t b) {
// CHECK: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP10]]
// CHECK: [[TMP11:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP12:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
// CHECK: [[TMP13:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint64x2x2_t [[TMP13]]
uint64x2x2_t test_vld2q_lane_u64(uint64_t *a, uint64x2x2_t b) {
// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP10]]
// CHECK: [[TMP11:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP12:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
// CHECK: [[TMP13:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.int16x8x2_t [[TMP13]]
int16x8x2_t test_vld2q_lane_s16(int16_t *a, int16x8x2_t b) {
// CHECK: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], { <4 x i32>, <4 x i32> }* [[TMP10]]
// CHECK: [[TMP11:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP12:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
// CHECK: [[TMP13:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.int32x4x2_t [[TMP13]]
int32x4x2_t test_vld2q_lane_s32(int32_t *a, int32x4x2_t b) {
// CHECK: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP10]]
// CHECK: [[TMP11:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP12:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
// CHECK: [[TMP13:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.int64x2x2_t [[TMP13]]
int64x2x2_t test_vld2q_lane_s64(int64_t *a, int64x2x2_t b) {
// CHECK: store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: store { <8 x half>, <8 x half> } [[VLD2_LANE]], { <8 x half>, <8 x half> }* [[TMP10]]
// CHECK: [[TMP11:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP12:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
// CHECK: [[TMP13:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.float16x8x2_t [[TMP13]]
float16x8x2_t test_vld2q_lane_f16(float16_t *a, float16x8x2_t b) {
// CHECK: store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: store { <4 x float>, <4 x float> } [[VLD2_LANE]], { <4 x float>, <4 x float> }* [[TMP10]]
// CHECK: [[TMP11:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP12:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
// CHECK: [[TMP13:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.float32x4x2_t [[TMP13]]
float32x4x2_t test_vld2q_lane_f32(float32_t *a, float32x4x2_t b) {
// CHECK: store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast double* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: store { <2 x double>, <2 x double> } [[VLD2_LANE]], { <2 x double>, <2 x double> }* [[TMP10]]
// CHECK: [[TMP11:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP12:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
// CHECK: [[TMP13:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.float64x2x2_t [[TMP13]]
float64x2x2_t test_vld2q_lane_f64(float64_t *a, float64x2x2_t b) {
// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP10]]
// CHECK: [[TMP11:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP12:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
// CHECK: [[TMP13:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly16x8x2_t [[TMP13]]
poly16x8x2_t test_vld2q_lane_p16(poly16_t *a, poly16x8x2_t b) {
// CHECK: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP10]]
// CHECK: [[TMP11:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP12:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
// CHECK: [[TMP13:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly64x2x2_t [[TMP13]]
poly64x2x2_t test_vld2q_lane_p64(poly64_t *a, poly64x2x2_t b) {
// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
// CHECK: [[TMP6:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP7:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP6]], i8* align 8 [[TMP7]], i64 16, i1 false)
// CHECK: [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint8x8x2_t [[TMP8]]
uint8x8x2_t test_vld2_lane_u8(uint8_t *a, uint8x8x2_t b) {
// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP10]]
// CHECK: [[TMP11:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP12:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
// CHECK: [[TMP13:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint16x4x2_t [[TMP13]]
uint16x4x2_t test_vld2_lane_u16(uint16_t *a, uint16x4x2_t b) {
// CHECK: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], { <2 x i32>, <2 x i32> }* [[TMP10]]
// CHECK: [[TMP11:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP12:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
// CHECK: [[TMP13:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint32x2x2_t [[TMP13]]
uint32x2x2_t test_vld2_lane_u32(uint32_t *a, uint32x2x2_t b) {
// CHECK: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP10]]
// CHECK: [[TMP11:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP12:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
// CHECK: [[TMP13:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint64x1x2_t [[TMP13]]
uint64x1x2_t test_vld2_lane_u64(uint64_t *a, uint64x1x2_t b) {
// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
// CHECK: [[TMP6:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP7:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP6]], i8* align 8 [[TMP7]], i64 16, i1 false)
// CHECK: [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.int8x8x2_t [[TMP8]]
int8x8x2_t test_vld2_lane_s8(int8_t *a, int8x8x2_t b) {
// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP10]]
// CHECK: [[TMP11:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP12:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
// CHECK: [[TMP13:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.int16x4x2_t [[TMP13]]
int16x4x2_t test_vld2_lane_s16(int16_t *a, int16x4x2_t b) {
// CHECK: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], { <2 x i32>, <2 x i32> }* [[TMP10]]
// CHECK: [[TMP11:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP12:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
// CHECK: [[TMP13:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.int32x2x2_t [[TMP13]]
int32x2x2_t test_vld2_lane_s32(int32_t *a, int32x2x2_t b) {
// CHECK: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP10]]
// CHECK: [[TMP11:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP12:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
// CHECK: [[TMP13:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.int64x1x2_t [[TMP13]]
int64x1x2_t test_vld2_lane_s64(int64_t *a, int64x1x2_t b) {
// CHECK: store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: store { <4 x half>, <4 x half> } [[VLD2_LANE]], { <4 x half>, <4 x half> }* [[TMP10]]
// CHECK: [[TMP11:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP12:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
// CHECK: [[TMP13:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.float16x4x2_t [[TMP13]]
float16x4x2_t test_vld2_lane_f16(float16_t *a, float16x4x2_t b) {
// CHECK: store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: store { <2 x float>, <2 x float> } [[VLD2_LANE]], { <2 x float>, <2 x float> }* [[TMP10]]
// CHECK: [[TMP11:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP12:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
// CHECK: [[TMP13:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.float32x2x2_t [[TMP13]]
float32x2x2_t test_vld2_lane_f32(float32_t *a, float32x2x2_t b) {
// CHECK: store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast double* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
// CHECK: store { <1 x double>, <1 x double> } [[VLD2_LANE]], { <1 x double>, <1 x double> }* [[TMP10]]
// CHECK: [[TMP11:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP12:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
// CHECK: [[TMP13:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.float64x1x2_t [[TMP13]]
float64x1x2_t test_vld2_lane_f64(float64_t *a, float64x1x2_t b) {
// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
// CHECK: [[TMP6:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP7:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP6]], i8* align 8 [[TMP7]], i64 16, i1 false)
// CHECK: [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly8x8x2_t [[TMP8]]
poly8x8x2_t test_vld2_lane_p8(poly8_t *a, poly8x8x2_t b) {
// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP10]]
// CHECK: [[TMP11:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP12:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
// CHECK: [[TMP13:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly16x4x2_t [[TMP13]]
poly16x4x2_t test_vld2_lane_p16(poly16_t *a, poly16x4x2_t b) {
// CHECK: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP10]]
// CHECK: [[TMP11:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP12:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
// CHECK: [[TMP13:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly64x1x2_t [[TMP13]]
poly64x1x2_t test_vld2_lane_p64(poly64_t *a, poly64x1x2_t b) {
// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP13]]
// CHECK: [[TMP14:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP15:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
// CHECK: [[TMP16:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint16x8x3_t [[TMP16]]
uint16x8x3_t test_vld3q_lane_u16(uint16_t *a, uint16x8x3_t b) {
// CHECK: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP13]]
// CHECK: [[TMP14:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP15:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
// CHECK: [[TMP16:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint32x4x3_t [[TMP16]]
uint32x4x3_t test_vld3q_lane_u32(uint32_t *a, uint32x4x3_t b) {
// CHECK: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP13]]
// CHECK: [[TMP14:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP15:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
// CHECK: [[TMP16:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint64x2x3_t [[TMP16]]
uint64x2x3_t test_vld3q_lane_u64(uint64_t *a, uint64x2x3_t b) {
// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP13]]
// CHECK: [[TMP14:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP15:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
// CHECK: [[TMP16:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.int16x8x3_t [[TMP16]]
int16x8x3_t test_vld3q_lane_s16(int16_t *a, int16x8x3_t b) {
// CHECK: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP13]]
// CHECK: [[TMP14:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP15:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
// CHECK: [[TMP16:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.int32x4x3_t [[TMP16]]
int32x4x3_t test_vld3q_lane_s32(int32_t *a, int32x4x3_t b) {
// CHECK: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP13]]
// CHECK: [[TMP14:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP15:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
// CHECK: [[TMP16:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.int64x2x3_t [[TMP16]]
int64x2x3_t test_vld3q_lane_s64(int64_t *a, int64x2x3_t b) {
// CHECK: store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: store { <8 x half>, <8 x half>, <8 x half> } [[VLD3_LANE]], { <8 x half>, <8 x half>, <8 x half> }* [[TMP13]]
// CHECK: [[TMP14:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP15:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
// CHECK: [[TMP16:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.float16x8x3_t [[TMP16]]
float16x8x3_t test_vld3q_lane_f16(float16_t *a, float16x8x3_t b) {
// CHECK: store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: store { <4 x float>, <4 x float>, <4 x float> } [[VLD3_LANE]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP13]]
// CHECK: [[TMP14:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP15:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
// CHECK: [[TMP16:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.float32x4x3_t [[TMP16]]
float32x4x3_t test_vld3q_lane_f32(float32_t *a, float32x4x3_t b) {
// CHECK: store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast double* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: store { <2 x double>, <2 x double>, <2 x double> } [[VLD3_LANE]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP13]]
// CHECK: [[TMP14:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP15:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
// CHECK: [[TMP16:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.float64x2x3_t [[TMP16]]
float64x2x3_t test_vld3q_lane_f64(float64_t *a, float64x2x3_t b) {
// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]]
// CHECK: [[TMP7:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP8:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP7]], i8* align 16 [[TMP8]], i64 48, i1 false)
// CHECK: [[TMP9:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly8x16x3_t [[TMP9]]
poly8x16x3_t test_vld3q_lane_p8(poly8_t *a, poly8x16x3_t b) {
// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP13]]
// CHECK: [[TMP14:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP15:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
// CHECK: [[TMP16:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly16x8x3_t [[TMP16]]
poly16x8x3_t test_vld3q_lane_p16(poly16_t *a, poly16x8x3_t b) {
// CHECK: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP13]]
// CHECK: [[TMP14:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP15:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
// CHECK: [[TMP16:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly64x2x3_t [[TMP16]]
poly64x2x3_t test_vld3q_lane_p64(poly64_t *a, poly64x2x3_t b) {
// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
// CHECK: [[TMP7:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP8:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP7]], i8* align 8 [[TMP8]], i64 24, i1 false)
// CHECK: [[TMP9:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint8x8x3_t [[TMP9]]
uint8x8x3_t test_vld3_lane_u8(uint8_t *a, uint8x8x3_t b) {
// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP13]]
// CHECK: [[TMP14:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP15:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
// CHECK: [[TMP16:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint16x4x3_t [[TMP16]]
uint16x4x3_t test_vld3_lane_u16(uint16_t *a, uint16x4x3_t b) {
// CHECK: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP13]]
// CHECK: [[TMP14:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP15:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
// CHECK: [[TMP16:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint32x2x3_t [[TMP16]]
uint32x2x3_t test_vld3_lane_u32(uint32_t *a, uint32x2x3_t b) {
// CHECK: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP13]]
// CHECK: [[TMP14:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP15:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
// CHECK: [[TMP16:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint64x1x3_t [[TMP16]]
uint64x1x3_t test_vld3_lane_u64(uint64_t *a, uint64x1x3_t b) {
// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
// CHECK: [[TMP7:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP8:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP7]], i8* align 8 [[TMP8]], i64 24, i1 false)
// CHECK: [[TMP9:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.int8x8x3_t [[TMP9]]
int8x8x3_t test_vld3_lane_s8(int8_t *a, int8x8x3_t b) {
// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP13]]
// CHECK: [[TMP14:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP15:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
// CHECK: [[TMP16:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.int16x4x3_t [[TMP16]]
int16x4x3_t test_vld3_lane_s16(int16_t *a, int16x4x3_t b) {
// CHECK: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP13]]
// CHECK: [[TMP14:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP15:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
// CHECK: [[TMP16:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.int32x2x3_t [[TMP16]]
int32x2x3_t test_vld3_lane_s32(int32_t *a, int32x2x3_t b) {
// CHECK: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP13]]
// CHECK: [[TMP14:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP15:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
// CHECK: [[TMP16:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.int64x1x3_t [[TMP16]]
int64x1x3_t test_vld3_lane_s64(int64_t *a, int64x1x3_t b) {
// CHECK: store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: store { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE]], { <4 x half>, <4 x half>, <4 x half> }* [[TMP13]]
// CHECK: [[TMP14:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP15:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
// CHECK: [[TMP16:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.float16x4x3_t [[TMP16]]
float16x4x3_t test_vld3_lane_f16(float16_t *a, float16x4x3_t b) {
// CHECK: store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP13]]
// CHECK: [[TMP14:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP15:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
// CHECK: [[TMP16:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.float32x2x3_t [[TMP16]]
float32x2x3_t test_vld3_lane_f32(float32_t *a, float32x2x3_t b) {
// CHECK: store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast double* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
// CHECK: store { <1 x double>, <1 x double>, <1 x double> } [[VLD3_LANE]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP13]]
// CHECK: [[TMP14:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP15:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
// CHECK: [[TMP16:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.float64x1x3_t [[TMP16]]
float64x1x3_t test_vld3_lane_f64(float64_t *a, float64x1x3_t b) {
// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
// CHECK: [[TMP7:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP8:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP7]], i8* align 8 [[TMP8]], i64 24, i1 false)
// CHECK: [[TMP9:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly8x8x3_t [[TMP9]]
poly8x8x3_t test_vld3_lane_p8(poly8_t *a, poly8x8x3_t b) {
// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP13]]
// CHECK: [[TMP14:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP15:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
// CHECK: [[TMP16:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly16x4x3_t [[TMP16]]
poly16x4x3_t test_vld3_lane_p16(poly16_t *a, poly16x4x3_t b) {
// CHECK: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP13]]
// CHECK: [[TMP14:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP15:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
// CHECK: [[TMP16:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly64x1x3_t [[TMP16]]
poly64x1x3_t test_vld3_lane_p64(poly64_t *a, poly64x1x3_t b) {
// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]]
// CHECK: [[TMP8:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP9:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP8]], i8* align 16 [[TMP9]], i64 64, i1 false)
// CHECK: [[TMP10:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint8x16x4_t [[TMP10]]
uint8x16x4_t test_vld4q_lane_u8(uint8_t *a, uint8x16x4_t b) {
// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP16]]
// CHECK: [[TMP17:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP18:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
// CHECK: [[TMP19:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint16x8x4_t [[TMP19]]
uint16x8x4_t test_vld4q_lane_u16(uint16_t *a, uint16x8x4_t b) {
// CHECK: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP16]]
// CHECK: [[TMP17:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP18:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
// CHECK: [[TMP19:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint32x4x4_t [[TMP19]]
uint32x4x4_t test_vld4q_lane_u32(uint32_t *a, uint32x4x4_t b) {
// CHECK: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP16]]
// CHECK: [[TMP17:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP18:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
// CHECK: [[TMP19:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.uint64x2x4_t [[TMP19]]
uint64x2x4_t test_vld4q_lane_u64(uint64_t *a, uint64x2x4_t b) {
// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]]
// CHECK: [[TMP8:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP9:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP8]], i8* align 16 [[TMP9]], i64 64, i1 false)
// CHECK: [[TMP10:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.int8x16x4_t [[TMP10]]
int8x16x4_t test_vld4q_lane_s8(int8_t *a, int8x16x4_t b) {
// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP16]]
// CHECK: [[TMP17:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP18:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
// CHECK: [[TMP19:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.int16x8x4_t [[TMP19]]
int16x8x4_t test_vld4q_lane_s16(int16_t *a, int16x8x4_t b) {
// CHECK: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP16]]
// CHECK: [[TMP17:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP18:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
// CHECK: [[TMP19:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.int32x4x4_t [[TMP19]]
int32x4x4_t test_vld4q_lane_s32(int32_t *a, int32x4x4_t b) {
// CHECK: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP16]]
// CHECK: [[TMP17:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP18:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
// CHECK: [[TMP19:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.int64x2x4_t [[TMP19]]
int64x2x4_t test_vld4q_lane_s64(int64_t *a, int64x2x4_t b) {
// CHECK: store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4_LANE]], { <8 x half>, <8 x half>, <8 x half>, <8 x half> }* [[TMP16]]
// CHECK: [[TMP17:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP18:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
// CHECK: [[TMP19:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.float16x8x4_t [[TMP19]]
float16x8x4_t test_vld4q_lane_f16(float16_t *a, float16x8x4_t b) {
// CHECK: store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP16]]
// CHECK: [[TMP17:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP18:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
// CHECK: [[TMP19:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.float32x4x4_t [[TMP19]]
float32x4x4_t test_vld4q_lane_f32(float32_t *a, float32x4x4_t b) {
// CHECK: store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast double* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4_LANE]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP16]]
// CHECK: [[TMP17:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP18:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
// CHECK: [[TMP19:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.float64x2x4_t [[TMP19]]
float64x2x4_t test_vld4q_lane_f64(float64_t *a, float64x2x4_t b) {
// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]]
// CHECK: [[TMP8:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP9:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP8]], i8* align 16 [[TMP9]], i64 64, i1 false)
// CHECK: [[TMP10:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly8x16x4_t [[TMP10]]
poly8x16x4_t test_vld4q_lane_p8(poly8_t *a, poly8x16x4_t b) {
// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP16]]
// CHECK: [[TMP17:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP18:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
// CHECK: [[TMP19:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly16x8x4_t [[TMP19]]
poly16x8x4_t test_vld4q_lane_p16(poly16_t *a, poly16x8x4_t b) {
// CHECK: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP16]]
// CHECK: [[TMP17:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP18:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
// CHECK: [[TMP19:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly64x2x4_t [[TMP19]]
poly64x2x4_t test_vld4q_lane_p64(poly64_t *a, poly64x2x4_t b) {
// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
// CHECK: [[TMP8:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP9:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP8]], i8* align 8 [[TMP9]], i64 32, i1 false)
// CHECK: [[TMP10:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint8x8x4_t [[TMP10]]
uint8x8x4_t test_vld4_lane_u8(uint8_t *a, uint8x8x4_t b) {
// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP16]]
// CHECK: [[TMP17:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP18:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
// CHECK: [[TMP19:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint16x4x4_t [[TMP19]]
uint16x4x4_t test_vld4_lane_u16(uint16_t *a, uint16x4x4_t b) {
// CHECK: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP16]]
// CHECK: [[TMP17:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP18:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
// CHECK: [[TMP19:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint32x2x4_t [[TMP19]]
uint32x2x4_t test_vld4_lane_u32(uint32_t *a, uint32x2x4_t b) {
// CHECK: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP16]]
// CHECK: [[TMP17:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP18:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
// CHECK: [[TMP19:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.uint64x1x4_t [[TMP19]]
uint64x1x4_t test_vld4_lane_u64(uint64_t *a, uint64x1x4_t b) {
// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
// CHECK: [[TMP8:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP9:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP8]], i8* align 8 [[TMP9]], i64 32, i1 false)
// CHECK: [[TMP10:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.int8x8x4_t [[TMP10]]
int8x8x4_t test_vld4_lane_s8(int8_t *a, int8x8x4_t b) {
// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP16]]
// CHECK: [[TMP17:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP18:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
// CHECK: [[TMP19:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.int16x4x4_t [[TMP19]]
int16x4x4_t test_vld4_lane_s16(int16_t *a, int16x4x4_t b) {
// CHECK: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP16]]
// CHECK: [[TMP17:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP18:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
// CHECK: [[TMP19:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.int32x2x4_t [[TMP19]]
int32x2x4_t test_vld4_lane_s32(int32_t *a, int32x2x4_t b) {
// CHECK: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP16]]
// CHECK: [[TMP17:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP18:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
// CHECK: [[TMP19:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.int64x1x4_t [[TMP19]]
int64x1x4_t test_vld4_lane_s64(int64_t *a, int64x1x4_t b) {
// CHECK: store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE]], { <4 x half>, <4 x half>, <4 x half>, <4 x half> }* [[TMP16]]
// CHECK: [[TMP17:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP18:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
// CHECK: [[TMP19:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.float16x4x4_t [[TMP19]]
float16x4x4_t test_vld4_lane_f16(float16_t *a, float16x4x4_t b) {
// CHECK: store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP16]]
// CHECK: [[TMP17:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP18:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
// CHECK: [[TMP19:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.float32x2x4_t [[TMP19]]
float32x2x4_t test_vld4_lane_f32(float32_t *a, float32x2x4_t b) {
// CHECK: store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast double* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
// CHECK: store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4_LANE]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP16]]
// CHECK: [[TMP17:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP18:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
// CHECK: [[TMP19:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.float64x1x4_t [[TMP19]]
float64x1x4_t test_vld4_lane_f64(float64_t *a, float64x1x4_t b) {
// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
// CHECK: [[TMP8:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP9:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP8]], i8* align 8 [[TMP9]], i64 32, i1 false)
// CHECK: [[TMP10:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly8x8x4_t [[TMP10]]
poly8x8x4_t test_vld4_lane_p8(poly8_t *a, poly8x8x4_t b) {
// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP16]]
// CHECK: [[TMP17:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP18:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
// CHECK: [[TMP19:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly16x4x4_t [[TMP19]]
poly16x4x4_t test_vld4_lane_p16(poly16_t *a, poly16x4x4_t b) {
// CHECK: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP16]]
// CHECK: [[TMP17:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP18:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
// CHECK: [[TMP19:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly64x1x4_t [[TMP19]]
poly64x1x4_t test_vld4_lane_p64(poly64_t *a, poly64x1x4_t b) {
// CHECK: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast double* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
// CHECK: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
// CHECK: store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
// CHECK: store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false) #2
// CHECK: [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8
// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0
// CHECK: store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.int16x4x2_t [[TMP7]], 0
// CHECK: store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.int32x2x2_t [[TMP7]], 0
// CHECK: store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false) #2
// CHECK: [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8
// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0
// CHECK: store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP7]], 0
// CHECK: store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP7]], 0
// CHECK: store <2 x float> [[VUZP1_I]], <2 x float>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.float32x2x2_t [[TMP7]], 0
// CHECK: store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false) #2
// CHECK: [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8
// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0
// CHECK: store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP7]], 0
// CHECK: store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false) #2
// CHECK: [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16
// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0
// CHECK: store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.int16x8x2_t [[TMP7]], 0
// CHECK: store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.int32x4x2_t [[TMP7]], 0
// CHECK: store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false) #2
// CHECK: [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16
// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0
// CHECK: store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP7]], 0
// CHECK: store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP7]], 0
// CHECK: store <4 x float> [[VUZP1_I]], <4 x float>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.float32x4x2_t [[TMP7]], 0
// CHECK: store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false) #2
// CHECK: [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16
// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0
// CHECK: store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP7]], 0
// CHECK: store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false) #2
// CHECK: [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8
// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0
// CHECK: store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.int16x4x2_t [[TMP7]], 0
// CHECK: store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.int32x2x2_t [[TMP7]], 0
// CHECK: store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false) #2
// CHECK: [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8
// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0
// CHECK: store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP7]], 0
// CHECK: store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP7]], 0
// CHECK: store <2 x float> [[VZIP1_I]], <2 x float>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.float32x2x2_t [[TMP7]], 0
// CHECK: store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false) #2
// CHECK: [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8
// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0
// CHECK: store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP7]], 0
// CHECK: store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false) #2
// CHECK: [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16
// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0
// CHECK: store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.int16x8x2_t [[TMP7]], 0
// CHECK: store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.int32x4x2_t [[TMP7]], 0
// CHECK: store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false) #2
// CHECK: [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16
// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0
// CHECK: store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP7]], 0
// CHECK: store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP7]], 0
// CHECK: store <4 x float> [[VZIP1_I]], <4 x float>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.float32x4x2_t [[TMP7]], 0
// CHECK: store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false) #2
// CHECK: [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16
// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0
// CHECK: store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP7]], 0
// CHECK: store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false) #2
// CHECK: [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8
// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0
// CHECK: store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.int16x4x2_t [[TMP7]], 0
// CHECK: store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.int32x2x2_t [[TMP7]], 0
// CHECK: store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false) #2
// CHECK: [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8
// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0
// CHECK: store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP7]], 0
// CHECK: store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP7]], 0
// CHECK: store <2 x float> [[VTRN1_I]], <2 x float>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.float32x2x2_t [[TMP7]], 0
// CHECK: store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i64 16, i1 false) #2
// CHECK: [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8
// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0
// CHECK: store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP7]], 0
// CHECK: store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false) #2
// CHECK: [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16
// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0
// CHECK: store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.int16x8x2_t [[TMP7]], 0
// CHECK: store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.int32x4x2_t [[TMP7]], 0
// CHECK: store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false) #2
// CHECK: [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16
// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0
// CHECK: store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP7]], 0
// CHECK: store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP7]], 0
// CHECK: store <4 x float> [[VTRN1_I]], <4 x float>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.float32x4x2_t [[TMP7]], 0
// CHECK: store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i64 32, i1 false) #2
// CHECK: [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16
// CHECK: [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0
// CHECK: store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false) #2
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false) #2
// CHECK: [[TMP7:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
// CHECK: [[TMP8:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
// CHECK: [[TMP9:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP7]], 0
// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly64x1x2_t [[TMP6]]
poly64x1x2_t test_vld2_p64(poly64_t const * ptr) {
// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly64x2x2_t [[TMP6]]
poly64x2x2_t test_vld2q_p64(poly64_t const * ptr) {
// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly64x1x3_t [[TMP6]]
poly64x1x3_t test_vld3_p64(poly64_t const * ptr) {
// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly64x2x3_t [[TMP6]]
poly64x2x3_t test_vld3q_p64(poly64_t const * ptr) {
// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
// CHECK: ret %struct.poly64x1x4_t [[TMP6]]
poly64x1x4_t test_vld4_p64(poly64_t const * ptr) {
// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
// CHECK: ret %struct.poly64x2x4_t [[TMP6]]
poly64x2x4_t test_vld4q_p64(poly64_t const * ptr) {
// CHECK: store [2 x <1 x i64>] [[VAL]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[VAL]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %ptr to i8*
// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 0
// CHECK: store [2 x <2 x i64>] [[VAL]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[VAL]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %ptr to i8*
// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 0
// CHECK: store [3 x <1 x i64>] [[VAL]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[VAL]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %ptr to i8*
// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 0
// CHECK: store [3 x <2 x i64>] [[VAL]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[VAL]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %ptr to i8*
// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 0
// CHECK: store [4 x <1 x i64>] [[VAL]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[VAL]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %ptr to i8*
// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 0
// CHECK: store [4 x <2 x i64>] [[VAL]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
// CHECK: [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[VAL]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
// CHECK: [[TMP2:%.*]] = bitcast i64* %ptr to i8*
// CHECK: [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 0
// CHECK: store <4 x half> [[VZIP1_I]], <4 x half>* [[TMP2]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.float16x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false)
float16x4x2_t test_vzip_f16(float16x4_t a, float16x4_t b) {
return vzip_f16(a, b);
}
// CHECK: store <8 x half> [[VZIP1_I]], <8 x half>* [[TMP2]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.float16x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false)
float16x8x2_t test_vzipq_f16(float16x8_t a, float16x8_t b) {
return vzipq_f16(a, b);
}
// CHECK: store <4 x half> [[VZIP1_I]], <4 x half>* [[TMP2]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.float16x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false)
float16x4x2_t test_vuzp_f16(float16x4_t a, float16x4_t b) {
return vuzp_f16(a, b);
}
// CHECK: store <8 x half> [[VZIP1_I]], <8 x half>* [[TMP2]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.float16x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false)
float16x8x2_t test_vuzpq_f16(float16x8_t a, float16x8_t b) {
return vuzpq_f16(a, b);
}
// CHECK: store <4 x half> [[VZIP1_I]], <4 x half>* [[TMP2]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.float16x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i64 16, i1 false)
float16x4x2_t test_vtrn_f16(float16x4_t a, float16x4_t b) {
return vtrn_f16(a, b);
}
// CHECK: store <8 x half> [[VZIP1_I]], <8 x half>* [[TMP2]]
// CHECK: [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.float16x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP5]], i8* [[TMP6]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i64 32, i1 false)
float16x8x2_t test_vtrnq_f16(float16x8_t a, float16x8_t b) {
return vtrnq_f16(a, b);
}
// APCS-GNU: %[[a:.*]] = alloca %struct.s35, align 16
// APCS-GNU: %[[b:.*]] = bitcast %struct.s35* %[[a]] to i8*
// APCS-GNU: %[[c:.*]] = bitcast %struct.s35* %0 to i8*
-// APCS-GNU: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[b]], i8* %[[c]]
+// APCS-GNU: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align {{[0-9]+}} %[[b]], i8* align {{[0-9]+}} %[[c]]
// APCS-GNU: %[[d:.*]] = bitcast %struct.s35* %[[a]] to <4 x float>*
// APCS-GNU: load <4 x float>, <4 x float>* %[[d]], align 16
// AAPCS-LABEL: define arm_aapcscc <4 x float> @f35(i32 %i, %struct.s35* byval align 8, %struct.s35* byval align 8)
// AAPCS: %[[a:.*]] = alloca %struct.s35, align 16
// AAPCS: %[[b:.*]] = bitcast %struct.s35* %[[a]] to i8*
// AAPCS: %[[c:.*]] = bitcast %struct.s35* %0 to i8*
-// AAPCS: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[b]], i8* %[[c]]
+// AAPCS: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 %[[b]], i8* align 8 %[[c]]
// AAPCS: %[[d:.*]] = bitcast %struct.s35* %[[a]] to <4 x float>*
// AAPCS: load <4 x float>, <4 x float>* %[[d]], align 16
// IR: callee_b0f(i64 [[ARG:%.*]])
// IR: store i64 [[ARG]], i64* [[PTR:%.*]], align 8
// IR: [[BITCAST:%.*]] = bitcast i64* [[PTR]] to i8*
-// IR: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* [[BITCAST]], i64 4
+// IR: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* align 4 [[BITCAST]], i64 4
return bp11.b2;
}
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
// CHECK: [[TMP4:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i32 0, i32 0
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i32 0, i32 0
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i32 0, i32 0
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i32 0, i32 0
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i32 0, i32 0
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i32 0, i32 0
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 16, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i32 0, i32 0
// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i32 0, i32 0
// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i32 0, i32 0
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i32 0, i32 0
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i32 0, i32 0
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i32 0, i32 0
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i32 0, i32 0
// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i32 0, i32 0
// CHECK: store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 48, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i32 0, i32 0
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i32 0, i32 0
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 24, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i32 0, i32 0
// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i32 0, i32 0
// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i64* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i32 0, i32 0
// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i32 0, i32 0
// CHECK: store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP1]], i8* align 16 [[TMP2]], i32 64, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i32* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast half* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast float* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i32 0, i32 0
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
// CHECK: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
// CHECK: store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
// CHECK: [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
// CHECK: [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP1]], i8* align 8 [[TMP2]], i32 32, i1 false)
// CHECK: [[TMP3:%.*]] = bitcast i16* %a to i8*
// CHECK: [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
// CHECK: store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]], !noalias !3
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i32 16, i1 false)
// CHECK: ret void
int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) {
return vtrn_s8(a, b);
// CHECK: store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]], !noalias !6
// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
// CHECK: ret void
int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) {
return vtrn_s16(a, b);
// CHECK: store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP4]], !noalias !9
// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
// CHECK: ret void
int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) {
return vtrn_s32(a, b);
// CHECK: store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]], !noalias !12
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i32 16, i1 false)
// CHECK: ret void
uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) {
return vtrn_u8(a, b);
// CHECK: store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]], !noalias !15
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
// CHECK: ret void
uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) {
return vtrn_u16(a, b);
// CHECK: store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP4]], !noalias !18
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
// CHECK: ret void
uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) {
return vtrn_u32(a, b);
// CHECK: store <2 x float> [[VTRN1_I]], <2 x float>* [[TMP4]], !noalias !21
// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
// CHECK: ret void
float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) {
return vtrn_f32(a, b);
// CHECK: store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]], !noalias !24
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i32 16, i1 false)
// CHECK: ret void
poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) {
return vtrn_p8(a, b);
// CHECK: store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP4]], !noalias !27
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
// CHECK: ret void
poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) {
return vtrn_p16(a, b);
// CHECK: store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !noalias !30
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i32 32, i1 false)
// CHECK: ret void
int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
return vtrnq_s8(a, b);
// CHECK: store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !noalias !33
// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
// CHECK: ret void
int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
return vtrnq_s16(a, b);
// CHECK: store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]], !noalias !36
// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
// CHECK: ret void
int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) {
return vtrnq_s32(a, b);
// CHECK: store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !noalias !39
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i32 32, i1 false)
// CHECK: ret void
uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
return vtrnq_u8(a, b);
// CHECK: store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !noalias !42
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
// CHECK: ret void
uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
return vtrnq_u16(a, b);
// CHECK: store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]], !noalias !45
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
// CHECK: ret void
uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
return vtrnq_u32(a, b);
// CHECK: store <4 x float> [[VTRN1_I]], <4 x float>* [[TMP4]], !noalias !48
// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
// CHECK: ret void
float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) {
return vtrnq_f32(a, b);
// CHECK: store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !noalias !51
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i32 32, i1 false)
// CHECK: ret void
poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
return vtrnq_p8(a, b);
// CHECK: store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !noalias !54
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
// CHECK: ret void
poly16x8x2_t test_vtrnq_p16(poly16x8_t a, poly16x8_t b) {
return vtrnq_p16(a, b);
// CHECK: store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]], !noalias !57
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i32 16, i1 false)
// CHECK: ret void
int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) {
return vuzp_s8(a, b);
// CHECK: store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]], !noalias !60
// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
// CHECK: ret void
int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) {
return vuzp_s16(a, b);
// CHECK: store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP4]], !noalias !63
// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
// CHECK: ret void
int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) {
return vuzp_s32(a, b);
// CHECK: store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]], !noalias !66
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i32 16, i1 false)
// CHECK: ret void
uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) {
return vuzp_u8(a, b);
// CHECK: store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]], !noalias !69
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
// CHECK: ret void
uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) {
return vuzp_u16(a, b);
// CHECK: store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP4]], !noalias !72
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
// CHECK: ret void
uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) {
return vuzp_u32(a, b);
// CHECK: store <2 x float> [[VUZP1_I]], <2 x float>* [[TMP4]], !noalias !75
// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
// CHECK: ret void
float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) {
return vuzp_f32(a, b);
// CHECK: store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]], !noalias !78
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i32 16, i1 false)
// CHECK: ret void
poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) {
return vuzp_p8(a, b);
// CHECK: store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP4]], !noalias !81
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
// CHECK: ret void
poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) {
return vuzp_p16(a, b);
// CHECK: store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !noalias !84
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i32 32, i1 false)
// CHECK: ret void
int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
return vuzpq_s8(a, b);
// CHECK: store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !noalias !87
// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
// CHECK: ret void
int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
return vuzpq_s16(a, b);
// CHECK: store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]], !noalias !90
// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
// CHECK: ret void
int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) {
return vuzpq_s32(a, b);
// CHECK: store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !noalias !93
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i32 32, i1 false)
// CHECK: ret void
uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
return vuzpq_u8(a, b);
// CHECK: store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !noalias !96
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
// CHECK: ret void
uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
return vuzpq_u16(a, b);
// CHECK: store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]], !noalias !99
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
// CHECK: ret void
uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
return vuzpq_u32(a, b);
// CHECK: store <4 x float> [[VUZP1_I]], <4 x float>* [[TMP4]], !noalias !102
// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
// CHECK: ret void
float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
return vuzpq_f32(a, b);
// CHECK: store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !noalias !105
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i32 32, i1 false)
// CHECK: ret void
poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
return vuzpq_p8(a, b);
// CHECK: store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !noalias !108
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
// CHECK: ret void
poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) {
return vuzpq_p16(a, b);
// CHECK: store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]], !noalias !111
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i32 16, i1 false)
// CHECK: ret void
int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) {
return vzip_s8(a, b);
// CHECK: store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]], !noalias !114
// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
// CHECK: ret void
int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) {
return vzip_s16(a, b);
// CHECK: store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP4]], !noalias !117
// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
// CHECK: ret void
int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) {
return vzip_s32(a, b);
// CHECK: store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]], !noalias !120
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i32 16, i1 false)
// CHECK: ret void
uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) {
return vzip_u8(a, b);
// CHECK: store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]], !noalias !123
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
// CHECK: ret void
uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) {
return vzip_u16(a, b);
// CHECK: store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP4]], !noalias !126
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
// CHECK: ret void
uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) {
return vzip_u32(a, b);
// CHECK: store <2 x float> [[VZIP1_I]], <2 x float>* [[TMP4]], !noalias !129
// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
// CHECK: ret void
float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) {
return vzip_f32(a, b);
// CHECK: store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]], !noalias !132
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP3]], i8* align 8 [[TMP4]], i32 16, i1 false)
// CHECK: ret void
poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) {
return vzip_p8(a, b);
// CHECK: store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP4]], !noalias !135
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[TMP5]], i8* align 8 [[TMP6]], i32 16, i1 false)
// CHECK: ret void
poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) {
return vzip_p16(a, b);
// CHECK: store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !noalias !138
// CHECK: [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i32 32, i1 false)
// CHECK: ret void
int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
return vzipq_s8(a, b);
// CHECK: store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !noalias !141
// CHECK: [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
// CHECK: ret void
int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
return vzipq_s16(a, b);
// CHECK: store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]], !noalias !144
// CHECK: [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
// CHECK: ret void
int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) {
return vzipq_s32(a, b);
// CHECK: store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !noalias !147
// CHECK: [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i32 32, i1 false)
// CHECK: ret void
uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
return vzipq_u8(a, b);
// CHECK: store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !noalias !150
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
// CHECK: ret void
uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
return vzipq_u16(a, b);
// CHECK: store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]], !noalias !153
// CHECK: [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
// CHECK: ret void
uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
return vzipq_u32(a, b);
// CHECK: store <4 x float> [[VZIP1_I]], <4 x float>* [[TMP4]], !noalias !156
// CHECK: [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
// CHECK: ret void
float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) {
return vzipq_f32(a, b);
// CHECK: store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !noalias !159
// CHECK: [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
// CHECK: [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP3]], i8* align 16 [[TMP4]], i32 32, i1 false)
// CHECK: ret void
poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
return vzipq_p8(a, b);
// CHECK: store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !noalias !162
// CHECK: [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
// CHECK: [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[TMP5]], i8* align 16 [[TMP6]], i32 32, i1 false)
// CHECK: ret void
poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) {
return vzipq_p16(a, b);
// CHECK: [[TEMP:%.*]] = alloca [[QUAD_T:%.*]], align 8
// CHECK-NEXT: [[T0:%.*]] = bitcast [[QUAD_T]]* [[TEMP]] to i8*
// CHECK-NEXT: [[T1:%.*]] = bitcast [[QUAD_T]]* {{%.*}} to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T0]], i8* [[T1]], i64 32, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[T0]], i8* align 8 [[T1]], i64 32, i1 false)
// CHECK-NEXT: [[T0:%.*]] = bitcast [[QUAD_T]]* [[TEMP]] to i256*
// CHECK-NEXT: [[T1:%.*]] = bitcast i256* [[T0]] to i8*
// CHECK-NEXT: call void @__atomic_store(i64 32, i8* bitcast ([[QUAD_T]]* @a_pointer_quad to i8*), i8* [[T1]], i32 5)
// CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[BYREF]], [[BYREF]]* [[T0]], i32 0, i32 4
// CHECK-NEXT: [[T2:%.*]] = bitcast [[AGG]]* [[T1]] to i8*
// CHECK-NEXT: [[T3:%.*]] = bitcast [[AGG]]* [[TEMP]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T2]], i8* [[T3]], i64 4, i32 4, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[T2]], i8* align 4 [[T3]], i64 4, i1 false)
// Verify that there's nothing else significant in the function.
// CHECK-NEXT: [[T0:%.*]] = bitcast [[BYREF]]* [[A]] to i8*
// CHECK-NEXT: call void @_Block_object_dispose(i8* [[T0]], i32 8)
// CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[B_BYREF]], [[B_BYREF]]* [[T0]], i32 0, i32 4
// CHECK-NEXT: [[T2:%.*]] = bitcast [[AGG]]* [[T1]] to i8*
// CHECK-NEXT: [[T3:%.*]] = bitcast [[AGG]]* [[TEMP]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T2]], i8* [[T3]], i64 4, i32 4, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[T2]], i8* align 4 [[T3]], i64 4, i1 false)
// Then for 'a':
// CHECK-NEXT: [[A_FORWARDING:%.*]] = getelementptr inbounds [[A_BYREF]], [[A_BYREF]]* [[A]], i32 0, i32 1
// CHECK-NEXT: [[T0:%.*]] = load [[A_BYREF]]*, [[A_BYREF]]** [[A_FORWARDING]]
// CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[A_BYREF]], [[A_BYREF]]* [[T0]], i32 0, i32 4
// CHECK-NEXT: [[T2:%.*]] = bitcast [[AGG]]* [[T1]] to i8*
// CHECK-NEXT: [[T3:%.*]] = bitcast [[AGG]]* [[TEMP]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T2]], i8* [[T3]], i64 4, i32 4, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[T2]], i8* align 4 [[T3]], i64 4, i1 false)
// Verify that there's nothing else significant in the function.
// CHECK-NEXT: [[T0:%.*]] = bitcast [[B_BYREF]]* [[B]] to i8*
// CHECK-NEXT: call void @_Block_object_dispose(i8* [[T0]], i32 8)
int test7(int *p) {
struct snd_pcm_hw_params_t* hwparams; // incomplete type.
- // CHECK: call void @llvm.memset{{.*}}256, i32 4, i1 false)
+ // CHECK: call void @llvm.memset{{.*}} align 4 {{.*}}256, i1 false)
__builtin_memset(p, 0, 256); // Should be alignment = 4
- // CHECK: call void @llvm.memset{{.*}}256, i32 1, i1 false)
+ // CHECK: call void @llvm.memset{{.*}} align 1 {{.*}}256, i1 false)
__builtin_memset((char*)p, 0, 256); // Should be alignment = 1
__builtin_memset(hwparams, 0, 256); // No crash alignment = 1
- // CHECK: call void @llvm.memset{{.*}}256, i32 1, i1 false)
+ // CHECK: call void @llvm.memset{{.*}} align 1{{.*}}256, i1 false)
}
// <rdar://problem/11314941>
struct PS ps;
void test8(int *arg) {
// CHECK: @test8
- // CHECK: call void @llvm.memcpy{{.*}} 16, i32 1, i1 false)
+ // CHECK: call void @llvm.memcpy{{.*}} align 1 {{.*}} align 1 {{.*}} 16, i1 false)
__builtin_memcpy(arg, ps.modes, sizeof(struct PS));
}
__attribute((aligned(16))) int x[4], y[4];
void test9() {
// CHECK: @test9
- // CHECK: call void @llvm.memcpy{{.*}} 16, i32 16, i1 false)
+ // CHECK: call void @llvm.memcpy{{.*}} align 16 {{.*}} align 16 {{.*}} 16, i1 false)
__builtin_memcpy(x, y, sizeof(y));
}
// CHECK-NEXT: [[T0:%.*]] = load [[S]]*, [[S]]** [[FP]]
// CHECK-NEXT: [[T1:%.*]] = bitcast [[S]]* [[TMP0]] to i8*
// CHECK-NEXT: [[T2:%.*]] = bitcast [[S]]* [[F]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[T1]], i8* [[T2]], i32 8, i32 2, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[T1]], i8* align 2 [[T2]], i32 8, i1 false)
// CHECK-NEXT: [[T3:%.*]] = bitcast [[S]]* [[TMP0]] to i64*
// CHECK-NEXT: [[T4:%.*]] = load i64, i64* [[T3]], align 8
// CHECK-NEXT: [[T5:%.*]] = bitcast [[S]]* [[T0]] to i64*
// CHECK-NEXT: [[P:%.*]] = load [[APS]]*, [[APS]]** [[FP]]
// CHECK-NEXT: [[T0:%.*]] = bitcast [[APS]]* [[P]] to i8*
-// CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[T0]], i8 0, i64 8, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[T0]], i8 0, i64 8, i1 false)
// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[APS]], [[APS]]* [[P]], i32 0, i32 0
// CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[PS]], [[PS]]* [[T0]], i32 0, i32 0
// CHECK-NEXT: store i16 1, i16* [[T1]], align 8
__c11_atomic_init(fp, (PS){1,2,3});
// CHECK-NEXT: [[T0:%.*]] = bitcast [[APS]]* [[X]] to i8*
-// CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* [[T0]], i8 0, i32 8, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 8 [[T0]], i8 0, i32 8, i1 false)
// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[APS]], [[APS]]* [[X]], i32 0, i32 0
// CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[PS]], [[PS]]* [[T0]], i32 0, i32 0
// CHECK-NEXT: store i16 1, i16* [[T1]], align 8
// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[APS]], [[APS]]* [[TMP0]], i32 0, i32 0
// CHECK-NEXT: [[T1:%.*]] = bitcast [[PS]]* [[F]] to i8*
// CHECK-NEXT: [[T2:%.*]] = bitcast [[PS]]* [[T0]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[T1]], i8* [[T2]], i32 6, i32 2, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[T1]], i8* align 2 [[T2]], i32 6, i1 false)
PS f = *fp;
// CHECK-NEXT: [[T0:%.*]] = load [[APS]]*, [[APS]]** [[FP]]
// CHECK-NEXT: [[T1:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[TMP1]] to i8*
-// CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* [[T1]], i8 0, i32 8, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 8 [[T1]], i8 0, i32 8, i1 false)
// CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[APS]], [[APS]]* [[TMP1]], i32 0, i32 0
// CHECK-NEXT: [[T2:%.*]] = bitcast [[PS]]* [[T1]] to i8*
// CHECK-NEXT: [[T3:%.*]] = bitcast [[PS]]* [[F]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[T2]], i8* [[T3]], i32 6, i32 2, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[T2]], i8* align 2 [[T3]], i32 6, i1 false)
// CHECK-NEXT: [[T4:%.*]] = bitcast [[APS]]* [[TMP1]] to i64*
// CHECK-NEXT: [[T5:%.*]] = load i64, i64* [[T4]], align 8
// CHECK-NEXT: [[T6:%.*]] = bitcast [[APS]]* [[T0]] to i64*
// CHECK: [[ATOMIC_RES_STRUCT:%.*]] = bitcast i64* [[ATOMIC_RES64]] to %struct.PS*
// CHECK: [[AGG_RESULT8:%.*]] = bitcast %struct.PS* %agg.result to i8*
// CHECK: [[ATOMIC_RES8:%.*]] = bitcast %struct.PS* [[ATOMIC_RES_STRUCT]] to i8*
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[AGG_RESULT8]], i8* [[ATOMIC_RES8]], i32 6, i32 2, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[AGG_RESULT8]], i8* align 2 [[ATOMIC_RES8]], i32 6, i1 false)
return __c11_atomic_load(addr, 5);
}
// CHECK: [[VAL:%.*]] = load %struct.PS*, %struct.PS** [[VAL_ARG]], align 4
// CHECK: [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
// CHECK: [[VAL8:%.*]] = bitcast %struct.PS* [[VAL]] to i8*
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[NONATOMIC_TMP8]], i8* [[VAL8]], i32 6, i32 2, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[NONATOMIC_TMP8]], i8* align 2 [[VAL8]], i32 6, i1 false)
// CHECK: [[ADDR64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ADDR]] to i64*
// CHECK: [[ATOMIC_VAL8:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_VAL]] to i8*
// CHECK: [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[ATOMIC_VAL8]], i8* [[NONATOMIC_TMP8]], i64 6, i32 2, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 [[ATOMIC_VAL8]], i8* align 2 [[NONATOMIC_TMP8]], i64 6, i1 false)
// CHECK: [[ATOMIC_VAL64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_VAL]] to i64*
// CHECK: [[VAL64:%.*]] = load i64, i64* [[ATOMIC_VAL64]], align 8
// CHECK: store atomic i64 [[VAL64]], i64* [[ADDR64]] seq_cst, align 8
// CHECK: [[VAL:%.*]] = load %struct.PS*, %struct.PS** [[VAL_ARG]], align 4
// CHECK: [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
// CHECK: [[VAL8:%.*]] = bitcast %struct.PS* [[VAL]] to i8*
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[NONATOMIC_TMP8]], i8* [[VAL8]], i32 6, i32 2, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[NONATOMIC_TMP8]], i8* align 2 [[VAL8]], i32 6, i1 false)
// CHECK: [[ADDR64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ADDR]] to i64*
// CHECK: [[ATOMIC_VAL8:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_VAL]] to i8*
// CHECK: [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[ATOMIC_VAL8]], i8* [[NONATOMIC_TMP8]], i64 6, i32 2, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 [[ATOMIC_VAL8]], i8* align 2 [[NONATOMIC_TMP8]], i64 6, i1 false)
// CHECK: [[ATOMIC_VAL64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_VAL]] to i64*
// CHECK: [[ATOMIC_RES64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_RES]] to i64*
// CHECK: [[VAL64:%.*]] = load i64, i64* [[ATOMIC_VAL64]], align 8
// CHECK: [[ATOMIC_RES_STRUCT:%.*]] = bitcast i64* [[ATOMIC_RES64]] to %struct.PS*
// CHECK: [[AGG_RESULT8:%.*]] = bitcast %struct.PS* %agg.result to i8*
// CHECK: [[ATOMIC_RES8:%.*]] = bitcast %struct.PS* [[ATOMIC_RES_STRUCT]] to i8*
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[AGG_RESULT8]], i8* [[ATOMIC_RES8]], i32 6, i32 2, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[AGG_RESULT8]], i8* align 2 [[ATOMIC_RES8]], i32 6, i1 false)
return __c11_atomic_exchange(addr, *val, 5);
}
// CHECK: [[NEW:%.*]] = load %struct.PS*, %struct.PS** [[NEW_ARG]], align 4
// CHECK: [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
// CHECK: [[NEW8:%.*]] = bitcast %struct.PS* [[NEW]] to i8*
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[NONATOMIC_TMP8]], i8* [[NEW8]], i32 6, i32 2, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[NONATOMIC_TMP8]], i8* align 2 [[NEW8]], i32 6, i1 false)
// CHECK: [[ADDR64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ADDR]] to i64*
// CHECK: [[ATOMIC_DESIRED8:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_DESIRED:%.*]] to i8*
// CHECK: [[DESIRED8:%.*]] = bitcast %struct.PS* [[DESIRED]] to i8*
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[ATOMIC_DESIRED8]], i8* [[DESIRED8]], i64 6, i32 2, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 [[ATOMIC_DESIRED8]], i8* align 2 [[DESIRED8]], i64 6, i1 false)
// CHECK: [[ATOMIC_DESIRED64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_DESIRED:%.*]] to i64*
// CHECK: [[ATOMIC_NEW8:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_NEW]] to i8*
// CHECK: [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[ATOMIC_NEW8]], i8* [[NONATOMIC_TMP8]], i64 6, i32 2, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 [[ATOMIC_NEW8]], i8* align 2 [[NONATOMIC_TMP8]], i64 6, i1 false)
// CHECK: [[ATOMIC_NEW64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_NEW]] to i64*
// CHECK: [[ATOMIC_DESIRED_VAL64:%.*]] = load i64, i64* [[ATOMIC_DESIRED64]], align 8
// CHECK: [[ATOMIC_NEW_VAL64:%.*]] = load i64, i64* [[ATOMIC_NEW64]], align 8
// CHECK-NEXT: [[T0:%.*]] = load [[S]]*, [[S]]** [[FP]]
// CHECK-NEXT: [[T1:%.*]] = bitcast [[S]]* [[TMP0]] to i8*
// CHECK-NEXT: [[T2:%.*]] = bitcast [[S]]* [[F]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[T1]], i8* [[T2]], i32 8, i32 2, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[T1]], i8* align 2 [[T2]], i32 8, i1 false)
// CHECK-NEXT: [[T3:%.*]] = bitcast [[S]]* [[T0]] to i8*
// CHECK-NEXT: [[T4:%.*]] = bitcast [[S]]* [[TMP0]] to i8*
// CHECK-NEXT: call arm_aapcscc void @__atomic_store(i32 8, i8* [[T3]], i8* [[T4]], i32 5)
// CHECK-NEXT: [[P:%.*]] = load [[APS]]*, [[APS]]** [[FP]]
// CHECK-NEXT: [[T0:%.*]] = bitcast [[APS]]* [[P]] to i8*
-// CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[T0]], i8 0, i64 8, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[T0]], i8 0, i64 8, i1 false)
// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[APS]], [[APS]]* [[P]], i32 0, i32 0
// CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[PS]], [[PS]]* [[T0]], i32 0, i32 0
// CHECK-NEXT: store i16 1, i16* [[T1]], align 8
__c11_atomic_init(fp, (PS){1,2,3});
// CHECK-NEXT: [[T0:%.*]] = bitcast [[APS]]* [[X]] to i8*
-// CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* [[T0]], i8 0, i32 8, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 8 [[T0]], i8 0, i32 8, i1 false)
// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[APS]], [[APS]]* [[X]], i32 0, i32 0
// CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[PS]], [[PS]]* [[T0]], i32 0, i32 0
// CHECK-NEXT: store i16 1, i16* [[T1]], align 8
// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[APS]], [[APS]]* [[TMP0]], i32 0, i32 0
// CHECK-NEXT: [[T1:%.*]] = bitcast [[PS]]* [[F]] to i8*
// CHECK-NEXT: [[T2:%.*]] = bitcast [[PS]]* [[T0]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[T1]], i8* [[T2]], i32 6, i32 2, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[T1]], i8* align 2 [[T2]], i32 6, i1 false)
PS f = *fp;
// CHECK-NEXT: [[T0:%.*]] = load [[APS]]*, [[APS]]** [[FP]]
// CHECK-NEXT: [[T1:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[TMP1]] to i8*
-// CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* [[T1]], i8 0, i32 8, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* align 8 [[T1]], i8 0, i32 8, i1 false)
// CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds [[APS]], [[APS]]* [[TMP1]], i32 0, i32 0
// CHECK-NEXT: [[T2:%.*]] = bitcast [[PS]]* [[T1]] to i8*
// CHECK-NEXT: [[T3:%.*]] = bitcast [[PS]]* [[F]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[T2]], i8* [[T3]], i32 6, i32 2, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[T2]], i8* align 2 [[T3]], i32 6, i1 false)
// CHECK-NEXT: [[T4:%.*]] = bitcast [[APS]]* [[T0]] to i8*
// CHECK-NEXT: [[T5:%.*]] = bitcast [[APS]]* [[TMP1]] to i8*
// CHECK-NEXT: call arm_aapcscc void @__atomic_store(i32 8, i8* [[T4]], i8* [[T5]], i32 5)
// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[APS]], [[APS]]* [[TMP3]], i32 0, i32 0
// CHECK-NEXT: [[T1:%.*]] = bitcast %struct.PS* [[TMP2]] to i8*
// CHECK-NEXT: [[T2:%.*]] = bitcast %struct.PS* [[T0]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[T1]], i8* [[T2]], i32 6, i32 2, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[T1]], i8* align 2 [[T2]], i32 6, i1 false)
// CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds %struct.PS, %struct.PS* [[TMP2]], i32 0, i32 0
// CHECK-NEXT: [[T1:%.*]] = load i16, i16* [[T0]], align 2
// CHECK-NEXT: [[T2:%.*]] = sext i16 [[T1]] to i32
// CHECK: [[ATOMIC_RES_STRUCT:%.*]] = bitcast i64* [[ATOMIC_RES64]] to %struct.PS*
// CHECK: [[AGG_RESULT8:%.*]] = bitcast %struct.PS* %agg.result to i8*
// CHECK: [[ATOMIC_RES8:%.*]] = bitcast %struct.PS* [[ATOMIC_RES_STRUCT]] to i8*
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[AGG_RESULT8]], i8* [[ATOMIC_RES8]], i32 6, i32 2, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[AGG_RESULT8]], i8* align 2 [[ATOMIC_RES8]], i32 6, i1 false)
return __c11_atomic_load(addr, 5);
}
// CHECK: [[VAL:%.*]] = load %struct.PS*, %struct.PS** [[VAL_ARG]], align 4
// CHECK: [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
// CHECK: [[VAL8:%.*]] = bitcast %struct.PS* [[VAL]] to i8*
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[NONATOMIC_TMP8]], i8* [[VAL8]], i32 6, i32 2, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[NONATOMIC_TMP8]], i8* align 2 [[VAL8]], i32 6, i1 false)
// CHECK: [[ADDR64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ADDR]] to i64*
// CHECK: [[ATOMIC_VAL8:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_VAL]] to i8*
// CHECK: [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[ATOMIC_VAL8]], i8* [[NONATOMIC_TMP8]], i64 6, i32 2, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 [[ATOMIC_VAL8]], i8* align 2 [[NONATOMIC_TMP8]], i64 6, i1 false)
// CHECK: [[ATOMIC_VAL64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_VAL]] to i64*
// CHECK: [[ADDR8:%.*]] = bitcast i64* [[ADDR64]] to i8*
// CHECK: [[VAL64:%.*]] = load i64, i64* [[ATOMIC_VAL64]], align 2
// CHECK: [[VAL:%.*]] = load %struct.PS*, %struct.PS** [[VAL_ARG]], align 4
// CHECK: [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
// CHECK: [[VAL8:%.*]] = bitcast %struct.PS* [[VAL]] to i8*
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[NONATOMIC_TMP8]], i8* [[VAL8]], i32 6, i32 2, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[NONATOMIC_TMP8]], i8* align 2 [[VAL8]], i32 6, i1 false)
// CHECK: [[ADDR64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ADDR]] to i64*
// CHECK: [[ATOMIC_VAL8:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_VAL]] to i8*
// CHECK: [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[ATOMIC_VAL8]], i8* [[NONATOMIC_TMP8]], i64 6, i32 2, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 [[ATOMIC_VAL8]], i8* align 2 [[NONATOMIC_TMP8]], i64 6, i1 false)
// CHECK: [[ATOMIC_VAL64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_VAL]] to i64*
// CHECK: [[ATOMIC_RES64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_RES]] to i64*
// CHECK: [[ADDR8:%.*]] = bitcast i64* [[ADDR64]] to i8*
// CHECK: [[ATOMIC_RES_STRUCT:%.*]] = bitcast i64* [[ATOMIC_RES64]] to %struct.PS*
// CHECK: [[AGG_RESULT8:%.*]] = bitcast %struct.PS* %agg.result to i8*
// CHECK: [[ATOMIC_RES8:%.*]] = bitcast %struct.PS* [[ATOMIC_RES_STRUCT]] to i8*
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[AGG_RESULT8]], i8* [[ATOMIC_RES8]], i32 6, i32 2, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[AGG_RESULT8]], i8* align 2 [[ATOMIC_RES8]], i32 6, i1 false)
return __c11_atomic_exchange(addr, *val, 5);
}
// CHECK: [[NEW:%.*]] = load %struct.PS*, %struct.PS** [[NEW_ARG]], align 4
// CHECK: [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
// CHECK: [[NEW8:%.*]] = bitcast %struct.PS* [[NEW]] to i8*
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[NONATOMIC_TMP8]], i8* [[NEW8]], i32 6, i32 2, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[NONATOMIC_TMP8]], i8* align 2 [[NEW8]], i32 6, i1 false)
// CHECK: [[ADDR64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ADDR]] to i64*
// CHECK: [[ATOMIC_DESIRED8:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_DESIRED]] to i8*
// CHECK: [[DESIRED8:%.*]] = bitcast %struct.PS* [[DESIRED]]to i8*
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[ATOMIC_DESIRED8]], i8* [[DESIRED8]], i64 6, i32 2, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 [[ATOMIC_DESIRED8]], i8* align 2 [[DESIRED8]], i64 6, i1 false)
// CHECK: [[ATOMIC_DESIRED64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_DESIRED]] to i64*
// CHECK: [[ATOMIC_NEW8:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_NEW]] to i8*
// CHECK: [[NONATOMIC_TMP8:%.*]] = bitcast %struct.PS* [[NONATOMIC_TMP]] to i8*
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[ATOMIC_NEW8]], i8* [[NONATOMIC_TMP8]], i64 6, i32 2, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 [[ATOMIC_NEW8]], i8* align 2 [[NONATOMIC_TMP8]], i64 6, i1 false)
// CHECK: [[ATOMIC_NEW64:%.*]] = bitcast { %struct.PS, [2 x i8] }* [[ATOMIC_NEW]] to i64*
// CHECK: [[ADDR8:%.*]] = bitcast i64* [[ADDR64]] to i8*
// CHECK: [[ATOMIC_DESIRED8:%.*]] = bitcast i64* [[ATOMIC_DESIRED64]] to i8*
// CHECK-NEXT: store i32 [[TMP]], i32* [[CY]]
// CHECK-NEXT: [[SI8:%[a-zA-Z0-9.]+]] = bitcast [[STRUCT]]* [[S]] to i8*
// CHECK-NEXT: [[COMPOUNDLITI8:%[a-zA-Z0-9.]+]] = bitcast [[STRUCT]]* [[COMPOUNDLIT]] to i8*
- // CHECK-NEXT: call void @llvm.memcpy{{.*}}(i8* [[SI8]], i8* [[COMPOUNDLITI8]]
+ // CHECK-NEXT: call void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}} [[SI8]], i8* align {{[0-9]+}} [[COMPOUNDLITI8]]
s = (S){s.y,s.x};
// CHECK-NEXT: ret void
}
// CHECK-NEXT: [[T0:%.*]] = bitcast i48* [[COERCE_TEMP]] to i8*
// CHECK-NEXT: [[T1:%.*]] = bitcast [[G]]* [[RESULT]] to i8*
- // CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T0]], i8* [[T1]], i64 6
+ // CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align {{[0-9]+}} [[T0]], i8* align {{[0-9]+}} [[T1]], i64 6
// CHECK-NEXT: [[T0:%.*]] = load i48, i48* [[COERCE_TEMP]]
// CHECK-NEXT: ret i48 [[T0]]
}
// CHECK: [[RESULT:%[a-z_0-9]+]] = va_arg {{.*}}, %struct.Foo{{$}}
// CHECK: store %struct.Foo [[RESULT]], %struct.Foo* [[LOC:%[a-z_0-9]+]]
// CHECK: [[LOC2:%[a-z_0-9]+]] = bitcast {{.*}} [[LOC]] to i8*
-// CHECK: call void @llvm.memcpy{{.*}}@dest{{.*}}, i8* [[LOC2]]
+// CHECK: call void @llvm.memcpy{{.*}}@dest{{.*}}, i8* align {{[0-9]+}} [[LOC2]]
void skip_struct(va_list *args) {
va_arg(*args, struct Foo);
}
// CHECK-I386: define{{.*}}void @test__stosb
-// CHECK-I386: tail call void @llvm.memset.p0i8.i32(i8* %Dest, i8 %Data, i32 %Count, i32 1, i1 true)
+// CHECK-I386: tail call void @llvm.memset.p0i8.i32(i8* align 1 %Dest, i8 %Data, i32 %Count, i1 true)
// CHECK-I386: ret void
// CHECK-I386: }
// CHECK-X64: define{{.*}}void @test__stosb
-// CHECK-X64: tail call void @llvm.memset.p0i8.i64(i8* %Dest, i8 %Data, i64 %Count, i32 1, i1 true)
+// CHECK-X64: tail call void @llvm.memset.p0i8.i64(i8* align 1 %Dest, i8 %Data, i64 %Count, i1 true)
// CHECK-X64: ret void
// CHECK-X64: }
// CHECK: %[[LS:.*]] = alloca %struct.s, align 4
// CHECK-NEXT: %[[ZERO:.*]] = bitcast %struct.s* %[[LS]] to i8*
// CHECK-NEXT: %[[ONE:.*]] = bitcast %struct.s* %[[LS]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* %[[ZERO]], i8* %[[ONE]], i64 132, i32 4, i1 true)
-// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i8* getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i64 132, i32 4, i1 true)
+// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* align 4 %[[ZERO]], i8* align 4 %[[ONE]], i64 132, i1 true)
+// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* align 4 getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i8* align 4 getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i64 132, i1 true)
// CHECK-NEXT: %[[TWO:.*]] = bitcast %struct.s* %[[LS]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* %[[TWO]], i8* getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i64 132, i32 4, i1 true)
+// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* align 4 %[[TWO]], i8* align 4 getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i64 132, i1 true)
struct s1 {
s.y = gs;
}
// CHECK-LABEL: define void @fee()
-// CHECK: call void @llvm.memcpy.{{.*}}(i8* getelementptr inbounds (%struct.s1, %struct.s1* @s, i32 0, i32 0, i32 0, i32 0), i8* getelementptr inbounds (%struct.s1, %struct.s1* @s, i32 0, i32 0, i32 0, i32 0), i64 132, i32 4, i1 true)
-// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* getelementptr inbounds (%struct.s1, %struct.s1* @s, i32 0, i32 0, i32 0, i32 0), i8* getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i64 132, i32 4, i1 true)
+// CHECK: call void @llvm.memcpy.{{.*}}(i8* align 4 getelementptr inbounds (%struct.s1, %struct.s1* @s, i32 0, i32 0, i32 0, i32 0), i8* align 4 getelementptr inbounds (%struct.s1, %struct.s1* @s, i32 0, i32 0, i32 0, i32 0), i64 132, i1 true)
+// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* align 4 getelementptr inbounds (%struct.s1, %struct.s1* @s, i32 0, i32 0, i32 0, i32 0), i8* align 4 getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i64 132, i1 true)
// <rdar://problem/10463337>
struct X test1() {
// CHECK: @test1
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* bitcast (%struct.X* getelementptr inbounds (%struct.Y, %struct.Y* @g, i32 0, i32 1) to i8*), i64 24, i32 1, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* align 1 bitcast (%struct.X* getelementptr inbounds (%struct.Y, %struct.Y* @g, i32 0, i32 1) to i8*), i64 24, i1 false)
return g.y;
}
struct X test2() {
// CHECK: @test2
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* bitcast (%struct.X* getelementptr inbounds (%struct.Y, %struct.Y* @g, i32 0, i32 1) to i8*), i64 24, i32 1, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* align 1 bitcast (%struct.X* getelementptr inbounds (%struct.Y, %struct.Y* @g, i32 0, i32 1) to i8*), i64 24, i1 false)
struct X a = g.y;
return a;
}
void test3(struct X a) {
// CHECK: @test3
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast (%struct.X* getelementptr inbounds (%struct.Y, %struct.Y* @g, i32 0, i32 1) to i8*), i8* {{.*}}, i64 24, i32 1, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 bitcast (%struct.X* getelementptr inbounds (%struct.Y, %struct.Y* @g, i32 0, i32 1) to i8*), i8* {{.*}}, i64 24, i1 false)
g.y = a;
}
// <rdar://problem/10530444>
void test4() {
// CHECK: @test4
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* bitcast (%struct.X* getelementptr inbounds (%struct.Y, %struct.Y* @g, i32 0, i32 1) to i8*), i64 24, i32 1, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* align 1 bitcast (%struct.X* getelementptr inbounds (%struct.Y, %struct.Y* @g, i32 0, i32 1) to i8*), i64 24, i1 false)
f(g.y);
}
// <rdar://problem/11220251>
void test6() {
// CHECK: @test6
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast (%struct.X* getelementptr inbounds (%struct.Y, %struct.Y* @g, i32 0, i32 1) to i8*), i8* %{{.*}}, i64 24, i32 1, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 bitcast (%struct.X* getelementptr inbounds (%struct.Y, %struct.Y* @g, i32 0, i32 1) to i8*), i8* align 1 %{{.*}}, i64 24, i1 false)
g.y = foo();
}
// CHECK-FUNCTIONS: ret i32 [[s0_load_y]]
int s0_load_y(struct s0 *a) { return a->y; }
// CHECK-FUNCTIONS-LABEL: define void @s0_copy
-// CHECK-FUNCTIONS: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* {{.*}}, i64 8, i32 4, i1 false)
+// CHECK-FUNCTIONS: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 {{.*}}, i8* align 4 {{.*}}, i64 8, i1 false)
void s0_copy(struct s0 *a, struct s0 *b) { *b = *a; }
//
// CHECK-FUNCTIONS: ret i32 [[s1_load_y]]
int s1_load_y(struct s1 *a) { return a->y; }
// CHECK-FUNCTIONS-LABEL: define void @s1_copy
-// CHECK-FUNCTIONS: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* {{.*}}, i64 8, i32 1, i1 false)
+// CHECK-FUNCTIONS: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* {{.*}}, i64 8, i1 false)
void s1_copy(struct s1 *a, struct s1 *b) { *b = *a; }
//
// CHECK-FUNCTIONS: ret i32 [[s2_load_y]]
int s2_load_y(struct s2 *a) { return a->y; }
// CHECK-FUNCTIONS-LABEL: define void @s2_copy
-// CHECK-FUNCTIONS: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* {{.*}}, i64 8, i32 2, i1 false)
+// CHECK-FUNCTIONS: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 {{.*}}, i8* align 2 {{.*}}, i64 8, i1 false)
void s2_copy(struct s2 *a, struct s2 *b) { *b = *a; }
struct __attribute__((packed, aligned)) s3 {
// CHECK-LABEL: test1
void test1(void)
{
- // CHECK: call void @llvm.memcpy{{.*}}%struct.P1, %struct.P1* @g1{{.*}}i64 6, i32 {{[0-9]}}, i1 false)
+ // CHECK: call void @llvm.memcpy{{.*}}%struct.P1, %struct.P1* @g1{{.*}}i64 6, i1 false)
// CHECK: store i8 120, i8* %
struct LP1 l = { .p1 = g1, .p1.x[2] = 'x' };
// CHECK-LABEL: test2
void test2(void)
{
- // CHECK: call void @llvm.memcpy{{.*}}%struct.P1, %struct.P1* @g1{{.*}}i64 6, i32 {{[0-9]}}, i1 false)
+ // CHECK: call void @llvm.memcpy{{.*}}%struct.P1, %struct.P1* @g1{{.*}}i64 6, i1 false)
// CHECK: store i8 114, i8* %
struct LP1 l = { .p1 = g1, .p1.x[1] = 'r' };
// CHECK-LABEL: test3
void test3(void)
{
- // CHECK: call void @llvm.memcpy{{.*}}%struct.P2* @g2{{.*}}i64 12, i32 {{[0-9]}}, i1 false)
+ // CHECK: call void @llvm.memcpy{{.*}}%struct.P2* @g2{{.*}}i64 12, i1 false)
// CHECK: store i32 10, i32* %
struct LP2 l = { .p2 = g2, .p2.b = 10 };
// CHECK: [[CALL:%[a-z0-9]+]] = call {{.*}}@get123()
// CHECK: store{{.*}}[[CALL]], {{.*}}[[TMP0:%[a-z0-9]+]]
// CHECK: [[TMP1:%[a-z0-9]+]] = bitcast {{.*}}[[TMP0]]
- // CHECK: call void @llvm.memcpy{{.*}}[[TMP1]], i64 12, i32 {{[0-9]}}, i1 false)
+ // CHECK: call void @llvm.memcpy{{.*}}[[TMP1]], i64 12, i1 false)
// CHECK: store i32 100, i32* %
struct LUP2 { union UP2 up; } var = { get123(), .up.p2.a = 100 };
void test5(void)
{
// .l3 = g3
- // CHECK: call void @llvm.memcpy{{.*}}%struct.LP3, %struct.LP3* @g3{{.*}}i64 12, i32 {{[0-9]}}, i1 false)
+ // CHECK: call void @llvm.memcpy{{.*}}%struct.LP3, %struct.LP3* @g3{{.*}}i64 12, i1 false)
// .l3.p1 = { [0] = g1 } implicitly sets [1] to zero
- // CHECK: call void @llvm.memcpy{{.*}}%struct.P1, %struct.P1* @g1{{.*}}i64 6, i32 {{[0-9]}}, i1 false)
+ // CHECK: call void @llvm.memcpy{{.*}}%struct.P1, %struct.P1* @g1{{.*}}i64 6, i1 false)
// CHECK: getelementptr{{.*}}%struct.P1, %struct.P1*{{.*}}i64 1
- // CHECK: call void @llvm.memset{{.*}}i8 0, i64 6, i32 {{[0-9]}}, i1 false)
+ // CHECK: call void @llvm.memset{{.*}}i8 0, i64 6, i1 false)
// .l3.p1[1].x[1] = 'x'
// CHECK: store i8 120, i8* %
// CHECK: [[CALL:%[a-z0-9]+]] = call {{.*}}@get235()
// CHECK: store{{.*}}[[CALL]], {{.*}}[[TMP0:%[a-z0-9]+]]
// CHECK: [[TMP1:%[a-z0-9]+]] = bitcast {{.*}}[[TMP0]]
- // CHECK: call void @llvm.memcpy{{.*}}[[TMP1]], i64 12, i32 {{[0-9]}}, i1 false)
+ // CHECK: call void @llvm.memcpy{{.*}}[[TMP1]], i64 12, i1 false)
// CHECK: store i32 10, i32* %
// CHECK-PPC-NEXT: [[AGGR:%[a-z0-9]+]] = load %struct.x*, %struct.x** [[VAARG_ADDR]]
// CHECK-PPC-NEXT: [[DEST:%[0-9]+]] = bitcast %struct.x* %t to i8*
// CHECK-PPC-NEXT: [[SRC:%.+]] = bitcast %struct.x* [[AGGR]] to i8*
-// CHECK-PPC-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[DEST]], i8* [[SRC]], i32 16, i32 8, i1 false)
+// CHECK-PPC-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[DEST]], i8* align 8 [[SRC]], i32 16, i1 false)
int v = va_arg (ap, int);
// CHECK: [[T0:%.*]] = bitcast i8* %[[CUR]] to %struct.test1*
// CHECK: [[DEST:%.*]] = bitcast %struct.test1* %y to i8*
// CHECK: [[SRC:%.*]] = bitcast %struct.test1* [[T0]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DEST]], i8* [[SRC]], i64 8, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[DEST]], i8* align 4 [[SRC]], i64 8, i1 false)
struct test1 test1va (int x, ...)
{
struct test1 y;
// CHECK: [[T0:%.*]] = bitcast i8* %[[ALIGN]] to %struct.test2*
// CHECK: [[DEST:%.*]] = bitcast %struct.test2* %y to i8*
// CHECK: [[SRC:%.*]] = bitcast %struct.test2* [[T0]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DEST]], i8* [[SRC]], i64 16, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[DEST]], i8* align 16 [[SRC]], i64 16, i1 false)
struct test2 test2va (int x, ...)
{
struct test2 y;
// CHECK: [[T0:%.*]] = bitcast i8* %[[ALIGN]] to %struct.test3*
// CHECK: [[DEST:%.*]] = bitcast %struct.test3* %y to i8*
// CHECK: [[SRC:%.*]] = bitcast %struct.test3* [[T0]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DEST]], i8* [[SRC]], i64 32, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[DEST]], i8* align 16 [[SRC]], i64 32, i1 false)
struct test3 test3va (int x, ...)
{
struct test3 y;
// CHECK: [[T0:%.*]] = bitcast i8* %[[CUR]] to %struct.test4*
// CHECK: [[DEST:%.*]] = bitcast %struct.test4* %y to i8*
// CHECK: [[SRC:%.*]] = bitcast %struct.test4* [[T0]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DEST]], i8* [[SRC]], i64 12, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[DEST]], i8* align 4 [[SRC]], i64 12, i1 false)
struct test4 test4va (int x, ...)
{
struct test4 y;
// CHECK: [[T0:%.*]] = bitcast i8* %[[CUR]] to %struct.test_longdouble*
// CHECK: [[DEST:%.*]] = bitcast %struct.test_longdouble* %y to i8*
// CHECK: [[SRC:%.*]] = bitcast %struct.test_longdouble* [[T0]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DEST]], i8* [[SRC]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[DEST]], i8* align 8 [[SRC]], i64 16, i1 false)
struct test_longdouble { long double x; };
struct test_longdouble testva_longdouble (int x, ...)
{
// CHECK: [[T0:%.*]] = bitcast i8* %[[ALIGN]] to %struct.test_vector*
// CHECK: [[DEST:%.*]] = bitcast %struct.test_vector* %y to i8*
// CHECK: [[SRC:%.*]] = bitcast %struct.test_vector* [[T0]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DEST]], i8* [[SRC]], i64 16, i32 16, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[DEST]], i8* align 16 [[SRC]], i64 16, i1 false)
struct test_vector { vector int x; };
struct test_vector testva_vector (int x, ...)
{
// CHECK-BE: %[[TMP0:[^ ]+]] = alloca %struct.f3, align 4
// CHECK: %[[TMP1:[^ ]+]] = alloca [2 x i64]
// CHECK: %[[TMP2:[^ ]+]] = bitcast [2 x i64]* %[[TMP1]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[TMP2]], i8* bitcast (%struct.f3* @global_f3 to i8*), i64 12, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %[[TMP2]], i8* align 4 bitcast (%struct.f3* @global_f3 to i8*), i64 12, i1 false)
// CHECK: %[[TMP3:[^ ]+]] = load [2 x i64], [2 x i64]* %[[TMP1]]
// CHECK-LE: call { i64, i64 } @func_f3([2 x i64] %[[TMP3]])
// CHECK-BE: call void @func_f3(%struct.f3* sret %[[TMP0]], [2 x i64] %[[TMP3]])
// CHECK: %[[TMP0:[^ ]+]] = alloca %struct.f5, align 4
// CHECK: %[[TMP1:[^ ]+]] = alloca [3 x i64]
// CHECK: %[[TMP2:[^ ]+]] = bitcast [3 x i64]* %[[TMP1]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[TMP2]], i8* bitcast (%struct.f5* @global_f5 to i8*), i64 20, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %[[TMP2]], i8* align 4 bitcast (%struct.f5* @global_f5 to i8*), i64 20, i1 false)
// CHECK: %[[TMP3:[^ ]+]] = load [3 x i64], [3 x i64]* %[[TMP1]]
// CHECK: call void @func_f5(%struct.f5* sret %[[TMP0]], [3 x i64] %[[TMP3]])
struct f5 global_f5;
// CHECK: %[[TMP0:[^ ]+]] = alloca %struct.f7, align 4
// CHECK: %[[TMP1:[^ ]+]] = alloca [4 x i64], align 8
// CHECK: %[[TMP2:[^ ]+]] = bitcast [4 x i64]* %[[TMP1]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[TMP2]], i8* bitcast (%struct.f7* @global_f7 to i8*), i64 28, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %[[TMP2]], i8* align 4 bitcast (%struct.f7* @global_f7 to i8*), i64 28, i1 false)
// CHECK: %[[TMP3:[^ ]+]] = load [4 x i64], [4 x i64]* %[[TMP1]], align 8
// CHECK: call void @func_f7(%struct.f7* sret %[[TMP0]], [4 x i64] %[[TMP3]])
struct f7 global_f7;
// CHECK-LABEL: @call_f9
// CHECK: %[[TMP1:[^ ]+]] = alloca [5 x i64]
// CHECK: %[[TMP2:[^ ]+]] = bitcast [5 x i64]* %[[TMP1]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[TMP2]], i8* bitcast (%struct.f9* @global_f9 to i8*), i64 36, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %[[TMP2]], i8* align 4 bitcast (%struct.f9* @global_f9 to i8*), i64 36, i1 false)
// CHECK: %[[TMP3:[^ ]+]] = load [5 x i64], [5 x i64]* %[[TMP1]]
// CHECK: call void @func_f9(%struct.f9* sret %{{[^ ]+}}, [5 x i64] %[[TMP3]])
struct f9 global_f9;
// CHECK-BE: %[[TMPX:[^ ]+]] = alloca %struct.fabc, align 4
// CHECK: %[[TMP0:[^ ]+]] = alloca [2 x i64], align 8
// CHECK: %[[TMP2:[^ ]+]] = bitcast [2 x i64]* %[[TMP0]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[TMP2]], i8* bitcast (%struct.fabc* @global_fabc to i8*), i64 12, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %[[TMP2]], i8* align 4 bitcast (%struct.fabc* @global_fabc to i8*), i64 12, i1 false)
// CHECK: %[[TMP3:[^ ]+]] = load [2 x i64], [2 x i64]* %[[TMP0]], align 8
// CHECK-LE: %call = call { i64, i64 } @func_fabc([2 x i64] %[[TMP3]])
// CHECK-BE: call void @func_fabc(%struct.fabc* sret %[[TMPX]], [2 x i64] %[[TMP3]])
// CHECK-LABEL: @call_f9
// CHECK: %[[TMP1:[^ ]+]] = alloca [5 x i64]
// CHECK: %[[TMP2:[^ ]+]] = bitcast [5 x i64]* %[[TMP1]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[TMP2]], i8* bitcast (%struct.f9* @global_f9 to i8*), i64 36, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %[[TMP2]], i8* align 4 bitcast (%struct.f9* @global_f9 to i8*), i64 36, i1 false)
// CHECK: %[[TMP3:[^ ]+]] = load [5 x i64], [5 x i64]* %[[TMP1]]
// CHECK: call void @func_f9(%struct.f9* sret %{{[^ ]+}}, [5 x i64] %[[TMP3]])
struct f9 global_f9;
// CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[ARGP_CUR4]] to %struct.tiny*
// CHECK-NEXT: [[TMP6:%.*]] = bitcast %struct.tiny* [[TS]] to i8*
// CHECK-NEXT: [[TMP7:%.*]] = bitcast %struct.tiny* [[TMP5]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP6]], i8* [[TMP7]], i32 4, i32 1, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 [[TMP6]], i8* align 1 [[TMP7]], i32 4, i1 false)
// CHECK-NEXT: [[ARGP_CUR6:%.*]] = load i8*, i8** [[VA]], align 4
// CHECK-NEXT: [[ARGP_NEXT7:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR6]], i32 8
// CHECK-NEXT: store i8* [[ARGP_NEXT7]], i8** [[VA]], align 4
// CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[ARGP_CUR6]] to %struct.small*
// CHECK-NEXT: [[TMP9:%.*]] = bitcast %struct.small* [[SS]] to i8*
// CHECK-NEXT: [[TMP10:%.*]] = bitcast %struct.small* [[TMP8]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 8, i32 4, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP9]], i8* align 4 [[TMP10]], i32 8, i1 false)
// CHECK-NEXT: [[ARGP_CUR8:%.*]] = load i8*, i8** [[VA]], align 4
// CHECK-NEXT: [[ARGP_NEXT9:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR8]], i32 4
// CHECK-NEXT: store i8* [[ARGP_NEXT9]], i8** [[VA]], align 4
// CHECK-NEXT: [[TMP12:%.*]] = load %struct.large*, %struct.large** [[TMP11]], align 4
// CHECK-NEXT: [[TMP13:%.*]] = bitcast %struct.large* [[LS]] to i8*
// CHECK-NEXT: [[TMP14:%.*]] = bitcast %struct.large* [[TMP12]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP13]], i8* [[TMP14]], i32 16, i32 4, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[TMP13]], i8* align 4 [[TMP14]], i32 16, i1 false)
// CHECK-NEXT: [[VA10:%.*]] = bitcast i8** [[VA]] to i8*
// CHECK-NEXT: call void @llvm.va_end(i8* [[VA10]])
int f_va_4(char *fmt, ...) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[ARGP_CUR2]] to %struct.tiny*
// CHECK-NEXT: [[TMP3:%.*]] = bitcast %struct.tiny* [[TS]] to i8*
// CHECK-NEXT: [[TMP4:%.*]] = bitcast %struct.tiny* [[TMP2]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 8, i32 2, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 [[TMP3]], i8* align 2 [[TMP4]], i64 8, i1 false)
// CHECK-NEXT: [[ARGP_CUR4:%.*]] = load i8*, i8** [[VA]], align 8
// CHECK-NEXT: [[ARGP_NEXT5:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR4]], i64 16
// CHECK-NEXT: store i8* [[ARGP_NEXT5]], i8** [[VA]], align 8
// CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[ARGP_CUR4]] to %struct.small*
// CHECK-NEXT: [[TMP6:%.*]] = bitcast %struct.small* [[SS]] to i8*
// CHECK-NEXT: [[TMP7:%.*]] = bitcast %struct.small* [[TMP5]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP6]], i8* align 8 [[TMP7]], i64 16, i1 false)
// CHECK-NEXT: [[ARGP_CUR6:%.*]] = load i8*, i8** [[VA]], align 8
// CHECK-NEXT: [[ARGP_NEXT7:%.*]] = getelementptr inbounds i8, i8* [[ARGP_CUR6]], i64 8
// CHECK-NEXT: store i8* [[ARGP_NEXT7]], i8** [[VA]], align 8
// CHECK-NEXT: [[TMP9:%.*]] = load %struct.large*, %struct.large** [[TMP8]], align 8
// CHECK-NEXT: [[TMP10:%.*]] = bitcast %struct.large* [[LS]] to i8*
// CHECK-NEXT: [[TMP11:%.*]] = bitcast %struct.large* [[TMP9]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP10]], i8* [[TMP11]], i64 32, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP10]], i8* align 8 [[TMP11]], i64 32, i1 false)
// CHECK-NEXT: [[VA8:%.*]] = bitcast i8** [[VA]] to i8*
// CHECK-NEXT: call void @llvm.va_end(i8* [[VA8]])
// CHECK-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_TINY]], %struct.tiny* [[TS]], i32 0, i32 0
// CHECK-LABEL: define void @get_struct
// CHECK: [[RESULT:%[a-z_0-9]+]] = va_arg {{.*}}, %struct.Foo*{{$}}
// CHECK: [[RESULT2:%[a-z_0-9]+]] = bitcast {{.*}} [[RESULT]] to i8*
-// CHECK: call void @llvm.memcpy{{.*}}@dest{{.*}}, i8* [[RESULT2]]
+// CHECK: call void @llvm.memcpy{{.*}}@dest{{.*}}, i8* align {{[0-9]+}} [[RESULT2]]
void get_struct(va_list *args) {
dest = va_arg(*args, struct Foo);
}
*a = *b;
}
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.*}}, i8* %{{.*}}, i64 16, i32 4, i1 false), !tbaa.struct [[TS:!.*]]
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %{{.*}}, i8* align 4 %{{.*}}, i64 16, i1 false), !tbaa.struct [[TS:!.*]]
struct B {
char c1;
*a = *b;
}
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.*}}, i8* %{{.*}}, i64 24, i32 4, i1 false), !tbaa.struct [[TS2:!.*]]
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %{{.*}}, i8* align 4 %{{.*}}, i64 24, i1 false), !tbaa.struct [[TS2:!.*]]
typedef _Complex int T2;
typedef _Complex char T5;
*a = *b;
}
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.*}}, i8* %{{.*}}, i64 12, i32 4, i1 false), !tbaa.struct [[TS3:!.*]]
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %{{.*}}, i8* align 4 %{{.*}}, i64 12, i1 false), !tbaa.struct [[TS3:!.*]]
// Make sure that zero-length bitfield works.
#define ATTR __attribute__ ((ms_struct))
void copy4(struct five *a, struct five *b) {
*a = *b;
}
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.*}}, i8* %{{.*}}, i64 3, i32 1, i1 false), !tbaa.struct [[TS4:!.*]]
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* {{.*}}, i64 3, i1 false), !tbaa.struct [[TS4:!.*]]
struct six {
char a;
void copy5(struct six *a, struct six *b) {
*a = *b;
}
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.*}}, i8* %{{.*}}, i64 6, i32 1, i1 false), !tbaa.struct [[TS5:!.*]]
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* {{.*}}, i64 6, i1 false), !tbaa.struct [[TS5:!.*]]
// CHECK: [[TS]] = !{i64 0, i64 2, !{{.*}}, i64 4, i64 4, !{{.*}}, i64 8, i64 1, !{{.*}}, i64 12, i64 4, !{{.*}}}
// CHECK: [[CHAR:!.*]] = !{!"omnipotent char", !{{.*}}}
// CHECK: store volatile i32 {{.*}}, i32* @vtS
(void)vF2;
// From vF2 to a temporary
-// CHECK: call void @llvm.memcpy.{{.*}}(i8* %{{.*}}, i8* {{.*}} @vF2 {{.*}}, i1 true)
+// CHECK: call void @llvm.memcpy.{{.*}}(i8* align {{[0-9]+}} %{{.*}}, i8* {{.*}} @vF2 {{.*}}, i1 true)
vF2 = vF2;
// vF2 to itself
// CHECK: call void @llvm.memcpy.{{.*}}(i8* {{.*@vF2.*}}, i8* {{.*@vF2.*}}, i1 true)
// CHECK: call void @llvm.memcpy.{{.*}}(i8* {{.*@vF2.*}}, i8* {{.*@vF2.*}}, i1 true)
vF2 = (vF2, vF2);
// vF2 to a temporary, then vF2 to itself
-// CHECK: call void @llvm.memcpy.{{.*}}(i8* %{{.*}}, i8* {{.*@vF2.*}}, i1 true)
+// CHECK: call void @llvm.memcpy.{{.*}}(i8* align {{[0-9]+}} %{{.*}}, i8* {{.*@vF2.*}}, i1 true)
// CHECK: call void @llvm.memcpy.{{.*}}(i8* {{.*@vF2.*}}, i8* {{.*@vF2.*}}, i1 true)
}
// CHECK: [[R3:%[^,=]+]] = bitcast i8* [[ARGP_CUR]] to [[STRUCT_S]]*
// CHECK: [[R4:%[^,=]+]] = bitcast [[STRUCT_S]]* [[V]] to i8*
// CHECK: [[R5:%[^,=]+]] = bitcast [[STRUCT_S]]* [[R3]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[R4]], i8* [[R5]], i32 12, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[R4]], i8* align 4 [[R5]], i32 12, i1 false)
// CHECK: [[VA2:%[^,=]+]] = bitcast i8** [[VA]] to i8*
// CHECK: call void @llvm.va_end(i8* [[VA2]])
// CHECK: [[R6:%[^,=]+]] = bitcast [[STRUCT_S]]* %agg.result to i8*
// CHECK: [[R7:%[^,=]+]] = bitcast [[STRUCT_S]]* [[V]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[R6]], i8* [[R7]], i32 12, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[R6]], i8* align 4 [[R7]], i32 12, i1 false)
// CHECK: ret void
// CHECK: }
// CHECK: [[RET:%.*]] = alloca [[STRUCT:%.*]], align 1
// CHECK: [[RES:%.*]] = alloca [[STRUCT]], align 1
// CHECK: [[CAST:%.*]] = bitcast [[STRUCT]]* [[RES]] to i8*
-// CHECK: call void @llvm.memset{{.*}}(i8* [[CAST]], i8 0, i64 5
+// CHECK: call void @llvm.memset{{.*}}(i8* align 1 [[CAST]], i8 0, i64 5
// CHECK: [[CASTRET:%.*]] = bitcast [[STRUCT]]* [[RET]] to i8*
// CHECK: [[CASTRES:%.*]] = bitcast [[STRUCT]]* [[RES]] to i8*
-// CHECK: call void @llvm.memcpy{{.*}}(i8* [[CASTRET]], i8* [[CASTRES]], i64 5
+// CHECK: call void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}} [[CASTRET]], i8* align {{[0-9]+}} [[CASTRES]], i64 5
// CHECK: [[CAST:%.*]] = bitcast [[STRUCT]]* [[RET]] to { i64 }*
// CHECK: [[GEP:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[CAST]], i32 0, i32 0
// CHECK: [[R0:%.*]] = load i64, i64* [[GEP]], align 1
// CHECK: [[RET:%.*]] = alloca [[UNION:%.*]], align 8
// CHECK: [[RES:%.*]] = alloca [[UNION]], align 8
// CHECK: [[CAST:%.*]] = bitcast [[UNION]]* [[RES]] to i8*
-// CHECK: call void @llvm.memcpy{{.*}}(i8* [[CAST]]
+// CHECK: call void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}} [[CAST]]
// CHECK: [[CASTRET:%.*]] = bitcast [[UNION]]* [[RET]] to i8*
// CHECK: [[CASTRES:%.*]] = bitcast [[UNION]]* [[RES]] to i8*
-// CHECK: call void @llvm.memcpy{{.*}}(i8* [[CASTRET]], i8* [[CASTRES]]
+// CHECK: call void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}} [[CASTRET]], i8* align {{[0-9]+}} [[CASTRES]]
// CHECK: [[CAST:%.*]] = bitcast [[UNION]]* [[RET]] to { i64 }*
// CHECK: [[GEP:%.*]] = getelementptr inbounds { i64 }, { i64 }* [[CAST]], i32 0, i32 0
// CHECK: [[R0:%.*]] = load i64, i64* [[GEP]], align 8
// CHECK: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
// CHECK: [[INC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
// CHECK: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
- // CHECK: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+ // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
// CHECK: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 16
// CHECK: [[OLD_INT_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i128*
// CHECK: [[OLD_INT:%.+]] = load i128, i128* [[OLD_INT_ADDR]], align 16
// CHECK: [[NEW_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR:%.+]] to i8*
- // CHECK: call void @llvm.memset.p0i8.i64(i8* [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+ // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
// CHECK: store x86_fp80 [[INC_VALUE]], x86_fp80* [[NEW_VALUE_ADDR]], align 16
// CHECK: [[NEW_INT_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR]] to i128*
// CHECK: [[NEW_INT:%.+]] = load i128, i128* [[NEW_INT_ADDR]], align 16
// CHECK32: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
// CHECK32: [[INC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
// CHECK32: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
- // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+ // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
// CHECK32: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 4
// CHECK32: [[DESIRED_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[DESIRED_VALUE_ADDR:%.+]] to i8*
- // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+ // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
// CHECK32: store x86_fp80 [[INC_VALUE]], x86_fp80* [[DESIRED_VALUE_ADDR]], align 4
// CHECK32: [[OBJ:%.+]] = bitcast x86_fp80* [[ADDR]] to i8*
// CHECK32: [[EXPECTED:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i8*
// CHECK: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[ORIG_LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
// CHECK: [[DEC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
// CHECK: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
- // CHECK: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+ // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
// CHECK: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 16
// CHECK: [[OLD_INT_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i128*
// CHECK: [[OLD_INT:%.+]] = load i128, i128* [[OLD_INT_ADDR]], align 16
// CHECK: [[NEW_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR:%.+]] to i8*
- // CHECK: call void @llvm.memset.p0i8.i64(i8* [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+ // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
// CHECK: store x86_fp80 [[DEC_VALUE]], x86_fp80* [[NEW_VALUE_ADDR]], align 16
// CHECK: [[NEW_INT_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR]] to i128*
// CHECK: [[NEW_INT:%.+]] = load i128, i128* [[NEW_INT_ADDR]], align 16
// CHECK32: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[ORIG_LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
// CHECK32: [[DEC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
// CHECK32: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
- // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+ // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
// CHECK32: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 4
// CHECK32: [[DESIRED_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[DESIRED_VALUE_ADDR:%.+]] to i8*
- // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+ // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
// CHECK32: store x86_fp80 [[DEC_VALUE]], x86_fp80* [[DESIRED_VALUE_ADDR]], align 4
// CHECK32: [[OBJ:%.+]] = bitcast x86_fp80* [[ADDR]] to i8*
// CHECK32: [[EXPECTED:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i8*
// CHECK: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
// CHECK: [[SUB_VALUE:%.+]] = fsub x86_fp80 [[OLD_VALUE]],
// CHECK: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
- // CHECK: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+ // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
// CHECK: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 16
// CHECK: [[OLD_INT_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i128*
// CHECK: [[OLD_INT:%.+]] = load i128, i128* [[OLD_INT_ADDR]], align 16
// CHECK: [[NEW_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR:%.+]] to i8*
- // CHECK: call void @llvm.memset.p0i8.i64(i8* [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+ // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
// CHECK: store x86_fp80 [[SUB_VALUE]], x86_fp80* [[NEW_VALUE_ADDR]], align 16
// CHECK: [[NEW_INT_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR]] to i128*
// CHECK: [[NEW_INT:%.+]] = load i128, i128* [[NEW_INT_ADDR]], align 16
// CHECK32: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
// CHECK32: [[INC_VALUE:%.+]] = fsub x86_fp80 [[OLD_VALUE]],
// CHECK32: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
- // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+ // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
// CHECK32: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 4
// CHECK32: [[DESIRED_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[DESIRED_VALUE_ADDR:%.+]] to i8*
- // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+ // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
// CHECK32: store x86_fp80 [[INC_VALUE]], x86_fp80* [[DESIRED_VALUE_ADDR]], align 4
// CHECK32: [[OBJ:%.+]] = bitcast x86_fp80* [[ADDR]] to i8*
// CHECK32: [[EXPECTED:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i8*
// CHECK: store x86_fp80* %{{.+}}, x86_fp80** [[ADDR_ADDR:%.+]], align 8
// CHECK: [[ADDR:%.+]] = load x86_fp80*, x86_fp80** [[ADDR_ADDR]], align 8
// CHECK: [[STORE_TEMP_VOID_PTR:%.+]] = bitcast x86_fp80* [[STORE_TEMP_PTR:%.+]] to i8*
- // CHECK: call void @llvm.memset.p0i8.i64(i8* [[STORE_TEMP_VOID_PTR]], i8 0, i64 16, i32 16, i1 false)
+ // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[STORE_TEMP_VOID_PTR]], i8 0, i64 16, i1 false)
// CHECK: store x86_fp80 {{.+}}, x86_fp80* [[STORE_TEMP_PTR]], align 16
// CHECK: [[STORE_TEMP_INT_PTR:%.+]] = bitcast x86_fp80* [[STORE_TEMP_PTR]] to i128*
// CHECK: [[STORE_TEMP_INT:%.+]] = load i128, i128* [[STORE_TEMP_INT_PTR]], align 16
// CHECK32: store x86_fp80* %{{.+}}, x86_fp80** [[ADDR_ADDR:%.+]], align 4
// CHECK32: [[ADDR:%.+]] = load x86_fp80*, x86_fp80** [[ADDR_ADDR]], align 4
// CHECK32: [[STORE_TEMP_VOID_PTR:%.+]] = bitcast x86_fp80* [[STORE_TEMP_PTR:%.+]] to i8*
- // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[STORE_TEMP_VOID_PTR]], i8 0, i64 12, i32 4, i1 false)
+ // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[STORE_TEMP_VOID_PTR]], i8 0, i64 12, i1 false)
// CHECK32: store x86_fp80 {{.+}}, x86_fp80* [[STORE_TEMP_PTR]], align 4
// CHECK32: [[ADDR_VOID:%.+]] = bitcast x86_fp80* [[ADDR]] to i8*
// CHECK32: [[STORE_TEMP_VOID_PTR:%.+]] = bitcast x86_fp80* [[STORE_TEMP_PTR]] to i8*
// CHECK: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
// CHECK: [[INC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
// CHECK: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
- // CHECK: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+ // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
// CHECK: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 16
// CHECK: [[OLD_INT_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i128*
// CHECK: [[OLD_INT:%.+]] = load i128, i128* [[OLD_INT_ADDR]], align 16
// CHECK: [[NEW_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR:%.+]] to i8*
- // CHECK: call void @llvm.memset.p0i8.i64(i8* [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+ // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
// CHECK: store x86_fp80 [[INC_VALUE]], x86_fp80* [[NEW_VALUE_ADDR]], align 16
// CHECK: [[NEW_INT_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR]] to i128*
// CHECK: [[NEW_INT:%.+]] = load i128, i128* [[NEW_INT_ADDR]], align 16
// CHECK32: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
// CHECK32: [[INC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
// CHECK32: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
- // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+ // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
// CHECK32: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 4
// CHECK32: [[DESIRED_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[DESIRED_VALUE_ADDR:%.+]] to i8*
- // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+ // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
// CHECK32: store x86_fp80 [[INC_VALUE]], x86_fp80* [[DESIRED_VALUE_ADDR]], align 4
// CHECK32: [[OBJ:%.+]] = bitcast x86_fp80* [[ADDR]] to i8*
// CHECK32: [[EXPECTED:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i8*
// CHECK: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[ORIG_LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
// CHECK: [[DEC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
// CHECK: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
- // CHECK: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+ // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
// CHECK: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 16
// CHECK: [[OLD_INT_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i128*
// CHECK: [[OLD_INT:%.+]] = load i128, i128* [[OLD_INT_ADDR]], align 16
// CHECK: [[NEW_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR:%.+]] to i8*
- // CHECK: call void @llvm.memset.p0i8.i64(i8* [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+ // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
// CHECK: store x86_fp80 [[DEC_VALUE]], x86_fp80* [[NEW_VALUE_ADDR]], align 16
// CHECK: [[NEW_INT_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR]] to i128*
// CHECK: [[NEW_INT:%.+]] = load i128, i128* [[NEW_INT_ADDR]], align 16
// CHECK32: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[ORIG_LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
// CHECK32: [[DEC_VALUE:%.+]] = fadd x86_fp80 [[OLD_VALUE]],
// CHECK32: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
- // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+ // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
// CHECK32: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 4
// CHECK32: [[DESIRED_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[DESIRED_VALUE_ADDR:%.+]] to i8*
- // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+ // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
// CHECK32: store x86_fp80 [[DEC_VALUE]], x86_fp80* [[DESIRED_VALUE_ADDR]], align 4
// CHECK32: [[OBJ:%.+]] = bitcast x86_fp80* [[ADDR]] to i8*
// CHECK32: [[EXPECTED:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i8*
// CHECK: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
// CHECK: [[SUB_VALUE:%.+]] = fsub x86_fp80 [[OLD_VALUE]],
// CHECK: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
- // CHECK: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+ // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
// CHECK: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 16
// CHECK: [[OLD_INT_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i128*
// CHECK: [[OLD_INT:%.+]] = load i128, i128* [[OLD_INT_ADDR]], align 16
// CHECK: [[NEW_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR:%.+]] to i8*
- // CHECK: call void @llvm.memset.p0i8.i64(i8* [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i32 16, i1 false)
+ // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[NEW_VALUE_VOID_ADDR]], i8 0, i64 16, i1 false)
// CHECK: store x86_fp80 [[SUB_VALUE]], x86_fp80* [[NEW_VALUE_ADDR]], align 16
// CHECK: [[NEW_INT_ADDR:%.+]] = bitcast x86_fp80* [[NEW_VALUE_ADDR]] to i128*
// CHECK: [[NEW_INT:%.+]] = load i128, i128* [[NEW_INT_ADDR]], align 16
// CHECK32: [[OLD_VALUE:%.+]] = phi x86_fp80 [ [[LD_VALUE]], %{{.+}} ], [ [[LD_VALUE:%.+]], %[[ATOMIC_OP]] ]
// CHECK32: [[INC_VALUE:%.+]] = fsub x86_fp80 [[OLD_VALUE]],
// CHECK32: [[OLD_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR:%.+]] to i8*
- // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+ // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[OLD_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
// CHECK32: store x86_fp80 [[OLD_VALUE]], x86_fp80* [[OLD_VALUE_ADDR]], align 4
// CHECK32: [[DESIRED_VALUE_VOID_ADDR:%.+]] = bitcast x86_fp80* [[DESIRED_VALUE_ADDR:%.+]] to i8*
- // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i32 4, i1 false)
+ // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[DESIRED_VALUE_VOID_ADDR]], i8 0, i64 12, i1 false)
// CHECK32: store x86_fp80 [[INC_VALUE]], x86_fp80* [[DESIRED_VALUE_ADDR]], align 4
// CHECK32: [[OBJ:%.+]] = bitcast x86_fp80* [[ADDR]] to i8*
// CHECK32: [[EXPECTED:%.+]] = bitcast x86_fp80* [[OLD_VALUE_ADDR]] to i8*
// CHECK: store x86_fp80* %{{.+}}, x86_fp80** [[ADDR_ADDR:%.+]], align 8
// CHECK: [[ADDR:%.+]] = load x86_fp80*, x86_fp80** [[ADDR_ADDR]], align 8
// CHECK: [[STORE_TEMP_VOID_PTR:%.+]] = bitcast x86_fp80* [[STORE_TEMP_PTR:%.+]] to i8*
- // CHECK: call void @llvm.memset.p0i8.i64(i8* [[STORE_TEMP_VOID_PTR]], i8 0, i64 16, i32 16, i1 false)
+ // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[STORE_TEMP_VOID_PTR]], i8 0, i64 16, i1 false)
// CHECK: store x86_fp80 {{.+}}, x86_fp80* [[STORE_TEMP_PTR]], align 16
// CHECK: [[STORE_TEMP_INT_PTR:%.+]] = bitcast x86_fp80* [[STORE_TEMP_PTR]] to i128*
// CHECK: [[STORE_TEMP_INT:%.+]] = load i128, i128* [[STORE_TEMP_INT_PTR]], align 16
// CHECK32: store x86_fp80* %{{.+}}, x86_fp80** [[ADDR_ADDR:%.+]], align 4
// CHECK32: [[ADDR:%.+]] = load x86_fp80*, x86_fp80** [[ADDR_ADDR]], align 4
// CHECK32: [[STORE_TEMP_VOID_PTR:%.+]] = bitcast x86_fp80* [[STORE_TEMP_PTR:%.+]] to i8*
- // CHECK32: call void @llvm.memset.p0i8.i64(i8* [[STORE_TEMP_VOID_PTR]], i8 0, i64 12, i32 4, i1 false)
+ // CHECK32: call void @llvm.memset.p0i8.i64(i8* align 4 [[STORE_TEMP_VOID_PTR]], i8 0, i64 12, i1 false)
// CHECK32: store x86_fp80 {{.+}}, x86_fp80* [[STORE_TEMP_PTR]], align 4
// CHECK32: [[ADDR_VOID:%.+]] = bitcast x86_fp80* [[ADDR]] to i8*
// CHECK32: [[STORE_TEMP_VOID_PTR:%.+]] = bitcast x86_fp80* [[STORE_TEMP_PTR]] to i8*
// RUN: FileCheck < %t %s
// CHECK-LABEL: define void @f0(%struct.s0* byval align 4)
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %{{.*}}, i8* %{{.*}}, i32 16, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %{{.*}}, i8* align 4 %{{.*}}, i32 16, i1 false)
// CHECK: }
struct s0 { long double a; };
void f0(struct s0 a0) {
// CHECK-NEXT: [[CASTED_VALUE_ADDR:%.*]] = bitcast i8* [[VALUE_ADDR]] to [[STRUCT_TEST51]]
// CHECK-NEXT: [[CASTED_TMP_ADDR:%.*]] = bitcast [[STRUCT_TEST51]]* [[TMP_ADDR]] to i8*
// CHECK-NEXT: [[RECASTED_VALUE_ADDR:%.*]] = bitcast [[STRUCT_TEST51]]* [[CASTED_VALUE_ADDR]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[CASTED_TMP_ADDR]], i8* [[RECASTED_VALUE_ADDR]], i64 16, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[CASTED_TMP_ADDR]], i8* align 8 [[RECASTED_VALUE_ADDR]], i64 16, i1 false)
// CHECK-NEXT: add i32 {{.*}}, 16
// CHECK-NEXT: store i32 {{.*}}, i32* {{.*}}
// CHECK-NEXT: br label
// CHECK: store i8* [[IN]], i8** [[AP]]
// CHECK: [[V1:%[a-z0-9]+]] = bitcast %struct.x* [[V:%[a-z0-9]+]] to i8*
// CHECK: [[P1:%[a-z0-9]+]] = bitcast %struct.x* [[P]] to i8*
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[V1]], i8* [[P1]], i32 20, i32 4, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[V1]], i8* align 4 [[P1]], i32 20, i1 false)
// CHECK: [[V2:%[a-z0-9]+]] = bitcast %struct.x* [[V]] to i8*
// CHECK: call void @f(i8* [[V2]])
// CHECK: store i8* [[IN]], i8** [[AP]]
// CHECK: [[V1:%[a-z0-9]+]] = bitcast [4 x i32]* [[V0:%[a-z0-9]+]] to i8*
// CHECK: [[P1:%[a-z0-9]+]] = bitcast [4 x i32]* [[P]] to i8*
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[V1]], i8* [[P1]], i32 16, i32 4, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[V1]], i8* align 4 [[P1]], i32 16, i1 false)
// CHECK: [[V2:%[a-z0-9]+]] = getelementptr inbounds [4 x i32], [4 x i32]* [[V0]], i32 0, i32 0
// CHECK: store i32* [[V2]], i32** [[V:%[a-z0-9]+]], align 4
// CHECK: [[V3:%[a-z0-9]+]] = load i32*, i32** [[V]], align 4
// CHECK: [[ARRAY_P:%.*]] = getelementptr inbounds [[A]], [[A]]* [[A_P]], i32 0, i32 0
// CHECK: [[T0:%.*]] = bitcast [[ARRAY]]* [[RESULT]] to i8*
// CHECK: [[T1:%.*]] = bitcast [[ARRAY]]* [[ARRAY_P]] to i8*
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T0]], i8* [[T1]], i64 16, i32 16, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[T0]], i8* align 16 [[T1]], i64 16, i1 false)
AlignedArray result = a.aArray;
}
// CHECK: [[ARRAY_P:%.*]] = getelementptr inbounds [[A]], [[A]]* [[A_P]], i32 0, i32 0
// CHECK: [[T0:%.*]] = bitcast [[ARRAY]]* [[RESULT]] to i8*
// CHECK: [[T1:%.*]] = bitcast [[ARRAY]]* [[ARRAY_P]] to i8*
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T0]], i8* [[T1]], i64 16, i32 16, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[T0]], i8* align 16 [[T1]], i64 16, i1 false)
AlignedArray result = b.aArray;
}
// CHECK: [[ARRAY_P:%.*]] = getelementptr inbounds [[B]], [[B]]* [[B_P]], i32 0, i32 2
// CHECK: [[T0:%.*]] = bitcast [[ARRAY]]* [[RESULT]] to i8*
// CHECK: [[T1:%.*]] = bitcast [[ARRAY]]* [[ARRAY_P]] to i8*
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T0]], i8* [[T1]], i64 16, i32 8, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[T0]], i8* align 8 [[T1]], i64 16, i1 false)
AlignedArray result = b.bArray;
}
// CHECK: [[ARRAY_P:%.*]] = getelementptr inbounds [[B]], [[B]]* [[B_P]], i32 0, i32 2
// CHECK: [[T0:%.*]] = bitcast [[ARRAY]]* [[RESULT]] to i8*
// CHECK: [[T1:%.*]] = bitcast [[ARRAY]]* [[ARRAY_P]] to i8*
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T0]], i8* [[T1]], i64 16, i32 8, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[T0]], i8* align 8 [[T1]], i64 16, i1 false)
AlignedArray result = b->bArray;
}
// CHECK: [[ARRAY_P:%.*]] = getelementptr inbounds [[B]], [[B]]* [[B_P]], i32 0, i32 2
// CHECK: [[T0:%.*]] = bitcast [[ARRAY]]* [[RESULT]] to i8*
// CHECK: [[T1:%.*]] = bitcast [[ARRAY]]* [[ARRAY_P]] to i8*
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T0]], i8* [[T1]], i64 16, i32 16, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[T0]], i8* align 16 [[T1]], i64 16, i1 false)
B b;
AlignedArray result = b.bArray;
}
// CHECK: [[ARRAY_P:%.*]] = getelementptr inbounds [[A]], [[A]]* [[A_P]], i32 0, i32 0
// CHECK: [[T0:%.*]] = bitcast [[ARRAY]]* [[RESULT]] to i8*
// CHECK: [[T1:%.*]] = bitcast [[ARRAY]]* [[ARRAY_P]] to i8*
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T0]], i8* [[T1]], i64 16, i32 16, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[T0]], i8* align 16 [[T1]], i64 16, i1 false)
D d;
AlignedArray result = d.aArray;
}
// CHECK: [[ARRAY_P:%.*]] = getelementptr inbounds [[B]], [[B]]* [[B_P]], i32 0, i32 2
// CHECK: [[T0:%.*]] = bitcast [[ARRAY]]* [[RESULT]] to i8*
// CHECK: [[T1:%.*]] = bitcast [[ARRAY]]* [[ARRAY_P]] to i8*
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T0]], i8* [[T1]], i64 16, i32 8, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[T0]], i8* align 8 [[T1]], i64 16, i1 false)
D d;
AlignedArray result = d.bArray;
}
-// RUN: %clang_cc1 -triple x86_64-apple-darwin12 -emit-llvm -o - -std=c++11 %s -DPOD | FileCheck %s -check-prefix=CHECK-POD
+/// RUN: %clang_cc1 -triple x86_64-apple-darwin12 -emit-llvm -o - -std=c++11 %s -DPOD | FileCheck %s -check-prefix=CHECK-POD
// RUN: %clang_cc1 -triple x86_64-apple-darwin12 -emit-llvm -o - -std=c++11 %s | FileCheck %s -check-prefix=CHECK-NONPOD
// Declare the reserved placement operators.
foo *test1(void *f, const foo &x) {
return new (f) foo(x);
// CHECK-POD: test1
-// CHECK-POD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 24, i32 8
+// CHECK-POD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 8 {{.*}} align 8 {{.*}}i64 24
// CHECK-NONPOD: test1
-// CHECK-NONPOD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 24, i32 8
+// CHECK-NONPOD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 8 {{.*}} align 8 {{.*}}i64 24
}
foo *test2(const foo &x) {
return new foo(x);
// CHECK-POD: test2
-// CHECK-POD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 24, i32 8
+// CHECK-POD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 8 {{.*}} align 8 {{.*}}i64 24
// CHECK-NONPOD: test2
-// CHECK-NONPOD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 24, i32 8
+// CHECK-NONPOD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 8 {{.*}} align 8 {{.*}}i64 24
}
foo test3(const foo &x) {
foo f = x;
return f;
// CHECK-POD: test3
-// CHECK-POD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 24, i32 8
+// CHECK-POD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 8 {{.*}} align 8 {{.*}}i64 24
// CHECK-NONPOD: test3
-// CHECK-NONPOD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 24, i32 8
+// CHECK-NONPOD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 8 {{.*}} align 8 {{.*}}i64 24
}
foo *test4(foo &&x) {
return new foo(x);
// CHECK-POD: test4
-// CHECK-POD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 24, i32 8
+// CHECK-POD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 8 {{.*}} align 8 {{.*}}i64 24
// CHECK-NONPOD: test4
-// CHECK-NONPOD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 24, i32 8
+// CHECK-NONPOD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 8 {{.*}} align 8 {{.*}}i64 24
}
void test5(foo &f, const foo &x) {
f = x;
// CHECK-POD: test5
-// CHECK-POD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 24, i32 8
+// CHECK-POD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 8 {{.*}} align 8 {{.*}}i64 24
// CHECK-NONPOD: test5
-// CHECK-NONPOD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 17, i32 8
+// CHECK-NONPOD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 8 {{.*}} align 8 {{.*}}i64 17
}
extern foo globtest;
void test6(foo &&x) {
globtest = move(x);
// CHECK-POD: test6
-// CHECK-POD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 24, i32 8
+// CHECK-POD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 8 {{.*}} align 8 {{.*}}i64 24
// CHECK-NONPOD: test6
-// CHECK-NONPOD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 17, i32 8
+// CHECK-NONPOD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 8 {{.*}} align 8 {{.*}}i64 17
}
void byval(foo f);
void test7(const foo &x) {
byval(x);
// CHECK-POD: test7
-// CHECK-POD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 24, i32 8
+// CHECK-POD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 8 {{.*}} align 8 {{.*}}i64 24
// CHECK-NONPOD: test7
-// CHECK-NONPOD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 24, i32 8
+// CHECK-NONPOD: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 8 {{.*}} align 8 {{.*}}i64 24
}
Test1 var;
var.Test1::Test1();
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %{{.*}}, i8* %{{.*}}, i32 4, i32 4, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %{{.*}}, i8* align 4 %{{.*}}, i32 4, i1 false)
var.Test1::Test1(var);
}
// CHECK-NEXT: call x86_thiscallcc void @_ZN5Test2C1Ev(%class.Test2* %var)
var.Test2::Test2();
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %{{.*}}, i8* %{{.*}}, i32 8, i32 4, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %{{.*}}, i8* align 4 %{{.*}}, i32 8, i1 false)
var.Test2::Test2(var);
}
// Make sure that we obey the destination's alignment requirements when emitting
// the copy.
// CHECK-LABEL: define {{.*}} @f(
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}({{.*}}, i8* bitcast (%struct.V* @gv1 to i8*), {{i64|i32}} 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i64|i32}}({{.*}}align 4{{.*}}, i8* align 4 bitcast (%struct.V* @gv1 to i8*), {{i64|i32}} 4, i1 false)
PR23373 pr23373_b(pr23373_a);
// CHECK-LABEL: define {{.*}} @__cxx_global_var_init(
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i32|i64}}({{.*}} @pr23373_b{{.*}}, {{.*}} @pr23373_a{{.*}}, [[W:i32|i64]] 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.{{i32|i64}}({{.*}}align 4{{.*}}@pr23373_b{{.*}}, {{.*}}align 4{{.*}} @pr23373_a{{.*}}, [[W:i32|i64]] 4, i1 false)
PR23373 pr23373_f() { return pr23373_a; }
// CHECK-LABEL: define {{.*}} @_Z9pr23373_fv(
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.[[W]]({{.*}}, [[W]] 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.[[W]]({{.*}}align 4{{.*}}align 4{{.*}}, [[W]] 4, i1 false)
void pr23373_g(PR23373 &a, PR23373 &b) { a = b; }
// CHECK-LABEL: define {{.*}} @_Z9pr23373_g
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.[[W]]({{.*}}, [[W]] 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.[[W]]({{.*}}align 4{{.*}}align 4{{.*}}, [[W]] 4, i1 false)
struct A { virtual void a(); };
A x(A& y) { return y; }
// CHECK-NEXT: [[T2:%.*]] = getelementptr inbounds [[A]], [[A]]* [[OTHER]], i32 0, i32 1
// CHECK-NEXT: [[T4:%.*]] = bitcast i16* [[T0]] to i8*
// CHECK-NEXT: [[T5:%.*]] = bitcast i16* [[T2]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T4]], i8* [[T5]], i64 8, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[T4]], i8* align 8 [[T5]], i64 8, i1 false)
// CHECK-NEXT: ret [[A]]* [[THIS]]
// CHECK-LABEL: define linkonce_odr void @_ZN6PR66281BC2ERKS0_(%"struct.PR6628::B"* %this, %"struct.PR6628::B"* dereferenceable({{[0-9]+}})) unnamed_addr
// CHECK-NEXT: [[T2:%.*]] = getelementptr inbounds [[A]], [[A]]* [[OTHER]], i32 0, i32 1
// CHECK-NEXT: [[T4:%.*]] = bitcast i16* [[T0]] to i8*
// CHECK-NEXT: [[T5:%.*]] = bitcast i16* [[T2]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T4]], i8* [[T5]], i64 8, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[T4]], i8* align 8 [[T5]], i64 8, i1 false)
// CHECK-NEXT: ret void
}
X::X(int) : X() {}
}
// CHECK: define {{.*}} @_ZN7PR128901XC1Ei(%"class.PR12890::X"* %this, i32)
-// CHECK: call void @llvm.memset.p0i8.{{i32|i64}}(i8* {{.*}}, i8 0, {{i32|i64}} 4, i32 4, i1 false)
+// CHECK: call void @llvm.memset.p0i8.{{i32|i64}}(i8* align 4 {{.*}}, i8 0, {{i32|i64}} 4, i1 false)
namespace PR14588 {
void other();
// CHECK: store i32 -1,
Agg2 b = { n };
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %{{.*}}, i8* bitcast ([3 x i32]* @[[THREE_NULL_MEMPTRS]] to i8*), i32 12, i32 4, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %{{.*}}, i8* align 4 bitcast ([3 x i32]* @[[THREE_NULL_MEMPTRS]] to i8*), i32 12, i1 false)
}
// Test dynamic initialization.
//
// CHECK: %[[SIZE:.*]] = sub i64 %{{.*}}, 24
// CHECK: %[[REST:.*]] = bitcast %[[T]]* %[[T_2_AS_T]] to i8*
-// CHECK: call void @llvm.memset.p0i8.i64(i8* %[[REST]], i8 0, i64 %[[SIZE]], i32 4, i1 false)
+// CHECK: call void @llvm.memset.p0i8.i64(i8* align 4 %[[REST]], i8 0, i64 %[[SIZE]], i1 false)
//
// CHECK: }
//CHECK: %[[I1:.+]] = bitcast %struct.A* %[[I0]] to i8*
//CHECK: %[[I2:.+]] = bitcast %struct.A* %this1 to i8*
// copy the contents ...
-//CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[I1]], i8* %[[I2]], i32 8, i32 8, i1 false)
+//CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 %[[I1]], i8* align 8 %[[I2]], i32 8, i1 false)
struct B {
double b = 222;
-// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -triple x86_64-apple-darwin -std=c++11 -emit-llvm %s -o %t.ll
-// RUN: FileCheck --input-file=%t.ll %s
+// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -triple x86_64-apple-darwin -std=c++11 -emit-llvm %s -o - \
+// RUN: | FileCheck %s
struct test1_D {
double d;
// CHECK: [[EXNOBJ:%.*]] = call i8* @__cxa_allocate_exception(i64 8)
// CHECK-NEXT: [[EXN:%.*]] = bitcast i8* [[EXNOBJ]] to [[DSTAR:%[^*]*\*]]
// CHECK-NEXT: [[EXN2:%.*]] = bitcast [[DSTAR]] [[EXN]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[EXN2]], i8* bitcast ([[DSTAR]] @d1 to i8*), i64 8, i32 8, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[EXN2]], i8* align 8 bitcast ([[DSTAR]] @d1 to i8*), i64 8, i1 false)
// CHECK-NEXT: call void @__cxa_throw(i8* [[EXNOBJ]], i8* bitcast ({ i8*, i8* }* @_ZTI7test1_D to i8*), i8* null) [[NR:#[0-9]+]]
// CHECK-NEXT: unreachable
// CHECK: [[T0:%.*]] = call i8* @__cxa_allocate_exception(i64 16)
// CHECK-NEXT: [[T1:%.*]] = bitcast i8* [[T0]] to %"class.test17::DerivedException"*
// CHECK-NEXT: [[T2:%.*]] = bitcast %"class.test17::DerivedException"* [[T1]] to i8*
- // CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[T2]], i8 0, i64 16, i32 16, i1 false)
+ // CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 16 [[T2]], i8 0, i64 16, i1 false)
}
}
S1<_Float16> s1 = { 132.f16 };
// CHECK-DAG: @_ZZ4mainE2s1 = private unnamed_addr constant %struct.S1 { half 0xH5820 }, align 2
// CHECK-DAG: [[S1:%[0-9]+]] = bitcast %struct.S1* %{{.*}} to i8*
-// CHECK-DAG: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[S1]], i8* bitcast (%struct.S1* @_ZZ4mainE2s1 to i8*), i64 2, i32 2, i1 false)
+// CHECK-DAG: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 [[S1]], i8* align 2 bitcast (%struct.S1* @_ZZ4mainE2s1 to i8*), i64 2, i1 false)
_Float16 f4l = func1n(f1l) + func1f(f2l) + c1.func1c(f3l) + c1.func2c(f1l) +
func1t(f1l) + s1.mem2 - f1n + f2n;
// WIN32: %[[argmem:[^ ]*]] = alloca inalloca [[argmem_ty]]
// WIN32: %[[gep1:[^ ]*]] = getelementptr inbounds [[argmem_ty]], [[argmem_ty]]* %[[argmem]], i32 0, i32 1
// WIN32: %[[bc1:[^ ]*]] = bitcast %struct.SmallWithDtor* %[[gep1]] to i8*
-// WIN32: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[bc1]], i8* {{.*}}, i32 4, i32 4, i1 false)
+// WIN32: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %[[bc1]], i8* align 4 {{.*}}, i32 4, i1 false)
// WIN32: %[[a2:[^ ]*]] = load void [[dst_ty]], void [[dst_ty]]* %[[a1]], align 4
// WIN32: %[[gep2:[^ ]*]] = getelementptr inbounds [[argmem_ty]], [[argmem_ty]]* %[[argmem]], i32 0, i32 0
// WIN32: %[[addr:[^ ]*]] = bitcast {}** %[[gep2]] to void [[dst_ty]]*
// CHECK: %[[B:.*]] = bitcast %"struct.test5::C"* %[[THIS]] to %"struct.test5::B"*
// CHECK: %[[B_i8:.*]] = bitcast %"struct.test5::B"* %[[B]] to i8*
// CHECK: %[[FIELD:.*]] = getelementptr inbounds i8, i8* %[[B_i8]], i32 4
-// CHECK: call void @llvm.memset.p0i8.i32(i8* %[[FIELD]], i8 0, i32 4, i32 4, i1 false)
+// CHECK: call void @llvm.memset.p0i8.i32(i8* align 4 %[[FIELD]], i8 0, i32 4, i1 false)
}
namespace pr27621 {
// CHECK: %[[C:.*]] = bitcast %"class.test6::D"* %[[THIS]] to %"class.test6::C"*
// CHECK: %[[C_i8:.*]] = bitcast %"class.test6::C"* %[[C]] to i8*
// CHECK: %[[FIELD:.*]] = getelementptr inbounds i8, i8* %[[C_i8]], i32 8
-// CHECK: call void @llvm.memset.p0i8.i32(i8* %[[FIELD]], i8 0, i32 4, i32 4, i1 false)
+// CHECK: call void @llvm.memset.p0i8.i32(i8* align 4 %[[FIELD]], i8 0, i32 4, i1 false)
}
// CHECK: @_GUID_87654321_4321_4321_4321_ba0987654321 = linkonce_odr constant { i32, i16, i16, [8 x i8] } { i32 -2023406815, i16 17185, i16 17185, [8 x i8] c"C!\BA\09\87eC!" }, comdat
// The static initializer for thing.
-// CHECK-DEFINE-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast (%struct._GUID* @thing to i8*), i8* bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ac to i8*), i32 16, i32 4, i1 false)
-// CHECK-DEFINE-WRONG-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast (%struct._GUID* @thing to i8*), i8* bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ac to i8*), i32 4, i32 4, i1 false)
+// CHECK-DEFINE-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 bitcast (%struct._GUID* @thing to i8*), i8* align 4 bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ac to i8*), i32 16, i1 false)
+// CHECK-DEFINE-WRONG-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 bitcast (%struct._GUID* @thing to i8*), i8* align 4 bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ac to i8*), i32 4, i1 false)
// The static initializer for g.
-// CHECK-DEFINE-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast (%struct._GUID* @g to i8*), i8* bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 16, i32 4, i1 false)
-// CHECK-DEFINE-WRONG-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast (%struct._GUID* @g to i8*), i8* bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 4, i32 4, i1 false)
+// CHECK-DEFINE-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 bitcast (%struct._GUID* @g to i8*), i8* align 4 bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 16, i1 false)
+// CHECK-DEFINE-WRONG-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 bitcast (%struct._GUID* @g to i8*), i8* align 4 bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 4, i1 false)
#ifdef DEFINE_GUID
void fun() {
// CHECK-DEFINE-GUID: [[U1:%.+]] = bitcast %struct._GUID* %s1_1 to i8*
// CHECK-DEFINE-WRONG-GUID: [[U1:%.+]] = bitcast %struct._GUID* %s1_1 to i8*
- // CHECK-DEFINE-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[U1]], i8* bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 16, i32 4, i1 false)
- // CHECK-DEFINE-WRONG-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[U1]], i8* bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 4, i32 4, i1 false)
+ // CHECK-DEFINE-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[U1]], i8* align 4 bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 16, i1 false)
+ // CHECK-DEFINE-WRONG-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[U1]], i8* align 4 bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 4, i1 false)
GUID s1_1 = __uuidof(S1);
// CHECK-DEFINE-GUID: [[U2:%.+]] = bitcast %struct._GUID* %s1_2 to i8*
// CHECK-DEFINE-WRONG-GUID: [[U2:%.+]] = bitcast %struct._GUID* %s1_2 to i8*
- // CHECK-DEFINE-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[U2]], i8* bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 16, i32 4, i1 false)
- // CHECK-DEFINE-WRONG-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[U2]], i8* bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 4, i32 4, i1 false)
+ // CHECK-DEFINE-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[U2]], i8* align 4 bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 16, i1 false)
+ // CHECK-DEFINE-WRONG-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[U2]], i8* align 4 bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 4, i1 false)
GUID s1_2 = __uuidof(S1);
// CHECK-DEFINE-GUID: [[U3:%.+]] = bitcast %struct._GUID* %s1_3 to i8*
// CHECK-DEFINE-WRONG-GUID: [[U3:%.+]] = bitcast %struct._GUID* %s1_3 to i8*
- // CHECK-DEFINE-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[U3]], i8* bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 16, i32 4, i1 false)
- // CHECK-DEFINE-WRONG-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[U3]], i8* bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 4, i32 4, i1 false)
+ // CHECK-DEFINE-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[U3]], i8* align 4 bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 16, i1 false)
+ // CHECK-DEFINE-WRONG-GUID: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 [[U3]], i8* align 4 bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to i8*), i32 4, i1 false)
GUID s1_3 = __uuidof(s1);
}
#endif
// FIXME: Conditionally throw an exception rather than passing -1 to alloc function
// CHECK: select
// CHECK: %[[PTR:.*]] = call i8* @_Zna{{.}}(i{{32|64}}
- // CHECK: call void @llvm.memcpy{{.*}}(i8* %[[PTR]], i8* getelementptr inbounds ([4 x i8], [4 x i8]* @[[ABC4]], i32 0, i32 0), i32 4,
+ // CHECK: call void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}} %[[PTR]], i8* align {{[0-9]+}} getelementptr inbounds ([4 x i8], [4 x i8]* @[[ABC4]], i32 0, i32 0), i32 4,
// CHECK: %[[REST:.*]] = getelementptr inbounds i8, i8* %[[PTR]], i32 4
// CHECK: %[[RESTSIZE:.*]] = sub {{.*}}, 4
- // CHECK: call void @llvm.memset{{.*}}(i8* %[[REST]], i8 0, i{{32|64}} %[[RESTSIZE]],
+ // CHECK: call void @llvm.memset{{.*}}(i8* align {{[0-9]+}} %[[REST]], i8 0, i{{32|64}} %[[RESTSIZE]],
new char[n] { "abc" };
}
void string_exact() {
// CHECK-NOT: icmp
// CHECK: %[[PTR:.*]] = call i8* @_Zna{{.}}(i{{32|64}} 4)
- // CHECK: call void @llvm.memcpy{{.*}}(i8* %[[PTR]], i8* getelementptr inbounds ([4 x i8], [4 x i8]* @[[ABC4]], i32 0, i32 0), i32 4,
+ // CHECK: call void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}} %[[PTR]], i8* align {{[0-9]+}} getelementptr inbounds ([4 x i8], [4 x i8]* @[[ABC4]], i32 0, i32 0), i32 4,
// CHECK-NOT: memset
new char[4] { "abc" };
}
// CHECK-NOT: icmp
// CHECK: %[[PTR:.*]] = call i8* @_Zna{{.}}(i{{32|64}} 15)
// FIXME: For very large arrays, it would be preferable to emit a small copy and a memset.
- // CHECK: call void @llvm.memcpy{{.*}}(i8* %[[PTR]], i8* getelementptr inbounds ([15 x i8], [15 x i8]* @[[ABC15]], i32 0, i32 0), i32 15,
+ // CHECK: call void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}} %[[PTR]], i8* align {{[0-9]+}} getelementptr inbounds ([15 x i8], [15 x i8]* @[[ABC15]], i32 0, i32 0), i32 15,
// CHECK-NOT: memset
new char[15] { "abc" };
}
// CHECK: %[[PTR2:.*]] = getelementptr inbounds %[[AGGR]], %[[AGGR]]* %[[PTR1]], i32 1{{$}}
// CHECK: %[[REMAIN:.*]] = sub i32 {{.*}}, 16
// CHECK: %[[MEM:.*]] = bitcast %[[AGGR]]* %[[PTR2]] to i8*
- // CHECK: call void @llvm.memset{{.*}}(i8* %[[MEM]], i8 0, i32 %[[REMAIN]],
+ // CHECK: call void @llvm.memset{{.*}}(i8* align {{[0-9]+}} %[[MEM]], i8 0, i32 %[[REMAIN]],
struct Aggr { int a, b; };
new Aggr[n] { 1, 2, 3 };
}
// CHECK: %[[LS:.*]] = alloca %struct.s, align 4
// CHECK-NEXT: %[[ZERO:.*]] = bitcast %struct.s* %[[LS]] to i8*
// CHECK-NEXT: %[[ONE:.*]] = bitcast %struct.s* %[[LS]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* %[[ZERO]], i8* %[[ONE]], i64 132, i32 4, i1 true)
-// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i8* getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i64 132, i32 4, i1 true)
+// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* align 4 %[[ZERO]], i8* align 4 %[[ONE]], i64 132, i1 true)
+// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* align 4 getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i8* align 4 getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i64 132, i1 true)
// CHECK-NEXT: %[[TWO:.*]] = bitcast %struct.s* %[[LS]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* %[[TWO]], i8* getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i64 132, i32 4, i1 true)
+// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* align 4 %[[TWO]], i8* align 4 getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i64 132, i1 true)
struct s1 {
s.y = gs;
}
// CHECK-LABEL: define void @_Z3feev()
-// CHECK: call void @llvm.memcpy.{{.*}}(i8* getelementptr inbounds (%struct.s1, %struct.s1* @s, i32 0, i32 0, i32 0, i32 0), i8* getelementptr inbounds (%struct.s1, %struct.s1* @s, i32 0, i32 0, i32 0, i32 0), i64 132, i32 4, i1 true)
-// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* getelementptr inbounds (%struct.s1, %struct.s1* @s, i32 0, i32 0, i32 0, i32 0), i8* getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i64 132, i32 4, i1 true)
+// CHECK: call void @llvm.memcpy.{{.*}}(i8* align 4 getelementptr inbounds (%struct.s1, %struct.s1* @s, i32 0, i32 0, i32 0, i32 0), i8* align 4 getelementptr inbounds (%struct.s1, %struct.s1* @s, i32 0, i32 0, i32 0, i32 0), i64 132, i1 true)
+// CHECK-NEXT: call void @llvm.memcpy.{{.*}}(i8* align 4 getelementptr inbounds (%struct.s1, %struct.s1* @s, i32 0, i32 0, i32 0, i32 0), i8* align 4 getelementptr inbounds (%struct.s, %struct.s* @gs, i32 0, i32 0, i32 0), i64 132, i1 true)
struct d : s1 {
};
gd = gd;
}
// CHECK-LABEL: define void @_Z4gorfv()
-// CHECK: call void @llvm.memcpy.{{.*}}(i8* getelementptr inbounds (%struct.d, %struct.d* @gd, i32 0, i32 0, i32 0, i32 0, i32 0), i8* getelementptr inbounds (%struct.d, %struct.d* @gd, i32 0, i32 0, i32 0, i32 0, i32 0), i64 132, i32 4, i1 true)
+// CHECK: call void @llvm.memcpy.{{.*}}(i8* align 4 getelementptr inbounds (%struct.d, %struct.d* @gd, i32 0, i32 0, i32 0, i32 0, i32 0), i8* align 4 getelementptr inbounds (%struct.d, %struct.d* @gd, i32 0, i32 0, i32 0, i32 0, i32 0), i64 132, i1 true)
// Basic copy-assignment:
// CHECK-LABEL: define linkonce_odr dereferenceable({{[0-9]+}}) %struct.Basic* @_ZN5BasicaSERKS_(%struct.Basic* %this, %struct.Basic* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
// CHECK: call dereferenceable({{[0-9]+}}) %struct.NonPOD* @_ZN6NonPODaSERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
// CHECK: ret %struct.Basic*
// PODMember copy-assignment:
// CHECK-LABEL: define linkonce_odr dereferenceable({{[0-9]+}}) %struct.PODMember* @_ZN9PODMemberaSERKS_(%struct.PODMember* %this, %struct.PODMember* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 32, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 32, i1 {{.*}})
// CHECK: call dereferenceable({{[0-9]+}}) %struct.NonPOD* @_ZN6NonPODaSERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
// CHECK: ret %struct.PODMember*
// PODLikeMember copy-assignment:
// CHECK-LABEL: define linkonce_odr dereferenceable({{[0-9]+}}) %struct.PODLikeMember* @_ZN13PODLikeMemberaSERKS_(%struct.PODLikeMember* %this, %struct.PODLikeMember* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 32, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 32, i1 {{.*}})
// CHECK: call dereferenceable({{[0-9]+}}) %struct.NonPOD* @_ZN6NonPODaSERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
// CHECK: ret %struct.PODLikeMember*
// ArrayMember copy-assignment:
// CHECK-LABEL: define linkonce_odr dereferenceable({{[0-9]+}}) %struct.ArrayMember* @_ZN11ArrayMemberaSERKS_(%struct.ArrayMember* %this, %struct.ArrayMember* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 64, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 64, i1 {{.*}})
// CHECK: call dereferenceable({{[0-9]+}}) %struct.NonPOD* @_ZN6NonPODaSERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 64, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 64, i1 {{.*}})
// CHECK: ret %struct.ArrayMember*
// VolatileMember copy-assignment:
// CHECK-LABEL: define linkonce_odr dereferenceable({{[0-9]+}}) %struct.VolatileMember* @_ZN14VolatileMemberaSERKS_(%struct.VolatileMember* %this, %struct.VolatileMember* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
// CHECK: load volatile i32, i32* {{.*}}, align 4
// CHECK: store volatile i32 {{.*}}, align 4
// CHECK: call dereferenceable({{[0-9]+}}) %struct.NonPOD* @_ZN6NonPODaSERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
// CHECK: ret %struct.VolatileMember*
// BitfieldMember copy-assignment:
// CHECK-LABEL: define linkonce_odr dereferenceable({{[0-9]+}}) %struct.BitfieldMember* @_ZN14BitfieldMemberaSERKS_(%struct.BitfieldMember* %this, %struct.BitfieldMember* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
// CHECK: call dereferenceable({{[0-9]+}}) %struct.NonPOD* @_ZN6NonPODaSERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 3, i32 1{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 1 {{.*}} align 1 {{.*}}i64 3, i1 {{.*}})
// CHECK: ret %struct.BitfieldMember*
// InnerClass copy-assignment:
// CHECK-LABEL: define linkonce_odr dereferenceable({{[0-9]+}}) %struct.InnerClassMember* @_ZN16InnerClassMemberaSERKS_(%struct.InnerClassMember* %this, %struct.InnerClassMember* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 32, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 32, i1 {{.*}})
// CHECK: call dereferenceable({{[0-9]+}}) %struct.NonPOD* @_ZN6NonPODaSERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
// CHECK: ret %struct.InnerClassMember*
// PackedMembers copy-assignment:
// CHECK-LABEL: define linkonce_odr dereferenceable({{[0-9]+}}) %struct.PackedMembers* @_ZN13PackedMembersaSERKS_(%struct.PackedMembers* %this, %struct.PackedMembers* dereferenceable({{[0-9]+}}))
// CHECK: call dereferenceable({{[0-9]+}}) %struct.NonPOD* @_ZN6NonPODaSERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 1{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 1 {{.*}} align 1 {{.*}}i64 16, i1 {{.*}})
// CHECK: ret %struct.PackedMembers*
// COPY-CONSTRUCTORS:
// PackedMembers copy-assignment:
// CHECK-LABEL: define linkonce_odr void @_ZN13PackedMembersC2ERKS_(%struct.PackedMembers* %this, %struct.PackedMembers* dereferenceable({{[0-9]+}}))
// CHECK: call void @_ZN6NonPODC1ERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 1{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 1 {{.*}} align 1 {{.*}}i64 16, i1 {{.*}})
// CHECK: ret void
CALL_CC(BitfieldMember2)
// BitfieldMember2 copy-constructor:
// CHECK-2-LABEL: define linkonce_odr void @_ZN15BitfieldMember2C2ERKS_(%struct.BitfieldMember2* %this, %struct.BitfieldMember2* dereferenceable({{[0-9]+}}))
-// CHECK-2: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4, i1 false)
+// CHECK-2: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 false)
// CHECK-2: call void @_ZN6NonPODC1ERKS_
// CHECK-2: ret void
CALL_CC(BitfieldMember3)
// BitfieldMember3 copy-constructor:
// CHECK-LABEL: define linkonce_odr void @_ZN15BitfieldMember3C2ERKS_(%struct.BitfieldMember3* %this, %struct.BitfieldMember3* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 8, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 8 {{.*}} align 8 {{.*}}i64 8, i1 false)
// CHECK: ret void
CALL_CC(ReferenceMember)
// ReferenceMember copy-constructor:
// CHECK-LABEL: define linkonce_odr void @_ZN15ReferenceMemberC2ERKS_(%struct.ReferenceMember* %this, %struct.ReferenceMember* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 8{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 8 {{.*}} align 8 {{.*}}i64 16, i1 {{.*}})
// CHECK: call void @_ZN6NonPODC1ERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 8{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 8 {{.*}} align 8 {{.*}}i64 16, i1 {{.*}})
// CHECK: ret void
CALL_CC(InnerClassMember)
// InnerClass copy-constructor:
// CHECK-LABEL: define linkonce_odr void @_ZN16InnerClassMemberC2ERKS_(%struct.InnerClassMember* %this, %struct.InnerClassMember* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 32, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 32, i1 {{.*}})
// CHECK: call void @_ZN6NonPODC1ERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
// CHECK: ret void
CALL_CC(BitfieldMember)
// BitfieldMember copy-constructor:
// CHECK-LABEL: define linkonce_odr void @_ZN14BitfieldMemberC2ERKS_(%struct.BitfieldMember* %this, %struct.BitfieldMember* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
// CHECK: call void @_ZN6NonPODC1ERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 3, i32 1{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 1 {{.*}} align 1 {{.*}}i64 3, i1 {{.*}})
// CHECK: ret void
CALL_CC(VolatileMember)
// VolatileMember copy-constructor:
// CHECK-LABEL: define linkonce_odr void @_ZN14VolatileMemberC2ERKS_(%struct.VolatileMember* %this, %struct.VolatileMember* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
// CHECK: load volatile i32, i32* {{.*}}, align 4
// CHECK: store volatile i32 {{.*}}, align 4
// CHECK: call void @_ZN6NonPODC1ERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
// CHECK: ret void
CALL_CC(ArrayMember)
// ArrayMember copy-constructor:
// CHECK-LABEL: define linkonce_odr void @_ZN11ArrayMemberC2ERKS_(%struct.ArrayMember* %this, %struct.ArrayMember* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 64, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 64, i1 {{.*}})
// CHECK: call void @_ZN6NonPODC1ERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 64, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 64, i1 {{.*}})
// CHECK: ret void
CALL_CC(PODLikeMember)
// PODLikeMember copy-constructor:
// CHECK-LABEL: define linkonce_odr void @_ZN13PODLikeMemberC2ERKS_(%struct.PODLikeMember* %this, %struct.PODLikeMember* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 32, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 32, i1 {{.*}})
// CHECK: invoke void @_ZN6NonPODC1ERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
// CHECK: ret void
// CHECK: landingpad
// CHECK: invoke void @_ZN7PODLikeD1Ev
CALL_CC(PODMember)
// PODMember copy-constructor:
// CHECK-LABEL: define linkonce_odr void @_ZN9PODMemberC2ERKS_(%struct.PODMember* %this, %struct.PODMember* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 32, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 32, i1 {{.*}})
// CHECK: call void @_ZN6NonPODC1ERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
// CHECK: ret void
CALL_CC(Basic)
// Basic copy-constructor:
// CHECK-LABEL: define linkonce_odr void @_ZN5BasicC2ERKS_(%struct.Basic* %this, %struct.Basic* dereferenceable({{[0-9]+}}))
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
// CHECK: call void @_ZN6NonPODC1ERKS_
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}}i64 16, i32 4{{.*}})
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
// CHECK: ret void
// CHECK: %[[dest_a_gep:.*]] = getelementptr inbounds %struct.Derived, %struct.Derived* %[[this]], i32 0, i32 1
// CHECK-NEXT: %[[src_load:.*]] = load %struct.Derived*, %struct.Derived** {{.*}}
// CHECK-NEXT: %[[src_a_gep:.*]] = getelementptr inbounds %struct.Derived, %struct.Derived* %[[src_load:.*]], i32 0, i32 1
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[dest_a_gep]], i8* %[[src_a_gep]], i64 1, i32 4, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %[[dest_a_gep]], i8* align 4 %[[src_a_gep]], i64 1, i1 false)
// CHECK-NEXT: %[[dest_this:.*]] = load %struct.Derived*, %struct.Derived** %[[retval]]
// CHECK-NEXT: ret %struct.Derived* %[[dest_this]]
bool a : 1;
// CHECK-NEXT: %[[src_a_gep:.*]] = getelementptr inbounds %struct.Derived2, %struct.Derived2* %[[src_load:.*]], i32 0, i32 1
// CHECK-NEXT: %[[dest_a_bitcast:.*]] = bitcast [1 x i32]* %[[dest_a_gep]] to i8*
// CHECK-NEXT: %[[src_a_bitcast:.*]] = bitcast [1 x i32]* %[[src_a_gep]] to i8*
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[dest_a_bitcast]], i8* %[[src_a_bitcast]], i32 4, i32 4, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %[[dest_a_bitcast]], i8* align 4 %[[src_a_bitcast]], i32 4, i1 false)
// CHECK-NEXT: %[[dest_this:.*]] = load %struct.Derived2*, %struct.Derived2** %[[retval]]
// CHECK-NEXT: ret %struct.Derived2* %[[dest_this]]
int Array[1];
struct C : B { C(); };
C::C() : A(3), B() {}
// CHECK-LABEL: define void @_ZN7PR111241CC1Ev
- // CHECK: call void @llvm.memset.p0i8.i64(i8* {{.*}}, i8 0, i64 12, i32 8, i1 false)
+ // CHECK: call void @llvm.memset.p0i8.i64(i8* align 8 {{.*}}, i8 0, i64 12, i1 false)
// CHECK-NEXT: call void @_ZN7PR111241BC2Ev
// Make sure C::C doesn't overwrite parts of A while it is zero-initializing B
struct C2 : B2 { C2(); };
C2::C2() : A(3), B2() {}
// CHECK-LABEL: define void @_ZN7PR111242C2C1Ev
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.*}}, i8* {{.*}}, i64 16, i32 8, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %{{.*}}, i8* align 8 {{.*}}, i64 16, i1 false)
// CHECK-NEXT: call void @_ZN7PR111242B2C2Ev
}
// CHECK-NEXT: [[TMP:%.*]] = alloca [[A]], align 4
// CHECK-NEXT: [[T0:%.*]] = bitcast [[A]]* [[TMP]] to i8*
// CHECK-NEXT: [[T1:%.*]] = bitcast [[A]]* [[X]] to i8*
- // CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[T0]], i8* [[T1]], i64 8, i32 4, i1 false)
+ // CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[T0]], i8* align 4 [[T1]], i64 8, i1 false)
// CHECK-NEXT: [[T0:%.*]] = bitcast [[A]]* [[TMP]] to i64*
// CHECK-NEXT: [[T1:%.*]] = load i64, i64* [[T0]], align 4
// CHECK-NEXT: call void (...) @_ZN5test13fooEz(i64 [[T1]])
// Initialize the fast enumaration state.
// CHECK-LP64-NEXT: [[T0:%.*]] = bitcast [[STATE_T]]* [[STATE]] to i8*
-// CHECK-LP64-NEXT: call void @llvm.memset.p0i8.i64(i8* [[T0]], i8 0, i64 64, i32 8, i1 false)
+// CHECK-LP64-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[T0]], i8 0, i64 64, i1 false)
// Evaluate the collection expression and retain.
// CHECK-LP64-NEXT: [[T0:%.*]] = load [[ARRAY_T]]*, [[ARRAY_T]]** [[ARRAY]], align 8
// CHECK: [[X:%.*]] = alloca [5 x i8*], align 16
// CHECK: call void @llvm.lifetime.start
// CHECK-NEXT: [[T0:%.*]] = bitcast [5 x i8*]* [[X]] to i8*
- // CHECK: call void @llvm.memset.p0i8.i64(i8* [[T0]], i8 0, i64 40, i32 16, i1 false)
+ // CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[T0]], i8 0, i64 40, i1 false)
id x[5];
extern id test19_helper(void);
// Zero-initialize.
// CHECK-NEXT: [[T0:%.*]] = bitcast i8** [[VLA]] to i8*
// CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[DIM]], 8
- // CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[T0]], i8 0, i64 [[T1]], i32 16, i1 false)
+ // CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 16 [[T0]], i8 0, i64 [[T1]], i1 false)
// Destroy.
// CHECK-NEXT: [[END:%.*]] = getelementptr inbounds i8*, i8** [[VLA]], i64 [[DIM]]
// CHECK-NEXT: [[T0:%.*]] = bitcast [3 x i8*]* [[VLA]] to i8*
// CHECK-NEXT: [[T1:%.*]] = mul nuw i64 2, [[DIM]]
// CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[T1]], 24
- // CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[T0]], i8 0, i64 [[T2]], i32 16, i1 false)
+ // CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 16 [[T0]], i8 0, i64 [[T2]], i1 false)
// Destroy.
// CHECK-NEXT: [[T0:%.*]] = mul nuw i64 2, [[DIM]]
// PR13697
void test1(int *a, id b) {
// CHECK: @test1
- // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* {{.*}}, i64 8, i32 1, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* {{.*}}, i64 8, i1 false)
memcpy(a, b, 8);
}
// CHECK: call {{.*}} @objc_msgSend_stret to
// CHECK-NEXT: br label
// CHECK: [[T0:%.*]] = bitcast [[POINT_T]]* [[POINT]] to i8*
- // CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[T0]], i8 0, i64 48, i32 4, i1 false)
+ // CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[T0]], i8 0, i64 48, i1 false)
// CHECK-NEXT: br label
// CHECK-NF: [[X:%.*]] = alloca [[A]]*
// CHECK-NF: call {{.*}} @objc_msgSend_stret to
// CHECK-NF-NEXT: br label
// CHECK-NF: [[T0:%.*]] = bitcast [[POINT_T]]* [[POINT]] to i8*
- // CHECK-NF-NEXT: call void @llvm.memset.p0i8.i64(i8* [[T0]], i8 0, i64 48, i32 4, i1 false)
+ // CHECK-NF-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[T0]], i8 0, i64 48, i1 false)
// CHECK-NF-NEXT: br label
MyPoint point = [x returnAPoint];
}
s = [(id)(argc&~255) method];
// CHECK: call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (%struct.stret*, i8*, i8*)*)(%struct.stret* sret [[T0:%[^,]+]]
// CHECK: [[T0P:%.*]] = bitcast %struct.stret* [[T0]] to i8*
- // CHECK: call void @llvm.memset.p0i8.i64(i8* [[T0P]], i8 0, i64 400, i32 4, i1 false)
+ // CHECK: call void @llvm.memset.p0i8.i64(i8* align 4 [[T0P]], i8 0, i64 400, i1 false)
s = [Test method];
// CHECK: call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (%struct.stret*, i8*, i8*)*)(%struct.stret* sret [[T1:%[^,]+]]
// Construct array.
// CHECK-NEXT: [[ARRAY:%.*]] = getelementptr inbounds [[A]], [[A]]* [[THIS]], i32 0, i32 1
// CHECK-NEXT: [[T0:%.*]] = bitcast [2 x [3 x i8*]]* [[ARRAY]] to i8*
- // CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[T0]], i8 0, i64 48, i32 8, i1 false)
+ // CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[T0]], i8 0, i64 48, i1 false)
// throw 0;
// CHECK: invoke void @__cxa_throw(
// Landing pad from throw site:
// CHECK-LABEL: define void @func3()
// CHECK: %a = alloca [16 x [1 x float]], align 4, addrspace(5)
// CHECK: %[[CAST:.+]] = bitcast [16 x [1 x float]] addrspace(5)* %a to i8 addrspace(5)*
-// CHECK: call void @llvm.memset.p5i8.i64(i8 addrspace(5)* %[[CAST]], i8 0, i64 64, i32 4, i1 false)
+// CHECK: call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 4 %[[CAST]], i8 0, i64 64, i1 false)
void func3(void) {
float a[16][1] = {{0.}};
}
// NOOPT: store i8* null, i8** %sp3, align 4
// NOOPT: store i8* null, i8** %sp4, align 4
// NOOPT: %[[SS1:.*]] = bitcast %struct.StructTy1* %SS1 to i8*
-// NOOPT: call void @llvm.memcpy.p0i8.p2i8.i64(i8* %[[SS1]], i8 addrspace(2)* bitcast (%struct.StructTy1 addrspace(2)* @test_func_scope_var_private.SS1 to i8 addrspace(2)*), i64 32, i32 8, i1 false)
+// NOOPT: call void @llvm.memcpy.p0i8.p2i8.i64(i8* align 8 %[[SS1]], i8 addrspace(2)* align 8 bitcast (%struct.StructTy1 addrspace(2)* @test_func_scope_var_private.SS1 to i8 addrspace(2)*), i64 32, i1 false)
// NOOPT: %[[SS2:.*]] = bitcast %struct.StructTy2* %SS2 to i8*
-// NOOPT: call void @llvm.memset.p0i8.i64(i8* %[[SS2]], i8 0, i64 24, i32 8, i1 false)
+// NOOPT: call void @llvm.memset.p0i8.i64(i8* align 8 %[[SS2]], i8 0, i64 24, i1 false)
void test_func_scope_var_private(void) {
private char *sp1 = 0;
private char *sp2 = NULL;
// NOOPT: store i8 addrspace(3)* null, i8 addrspace(3)** %sp3, align 4
// NOOPT: store i8 addrspace(3)* null, i8 addrspace(3)** %sp4, align 4
// NOOPT: %[[SS1:.*]] = bitcast %struct.StructTy1* %SS1 to i8*
-// NOOPT: call void @llvm.memcpy.p0i8.p2i8.i64(i8* %[[SS1]], i8 addrspace(2)* bitcast (%struct.StructTy1 addrspace(2)* @test_func_scope_var_local.SS1 to i8 addrspace(2)*), i64 32, i32 8, i1 false)
+// NOOPT: call void @llvm.memcpy.p0i8.p2i8.i64(i8* align 8 %[[SS1]], i8 addrspace(2)* align 8 bitcast (%struct.StructTy1 addrspace(2)* @test_func_scope_var_local.SS1 to i8 addrspace(2)*), i64 32, i1 false)
// NOOPT: %[[SS2:.*]] = bitcast %struct.StructTy2* %SS2 to i8*
-// NOOPT: call void @llvm.memset.p0i8.i64(i8* %[[SS2]], i8 0, i64 24, i32 8, i1 false)
+// NOOPT: call void @llvm.memset.p0i8.i64(i8* align 8 %[[SS2]], i8 0, i64 24, i1 false)
void test_func_scope_var_local(void) {
local char *sp1 = 0;
local char *sp2 = NULL;
} StructTy3;
// CHECK-LABEL: test_memset_private
-// CHECK: call void @llvm.memset.p0i8.i64(i8* nonnull {{.*}}, i8 0, i64 40, i32 8, i1 false)
+// CHECK: call void @llvm.memset.p0i8.i64(i8* nonnull align 8 {{.*}}, i8 0, i64 40, i1 false)
void test_memset_private(private StructTy3 *ptr) {
StructTy3 S3 = {0, 0, 0, 0, 0};
*ptr = S3;
// CHECK: %[[V2:.*]] = alloca <4 x i32>, align 16
// CHECK: %[[v0:.*]] = bitcast [6 x [6 x float]]* %A to i8*
- // CHECK: call void @llvm.memset.p0i8.i32(i8* %[[v0]], i8 0, i32 144, i32 4, i1 false)
+ // CHECK: call void @llvm.memset.p0i8.i32(i8* align 4 %[[v0]], i8 0, i32 144, i1 false)
// CHECK: %[[v1:.*]] = bitcast i8* %[[v0]] to [6 x [6 x float]]*
// CHECK: %[[v2:.*]] = getelementptr [6 x [6 x float]], [6 x [6 x float]]* %[[v1]], i32 0, i32 0
// CHECK: %[[v3:.*]] = getelementptr [6 x float], [6 x float]* %[[v2]], i32 0, i32 0
float A[6][6] = {1.0f, 2.0f};
// CHECK: %[[v5:.*]] = bitcast %struct.StrucTy* %S to i8*
- // CHECK: call void @llvm.memcpy.p0i8.p2i8.i32(i8* %[[v5]], i8 addrspace(2)* bitcast (%struct.StrucTy addrspace(2)* @f.S to i8 addrspace(2)*), i32 12, i32 4, i1 false)
+ // CHECK: call void @llvm.memcpy.p0i8.p2i8.i32(i8* align 4 %[[v5]], i8 addrspace(2)* align 4 bitcast (%struct.StrucTy addrspace(2)* @f.S to i8 addrspace(2)*), i32 12, i1 false)
StrucTy S = {1, 2};
// CHECK: store <2 x i32> <i32 1, i32 2>, <2 x i32>* %[[compoundliteral1]], align 8
void test() {
__private int arr[] = {1, 2, 3};
// CHECK: %[[arr_i8_ptr:[0-9]+]] = bitcast [3 x i32]* %arr to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p2i8.i32(i8* %[[arr_i8_ptr]], i8 addrspace(2)* bitcast ([3 x i32] addrspace(2)* @test.arr to i8 addrspace(2)*), i32 12, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p2i8.i32(i8* align 4 %[[arr_i8_ptr]], i8 addrspace(2)* align 4 bitcast ([3 x i32] addrspace(2)* @test.arr to i8 addrspace(2)*), i32 12, i1 false)
}
// CHECK: %[[l:.*]] = alloca %[[ListInt:[^ ]*]], align 8
// CHECK: %[[r:.*]] = alloca %[[ListInt]], align 8
- // CHECK: call {{.*}}memcpy{{.*}}(i8* %{{.*}}, i8* bitcast ({{.*}}* @_ZZ15testMixedStructvE1l to i8*), i64 16,
+ // CHECK: call {{.*}}memcpy{{.*}}(i8* align {{[0-9]+}} %{{.*}}, i8* align {{[0-9]+}} bitcast ({{.*}}* @_ZZ15testMixedStructvE1l to i8*), i64 16,
ListInt_left l{0, 1};
- // CHECK: call {{.*}}memcpy{{.*}}(i8* %{{.*}}, i8* bitcast ({{.*}}* @_ZZ15testMixedStructvE1r to i8*), i64 16,
+ // CHECK: call {{.*}}memcpy{{.*}}(i8* align {{[0-9]+}} %{{.*}}, i8* align {{[0-9]+}} bitcast ({{.*}}* @_ZZ15testMixedStructvE1r to i8*), i64 16,
ListInt_right r{0, 2};
// CHECK: call void @_Z10useListIntR4ListIiE(%[[ListInt]]* dereferenceable({{[0-9]+}}) %[[l]])
dx = dv;
// CHECK: [[LD:%.+]] = load x86_fp80, x86_fp80*
// CHECK: [[BITCAST:%.+]] = bitcast x86_fp80* [[LDTEMP:%.*]] to i8*
-// CHECK: call void @llvm.memset.p0i8.i64(i8* [[BITCAST]], i8 0, i64 16, i32 16, i1 false)
+// CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[BITCAST]], i8 0, i64 16, i1 false)
// CHECK: store x86_fp80 [[LD]], x86_fp80* [[LDTEMP]]
// CHECK: [[BITCAST:%.+]] = bitcast x86_fp80* [[LDTEMP:%.*]] to i128*
// CHECK: [[LD:%.+]] = load i128, i128* [[BITCAST]]
// CHECK: load i64, i64*
// CHECK: [[VAL:%.+]] = uitofp i64 %{{.+}} to x86_fp80
// CHECK: [[BITCAST:%.+]] = bitcast x86_fp80* [[TEMP:%.+]] to i8*
-// CHECK: call void @llvm.memset.p0i8.i64(i8* [[BITCAST]], i8 0, i64 16, i32 16, i1 false)
+// CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 [[BITCAST]], i8 0, i64 16, i1 false)
// CHECK: store x86_fp80 [[VAL]], x86_fp80* [[TEMP]]
// CHECK: [[BITCAST:%.+]] = bitcast x86_fp80* [[TEMP]] to i128*
// CHECK: [[VAL:%.+]] = load i128, i128* [[BITCAST]]
// CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
// CHECK-DAG: [[VEC_ADDR_VAL_BCAST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i{{[0-9]+}}*
// CHECK-DAG: [[VEC_PRIV_BCAST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i{{[0-9]+}}*
-// CHECK-DAG: call void @llvm.memcpy.{{.*}}(i{{[0-9]+}}* [[VEC_PRIV_BCAST]], i{{[0-9]+}}* [[VEC_ADDR_VAL_BCAST]],{{.+}})
+// CHECK-DAG: call void @llvm.memcpy.{{.*}}(i{{[0-9]+}}* align {{[0-9]+}} [[VEC_PRIV_BCAST]], i{{[0-9]+}}* align {{[0-9]+}} [[VEC_ADDR_VAL_BCAST]],{{.+}})
// init s_arr
// CHECK-DAG: [[S_ARR_ADDR_VAL:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[S_ARR_ADDR]],
// CHECK-DAG: [[S_ARR_DST_PAST:%.+]] = phi{{.+}} [ [[S_ARR_PRIV_BGN]],{{.+}} ], [ [[S_ARR_DST:%.+]],{{.+}} ]
// CHECK-DAG: [[S_ARR_SRC_BCAST:%.+]] = bitcast{{.+}} [[S_ARR_SRC_PAST]] to{{.+}}
// CHECK-DAG: [[S_ARR_DST_BCAST:%.+]] = bitcast{{.+}} [[S_ARR_DST_PAST]] to{{.+}}
-// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* [[S_ARR_DST_BCAST]], {{.+}}* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[S_ARR_DST_BCAST]], {{.+}}* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK-DAG: [[S_ARR_SRC]] = getelementptr{{.+}}
// CHECK-DAG: [[S_ARR_DST]] = getelementptr{{.+}}
// CHECK-DAG: [[S_ARR_CPY_FIN:%.+]] = icmp{{.+}} [[S_ARR_DST]], [[S_ARR_PRIV_NEXT]]
// CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP]],
// CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_PRIV]] to{{.+}}
// CHECK-DAG: [[TMP_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL]] to{{.+}}
-// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* [[VAR_PRIV_BCAST]], {{.+}}* [[TMP_BCAST]],{{.+}})
+// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[TMP_BCAST]],{{.+}})
// CHECK-DAG: store [[S_FLOAT_TY]]* [[VAR_PRIV]], [[S_FLOAT_TY]]** [[TMP_PRIV]],
// init svar
// CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
// CHECK-DAG: [[VEC_ADDR_VAL_BCAST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i{{[0-9]+}}*
// CHECK-DAG: [[VEC_PRIV_BCAST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i{{[0-9]+}}*
-// CHECK-DAG: call void @llvm.memcpy.{{.*}}(i{{[0-9]+}}* [[VEC_PRIV_BCAST]], i{{[0-9]+}}* [[VEC_ADDR_VAL_BCAST]],{{.+}})
+// CHECK-DAG: call void @llvm.memcpy.{{.*}}(i{{[0-9]+}}* align {{[0-9]+}} [[VEC_PRIV_BCAST]], i{{[0-9]+}}* align {{[0-9]+}} [[VEC_ADDR_VAL_BCAST]],{{.+}})
// init s_arr
// CHECK-DAG: [[S_ARR_ADDR_VAL:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR]],
// CHECK-DAG: [[S_ARR_DST_PAST:%.+]] = phi{{.+}} [ [[S_ARR_PRIV_BGN]],{{.+}} ], [ [[S_ARR_DST:%.+]],{{.+}} ]
// CHECK-DAG: [[S_ARR_SRC_BCAST:%.+]] = bitcast{{.+}} [[S_ARR_SRC_PAST]] to{{.+}}
// CHECK-DAG: [[S_ARR_DST_BCAST:%.+]] = bitcast{{.+}} [[S_ARR_DST_PAST]] to{{.+}}
-// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* [[S_ARR_DST_BCAST]], {{.+}}* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[S_ARR_DST_BCAST]], {{.+}}* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK-DAG: [[S_ARR_SRC]] = getelementptr{{.+}}
// CHECK-DAG: [[S_ARR_DST]] = getelementptr{{.+}}
// CHECK-DAG: [[S_ARR_CPY_FIN:%.+]] = icmp{{.+}} [[S_ARR_DST]], [[S_ARR_PRIV_NEXT]]
// CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP]],
// CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_PRIV]] to{{.+}}
// CHECK-DAG: [[TMP_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL]] to{{.+}}
-// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* [[VAR_PRIV_BCAST]], {{.+}}* [[TMP_BCAST]],{{.+}})
+// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[TMP_BCAST]],{{.+}})
// CHECK-DAG: store [[S_INT_TY]]* [[VAR_PRIV]], [[S_INT_TY]]** [[TMP_PRIV]],
// CHECK-DAG: store i{{[0-9]+}} 0, i{{[0-9]+}}* %.omp{{.+}},
// CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
// CHECK-DAG: [[S_ARR_PTR_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_PTR]] to i8*
// CHECK-DAG: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST]], i8* [[TMP_VAL_BCAST]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST]],
// CHECK: call void @__kmpc_for_static_fini(
// lastprivates
// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]] to [[S_FLOAT_TY]]*
// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_DST_EL]] to i8*
// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
// CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_REF]] to i8*
// CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
// CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
// CHECK: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR_REF]],
// CHECK: ret void
// CHECK-DAG: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
// CHECK-DAG: [[S_ARR_PTR_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_PTR1]] to i8*
// CHECK-DAG: [[TMP_VAL_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST1]], i8* [[TMP_VAL_BCAST1]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST1]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST1]],
// CHECK: call void @__kmpc_for_static_fini(
// lastprivates
// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF1]],
// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF1]] to i8*
// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV1]] to [[S_INT_TY]]*
// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_DST_EL]] to i8*
// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
// CHECK: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_REF]] to i8*
// CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
// CHECK: ret void
#endif
// CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[VEC_ADDR]],
// CHECK-DAG: [[VEC_PRIV_BCAST:%.+]] = bitcast {{.+}} [[VEC_PRIV]] to
// CHECK-DAG: [[VEC_ADDR_BCAST:%.+]] = bitcast {{.+}} [[VEC_ADDR_VAL]] to
-// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* [[VEC_PRIV_BCAST]], {{.+}}* [[VEC_ADDR_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* align {{[0-9]+}} [[VEC_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[VEC_ADDR_BCAST]],
// s_arr
// CHECK-DAG: [[S_ARR_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[S_ARR_ADDR]],
// CHECK-DAG: [[TMP_REF:%.+]] = load {{.+}}*, {{.+}}* [[TMP]],
// CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast {{.+}}* [[VAR_PRIV]] to
// CHECK-DAG: [[TMP_REF_BCAST:%.+]] = bitcast {{.+}}* [[TMP_REF]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* [[VAR_PRIV_BCAST]], {{.+}}* [[TMP_REF_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[TMP_REF_BCAST]],
// CHECK-DAG: store {{.+}}* [[VAR_PRIV]], {{.+}}** [[TMP_PRIV]],
// svar
// CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[VEC_ADDR]],
// CHECK-DAG: [[VEC_PRIV_BCAST:%.+]] = bitcast {{.+}} [[VEC_PRIV]] to
// CHECK-DAG: [[VEC_ADDR_BCAST:%.+]] = bitcast {{.+}} [[VEC_ADDR_VAL]] to
-// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* [[VEC_PRIV_BCAST]], {{.+}}* [[VEC_ADDR_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* align {{[0-9]+}} [[VEC_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[VEC_ADDR_BCAST]],
// s_arr
// CHECK-DAG: [[S_ARR_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[S_ARR_ADDR]],
// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load {{.+}}*, {{.+}}* [[VAR_ADDR]],
// CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast {{.+}}* [[VAR_PRIV]] to
// CHECK-DAG: [[VAR_ADDR_BCAST:%.+]] = bitcast {{.+}}* [[VAR_ADDR_REF]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* [[VAR_PRIV_BCAST]], {{.+}}* [[VAR_ADDR_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[VAR_ADDR_BCAST]],
// CHECK-DAG: store {{.+}}* [[VAR_PRIV]], {{.+}}** [[TMP_PRIV]],
// CHECK: call void @__kmpc_for_static_init_4(
// CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[VEC_ADDR]],
// CHECK-DAG: [[VEC_PRIV_BCAST:%.+]] = bitcast {{.+}} [[VEC_PRIV]] to
// CHECK-DAG: [[VEC_ADDR_BCAST:%.+]] = bitcast {{.+}} [[VEC_ADDR_VAL]] to
-// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* [[VEC_PRIV_BCAST]], {{.+}}* [[VEC_ADDR_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* align {{[0-9]+}} [[VEC_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[VEC_ADDR_BCAST]],
// s_arr
// CHECK-DAG: [[S_ARR_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[S_ARR_ADDR]],
// CHECK-DAG: [[TMP_REF:%.+]] = load {{.+}}*, {{.+}}* [[TMP]],
// CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast {{.+}}* [[VAR_PRIV]] to
// CHECK-DAG: [[TMP_REF_BCAST:%.+]] = bitcast {{.+}}* [[TMP_REF]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* [[VAR_PRIV_BCAST]], {{.+}}* [[TMP_REF_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[TMP_REF_BCAST]],
// CHECK-DAG: store {{.+}}* [[VAR_PRIV]], {{.+}}** [[TMP_PRIV]],
// CHECK: call void @__kmpc_for_static_init_4(
// CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[VEC_ADDR]],
// CHECK-DAG: [[VEC_PRIV_BCAST:%.+]] = bitcast {{.+}} [[VEC_PRIV]] to
// CHECK-DAG: [[VEC_ADDR_BCAST:%.+]] = bitcast {{.+}} [[VEC_ADDR_VAL]] to
-// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* [[VEC_PRIV_BCAST]], {{.+}}* [[VEC_ADDR_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* align {{[0-9]+}} [[VEC_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[VEC_ADDR_BCAST]],
// s_arr
// CHECK-DAG: [[S_ARR_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[S_ARR_ADDR]],
// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load {{.+}}*, {{.+}}* [[VAR_ADDR]],
// CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast {{.+}}* [[VAR_PRIV]] to
// CHECK-DAG: [[VAR_ADDR_BCAST:%.+]] = bitcast {{.+}}* [[VAR_ADDR_REF]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* [[VAR_PRIV_BCAST]], {{.+}}* [[VAR_ADDR_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[VAR_ADDR_BCAST]],
// CHECK-DAG: store {{.+}}* [[VAR_PRIV]], {{.+}}** [[TMP_PRIV]],
// CHECK: call void @__kmpc_for_static_init_4(
// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]] to [[S_FLOAT_TY]]*
// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_DST_EL]] to i8*
// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
// CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_REF]] to i8*
// CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
// CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
// CHECK: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR_REF]],
// CHECK: ret void
// CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
// CHECK-DAG: [[S_ARR_PTR_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_PTR]] to i8*
// CHECK-DAG: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST]], i8* [[TMP_VAL_BCAST]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST]],
// CHECK: call void @__kmpc_for_static_fini(
// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]] to [[S_FLOAT_TY]]*
// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_DST_EL]] to i8*
// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
// CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_ADDR_REF]] to i8*
// CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
// CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
// CHECK: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR_REF]],
// CHECK: ret void
// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]] to [[S_INT_TY]]*
// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_DST_EL]] to i8*
// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
// CHECK: [[TMP_VAL:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV]],
// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_REF]] to i8*
// CHECK: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST]],{{.+}})
// CHECK: ret void
// outlined function for 'parallel for'
// CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV]],
// CHECK-DAG: [[S_ARR_PTR_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_PTR]] to i8*
// CHECK-DAG: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL]] to i8*
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST]], i8* [[TMP_VAL_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST]],
// CHECK: call void @__kmpc_for_static_fini(
// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]] to [[S_INT_TY]]*
// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_DST_EL]] to i8*
// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
// CHECK: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV]],
// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_ADDR_REF]] to i8*
// CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
// CHECK: ret void
#endif
// CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[VEC_ADDR]],
// CHECK-DAG: [[VEC_PRIV_BCAST:%.+]] = bitcast {{.+}} [[VEC_PRIV]] to
// CHECK-DAG: [[VEC_ADDR_BCAST:%.+]] = bitcast {{.+}} [[VEC_ADDR_VAL]] to
-// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* [[VEC_PRIV_BCAST]], {{.+}}* [[VEC_ADDR_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* align {{[0-9]+}} [[VEC_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[VEC_ADDR_BCAST]],
// s_arr
// CHECK-DAG: [[S_ARR_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[S_ARR_ADDR]],
// CHECK-DAG: [[TMP_REF:%.+]] = load {{.+}}*, {{.+}}* [[TMP]],
// CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast {{.+}}* [[VAR_PRIV]] to
// CHECK-DAG: [[TMP_REF_BCAST:%.+]] = bitcast {{.+}}* [[TMP_REF]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* [[VAR_PRIV_BCAST]], {{.+}}* [[TMP_REF_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[TMP_REF_BCAST]],
// CHECK-DAG: store {{.+}}* [[VAR_PRIV]], {{.+}}** [[TMP_PRIV]],
// svar
// CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[VEC_ADDR]],
// CHECK-DAG: [[VEC_PRIV_BCAST:%.+]] = bitcast {{.+}} [[VEC_PRIV]] to
// CHECK-DAG: [[VEC_ADDR_BCAST:%.+]] = bitcast {{.+}} [[VEC_ADDR_VAL]] to
-// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* [[VEC_PRIV_BCAST]], {{.+}}* [[VEC_ADDR_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* align {{[0-9]+}} [[VEC_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[VEC_ADDR_BCAST]],
// s_arr
// CHECK-DAG: [[S_ARR_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[S_ARR_ADDR]],
// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load {{.+}}*, {{.+}}* [[VAR_ADDR]],
// CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast {{.+}}* [[VAR_PRIV]] to
// CHECK-DAG: [[VAR_ADDR_BCAST:%.+]] = bitcast {{.+}}* [[VAR_ADDR_REF]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* [[VAR_PRIV_BCAST]], {{.+}}* [[VAR_ADDR_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[VAR_ADDR_BCAST]],
// CHECK-DAG: store {{.+}}* [[VAR_PRIV]], {{.+}}** [[TMP_PRIV]],
// CHECK: call void @__kmpc_for_static_init_4(
// CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[VEC_ADDR]],
// CHECK-DAG: [[VEC_PRIV_BCAST:%.+]] = bitcast {{.+}} [[VEC_PRIV]] to
// CHECK-DAG: [[VEC_ADDR_BCAST:%.+]] = bitcast {{.+}} [[VEC_ADDR_VAL]] to
-// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* [[VEC_PRIV_BCAST]], {{.+}}* [[VEC_ADDR_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* align {{[0-9]+}} [[VEC_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[VEC_ADDR_BCAST]],
// s_arr
// CHECK-DAG: [[S_ARR_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[S_ARR_ADDR]],
// CHECK-DAG: [[TMP_REF:%.+]] = load {{.+}}*, {{.+}}* [[TMP]],
// CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast {{.+}}* [[VAR_PRIV]] to
// CHECK-DAG: [[TMP_REF_BCAST:%.+]] = bitcast {{.+}}* [[TMP_REF]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* [[VAR_PRIV_BCAST]], {{.+}}* [[TMP_REF_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[TMP_REF_BCAST]],
// CHECK-DAG: store {{.+}}* [[VAR_PRIV]], {{.+}}** [[TMP_PRIV]],
// CHECK: call void @__kmpc_for_static_init_4(
// CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[VEC_ADDR]],
// CHECK-DAG: [[VEC_PRIV_BCAST:%.+]] = bitcast {{.+}} [[VEC_PRIV]] to
// CHECK-DAG: [[VEC_ADDR_BCAST:%.+]] = bitcast {{.+}} [[VEC_ADDR_VAL]] to
-// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* [[VEC_PRIV_BCAST]], {{.+}}* [[VEC_ADDR_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy{{.+}}({{.+}}* align {{[0-9]+}} [[VEC_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[VEC_ADDR_BCAST]],
// s_arr
// CHECK-DAG: [[S_ARR_ADDR_VAL:%.+]] = load {{.+}}*, {{.+}}** [[S_ARR_ADDR]],
// CHECK-DAG: [[VAR_ADDR_REF:%.+]] = load {{.+}}*, {{.+}}* [[VAR_ADDR]],
// CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast {{.+}}* [[VAR_PRIV]] to
// CHECK-DAG: [[VAR_ADDR_BCAST:%.+]] = bitcast {{.+}}* [[VAR_ADDR_REF]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* [[VAR_PRIV_BCAST]], {{.+}}* [[VAR_ADDR_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[VAR_ADDR_BCAST]],
// CHECK-DAG: store {{.+}}* [[VAR_PRIV]], {{.+}}** [[TMP_PRIV]],
// CHECK: call void @__kmpc_for_static_init_4(
// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]] to [[S_FLOAT_TY]]*
// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_DST_EL]] to i8*
// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
// CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_REF]] to i8*
// CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
// CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
// CHECK: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR_REF]],
// CHECK: ret void
// CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
// CHECK-DAG: [[S_ARR_PTR_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_PTR]] to i8*
// CHECK-DAG: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST]], i8* [[TMP_VAL_BCAST]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST]],
// CHECK: call void @__kmpc_for_static_fini(
// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]] to [[S_FLOAT_TY]]*
// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_DST_EL]] to i8*
// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
// CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_ADDR_REF]] to i8*
// CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
// CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
// CHECK: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR_REF]],
// CHECK: ret void
// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]] to [[S_INT_TY]]*
// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_DST_EL]] to i8*
// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
// CHECK: [[TMP_VAL:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV]],
// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_REF]] to i8*
// CHECK: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST]],{{.+}})
// CHECK: ret void
// outlined function for 'parallel for'
// CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV]],
// CHECK-DAG: [[S_ARR_PTR_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_PTR]] to i8*
// CHECK-DAG: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL]] to i8*
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST]], i8* [[TMP_VAL_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST]],
// CHECK: call void @__kmpc_for_static_fini(
// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]] to [[S_INT_TY]]*
// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_DST_EL]] to i8*
// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
// CHECK: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV]],
// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_ADDR_REF]] to i8*
// CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
// CHECK: ret void
// CHECK: !{!"llvm.loop.vectorize.enable", i1 true}
// CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
// CHECK-DAG: [[VEC_ADDR_VAL_BCAST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i{{[0-9]+}}*
// CHECK-DAG: [[VEC_PRIV_BCAST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i{{[0-9]+}}*
-// CHECK-DAG: call void @llvm.memcpy.{{.*}}(i{{[0-9]+}}* [[VEC_PRIV_BCAST]], i{{[0-9]+}}* [[VEC_ADDR_VAL_BCAST]],{{.+}})
+// CHECK-DAG: call void @llvm.memcpy.{{.*}}(i{{[0-9]+}}* align {{[0-9]+}} [[VEC_PRIV_BCAST]], i{{[0-9]+}}* align {{[0-9]+}} [[VEC_ADDR_VAL_BCAST]],{{.+}})
// init s_arr
// CHECK-DAG: [[S_ARR_ADDR_VAL:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[S_ARR_ADDR]],
// CHECK-DAG: [[S_ARR_DST_PAST:%.+]] = phi{{.+}} [ [[S_ARR_PRIV_BGN]],{{.+}} ], [ [[S_ARR_DST:%.+]],{{.+}} ]
// CHECK-DAG: [[S_ARR_SRC_BCAST:%.+]] = bitcast{{.+}} [[S_ARR_SRC_PAST]] to{{.+}}
// CHECK-DAG: [[S_ARR_DST_BCAST:%.+]] = bitcast{{.+}} [[S_ARR_DST_PAST]] to{{.+}}
-// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* [[S_ARR_DST_BCAST]], {{.+}}* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[S_ARR_DST_BCAST]], {{.+}}* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK-DAG: [[S_ARR_SRC]] = getelementptr{{.+}}
// CHECK-DAG: [[S_ARR_DST]] = getelementptr{{.+}}
// CHECK-DAG: [[S_ARR_CPY_FIN:%.+]] = icmp{{.+}} [[S_ARR_DST]], [[S_ARR_PRIV_NEXT]]
// CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP]],
// CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_PRIV]] to{{.+}}
// CHECK-DAG: [[TMP_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL]] to{{.+}}
-// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* [[VAR_PRIV_BCAST]], {{.+}}* [[TMP_BCAST]],{{.+}})
+// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[TMP_BCAST]],{{.+}})
// CHECK-DAG: store [[S_FLOAT_TY]]* [[VAR_PRIV]], [[S_FLOAT_TY]]** [[TMP_PRIV]],
// init svar
// CHECK-DAG: [[VEC_ADDR_VAL:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** [[VEC_ADDR]],
// CHECK-DAG: [[VEC_ADDR_VAL_BCAST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i{{[0-9]+}}*
// CHECK-DAG: [[VEC_PRIV_BCAST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i{{[0-9]+}}*
-// CHECK-DAG: call void @llvm.memcpy.{{.*}}(i{{[0-9]+}}* [[VEC_PRIV_BCAST]], i{{[0-9]+}}* [[VEC_ADDR_VAL_BCAST]],{{.+}})
+// CHECK-DAG: call void @llvm.memcpy.{{.*}}(i{{[0-9]+}}* align {{[0-9]+}} [[VEC_PRIV_BCAST]], i{{[0-9]+}}* align {{[0-9]+}} [[VEC_ADDR_VAL_BCAST]],{{.+}})
// init s_arr
// CHECK-DAG: [[S_ARR_ADDR_VAL:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[S_ARR_ADDR]],
// CHECK-DAG: [[S_ARR_DST_PAST:%.+]] = phi{{.+}} [ [[S_ARR_PRIV_BGN]],{{.+}} ], [ [[S_ARR_DST:%.+]],{{.+}} ]
// CHECK-DAG: [[S_ARR_SRC_BCAST:%.+]] = bitcast{{.+}} [[S_ARR_SRC_PAST]] to{{.+}}
// CHECK-DAG: [[S_ARR_DST_BCAST:%.+]] = bitcast{{.+}} [[S_ARR_DST_PAST]] to{{.+}}
-// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* [[S_ARR_DST_BCAST]], {{.+}}* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[S_ARR_DST_BCAST]], {{.+}}* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK-DAG: [[S_ARR_SRC]] = getelementptr{{.+}}
// CHECK-DAG: [[S_ARR_DST]] = getelementptr{{.+}}
// CHECK-DAG: [[S_ARR_CPY_FIN:%.+]] = icmp{{.+}} [[S_ARR_DST]], [[S_ARR_PRIV_NEXT]]
// CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP]],
// CHECK-DAG: [[VAR_PRIV_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_PRIV]] to{{.+}}
// CHECK-DAG: [[TMP_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL]] to{{.+}}
-// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* [[VAR_PRIV_BCAST]], {{.+}}* [[TMP_BCAST]],{{.+}})
+// CHECK-DAG: call{{.+}} @llvm.memcpy.{{.+}}({{.+}}* align {{[0-9]+}} [[VAR_PRIV_BCAST]], {{.+}}* align {{[0-9]+}} [[TMP_BCAST]],{{.+}})
// CHECK-DAG: store [[S_INT_TY]]* [[VAR_PRIV]], [[S_INT_TY]]** [[TMP_PRIV]],
// CHECK-DAG: store i{{[0-9]+}} 0, i{{[0-9]+}}* %.omp{{.+}},
// CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
// CHECK-DAG: [[S_ARR_PTR_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_PTR]] to i8*
// CHECK-DAG: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST]], i8* [[TMP_VAL_BCAST]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST]],
// CHECK: call void @__kmpc_for_static_fini(
// lastprivates
// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]] to [[S_FLOAT_TY]]*
// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_DST_EL]] to i8*
// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
// CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_REF]] to i8*
// CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
// CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
// CHECK: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR_REF]],
// CHECK: ret void
// CHECK-DAG: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
// CHECK-DAG: [[S_ARR_PTR_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_PTR1]] to i8*
// CHECK-DAG: [[TMP_VAL_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST1]], i8* [[TMP_VAL_BCAST1]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST1]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST1]],
// CHECK: call void @__kmpc_for_static_fini(
// lastprivates
// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF1]],
// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF1]] to i8*
// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV1]] to [[S_INT_TY]]*
// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_DST_EL]] to i8*
// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
// CHECK: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_REF]] to i8*
// CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
// CHECK: ret void
// CHECK: !{!"llvm.loop.vectorize.enable", i1 true}
// firstprivate vec(vec)
// CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* bitcast ([2 x i{{[0-9]+}}]* [[VEC]] to i8*),
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST]], i8* align {{[0-9]+}} bitcast ([2 x i{{[0-9]+}}]* [[VEC]] to i8*),
// firstprivate s_arr(s_arr)
// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// firstprivate vec(vec)
// CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
// CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST]], i8* align {{[0-9]+}} [[VEC_SRC]],
// firstprivate s_arr(s_arr)
// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// original vec[]=private_vec[];
// CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
// CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST]], i8* align {{[0-9]+}} [[VEC_SRC]],
// original s_arr[]=private_s_arr[];
// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// original vec[]=private_vec[];
// CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
// CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST]], i8* align {{[0-9]+}} [[VEC_SRC]],
// original s_arr[]=private_s_arr[];
// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @{{.+}}([[S_FLOAT_TY]]* [[VAR_REF]], [[S_FLOAT_TY]]* dereferenceable(4) [[VAR_PRIV]])
// CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_REF]] to i8*
// CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[UP]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
// var1 = var1.operator &&(var1_reduction);
// CHECK: [[TO_FLOAT:%.+]] = call float @{{.+}}([[S_FLOAT_TY]]* [[VAR1_REF]])
// CHECK: call void @{{.+}}([[S_FLOAT_TY]]* [[COND_LVALUE:%.+]], float [[CONV]])
// CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR1_REF]] to i8*
// CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[COND_LVALUE]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
// t_var1 = min(t_var1, t_var1_reduction);
// CHECK: [[T_VAR1_VAL:%.+]] = load float, float* [[T_VAR1_REF]],
// CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @{{.+}}([[S_FLOAT_TY]]* [[VAR_REF]], [[S_FLOAT_TY]]* dereferenceable(4) [[VAR_PRIV]])
// CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_REF]] to i8*
// CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[UP]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
// CHECK: call void @__kmpc_end_critical(
// var1 = var1.operator &&(var1_reduction);
// CHECK: call void @{{.+}}([[S_FLOAT_TY]]* [[COND_LVALUE:%.+]], float [[CONV]])
// CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR1_REF]] to i8*
// CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[COND_LVALUE]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
// CHECK: call void @__kmpc_end_critical(
// t_var1 = min(t_var1, t_var1_reduction);
// CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @{{.+}}([[S_FLOAT_TY]]* [[VAR_LHS]], [[S_FLOAT_TY]]* dereferenceable(4) [[VAR_RHS]])
// CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_LHS]] to i8*
// CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[UP]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
// var1_lhs = var1_lhs.operator &&(var1_rhs);
// CHECK: [[TO_FLOAT:%.+]] = call float @{{.+}}([[S_FLOAT_TY]]* [[VAR1_LHS]])
// CHECK: call void @{{.+}}([[S_FLOAT_TY]]* [[COND_LVALUE:%.+]], float [[CONV]])
// CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR1_LHS]] to i8*
// CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[COND_LVALUE]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
// t_var1_lhs = min(t_var1_lhs, t_var1_rhs);
// CHECK: [[T_VAR1_LHS_VAL:%.+]] = load float, float* [[T_VAR1_LHS]],
// CHECK: phi [[S_FLOAT_TY]]*
// CHECK: [[AND:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @_ZN1SIfEanERKS0_([[S_FLOAT_TY]]* %{{.+}}, [[S_FLOAT_TY]]* dereferenceable(4) %{{.+}})
// CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[AND]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.+}}, i8* [[BITCAST]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %{{.+}}, i8* align 4 [[BITCAST]], i64 4, i1 false)
// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
// CHECK: br i1 [[DONE]],
// CHECK: call void @__kmpc_critical(
// CHECK: [[AND:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @_ZN1SIfEanERKS0_([[S_FLOAT_TY]]* %{{.+}}, [[S_FLOAT_TY]]* dereferenceable(4) %{{.+}})
// CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[AND]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.+}}, i8* [[BITCAST]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %{{.+}}, i8* align 4 [[BITCAST]], i64 4, i1 false)
// CHECK: call void @__kmpc_end_critical(
// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
// CHECK: br i1 [[DONE]],
// CHECK: phi [[S_FLOAT_TY]]*
// CHECK: [[AND:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @_ZN1SIfEanERKS0_([[S_FLOAT_TY]]* %{{.+}}, [[S_FLOAT_TY]]* dereferenceable(4) %{{.+}})
// CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[AND]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.+}}, i8* [[BITCAST]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %{{.+}}, i8* align 4 [[BITCAST]], i64 4, i1 false)
// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
// CHECK: br i1 [[DONE]],
// CHECK: phi [[S_FLOAT_TY]]*
// CHECK: [[AND:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @_ZN1SIfEanERKS0_([[S_FLOAT_TY]]* %{{.+}}, [[S_FLOAT_TY]]* dereferenceable(4) %{{.+}})
// CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[AND]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.+}}, i8* [[BITCAST]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %{{.+}}, i8* align 4 [[BITCAST]], i64 4, i1 false)
// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
// CHECK: br i1 [[DONE]],
// CHECK: call void @__kmpc_critical(
// CHECK: [[AND:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @_ZN1SIfEanERKS0_([[S_FLOAT_TY]]* %{{.+}}, [[S_FLOAT_TY]]* dereferenceable(4) %{{.+}})
// CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[AND]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.+}}, i8* [[BITCAST]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %{{.+}}, i8* align 4 [[BITCAST]], i64 4, i1 false)
// CHECK: call void @__kmpc_end_critical(
// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
// CHECK: br i1 [[DONE]],
// CHECK: phi [[S_FLOAT_TY]]*
// CHECK: [[AND:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @_ZN1SIfEanERKS0_([[S_FLOAT_TY]]* %{{.+}}, [[S_FLOAT_TY]]* dereferenceable(4) %{{.+}})
// CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[AND]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.+}}, i8* [[BITCAST]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %{{.+}}, i8* align 4 [[BITCAST]], i64 4, i1 false)
// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
// CHECK: br i1 [[DONE]],
// CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_INT_TY]]* @{{.+}}([[S_INT_TY]]* [[VAR_REF]], [[S_INT_TY]]* dereferenceable(4) [[VAR_PRIV]])
// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR_REF]] to i8*
// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[UP]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
// var1 = var1.operator &&(var1_reduction);
// CHECK: [[TO_INT:%.+]] = call i{{[0-9]+}} @{{.+}}([[S_INT_TY]]* [[VAR1_REF]])
// CHECK: call void @{{.+}}([[S_INT_TY]]* [[COND_LVALUE:%.+]], i32 [[CONV]])
// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_REF]] to i8*
// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[COND_LVALUE]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
// t_var1 = min(t_var1, t_var1_reduction);
// CHECK: [[T_VAR1_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR1_REF]],
// CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_INT_TY]]* @{{.+}}([[S_INT_TY]]* [[VAR_REF]], [[S_INT_TY]]* dereferenceable(4) [[VAR_PRIV]])
// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR_REF]] to i8*
// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[UP]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
// CHECK: call void @__kmpc_end_critical(
// var1 = var1.operator &&(var1_reduction);
// CHECK: call void @{{.+}}([[S_INT_TY]]* [[COND_LVALUE:%.+]], i32 [[CONV]])
// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_REF]] to i8*
// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[COND_LVALUE]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
// CHECK: call void @__kmpc_end_critical(
// t_var1 = min(t_var1, t_var1_reduction);
// CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_INT_TY]]* @{{.+}}([[S_INT_TY]]* [[VAR_LHS]], [[S_INT_TY]]* dereferenceable(4) [[VAR_RHS]])
// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR_LHS]] to i8*
// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[UP]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
// var1_lhs = var1_lhs.operator &&(var1_rhs);
// CHECK: [[TO_INT:%.+]] = call i{{[0-9]+}} @{{.+}}([[S_INT_TY]]* [[VAR1_LHS]])
// CHECK: call void @{{.+}}([[S_INT_TY]]* [[COND_LVALUE:%.+]], i32 [[CONV]])
// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_LHS]] to i8*
// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[COND_LVALUE]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
// t_var1_lhs = min(t_var1_lhs, t_var1_rhs);
// CHECK: [[T_VAR1_LHS_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR1_LHS]],
// firstprivate(b): memcpy(b_priv,b_in)
// TCHECK: [[B_PRIV_BCAST:%.+]] = bitcast [10 x float]* [[B_PRIV]] to i8*
// TCHECK: [[B_ADDR_REF_BCAST:%.+]] = bitcast [10 x float]* [[B_ADDR_REF]] to i8*
- // TCHECK: call void @llvm.memcpy.{{.+}}(i8* [[B_PRIV_BCAST]], i8* [[B_ADDR_REF_BCAST]], {{.+}})
+ // TCHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[B_PRIV_BCAST]], i8* align {{[0-9]+}} [[B_ADDR_REF_BCAST]], {{.+}})
// firstprivate(c)
// TCHECK: [[C_PRIV_BCAST:%.+]] = bitcast [5 x [10 x double]]* [[C_PRIV]] to i8*
// TCHECK: [[C_IN_BCAST:%.+]] = bitcast [5 x [10 x double]]* [[C_ADDR_REF]] to i8*
- // TCHECK: call void @llvm.memcpy.{{.+}}(i8* [[C_PRIV_BCAST]], i8* [[C_IN_BCAST]],{{.+}})
+ // TCHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[C_PRIV_BCAST]], i8* align {{[0-9]+}} [[C_IN_BCAST]],{{.+}})
// firstprivate(d)
// TCHECK: [[D_PRIV_BCAST:%.+]] = bitcast [[TT]]* [[D_PRIV]] to i8*
// TCHECK: [[D_IN_BCAST:%.+]] = bitcast [[TT]]* [[D_ADDR_REF]] to i8*
- // TCHECK: call void @llvm.memcpy.{{.+}}(i8* [[D_PRIV_BCAST]], i8* [[D_IN_BCAST]],{{.+}})
+ // TCHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[D_PRIV_BCAST]], i8* align {{[0-9]+}} [[D_IN_BCAST]],{{.+}})
// TCHECK: load i16, i16* [[A2_ADDR]],
// firstprivate(b)
// TCHECK: [[B_PRIV_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_PRIV]] to i8*
// TCHECK: [[B_IN_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_ADDR_REF]] to i8*
-// TCHECK: call void @llvm.memcpy.{{.+}}(i8* [[B_PRIV_BCAST]], i8* [[B_IN_BCAST]],{{.+}})
+// TCHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[B_PRIV_BCAST]], i8* align {{[0-9]+}} [[B_IN_BCAST]],{{.+}})
// TCHECK: ret void
// firstprivate(b)
// TCHECK: [[B_PRIV_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_PRIV]] to i8*
// TCHECK: [[B_IN_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_ADDR_REF]] to i8*
-// TCHECK: call void @llvm.memcpy.{{.+}}(i8* [[B_PRIV_BCAST]], i8* [[B_IN_BCAST]],{{.+}})
+// TCHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[B_PRIV_BCAST]], i8* align {{[0-9]+}} [[B_IN_BCAST]],{{.+}})
// TCHECK: ret void
// CHECK: icmp
// CHECK-NEXT: br i1 %
// CHECK: [[CAST:%.+]] = bitcast [[KMP_DIM]]* [[DIMS]] to i8*
-// CHECK: call void @llvm.memset.p0i8.i64(i8* [[CAST]], i8 0, i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memset.p0i8.i64(i8* align 8 [[CAST]], i8 0, i64 24, i1 false)
// CHECK: getelementptr inbounds [[KMP_DIM]], [[KMP_DIM]]* [[DIMS]], i32 0, i32 1
// CHECK: store i64 %{{.+}}, i64* %
// CHECK: getelementptr inbounds [[KMP_DIM]], [[KMP_DIM]]* [[DIMS]], i32 0, i32 2
// CHECK: icmp
// CHECK-NEXT: br i1 %
// CHECK: [[CAST:%.+]] = bitcast [[KMP_DIM]]* [[DIMS]] to i8*
-// CHECK: call void @llvm.memset.p0i8.i64(i8* [[CAST]], i8 0, i64 24, i32 8, i1 false)
+// CHECK: call void @llvm.memset.p0i8.i64(i8* align 8 [[CAST]], i8 0, i64 24, i1 false)
// CHECK: getelementptr inbounds [[KMP_DIM]], [[KMP_DIM]]* [[DIMS]], i32 0, i32 1
// CHECK: store i64 %{{.+}}, i64* %
// CHECK: getelementptr inbounds [[KMP_DIM]], [[KMP_DIM]]* [[DIMS]], i32 0, i32 2
// CHECK-DEBUG: [[LOC_2_ADDR:%.+]] = alloca %ident_t
// CHECK-DEBUG: [[KMPC_LOC_VOIDPTR:%.+]] = bitcast %ident_t* [[LOC_2_ADDR]] to i8*
// CHECK-DEBUG-NEXT: [[KMPC_DEFAULT_LOC_VOIDPTR:%.+]] = bitcast %ident_t* [[DEF_LOC_2]] to i8*
-// CHECK-DEBUG-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[KMPC_LOC_VOIDPTR]], i8* [[KMPC_DEFAULT_LOC_VOIDPTR]], i64 24, i32 8, i1 false)
+// CHECK-DEBUG-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[KMPC_LOC_VOIDPTR]], i8* align 8 [[KMPC_DEFAULT_LOC_VOIDPTR]], i64 24, i1 false)
// CHECK-DEBUG: store i32 %argc, i32* [[ARGC_ADDR:%.+]],
// CHECK-DEBUG: [[VLA:%.+]] = alloca i32, i64 [[VLA_SIZE:%[^,]+]],
// CHECK-DEBUG: [[KMPC_LOC_PSOURCE_REF:%.+]] = getelementptr inbounds %ident_t, %ident_t* [[LOC_2_ADDR]], i32 0, i32 4
// CHECK-DEBUG-DAG: [[LOC_2_ADDR:%.+]] = alloca %ident_t
// CHECK-DEBUG: [[KMPC_LOC_VOIDPTR:%.+]] = bitcast %ident_t* [[LOC_2_ADDR]] to i8*
// CHECK-DEBUG-NEXT: [[KMPC_DEFAULT_LOC_VOIDPTR:%.+]] = bitcast %ident_t* [[DEF_LOC_2]] to i8*
-// CHECK-DEBUG-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[KMPC_LOC_VOIDPTR]], i8* [[KMPC_DEFAULT_LOC_VOIDPTR]], i64 24, i32 8, i1 false)
+// CHECK-DEBUG-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[KMPC_LOC_VOIDPTR]], i8* align 8 [[KMPC_DEFAULT_LOC_VOIDPTR]], i64 24, i1 false)
// CHECK-DEBUG-NEXT: store i8** %argc, i8*** [[ARGC_ADDR:%.+]],
// CHECK-DEBUG: [[KMPC_LOC_PSOURCE_REF:%.+]] = getelementptr inbounds %ident_t, %ident_t* [[LOC_2_ADDR]], i32 0, i32 4
// CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.+}} x i8], [{{.+}} x i8]* [[LOC2]], i32 0, i32 0), i8** [[KMPC_LOC_PSOURCE_REF]]
// threadprivate_vec = vec;
// CHECK: call {{.*}}i8* @__kmpc_threadprivate_cached({{.+}} [[VEC]]
-// CHECK: call void @llvm.memcpy{{.*}}(i8* %{{.+}}, i8* bitcast ([2 x i{{[0-9]+}}]* [[VEC]] to i8*),
+// CHECK: call void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}} %{{.+}}, i8* align {{[0-9]+}} bitcast ([2 x i{{[0-9]+}}]* [[VEC]] to i8*),
// TLS-CHECK: [[MASTER_CAST:%.+]] = bitcast [2 x i32]* [[MASTER_REF2]] to i8*
-// TLS-CHECK: call void @llvm.memcpy{{.*}}(i8* bitcast ([2 x i{{[0-9]+}}]* [[VEC]] to i8*), i8* [[MASTER_CAST]]
+// TLS-CHECK: call void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}} bitcast ([2 x i{{[0-9]+}}]* [[VEC]] to i8*), i8* align {{[0-9]+}} [[MASTER_CAST]]
// threadprivate_s_arr = s_arr;
// CHECK: call {{.*}}i8* @__kmpc_threadprivate_cached({{.+}} [[S_ARR]]
// threadprivate_vec = vec;
// CHECK: call {{.*}}i8* @__kmpc_threadprivate_cached({{.+}} [[TMAIN_VEC]]
-// CHECK: call {{.*}}void @llvm.memcpy{{.*}}(i8* %{{.+}}, i8* bitcast ([2 x i{{[0-9]+}}]* [[TMAIN_VEC]] to i8*),
+// CHECK: call {{.*}}void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}} %{{.+}}, i8* align {{[0-9]+}} bitcast ([2 x i{{[0-9]+}}]* [[TMAIN_VEC]] to i8*),
// TLS-CHECK: [[MASTER_CAST:%.+]] = bitcast [2 x i32]* [[MASTER_REF1]] to i8*
-// TLS-CHECK: call void @llvm.memcpy{{.*}}(i8* bitcast ([2 x i{{[0-9]+}}]* [[TMAIN_VEC]] to i8*), i8* [[MASTER_CAST]]
+// TLS-CHECK: call void @llvm.memcpy{{.*}}(i8* align {{[0-9]+}} bitcast ([2 x i{{[0-9]+}}]* [[TMAIN_VEC]] to i8*), i8* align {{[0-9]+}} [[MASTER_CAST]]
// threadprivate_s_arr = s_arr;
// CHECK: call {{.*}}i8* @__kmpc_threadprivate_cached({{.+}} [[TMAIN_S_ARR]]
static St s[2];
// ARRAY: @__kmpc_fork_call(
// ARRAY: call i8* @__kmpc_threadprivate_cached(
-// ARRAY: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.+}}, i8* bitcast ([2 x i32]* @{{.+}} to i8*), i64 8, i32 4, i1 false)
+// ARRAY: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %{{.+}}, i8* align 4 bitcast ([2 x i32]* @{{.+}} to i8*), i64 8, i1 false)
// ARRAY: call dereferenceable(8) %struct.St* @{{.+}}(%struct.St* %{{.+}}, %struct.St* dereferenceable(8) %{{.+}})
// TLS-ARRAY: @__kmpc_fork_call(
// TLS-ARRAY: [[REFT:%.+]] = load [2 x i32]*, [2 x i32]** [[ADDR:%.+]],
// TLS-ARRAY: [[REF:%.+]] = bitcast [2 x i32]* [[REFT]] to i8*
-// TLS-ARRAY: call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast ([2 x i32]* @{{.+}} to i8*), i8* [[REF]], i64 8, i32 4, i1 false)
+// TLS-ARRAY: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 bitcast ([2 x i32]* @{{.+}} to i8*), i8* align 4 [[REF]], i64 8, i1 false)
// TLS-ARRAY: call dereferenceable(8) %struct.St* @{{.+}}(%struct.St* %{{.+}}, %struct.St* dereferenceable(8) %{{.+}})
#pragma omp threadprivate(a, s)
// CHECK-64: [[SIVAR7_CONV:%.+]] = bitcast i64* [[SIVAR7_PRIV]] to i32*
// CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
// CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST]], i8* align {{[0-9]+}} [[VEC_SRC]],
// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_BEGIN:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_REF]] to [[S_FLOAT_TY]]*
// CHECK: [[S_ARR_PRIV_END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_PRIV_BEGIN]], i{{[0-9]+}} 2
// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_PRIV]], align 128
// CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
// CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]], i{{[0-9]+}} {{[0-9]+}}, i{{[0-9]+}} 128,
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align 128 [[VEC_DEST]], i8* align 128 [[VEC_SRC]], i{{[0-9]+}} {{[0-9]+}}, i1
// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_BEGIN:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_REF]] to [[S_INT_TY]]*
// CHECK: [[S_ARR_PRIV_END:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_PRIV_BEGIN]], i{{[0-9]+}} 2
// ARRAY-DAG: store double* %{{.+}}, double** [[PRIV_VLA2]],
// ARRAY: call i8* @llvm.stacksave()
// ARRAY: [[SIZE:%.+]] = mul nuw i64 %{{.+}}, 8
-// ARRAY: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.+}}, i8* %{{.+}}, i64 [[SIZE]], i32 128, i1 false)
+// ARRAY: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 128 %{{.+}}, i8* align 128 %{{.+}}, i64 [[SIZE]], i1 false)
#pragma omp parallel firstprivate(a, s, vla1, vla2)
s[0].St_func(s, n, vla1);
;
// ARRAY-DAG: store double* %{{.+}}, double** [[PRIV_VLA2]],
// ARRAY: call i8* @llvm.stacksave()
// ARRAY: [[SIZE:%.+]] = mul nuw i64 %{{.+}}, 8
-// ARRAY: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.+}}, i8* %{{.+}}, i64 [[SIZE]], i32 128, i1 false)
+// ARRAY: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 128 %{{.+}}, i8* align 128 %{{.+}}, i64 [[SIZE]], i1 false)
#endif
// CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @{{.+}}([[S_FLOAT_TY]]* [[VAR_REF]], [[S_FLOAT_TY]]* dereferenceable(4) [[VAR_PRIV]])
// CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_REF]] to i8*
// CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[UP]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
// var1 = var1.operator &&(var1_reduction);
// CHECK: [[TO_FLOAT:%.+]] = call float @{{.+}}([[S_FLOAT_TY]]* [[VAR1_REF]])
// CHECK: call void @{{.+}}([[S_FLOAT_TY]]* [[COND_LVALUE:%.+]], float [[CONV]])
// CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR1_REF]] to i8*
// CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[COND_LVALUE]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
// t_var1 = min(t_var1, t_var1_reduction);
// CHECK: [[T_VAR1_VAL:%.+]] = load float, float* [[T_VAR1_REF]],
// CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @{{.+}}([[S_FLOAT_TY]]* [[VAR_REF]], [[S_FLOAT_TY]]* dereferenceable(4) [[VAR_PRIV]])
// CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_REF]] to i8*
// CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[UP]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
// CHECK: call void @__kmpc_end_critical(
// var1 = var1.operator &&(var1_reduction);
// CHECK: call void @{{.+}}([[S_FLOAT_TY]]* [[COND_LVALUE:%.+]], float [[CONV]])
// CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR1_REF]] to i8*
// CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[COND_LVALUE]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
// CHECK: call void @__kmpc_end_critical(
// t_var1 = min(t_var1, t_var1_reduction);
// CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @{{.+}}([[S_FLOAT_TY]]* [[VAR_LHS]], [[S_FLOAT_TY]]* dereferenceable(4) [[VAR_RHS]])
// CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_LHS]] to i8*
// CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[UP]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
// var1_lhs = var1_lhs.operator &&(var1_rhs);
// CHECK: [[TO_FLOAT:%.+]] = call float @{{.+}}([[S_FLOAT_TY]]* [[VAR1_LHS]])
// CHECK: call void @{{.+}}([[S_FLOAT_TY]]* [[COND_LVALUE:%.+]], float [[CONV]])
// CHECK: [[BC1:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR1_LHS]] to i8*
// CHECK: [[BC2:%.+]] = bitcast [[S_FLOAT_TY]]* [[COND_LVALUE]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
// t_var1_lhs = min(t_var1_lhs, t_var1_rhs);
// CHECK: [[T_VAR1_LHS_VAL:%.+]] = load float, float* [[T_VAR1_LHS]],
// CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_INT_TY]]* @{{.+}}([[S_INT_TY]]* [[VAR_REF]], [[S_INT_TY]]* dereferenceable(4) [[VAR_PRIV]])
// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR_REF]] to i8*
// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[UP]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
// var1 = var1.operator &&(var1_reduction);
// CHECK: [[TO_INT:%.+]] = call i{{[0-9]+}} @{{.+}}([[S_INT_TY]]* [[VAR1_REF]])
// CHECK: call void @{{.+}}([[S_INT_TY]]* [[COND_LVALUE:%.+]], i32 [[CONV]])
// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_REF]] to i8*
// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[COND_LVALUE]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
// t_var1 = min(t_var1, t_var1_reduction);
// CHECK: [[T_VAR1_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR1_REF]],
// CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_INT_TY]]* @{{.+}}([[S_INT_TY]]* [[VAR_REF]], [[S_INT_TY]]* dereferenceable(4) [[VAR_PRIV]])
// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR_REF]] to i8*
// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[UP]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
// CHECK: call void @__kmpc_end_critical(
// var1 = var1.operator &&(var1_reduction);
// CHECK: call void @{{.+}}([[S_INT_TY]]* [[COND_LVALUE:%.+]], i32 [[CONV]])
// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_REF]] to i8*
// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[COND_LVALUE]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
// CHECK: call void @__kmpc_end_critical(
// t_var1 = min(t_var1, t_var1_reduction);
// CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_INT_TY]]* @{{.+}}([[S_INT_TY]]* [[VAR_LHS]], [[S_INT_TY]]* dereferenceable(4) [[VAR_RHS]])
// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR_LHS]] to i8*
// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[UP]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
// var1_lhs = var1_lhs.operator &&(var1_rhs);
// CHECK: [[TO_INT:%.+]] = call i{{[0-9]+}} @{{.+}}([[S_INT_TY]]* [[VAR1_LHS]])
// CHECK: call void @{{.+}}([[S_INT_TY]]* [[COND_LVALUE:%.+]], i32 [[CONV]])
// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_LHS]] to i8*
// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[COND_LVALUE]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
// t_var1_lhs = min(t_var1_lhs, t_var1_rhs);
// CHECK: [[T_VAR1_LHS_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR1_LHS]],
// firstprivate vec(vec)
// CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* bitcast ([2 x i{{[0-9]+}}]* [[VEC]] to i8*),
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST]], i8* align {{[0-9]+}} bitcast ([2 x i{{[0-9]+}}]* [[VEC]] to i8*),
// firstprivate s_arr(s_arr)
// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// firstprivate vec(vec)
// CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
// CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST]], i8* align {{[0-9]+}} [[VEC_SRC]],
// firstprivate s_arr(s_arr)
// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// original vec[]=private_vec[];
// CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
// CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST]], i8* align {{[0-9]+}} [[VEC_SRC]],
// original s_arr[]=private_s_arr[];
// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_INT_TY]]* @{{.+}}([[S_INT_TY]]* [[VAR_REF]], [[S_INT_TY]]* dereferenceable(4) [[VAR_PRIV]])
// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR_REF]] to i8*
// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[UP]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
// var1 = var1.operator &&(var1_reduction);
// CHECK: [[TO_INT:%.+]] = call i{{[0-9]+}} @{{.+}}([[S_INT_TY]]* [[VAR1_REF]])
// CHECK: call void @{{.+}}([[S_INT_TY]]* [[COND_LVALUE:%.+]], i32 [[CONV]])
// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_REF]] to i8*
// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[COND_LVALUE]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
// t_var1 = min(t_var1, t_var1_reduction);
// CHECK: [[T_VAR1_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR1_REF]],
// CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_INT_TY]]* @{{.+}}([[S_INT_TY]]* [[VAR_REF]], [[S_INT_TY]]* dereferenceable(4) [[VAR_PRIV]])
// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR_REF]] to i8*
// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[UP]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
// CHECK: call void @__kmpc_end_critical(
// var1 = var1.operator &&(var1_reduction);
// CHECK: call void @{{.+}}([[S_INT_TY]]* [[COND_LVALUE:%.+]], i32 [[CONV]])
// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_REF]] to i8*
// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[COND_LVALUE]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
// CHECK: call void @__kmpc_end_critical(
// t_var1 = min(t_var1, t_var1_reduction);
// CHECK: [[UP:%.+]] = call dereferenceable(4) [[S_INT_TY]]* @{{.+}}([[S_INT_TY]]* [[VAR_LHS]], [[S_INT_TY]]* dereferenceable(4) [[VAR_RHS]])
// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR_LHS]] to i8*
// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[UP]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
// var1_lhs = var1_lhs.operator &&(var1_rhs);
// CHECK: [[TO_INT:%.+]] = call i{{[0-9]+}} @{{.+}}([[S_INT_TY]]* [[VAR1_LHS]])
// CHECK: call void @{{.+}}([[S_INT_TY]]* [[COND_LVALUE:%.+]], i32 [[CONV]])
// CHECK: [[BC1:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_LHS]] to i8*
// CHECK: [[BC2:%.+]] = bitcast [[S_INT_TY]]* [[COND_LVALUE]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[BC1]], i8* [[BC2]], i64 4, i32 4, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[BC1]], i8* align 4 [[BC2]], i64 4, i1 false)
// t_var1_lhs = min(t_var1_lhs, t_var1_rhs);
// CHECK: [[T_VAR1_LHS_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR1_LHS]],
// CHECK: [[DST_A2_ADDR:%.+]] = load i8*, i8** [[DST_A2_ADDR_REF]],
// CHECK: [[SRC_A2_ADDR_REF:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[SRC_ADDR]], i{{[0-9]+}} 0, i{{[0-9]+}} 3
// CHECK: [[SRC_A2_ADDR:%.+]] = load i8*, i8** [[SRC_A2_ADDR_REF]],
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[DST_A2_ADDR]], i8* [[SRC_A2_ADDR]], i{{[0-9]+}} 2, i{{[0-9]+}} 1, i1 false)
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align 1 [[DST_A2_ADDR]], i8* align 1 [[SRC_A2_ADDR]], i{{[0-9]+}} 2, i1 false)
// CHECK: [[DST_TC2_ADDR_REF:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[DST_ADDR]], i{{[0-9]+}} 0, i{{[0-9]+}} 4
// CHECK: [[DST_TC2_ADDR_VOID_PTR:%.+]] = load i8*, i8** [[DST_TC2_ADDR_REF]],
// CHECK: [[DST_TC2_ADDR:%.+]] = bitcast i8* [[DST_TC2_ADDR_VOID_PTR]] to [[TEST_CLASS_TY]]*
// firstprivate vec(vec)
// CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* bitcast ([2 x i{{[0-9]+}}]* [[VEC]] to i8*),
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST]], i8* align {{[0-9]+}} bitcast ([2 x i{{[0-9]+}}]* [[VEC]] to i8*),
// firstprivate s_arr(s_arr)
// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// firstprivate vec(vec)
// CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
// CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST]], i8* align {{[0-9]+}} [[VEC_SRC]],
// firstprivate s_arr(s_arr)
// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CK1: [[SHAREDS:%.+]] = getelementptr inbounds %struct.kmp_task_t, %struct.kmp_task_t* [[TASK_T]], i32 0, i32 0
// CK1: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS]],
// CK1: [[BC1:%.+]] = bitcast %struct.anon* [[CAPTURES]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[SHAREDS_REF]], i8* [[BC1]], i[[sz]] 4, i32 4, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align 4 [[SHAREDS_REF]], i8* align 4 [[BC1]], i[[sz]] 4, i1 false)
// CK1: [[PRIVS:%.+]] = getelementptr inbounds %struct.kmp_task_t_with_privates, %struct.kmp_task_t_with_privates* [[BC]], i32 0, i32 1
// CK1: [[PRIVS_BASEPTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t, %struct..kmp_privates.t* [[PRIVS]], i32 0, i32 0
// CK1: [[BC_PRIVS_BASEPTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_BASEPTRS]] to i8*
// CK1: [[BC_BASEPTRS:%.+]] = bitcast i8** [[GEPBP0]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_BASEPTRS]], i8* [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_BASEPTRS]], i8* align {{8|4}} [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i1 false)
// CK1: [[PRIVS_PTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t, %struct..kmp_privates.t* [[PRIVS]], i32 0, i32 1
// CK1: [[BC_PRIVS_PTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_PTRS]] to i8*
// CK1: [[BC_PTRS:%.+]] = bitcast i8** [[GEPP0]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_PTRS]], i8* [[BC_PTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_PTRS]], i8* align {{8|4}} [[BC_PTRS]], i[[sz]] {{8|4}}, i1 false)
// CK1: [[PRIVS_SIZES:%.+]] = getelementptr inbounds %struct..kmp_privates.t, %struct..kmp_privates.t* [[PRIVS]], i32 0, i32 2
// CK1: [[BC_PRIVS_SIZES:%.+]] = bitcast [1 x i[[sz]]]* [[PRIVS_SIZES]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_SIZES]], i8* bitcast ([1 x i[[sz]]]* [[SIZE00]] to i8*), i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_SIZES]], i8* align {{8|4}} bitcast ([1 x i[[sz]]]* [[SIZE00]] to i8*), i[[sz]] {{8|4}}, i1 false)
// CK1: [[DEP:%.+]] = getelementptr inbounds [1 x %struct.kmp_depend_info], [1 x %struct.kmp_depend_info]* [[MAIN_DEP:%.+]], i[[sz]] 0, i[[sz]] 0
// CK1: [[DEP_ADR:%.+]] = getelementptr inbounds %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP]], i32 0, i32 0
// CK1: [[BC_ADR:%.+]] = ptrtoint i32* %{{.+}} to i[[sz]]
// CK1: [[SHAREDS:%.+]] = getelementptr inbounds %struct.kmp_task_t, %struct.kmp_task_t* [[TASK_T]], i32 0, i32 0
// CK1: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS]],
// CK1: [[BC1:%.+]] = bitcast %struct.anon{{.+}}* [[CAPTURES]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[SHAREDS_REF]], i8* [[BC1]], i[[sz]] 1, i32 1, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align 1 [[SHAREDS_REF]], i8* align 1 [[BC1]], i[[sz]] 1, i1 false)
// CK1: [[PRIVS:%.+]] = getelementptr inbounds %struct.kmp_task_t_with_privates{{.+}}, %struct.kmp_task_t_with_privates{{.+}}* [[RES_BC]], i32 0, i32 1
// CK1: [[PRIVS_BASEPTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 0
// CK1: [[BC_PRIVS_BASEPTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_BASEPTRS]] to i8*
// CK1: [[BC_BASEPTRS:%.+]] = bitcast i8** [[GEPBP0]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_BASEPTRS]], i8* [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_BASEPTRS]], i8* align {{8|4}} [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i1 false)
// CK1: [[PRIVS_PTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 1
// CK1: [[BC_PRIVS_PTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_PTRS]] to i8*
// CK1: [[BC_PTRS:%.+]] = bitcast i8** [[GEPP0]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_PTRS]], i8* [[BC_PTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_PTRS]], i8* align {{8|4}} [[BC_PTRS]], i[[sz]] {{8|4}}, i1 false)
// CK1: [[PRIVS_SIZES:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 2
// CK1: [[BC_PRIVS_SIZES:%.+]] = bitcast [1 x i[[sz]]]* [[PRIVS_SIZES]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_SIZES]], i8* bitcast ([1 x i[[sz]]]* [[SIZE02]] to i8*), i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_SIZES]], i8* align {{8|4}} bitcast ([1 x i[[sz]]]* [[SIZE02]] to i8*), i[[sz]] {{8|4}}, i1 false)
// CK1: [[DEP:%.+]] = getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* [[MAIN_DEP:%.+]], i[[sz]] 0, i[[sz]] 0
// CK1: [[DEP_ADR:%.+]] = getelementptr inbounds %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP]], i32 0, i32 0
// CK1: [[BC_ADR:%.+]] = ptrtoint i32* %{{.+}} to i[[sz]]
// CK1: [[PRIVS_BASEPTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 0
// CK1: [[BC_PRIVS_BASEPTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_BASEPTRS]] to i8*
// CK1: [[BC_BASEPTRS:%.+]] = bitcast i8** [[GEPBP0]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_BASEPTRS]], i8* [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_BASEPTRS]], i8* align {{8|4}} [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i1 false)
// CK1: [[PRIVS_PTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 1
// CK1: [[BC_PRIVS_PTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_PTRS]] to i8*
// CK1: [[BC_PTRS:%.+]] = bitcast i8** [[GEPP0]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_PTRS]], i8* [[BC_PTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_PTRS]], i8* align {{8|4}} [[BC_PTRS]], i[[sz]] {{8|4}}, i1 false)
// CK1: [[PRIVS_SIZES:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 2
// CK1: [[BC_PRIVS_SIZES:%.+]] = bitcast [1 x i[[sz]]]* [[PRIVS_SIZES]] to i8*
// CK1: [[BC_SIZES:%.+]] = bitcast i[[sz]]* [[GEPS0]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_SIZES]], i8* [[BC_SIZES]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_SIZES]], i8* align {{8|4}} [[BC_SIZES]], i[[sz]] {{8|4}}, i1 false)
// CK1: [[DEP:%.+]] = getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* [[MAIN_DEP:%.+]], i[[sz]] 0, i[[sz]] 0
// CK1: [[DEP_ADR:%.+]] = getelementptr inbounds %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP]], i32 0, i32 0
// CK1: [[BC_ADR:%.+]] = ptrtoint float* %{{.+}} to i[[sz]]
// CK1: [[PRIVS_BASEPTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 0
// CK1: [[BC_PRIVS_BASEPTRS:%.+]] = bitcast [2 x i8*]* [[PRIVS_BASEPTRS]] to i8*
// CK1: [[BC_BASEPTRS:%.+]] = bitcast i8** [[GEPBP0]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_BASEPTRS]], i8* [[BC_BASEPTRS]], i[[sz]] {{16|8}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_BASEPTRS]], i8* align {{8|4}} [[BC_BASEPTRS]], i[[sz]] {{16|8}}, i1 false)
// CK1: [[PRIVS_PTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 1
// CK1: [[BC_PRIVS_PTRS:%.+]] = bitcast [2 x i8*]* [[PRIVS_PTRS]] to i8*
// CK1: [[BC_PTRS:%.+]] = bitcast i8** [[GEPP0]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_PTRS]], i8* [[BC_PTRS]], i[[sz]] {{16|8}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_PTRS]], i8* align {{8|4}} [[BC_PTRS]], i[[sz]] {{16|8}}, i1 false)
// CK1: [[PRIVS_SIZES:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 2
// CK1: [[BC_PRIVS_SIZES:%.+]] = bitcast [2 x i[[sz]]]* [[PRIVS_SIZES]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_SIZES]], i8* bitcast ([2 x i[[sz]]]* [[SIZE04]] to i8*), i[[sz]] {{16|8}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_SIZES]], i8* align {{8|4}} bitcast ([2 x i[[sz]]]* [[SIZE04]] to i8*), i[[sz]] {{16|8}}, i1 false)
// CK1: [[DEP:%.+]] = getelementptr inbounds [5 x %struct.kmp_depend_info], [5 x %struct.kmp_depend_info]* [[MAIN_DEP:%.+]], i[[sz]] 0, i[[sz]] 0
// CK1: [[DEP_ADR:%.+]] = getelementptr inbounds %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP]], i32 0, i32 0
// CK1: [[BC_ADR:%.+]] = ptrtoint double* %{{.+}} to i[[sz]]
// CK1: [[SHAREDS:%.+]] = getelementptr inbounds %struct.kmp_task_t, %struct.kmp_task_t* [[TASK_T]], i32 0, i32 0
// CK1: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS]],
// CK1: [[BC1:%.+]] = bitcast %struct.anon* [[CAPTURES]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[SHAREDS_REF]], i8* [[BC1]], i[[sz]] 4, i32 4, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align 4 [[SHAREDS_REF]], i8* align 4 [[BC1]], i[[sz]] 4, i1 false)
// CK1: [[PRIVS:%.+]] = getelementptr inbounds %struct.kmp_task_t_with_privates, %struct.kmp_task_t_with_privates* [[BC]], i32 0, i32 1
// CK1: [[PRIVS_BASEPTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t, %struct..kmp_privates.t* [[PRIVS]], i32 0, i32 0
// CK1: [[BC_PRIVS_BASEPTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_BASEPTRS]] to i8*
// CK1: [[BC_BASEPTRS:%.+]] = bitcast i8** [[GEPBP0]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_BASEPTRS]], i8* [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_BASEPTRS]], i8* align {{8|4}} [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i1 false)
// CK1: [[PRIVS_PTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t, %struct..kmp_privates.t* [[PRIVS]], i32 0, i32 1
// CK1: [[BC_PRIVS_PTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_PTRS]] to i8*
// CK1: [[BC_PTRS:%.+]] = bitcast i8** [[GEPP0]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_PTRS]], i8* [[BC_PTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_PTRS]], i8* align {{8|4}} [[BC_PTRS]], i[[sz]] {{8|4}}, i1 false)
// CK1: [[PRIVS_SIZES:%.+]] = getelementptr inbounds %struct..kmp_privates.t, %struct..kmp_privates.t* [[PRIVS]], i32 0, i32 2
// CK1: [[BC_PRIVS_SIZES:%.+]] = bitcast [1 x i[[sz]]]* [[PRIVS_SIZES]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_SIZES]], i8* bitcast ([1 x i[[sz]]]* [[SIZE00]] to i8*), i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_SIZES]], i8* align {{8|4}} bitcast ([1 x i[[sz]]]* [[SIZE00]] to i8*), i[[sz]] {{8|4}}, i1 false)
// CK1: [[DEP:%.+]] = getelementptr inbounds [1 x %struct.kmp_depend_info], [1 x %struct.kmp_depend_info]* [[MAIN_DEP:%.+]], i[[sz]] 0, i[[sz]] 0
// CK1: [[DEP_ADR:%.+]] = getelementptr inbounds %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP]], i32 0, i32 0
// CK1: [[BC_ADR:%.+]] = ptrtoint i32* %{{.+}} to i[[sz]]
// CK1: [[SHAREDS:%.+]] = getelementptr inbounds %struct.kmp_task_t, %struct.kmp_task_t* [[TASK_T]], i32 0, i32 0
// CK1: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS]],
// CK1: [[BC1:%.+]] = bitcast %struct.anon{{.+}}* [[CAPTURES]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[SHAREDS_REF]], i8* [[BC1]], i[[sz]] 1, i32 1, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align 1 [[SHAREDS_REF]], i8* align 1 [[BC1]], i[[sz]] 1, i1 false)
// CK1: [[PRIVS:%.+]] = getelementptr inbounds %struct.kmp_task_t_with_privates{{.+}}, %struct.kmp_task_t_with_privates{{.+}}* [[RES_BC]], i32 0, i32 1
// CK1: [[PRIVS_BASEPTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 0
// CK1: [[BC_PRIVS_BASEPTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_BASEPTRS]] to i8*
// CK1: [[BC_BASEPTRS:%.+]] = bitcast i8** [[GEPBP0]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_BASEPTRS]], i8* [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_BASEPTRS]], i8* align {{8|4}} [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i1 false)
// CK1: [[PRIVS_PTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 1
// CK1: [[BC_PRIVS_PTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_PTRS]] to i8*
// CK1: [[BC_PTRS:%.+]] = bitcast i8** [[GEPP0]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_PTRS]], i8* [[BC_PTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_PTRS]], i8* align {{8|4}} [[BC_PTRS]], i[[sz]] {{8|4}}, i1 false)
// CK1: [[PRIVS_SIZES:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 2
// CK1: [[BC_PRIVS_SIZES:%.+]] = bitcast [1 x i[[sz]]]* [[PRIVS_SIZES]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_SIZES]], i8* bitcast ([1 x i[[sz]]]* [[SIZE02]] to i8*), i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_SIZES]], i8* align {{8|4}} bitcast ([1 x i[[sz]]]* [[SIZE02]] to i8*), i[[sz]] {{8|4}}, i1 false)
// CK1: [[DEP:%.+]] = getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* [[MAIN_DEP:%.+]], i[[sz]] 0, i[[sz]] 0
// CK1: [[DEP_ADR:%.+]] = getelementptr inbounds %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP]], i32 0, i32 0
// CK1: [[BC_ADR:%.+]] = ptrtoint i32* %{{.+}} to i[[sz]]
// CK1: [[PRIVS_BASEPTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 0
// CK1: [[BC_PRIVS_BASEPTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_BASEPTRS]] to i8*
// CK1: [[BC_BASEPTRS:%.+]] = bitcast i8** [[GEPBP0]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_BASEPTRS]], i8* [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_BASEPTRS]], i8* align {{8|4}} [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i1 false)
// CK1: [[PRIVS_PTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 1
// CK1: [[BC_PRIVS_PTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_PTRS]] to i8*
// CK1: [[BC_PTRS:%.+]] = bitcast i8** [[GEPP0]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_PTRS]], i8* [[BC_PTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_PTRS]], i8* align {{8|4}} [[BC_PTRS]], i[[sz]] {{8|4}}, i1 false)
// CK1: [[PRIVS_SIZES:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 2
// CK1: [[BC_PRIVS_SIZES:%.+]] = bitcast [1 x i[[sz]]]* [[PRIVS_SIZES]] to i8*
// CK1: [[BC_SIZES:%.+]] = bitcast i[[sz]]* [[GEPS0]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_SIZES]], i8* [[BC_SIZES]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_SIZES]], i8* align {{8|4}} [[BC_SIZES]], i[[sz]] {{8|4}}, i1 false)
// CK1: [[DEP:%.+]] = getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* [[MAIN_DEP:%.+]], i[[sz]] 0, i[[sz]] 0
// CK1: [[DEP_ADR:%.+]] = getelementptr inbounds %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP]], i32 0, i32 0
// CK1: [[BC_ADR:%.+]] = ptrtoint float* %{{.+}} to i[[sz]]
// CK1: [[PRIVS_BASEPTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 0
// CK1: [[BC_PRIVS_BASEPTRS:%.+]] = bitcast [2 x i8*]* [[PRIVS_BASEPTRS]] to i8*
// CK1: [[BC_BASEPTRS:%.+]] = bitcast i8** [[GEPBP0]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_BASEPTRS]], i8* [[BC_BASEPTRS]], i[[sz]] {{16|8}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_BASEPTRS]], i8* align {{8|4}} [[BC_BASEPTRS]], i[[sz]] {{16|8}}, i1 false)
// CK1: [[PRIVS_PTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 1
// CK1: [[BC_PRIVS_PTRS:%.+]] = bitcast [2 x i8*]* [[PRIVS_PTRS]] to i8*
// CK1: [[BC_PTRS:%.+]] = bitcast i8** [[GEPP0]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_PTRS]], i8* [[BC_PTRS]], i[[sz]] {{16|8}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_PTRS]], i8* align {{8|4}} [[BC_PTRS]], i[[sz]] {{16|8}}, i1 false)
// CK1: [[PRIVS_SIZES:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 2
// CK1: [[BC_PRIVS_SIZES:%.+]] = bitcast [2 x i[[sz]]]* [[PRIVS_SIZES]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_SIZES]], i8* bitcast ([2 x i[[sz]]]* [[SIZE04]] to i8*), i[[sz]] {{16|8}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_SIZES]], i8* align {{8|4}} bitcast ([2 x i[[sz]]]* [[SIZE04]] to i8*), i[[sz]] {{16|8}}, i1 false)
// CK1: [[DEP:%.+]] = getelementptr inbounds [5 x %struct.kmp_depend_info], [5 x %struct.kmp_depend_info]* [[MAIN_DEP:%.+]], i[[sz]] 0, i[[sz]] 0
// CK1: [[DEP_ADR:%.+]] = getelementptr inbounds %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP]], i32 0, i32 0
// CK1: [[BC_ADR:%.+]] = ptrtoint double* %{{.+}} to i[[sz]]
// firstprivate(b): memcpy(b_priv,b_in)
// TCHECK: [[B_PRIV_BCAST:%.+]] = bitcast [10 x float]* [[B_PRIV]] to i8*
// TCHECK: [[B_ADDR_REF_BCAST:%.+]] = bitcast [10 x float]* [[B_ADDR_REF]] to i8*
- // TCHECK: call void @llvm.memcpy.{{.+}}(i8* [[B_PRIV_BCAST]], i8* [[B_ADDR_REF_BCAST]], {{.+}})
+ // TCHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[B_PRIV_BCAST]], i8* align {{[0-9]+}} [[B_ADDR_REF_BCAST]], {{.+}})
// TCHECK: [[RET_STACK:%.+]] = call i8* @llvm.stacksave()
// TCHECK: store i8* [[RET_STACK]], i8** [[SSTACK]],
// TCHECK: [[BN_COPY_SZ:%.+]] = mul{{.+}} i{{[0-9]+}} [[BN_SZ_VAL]], 4
// TCHECK: [[BN_PRIV__BCAST:%.+]] = bitcast float* [[BN_PRIV]] to i8*
// TCHECK: [[BN_REF_IN_BCAST:%.+]] = bitcast float* [[BN_ADDR_REF]] to i8*
- // TCHECK: call void @llvm.memcpy.{{.+}}(i8* [[BN_PRIV__BCAST]], i8* [[BN_REF_IN_BCAST]], i{{[0-9]+}} [[BN_COPY_SZ]],{{.+}})
+ // TCHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BN_PRIV__BCAST]], i8* align {{[0-9]+}} [[BN_REF_IN_BCAST]], i{{[0-9]+}} [[BN_COPY_SZ]],{{.+}})
// firstprivate(c)
// TCHECK: [[C_PRIV_BCAST:%.+]] = bitcast [5 x [10 x double]]* [[C_PRIV]] to i8*
// TCHECK: [[C_IN_BCAST:%.+]] = bitcast [5 x [10 x double]]* [[C_ADDR_REF]] to i8*
- // TCHECK: call void @llvm.memcpy.{{.+}}(i8* [[C_PRIV_BCAST]], i8* [[C_IN_BCAST]],{{.+}})
+ // TCHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[C_PRIV_BCAST]], i8* align {{[0-9]+}} [[C_IN_BCAST]],{{.+}})
// firstprivate(cn)
// TCHECK: [[CN_SZ:%.+]] = mul{{.+}} i{{[0-9]+}} [[CN_SZ1_VAL]], [[CN_SZ2_VAL]]
// TCHECK: [[CN_SZ2_CPY:%.+]] = mul{{.+}} i{{[0-9]+}} [[CN_SZ2]], 8
// TCHECK: [[CN_PRIV_BCAST:%.+]] = bitcast double* [[CN_PRIV]] to i8*
// TCHECK: [[CN_IN_BCAST:%.+]] = bitcast double* [[CN_ADDR_REF]] to i8*
- // TCHECK: call void @llvm.memcpy.{{.+}}(i8* [[CN_PRIV_BCAST]], i8* [[CN_IN_BCAST]], i{{[0-9]+}} [[CN_SZ2_CPY]],{{.+}})
+ // TCHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[CN_PRIV_BCAST]], i8* align {{[0-9]+}} [[CN_IN_BCAST]], i{{[0-9]+}} [[CN_SZ2_CPY]],{{.+}})
// firstprivate(d)
// TCHECK: [[D_PRIV_BCAST:%.+]] = bitcast [[TT]]* [[D_PRIV]] to i8*
// TCHECK: [[D_IN_BCAST:%.+]] = bitcast [[TT]]* [[D_ADDR_REF]] to i8*
- // TCHECK: call void @llvm.memcpy.{{.+}}(i8* [[D_PRIV_BCAST]], i8* [[D_IN_BCAST]],{{.+}})
+ // TCHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[D_PRIV_BCAST]], i8* align {{[0-9]+}} [[D_IN_BCAST]],{{.+}})
#pragma omp target firstprivate(ptr)
// firstprivate(b)
// TCHECK: [[B_PRIV_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_PRIV]] to i8*
// TCHECK: [[B_IN_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_ADDR_REF]] to i8*
-// TCHECK: call void @llvm.memcpy.{{.+}}(i8* [[B_PRIV_BCAST]], i8* [[B_IN_BCAST]],{{.+}})
+// TCHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[B_PRIV_BCAST]], i8* align {{[0-9]+}} [[B_IN_BCAST]],{{.+}})
// TCHECK: ret void
// TCHECK: [[C_SZ_CPY:%.+]] = mul{{.+}} i{{[0-9]+}} [[C_SZ2]], 2
// TCHECK: [[C_PRIV_BCAST:%.+]] = bitcast i{{[0-9]+}}* [[C_PRIV]] to i8*
// TCHECK: [[C_IN_BCAST:%.+]] = bitcast i{{[0-9]+}}* [[C_ADDR_REF]] to i8*
- // TCHECK: call void @llvm.memcpy.{{.+}}(i8* [[C_PRIV_BCAST]], i8* [[C_IN_BCAST]],{{.+}})
+ // TCHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[C_PRIV_BCAST]], i8* align {{[0-9]+}} [[C_IN_BCAST]],{{.+}})
// finish
// TCHECK: [[RELOAD_SSTACK:%.+]] = load i8*, i8** [[SSTACK]],
// firstprivate(b)
// TCHECK: [[B_PRIV_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_PRIV]] to i8*
// TCHECK: [[B_IN_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_ADDR_REF]] to i8*
-// TCHECK: call void @llvm.memcpy.{{.+}}(i8* [[B_PRIV_BCAST]], i8* [[B_IN_BCAST]],{{.+}})
+// TCHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[B_PRIV_BCAST]], i8* align {{[0-9]+}} [[B_IN_BCAST]],{{.+}})
// TCHECK: ret void
// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
// firstprivate(s_arr)
// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]],
// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
// firstprivate(s_arr)
// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
// CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
// CHECK-DAG: [[S_ARR_PTR_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_PTR]] to i8*
// CHECK-DAG: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST]], i8* [[TMP_VAL_BCAST]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST]],
// CHECK: call void @__kmpc_for_static_fini(
// lastprivates
// CHECK-32: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR]],
// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]] to [[S_FLOAT_TY]]*
// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_DST_EL]] to i8*
// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
// CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_ADDR_REF]] to i8*
// CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
// CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
// CHECK-64: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR_REF]],
// CHECK-32: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR]],
// CHECK-DAG: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
// CHECK-DAG: [[S_ARR_PTR_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_PTR1]] to i8*
// CHECK-DAG: [[TMP_VAL_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST1]], i8* [[TMP_VAL_BCAST1]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST1]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST1]],
// CHECK: call void @__kmpc_for_static_fini(
// lastprivates
// CHECK-32: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR1]],
// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF1]] to i8*
// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV1]] to [[S_INT_TY]]*
// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_DST_EL]] to i8*
// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
// CHECK: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_ADDR1_REF]] to i8*
// CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
// CHECK: ret void
#endif
// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
// firstprivate(s_arr)
// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]],
// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
// firstprivate(s_arr)
// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]],
// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
// firstprivate(s_arr)
// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
// firstprivate(s_arr)
// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
// CHECK-64-DAG: store {{.+}}, {{.+}} [[TVAR_TGT]],
// CHECK-32-DAG: store {{.+}}, {{.+}} [[T_VAR_ADDR]],
// CHECK-DAG: [[VEC_TGT_REF:%.+]] = bitcast {{.+}} [[VEC_TGT]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[VEC_TGT_REF]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_TGT_REF]],
// CHECK-DAG: [[S_ARR_BEGIN:%.+]] = getelementptr {{.+}} [[S_ARR_TGT]],
// CHECK-DAG: call void @llvm.memcpy.{{.+}}(
// CHECK-DAG: [[VAR_TGT_BCAST:%.+]] = bitcast {{.+}} [[VAR_TGT]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[VAR_TGT_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_TGT_BCAST]],
// CHECK-64-DAG: store {{.+}}, {{.+}} [[SVAR_TGT]],
// CHECK-32-DAG: store {{.+}}, {{.+}} [[SVAR_ADDR]],
// CHECK: ret void
// CHECK-64-DAG: store {{.+}}, {{.+}} [[TVAR_TGT]],
// CHECK-32-DAG: store {{.+}}, {{.+}} [[T_VAR_ADDR]],
// CHECK-DAG: [[VEC_TGT_REF:%.+]] = bitcast {{.+}} [[VEC_TGT]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[VEC_TGT_REF]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_TGT_REF]],
// CHECK-DAG: [[S_ARR_BEGIN:%.+]] = getelementptr {{.+}} [[S_ARR_TGT]],
// CHECK-DAG: call void @llvm.memcpy.{{.+}}(
// CHECK-DAG: [[VAR_TGT_BCAST:%.+]] = bitcast {{.+}} [[VAR_TGT]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[VAR_TGT_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_TGT_BCAST]],
// CHECK-64-DAG: store {{.+}}, {{.+}} [[SVAR_TGT]],
// CHECK-32-DAG: store {{.+}}, {{.+}} [[SVAR_ADDR]],
// CHECK: ret void
// CHECK-64-DAG: store {{.+}}, {{.+}} [[T_VAR_TGT]],
// CHECK-32-DAG: store {{.+}}, {{.+}} [[T_VAR_ADDR1]],
// CHECK-DAG: [[VEC_TGT_BCAST:%.+]] = bitcast {{.+}} [[VEC_TGT]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[VEC_TGT_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_TGT_BCAST]],
// CHECK-DAG: {{.+}} = getelementptr {{.+}} [[S_ARR_TGT]],
// CHECK: call void @llvm.memcpy.{{.+}}(
// CHECK-DAG: [[VAR_ADDR_BCAST:%.+]] = bitcast {{.+}} [[VAR_TGT]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_BCAST]],
// CHECK: ret void
// CHECK: define internal void [[TPAR_OUTL:@.+]](i{{[0-9]+}}* noalias [[GTID_ADDR1:%.+]], i{{[0-9]+}}* noalias %{{.+}}, {{.+}}, {{.+}}, [2 x i{{[0-9]+}}]*{{.+}} [[VEC_IN1:%.+]], i{{[0-9]+}}{{.+}} [[T_VAR_IN1:%.+]], [2 x [[S_INT_TY]]]*{{.+}} [[S_ARR_IN1:%.+]], [[S_INT_TY]]*{{.+}} [[VAR_IN1:%.+]])
// CHECK-64-DAG: store {{.+}}, {{.+}} [[T_VAR_TGT]],
// CHECK-32-DAG: store {{.+}}, {{.+}} [[T_VAR_ADDR1]],
// CHECK-DAG: [[VEC_TGT_BCAST:%.+]] = bitcast {{.+}} [[VEC_TGT]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[VEC_TGT_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_TGT_BCAST]],
// CHECK-DAG: {{.+}} = getelementptr {{.+}} [[S_ARR_TGT]],
// CHECK: call void @llvm.memcpy.{{.+}}(
// CHECK-DAG: [[VAR_ADDR_BCAST:%.+]] = bitcast {{.+}} [[VAR_TGT]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_BCAST]],
// CHECK: ret void
#endif
// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
// firstprivate(s_arr)
// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]],
// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
// firstprivate(s_arr)
// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]],
// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
// firstprivate(s_arr)
// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
// firstprivate(s_arr)
// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
// CHECK-64-DAG: store {{.+}}, {{.+}} [[TVAR_TGT]],
// CHECK-32-DAG: store {{.+}}, {{.+}} [[T_VAR_ADDR]],
// CHECK-DAG: [[VEC_TGT_REF:%.+]] = bitcast {{.+}} [[VEC_TGT]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[VEC_TGT_REF]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_TGT_REF]],
// CHECK-DAG: [[S_ARR_BEGIN:%.+]] = getelementptr {{.+}} [[S_ARR_TGT]],
// CHECK-DAG: call void @llvm.memcpy.{{.+}}(
// CHECK-DAG: [[VAR_TGT_BCAST:%.+]] = bitcast {{.+}} [[VAR_TGT]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[VAR_TGT_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_TGT_BCAST]],
// CHECK-64-DAG: store {{.+}}, {{.+}} [[SVAR_TGT]],
// CHECK-32-DAG: store {{.+}}, {{.+}} [[SVAR_ADDR]],
// CHECK: ret void
// CHECK-64-DAG: store {{.+}}, {{.+}} [[TVAR_TGT]],
// CHECK-32-DAG: store {{.+}}, {{.+}} [[T_VAR_ADDR]],
// CHECK-DAG: [[VEC_TGT_REF:%.+]] = bitcast {{.+}} [[VEC_TGT]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[VEC_TGT_REF]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_TGT_REF]],
// CHECK-DAG: [[S_ARR_BEGIN:%.+]] = getelementptr {{.+}} [[S_ARR_TGT]],
// CHECK-DAG: call void @llvm.memcpy.{{.+}}(
// CHECK-DAG: [[VAR_TGT_BCAST:%.+]] = bitcast {{.+}} [[VAR_TGT]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[VAR_TGT_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_TGT_BCAST]],
// CHECK-64-DAG: store {{.+}}, {{.+}} [[SVAR_TGT]],
// CHECK-32-DAG: store {{.+}}, {{.+}} [[SVAR_ADDR]],
// CHECK: ret void
// CHECK-64-DAG: store {{.+}}, {{.+}} [[T_VAR_TGT]],
// CHECK-32-DAG: store {{.+}}, {{.+}} [[T_VAR_ADDR1]],
// CHECK-DAG: [[VEC_TGT_BCAST:%.+]] = bitcast {{.+}} [[VEC_TGT]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[VEC_TGT_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_TGT_BCAST]],
// CHECK-DAG: {{.+}} = getelementptr {{.+}} [[S_ARR_TGT]],
// CHECK: call void @llvm.memcpy.{{.+}}(
// CHECK-DAG: [[VAR_ADDR_BCAST:%.+]] = bitcast {{.+}} [[VAR_TGT]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_BCAST]],
// CHECK: ret void
// CHECK: define internal void [[TPAR_OUTL:@.+]](i{{[0-9]+}}* noalias [[GTID_ADDR1:%.+]], i{{[0-9]+}}* noalias %{{.+}}, {{.+}}, {{.+}}, [2 x i{{[0-9]+}}]*{{.+}} [[VEC_IN1:%.+]], i{{[0-9]+}}{{.+}} [[T_VAR_IN1:%.+]], [2 x [[S_INT_TY]]]*{{.+}} [[S_ARR_IN1:%.+]], [[S_INT_TY]]*{{.+}} [[VAR_IN1:%.+]])
// CHECK-64-DAG: store {{.+}}, {{.+}} [[T_VAR_TGT]],
// CHECK-32-DAG: store {{.+}}, {{.+}} [[T_VAR_ADDR1]],
// CHECK-DAG: [[VEC_TGT_BCAST:%.+]] = bitcast {{.+}} [[VEC_TGT]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[VEC_TGT_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_TGT_BCAST]],
// CHECK-DAG: {{.+}} = getelementptr {{.+}} [[S_ARR_TGT]],
// CHECK: call void @llvm.memcpy.{{.+}}(
// CHECK-DAG: [[VAR_ADDR_BCAST:%.+]] = bitcast {{.+}} [[VAR_TGT]] to
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_BCAST]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_BCAST]],
// CHECK: ret void
#endif
// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
// firstprivate(s_arr)
// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]],
// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
// firstprivate(s_arr)
// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
// CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
// CHECK-DAG: [[S_ARR_PTR_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_PTR]] to i8*
// CHECK-DAG: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST]], i8* [[TMP_VAL_BCAST]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST]],
// CHECK: call void @__kmpc_for_static_fini(
// lastprivates
// CHECK-32: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR]],
// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]] to [[S_FLOAT_TY]]*
// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_DST_EL]] to i8*
// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
// CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_ADDR_REF]] to i8*
// CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
// CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
// CHECK-64: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR_REF]],
// CHECK-32: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR]],
// CHECK-DAG: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
// CHECK-DAG: [[S_ARR_PTR_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_PTR1]] to i8*
// CHECK-DAG: [[TMP_VAL_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST1]], i8* [[TMP_VAL_BCAST1]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST1]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST1]],
// CHECK: call void @__kmpc_for_static_fini(
// lastprivates
// CHECK-32: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR1]],
// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF1]] to i8*
// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV1]] to [[S_INT_TY]]*
// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_DST_EL]] to i8*
// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
// CHECK: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_ADDR1_REF]] to i8*
// CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
// CHECK: ret void
#endif
// CK1: [[SHAREDS:%.+]] = getelementptr inbounds %struct.kmp_task_t, %struct.kmp_task_t* [[TASK_T]], i32 0, i32 0
// CK1: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS]],
// CK1: [[BC1:%.+]] = bitcast %struct.anon* [[CAPTURES]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[SHAREDS_REF]], i8* [[BC1]], i[[sz]] 4, i32 4, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align 4 [[SHAREDS_REF]], i8* align 4 [[BC1]], i[[sz]] 4, i1 false)
// CK1: [[PRIVS:%.+]] = getelementptr inbounds %struct.kmp_task_t_with_privates, %struct.kmp_task_t_with_privates* [[BC]], i32 0, i32 1
// CK1: [[PRIVS_BASEPTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t, %struct..kmp_privates.t* [[PRIVS]], i32 0, i32 0
// CK1: [[BC_PRIVS_BASEPTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_BASEPTRS]] to i8*
// CK1: [[BC_BASEPTRS:%.+]] = bitcast i8** [[GEPBP0]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_BASEPTRS]], i8* [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_BASEPTRS]], i8* align {{8|4}} [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i1 false)
// CK1: [[PRIVS_PTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t, %struct..kmp_privates.t* [[PRIVS]], i32 0, i32 1
// CK1: [[BC_PRIVS_PTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_PTRS]] to i8*
// CK1: [[BC_PTRS:%.+]] = bitcast i8** [[GEPP0]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_PTRS]], i8* [[BC_PTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_PTRS]], i8* align {{8|4}} [[BC_PTRS]], i[[sz]] {{8|4}}, i1 false)
// CK1: [[PRIVS_SIZES:%.+]] = getelementptr inbounds %struct..kmp_privates.t, %struct..kmp_privates.t* [[PRIVS]], i32 0, i32 2
// CK1: [[BC_PRIVS_SIZES:%.+]] = bitcast [1 x i[[sz]]]* [[PRIVS_SIZES]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_SIZES]], i8* bitcast ([1 x i[[sz]]]* [[SIZE00]] to i8*), i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_SIZES]], i8* align {{8|4}} bitcast ([1 x i[[sz]]]* [[SIZE00]] to i8*), i[[sz]] {{8|4}}, i1 false)
// CK1: [[DEP:%.+]] = getelementptr inbounds [1 x %struct.kmp_depend_info], [1 x %struct.kmp_depend_info]* [[MAIN_DEP:%.+]], i[[sz]] 0, i[[sz]] 0
// CK1: [[DEP_ADR:%.+]] = getelementptr inbounds %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP]], i32 0, i32 0
// CK1: [[BC_ADR:%.+]] = ptrtoint i32* %{{.+}} to i[[sz]]
// CK1: [[SHAREDS:%.+]] = getelementptr inbounds %struct.kmp_task_t, %struct.kmp_task_t* [[TASK_T]], i32 0, i32 0
// CK1: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS]],
// CK1: [[BC1:%.+]] = bitcast %struct.anon{{.+}}* [[CAPTURES]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[SHAREDS_REF]], i8* [[BC1]], i[[sz]] 1, i32 1, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align 1 [[SHAREDS_REF]], i8* align 1 [[BC1]], i[[sz]] 1, i1 false)
// CK1: [[PRIVS:%.+]] = getelementptr inbounds %struct.kmp_task_t_with_privates{{.+}}, %struct.kmp_task_t_with_privates{{.+}}* [[RES_BC]], i32 0, i32 1
// CK1: [[PRIVS_BASEPTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 0
// CK1: [[BC_PRIVS_BASEPTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_BASEPTRS]] to i8*
// CK1: [[BC_BASEPTRS:%.+]] = bitcast i8** [[GEPBP0]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_BASEPTRS]], i8* [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_BASEPTRS]], i8* align {{8|4}} [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i1 false)
// CK1: [[PRIVS_PTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 1
// CK1: [[BC_PRIVS_PTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_PTRS]] to i8*
// CK1: [[BC_PTRS:%.+]] = bitcast i8** [[GEPP0]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_PTRS]], i8* [[BC_PTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_PTRS]], i8* align {{8|4}} [[BC_PTRS]], i[[sz]] {{8|4}}, i1 false)
// CK1: [[PRIVS_SIZES:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 2
// CK1: [[BC_PRIVS_SIZES:%.+]] = bitcast [1 x i[[sz]]]* [[PRIVS_SIZES]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_SIZES]], i8* bitcast ([1 x i[[sz]]]* [[SIZE02]] to i8*), i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_SIZES]], i8* align {{8|4}} bitcast ([1 x i[[sz]]]* [[SIZE02]] to i8*), i[[sz]] {{8|4}}, i1 false)
// CK1: [[DEP:%.+]] = getelementptr inbounds [3 x %struct.kmp_depend_info], [3 x %struct.kmp_depend_info]* [[MAIN_DEP:%.+]], i[[sz]] 0, i[[sz]] 0
// CK1: [[DEP_ADR:%.+]] = getelementptr inbounds %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP]], i32 0, i32 0
// CK1: [[BC_ADR:%.+]] = ptrtoint i32* %{{.+}} to i[[sz]]
// CK1: [[PRIVS_BASEPTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 0
// CK1: [[BC_PRIVS_BASEPTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_BASEPTRS]] to i8*
// CK1: [[BC_BASEPTRS:%.+]] = bitcast i8** [[GEPBP0]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_BASEPTRS]], i8* [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_BASEPTRS]], i8* align {{8|4}} [[BC_BASEPTRS]], i[[sz]] {{8|4}}, i1 false)
// CK1: [[PRIVS_PTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 1
// CK1: [[BC_PRIVS_PTRS:%.+]] = bitcast [1 x i8*]* [[PRIVS_PTRS]] to i8*
// CK1: [[BC_PTRS:%.+]] = bitcast i8** [[GEPP0]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_PTRS]], i8* [[BC_PTRS]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_PTRS]], i8* align {{8|4}} [[BC_PTRS]], i[[sz]] {{8|4}}, i1 false)
// CK1: [[PRIVS_SIZES:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 2
// CK1: [[BC_PRIVS_SIZES:%.+]] = bitcast [1 x i[[sz]]]* [[PRIVS_SIZES]] to i8*
// CK1: [[BC_SIZES:%.+]] = bitcast i[[sz]]* [[GEPS0]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_SIZES]], i8* [[BC_SIZES]], i[[sz]] {{8|4}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_SIZES]], i8* align {{8|4}} [[BC_SIZES]], i[[sz]] {{8|4}}, i1 false)
// CK1: [[DEP:%.+]] = getelementptr inbounds [4 x %struct.kmp_depend_info], [4 x %struct.kmp_depend_info]* [[MAIN_DEP:%.+]], i[[sz]] 0, i[[sz]] 0
// CK1: [[DEP_ADR:%.+]] = getelementptr inbounds %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP]], i32 0, i32 0
// CK1: [[BC_ADR:%.+]] = ptrtoint float* %{{.+}} to i[[sz]]
// CK1: [[PRIVS_BASEPTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 0
// CK1: [[BC_PRIVS_BASEPTRS:%.+]] = bitcast [2 x i8*]* [[PRIVS_BASEPTRS]] to i8*
// CK1: [[BC_BASEPTRS:%.+]] = bitcast i8** [[GEPBP0]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_BASEPTRS]], i8* [[BC_BASEPTRS]], i[[sz]] {{16|8}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_BASEPTRS]], i8* align {{8|4}} [[BC_BASEPTRS]], i[[sz]] {{16|8}}, i1 false)
// CK1: [[PRIVS_PTRS:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 1
// CK1: [[BC_PRIVS_PTRS:%.+]] = bitcast [2 x i8*]* [[PRIVS_PTRS]] to i8*
// CK1: [[BC_PTRS:%.+]] = bitcast i8** [[GEPP0]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_PTRS]], i8* [[BC_PTRS]], i[[sz]] {{16|8}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_PTRS]], i8* align {{8|4}} [[BC_PTRS]], i[[sz]] {{16|8}}, i1 false)
// CK1: [[PRIVS_SIZES:%.+]] = getelementptr inbounds %struct..kmp_privates.t{{.+}}, %struct..kmp_privates.t{{.+}}* [[PRIVS]], i32 0, i32 2
// CK1: [[BC_PRIVS_SIZES:%.+]] = bitcast [2 x i[[sz]]]* [[PRIVS_SIZES]] to i8*
- // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* [[BC_PRIVS_SIZES]], i8* bitcast ([2 x i[[sz]]]* [[SIZE04]] to i8*), i[[sz]] {{16|8}}, i32 {{8|4}}, i1 false)
+ // CK1: call void @llvm.memcpy.p0i8.p0i8.i[[sz]](i8* align {{8|4}} [[BC_PRIVS_SIZES]], i8* align {{8|4}} bitcast ([2 x i[[sz]]]* [[SIZE04]] to i8*), i[[sz]] {{16|8}}, i1 false)
// CK1: [[DEP:%.+]] = getelementptr inbounds [5 x %struct.kmp_depend_info], [5 x %struct.kmp_depend_info]* [[MAIN_DEP:%.+]], i[[sz]] 0, i[[sz]] 0
// CK1: [[DEP_ADR:%.+]] = getelementptr inbounds %struct.kmp_depend_info, %struct.kmp_depend_info* [[DEP]], i32 0, i32 0
// CK1: [[BC_ADR:%.+]] = ptrtoint double* %{{.+}} to i[[sz]]
// CHECK: [[SHAREDS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]], [[KMP_TASK_T]]* [[TASK_PTR:%.+]], i32 0, i32 0
// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_PTR]]
// CHECK: [[BITCAST:%.+]] = bitcast [[STRUCT_SHAREDS]]* [[CAPTURES]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[BITCAST]], i64 16, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[SHAREDS_REF]], i8* align 8 [[BITCAST]], i64 16, i1 false)
// CHECK: [[PRIORITY_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]], [[KMP_TASK_T]]* [[TASK_PTR]], i32 0, i32 4
// CHECK: [[PRIORITY:%.+]] = bitcast %union{{.+}}* [[PRIORITY_REF_PTR]] to i32*
// CHECK: store i32 {{.+}}, i32* [[PRIORITY]]
// CHECK: [[SHAREDS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]], [[KMP_TASK_T]]* [[TASK_PTR:%.+]], i32 0, i32 0
// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_PTR]]
// CHECK: [[BITCAST:%.+]] = bitcast [[STRUCT_SHAREDS1]]* [[CAPTURES]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[BITCAST]], i64 8, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[SHAREDS_REF]], i8* align 8 [[BITCAST]], i64 8, i1 false)
// CHECK: [[DEP:%.*]] = getelementptr inbounds [4 x [[KMP_DEPEND_INFO]]], [4 x [[KMP_DEPEND_INFO]]]* [[DEPENDENCIES:%.*]], i64 0, i64 0
// CHECK: [[T0:%.*]] = getelementptr inbounds [[KMP_DEPEND_INFO]], [[KMP_DEPEND_INFO]]* [[DEP]], i32 0, i32 0
// CHECK: store i64 ptrtoint (i32* @{{.+}} to i64), i64* [[T0]]
// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_MAIN_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 40, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[SHAREDS_REF]], i8* align 8 [[CAPTURES_ADDR]], i64 40, i1 false)
// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
// Also copy address of private copy to the corresponding shareds reference.
// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_TMAIN_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[SHAREDS_REF]], i8* align 8 [[CAPTURES_ADDR]], i64 32, i1 false)
// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
// CHECK-DAG: store i8* bitcast (void (i8*, i8*)* @[[ACOMB:.+]] to i8*), i8** [[TMP9]],
// CHECK-DAG: [[TMP10:%.+]] = getelementptr inbounds [[T1]], [[T1]]* [[GEPA]], i32 0, i32 5
// CHECK-DAG: [[TMP11:%.+]] = bitcast i32* [[TMP10]] to i8*
-// CHECK-DAG: call void @llvm.memset.p0i8.i64(i8* [[TMP11]], i8 0, i64 4, i32 8, i1 false)
+// CHECK-DAG: call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP11]], i8 0, i64 4, i1 false)
// CHECK-DAG: [[TMP13:%.+]] = bitcast float* [[B]] to i8*
// CHECK-DAG: store i8* [[TMP13]], i8** [[TMP12:%[^,]+]],
// CHECK-DAG: [[TMP12]] = getelementptr inbounds [[T1]], [[T1]]* [[GEPB:%[^,]+]], i32 0, i32 0
// CHECK-DAG: store i8* bitcast (void (i8*, i8*)* @[[BCOMB:.+]] to i8*), i8** [[TMP17]],
// CHECK-DAG: [[TMP18:%.+]] = getelementptr inbounds [[T1]], [[T1]]* [[GEPB]], i32 0, i32 5
// CHECK-DAG: [[TMP19:%.+]] = bitcast i32* [[TMP18]] to i8*
-// CHECK-DAG: call void @llvm.memset.p0i8.i64(i8* [[TMP19]], i8 0, i64 4, i32 8, i1 false)
+// CHECK-DAG: call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP19]], i8 0, i64 4, i1 false)
// CHECK-DAG: [[TMP21:%.+]] = bitcast i32* [[ARGC_ADDR]] to i8*
// CHECK-DAG: store i8* [[TMP21]], i8** [[TMP20:%[^,]+]],
// CHECK-DAG: [[TMP20]] = getelementptr inbounds [[T1]], [[T1]]* [[GEPARGC:%[^,]+]], i32 0, i32 0
// CHECK-DAG: store i8* bitcast (void (i8*, i8*)* @[[ARGCCOMB:.+]] to i8*), i8** [[TMP25]],
// CHECK-DAG: [[TMP26:%.+]] = getelementptr inbounds [[T1]], [[T1]]* [[GEPARGC]], i32 0, i32 5
// CHECK-DAG: [[TMP27:%.+]] = bitcast i32* [[TMP26]] to i8*
-// CHECK-DAG: call void @llvm.memset.p0i8.i64(i8* [[TMP27]], i8 0, i64 4, i32 8, i1 false)
+// CHECK-DAG: call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP27]], i8 0, i64 4, i1 false)
// CHECK-DAG: [[TMP28:%.+]] = bitcast [3 x [[T1]]]* [[RD_IN1]] to i8*
// CHECK-DAG: [[TMP29:%.+]] = call i8* @__kmpc_task_reduction_init(i32 [[GTID]], i32 3, i8* [[TMP28]])
// DEBUG-DAG: call void @llvm.dbg.declare(metadata i8** [[TD1]],
// CHECK-DAG: store i8* bitcast (void (i8*, i8*)* @[[CCOMB:.+]] to i8*), i8** [[TMP35]],
// CHECK-DAG: [[TMP36:%.+]] = getelementptr inbounds [[T2]], [[T2]]* [[GEPC]], i32 0, i32 5
// CHECK-DAG: [[TMP37:%.+]] = bitcast i32* [[TMP36]] to i8*
-// CHECK-DAG: call void @llvm.memset.p0i8.i64(i8* [[TMP37]], i8 0, i64 4, i32 8, i1 false)
+// CHECK-DAG: call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP37]], i8 0, i64 4, i1 false)
// CHECK-DAG: [[TMP39:%.+]] = bitcast i16* [[VLA]] to i8*
// CHECK-DAG: store i8* [[TMP39]], i8** [[TMP38:%[^,]+]],
// CHECK-DAG: [[TMP38]] = getelementptr inbounds [[T2]], [[T2]]* [[GEPVLA:%[^,]+]], i32 0, i32 0
// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_MAIN_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 40, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[SHAREDS_REF]], i8* align 8 [[CAPTURES_ADDR]], i64 40, i1 false)
// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
// Also copy address of private copy to the corresponding shareds reference.
// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_TMAIN_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[SHAREDS_REF]], i8* align 8 [[CAPTURES_ADDR]], i64 32, i1 false)
// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_MAIN_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 40, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[SHAREDS_REF]], i8* align 8 [[CAPTURES_ADDR]], i64 40, i1 false)
// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
// Also copy address of private copy to the corresponding shareds reference.
// CHECK-NEXT: br i1
// CHECK: bitcast [[S_DOUBLE_TY]]* %{{.+}} to i8*
// CHECK: bitcast [[S_DOUBLE_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align {{[0-9]+}} %
// CHECK: load i32, i32* %
// CHECK: store i32 %{{.+}}, i32* %
// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* %
// CHECK: phi [[S_DOUBLE_TY]]*
// CHECK: phi [[S_DOUBLE_TY]]*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align {{[0-9]+}} %
// CHECK: icmp eq [[S_DOUBLE_TY]]* %
// CHECK-NEXT: br i1
// CHECK: bitcast [2 x i32]* %{{.+}} to i8*
// CHECK: bitcast [2 x i32]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align {{[0-9]+}} %
// CHECK: load i32, i32* %
// CHECK: store i32 %{{.+}}, i32* %
// CHECK: br label
// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_TMAIN_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[SHAREDS_REF]], i8* align 8 [[CAPTURES_ADDR]], i64 32, i1 false)
// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
// CHECK: store i32 %{{.+}}, i32* %
// CHECK: bitcast [2 x i32]* %{{.+}} to i8*
// CHECK: bitcast [2 x i32]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align {{[0-9]+}} %
// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* %
// CHECK: phi [[S_INT_TY]]*
// CHECK: phi [[S_INT_TY]]*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align {{[0-9]+}} %
// CHECK: icmp eq [[S_INT_TY]]* %
// CHECK-NEXT: br i1
// CHECK: bitcast [[S_INT_TY]]* %{{.+}} to i8*
// CHECK: bitcast [[S_INT_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align {{[0-9]+}} %
// CHECK: br label
// CHECK: ret
// CHECK-DAG: store i8* bitcast (void (i8*, i8*)* @[[RED_COMB1:.+]] to i8*), i8** [[TMP25]],
// CHECK-DAG: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 5
// CHECK-DAG: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to i8*
-// CHECK-DAG: call void @llvm.memset.p0i8.i64(i8* [[TMP27]], i8 0, i64 4, i32 8, i1 false)
+// CHECK-DAG: call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP27]], i8 0, i64 4, i1 false)
// CHECK-DAG: [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x %struct.S], [100 x %struct.S]* [[C]], i64 0, i64 0
// CHECK-DAG: [[LB_ADD_LEN:%.*]] = add nsw i64 -1, %
// CHECK-DAG: [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x %struct.S], [100 x %struct.S]* [[C]], i64 0, i64 [[LB_ADD_LEN]]
// CHECK-DAG: store i8* bitcast (void (i8*, i8*)* @[[RED_COMB3:.+]] to i8*), i8** [[TMP49]],
// CHECK-DAG: [[TMP50:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 5
// CHECK-DAG: [[TMP51:%.*]] = bitcast i32* [[TMP50]] to i8*
-// CHECK-DAG: call void @llvm.memset.p0i8.i64(i8* [[TMP51]], i8 0, i64 4, i32 8, i1 false)
+// CHECK-DAG: call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP51]], i8 0, i64 4, i1 false)
// CHECK-DAG: [[TMP53:%.*]] = bitcast float* [[VLA]] to i8*
// CHECK-DAG: store i8* [[TMP53]], i8** [[TMP52:%[^,]+]],
// CHECK-DAG: [[TMP52]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8:%.+]], i32 0, i32 0
// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_MAIN_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 40, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[SHAREDS_REF]], i8* align 8 [[CAPTURES_ADDR]], i64 40, i1 false)
// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
// Also copy address of private copy to the corresponding shareds reference.
// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_TMAIN_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[SHAREDS_REF]], i8* align 8 [[CAPTURES_ADDR]], i64 32, i1 false)
// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_MAIN_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 40, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[SHAREDS_REF]], i8* align 8 [[CAPTURES_ADDR]], i64 40, i1 false)
// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
// Also copy address of private copy to the corresponding shareds reference.
// CHECK-NEXT: br i1
// CHECK: bitcast [[S_DOUBLE_TY]]* %{{.+}} to i8*
// CHECK: bitcast [[S_DOUBLE_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align {{[0-9]+}} %
// CHECK: load i32, i32* %
// CHECK: store i32 %{{.+}}, i32* %
// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* %
// CHECK: phi [[S_DOUBLE_TY]]*
// CHECK: phi [[S_DOUBLE_TY]]*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align {{[0-9]+}} %
// CHECK: icmp eq [[S_DOUBLE_TY]]* %
// CHECK-NEXT: br i1
// CHECK: bitcast [2 x i32]* %{{.+}} to i8*
// CHECK: bitcast [2 x i32]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align {{[0-9]+}} %
// CHECK: load i32, i32* %
// CHECK: store i32 %{{.+}}, i32* %
// CHECK: br label
// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_TMAIN_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 32, i32 8, i1 false)
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[SHAREDS_REF]], i8* align 8 [[CAPTURES_ADDR]], i64 32, i1 false)
// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
// CHECK: store i32 %{{.+}}, i32* %
// CHECK: bitcast [2 x i32]* %{{.+}} to i8*
// CHECK: bitcast [2 x i32]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align {{[0-9]+}} %
// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* %
// CHECK: phi [[S_INT_TY]]*
// CHECK: phi [[S_INT_TY]]*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align {{[0-9]+}} %
// CHECK: icmp eq [[S_INT_TY]]* %
// CHECK-NEXT: br i1
// CHECK: bitcast [[S_INT_TY]]* %{{.+}} to i8*
// CHECK: bitcast [[S_INT_TY]]* %{{.+}} to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align {{[0-9]+}} %
// CHECK: br label
// CHECK: ret
// CHECK-DAG: store i8* bitcast (void (i8*, i8*)* @[[RED_COMB1:.+]] to i8*), i8** [[TMP25]],
// CHECK-DAG: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_]], i32 0, i32 5
// CHECK-DAG: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to i8*
-// CHECK-DAG: call void @llvm.memset.p0i8.i64(i8* [[TMP27]], i8 0, i64 4, i32 8, i1 false)
+// CHECK-DAG: call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP27]], i8 0, i64 4, i1 false)
// CHECK-DAG: [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x %struct.S], [100 x %struct.S]* [[C]], i64 0, i64 0
// CHECK-DAG: [[LB_ADD_LEN:%.*]] = add nsw i64 -1, %
// CHECK-DAG: [[ARRAYIDX6:%.*]] = getelementptr inbounds [100 x %struct.S], [100 x %struct.S]* [[C]], i64 0, i64 [[LB_ADD_LEN]]
// CHECK-DAG: store i8* bitcast (void (i8*, i8*)* @[[RED_COMB3:.+]] to i8*), i8** [[TMP49]],
// CHECK-DAG: [[TMP50:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_7]], i32 0, i32 5
// CHECK-DAG: [[TMP51:%.*]] = bitcast i32* [[TMP50]] to i8*
-// CHECK-DAG: call void @llvm.memset.p0i8.i64(i8* [[TMP51]], i8 0, i64 4, i32 8, i1 false)
+// CHECK-DAG: call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP51]], i8 0, i64 4, i1 false)
// CHECK-DAG: [[TMP53:%.*]] = bitcast float* [[VLA]] to i8*
// CHECK-DAG: store i8* [[TMP53]], i8** [[TMP52:%[^,]+]],
// CHECK-DAG: [[TMP52]] = getelementptr inbounds [[STRUCT_KMP_TASK_RED_INPUT_T]], %struct.kmp_task_red_input_t* [[DOTRD_INPUT_GEP_8:%.+]], i32 0, i32 0
// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
// firstprivate(s_arr)
// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]],
// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
// firstprivate(s_arr)
// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
// CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
// CHECK-DAG: [[S_ARR_PTR_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_PTR]] to i8*
// CHECK-DAG: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST]], i8* [[TMP_VAL_BCAST]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST]],
// CHECK: call void @__kmpc_for_static_fini(
// lastprivates
// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]] to [[S_FLOAT_TY]]*
// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_DST_EL]] to i8*
// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
// CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_ADDR_REF]] to i8*
// CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
// CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
// CHECK: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR_REF]],
// CHECK: ret void
// CHECK-DAG: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
// CHECK-DAG: [[S_ARR_PTR_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_PTR1]] to i8*
// CHECK-DAG: [[TMP_VAL_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST1]], i8* [[TMP_VAL_BCAST1]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST1]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST1]],
// CHECK: call void @__kmpc_for_static_fini(
// lastprivates
// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF1]],
// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF1]] to i8*
// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV1]] to [[S_INT_TY]]*
// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_DST_EL]] to i8*
// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
// CHECK: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_ADDR1_REF]] to i8*
// CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
// CHECK: ret void
#endif
// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
// firstprivate(s_arr)
// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]],
// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
// firstprivate(s_arr)
// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]],
// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
// firstprivate(s_arr)
// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
// firstprivate(s_arr)
// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]] to [[S_FLOAT_TY]]*
// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_DST_EL]] to i8*
// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
// CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_ADDR_REF]] to i8*
// CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
// CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
// CHECK: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR_REF]],
// CHECK: ret void
// CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
// CHECK-DAG: [[S_ARR_PTR_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_PTR]] to i8*
// CHECK-DAG: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST]], i8* [[TMP_VAL_BCAST]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST]],
// CHECK: call void @__kmpc_for_static_fini(
// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]] to [[S_FLOAT_TY]]*
// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_DST_EL]] to i8*
// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
// CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_ADDR_REF]] to i8*
// CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
// CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
// CHECK: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR_REF]],
// CHECK: ret void
// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF1]],
// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF1]] to i8*
// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV1]] to [[S_INT_TY]]*
// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_DST_EL]] to i8*
// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
// CHECK: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_ADDR1_REF]] to i8*
// CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
// CHECK: ret void
// CHECK: define internal void [[TPAR_OUTL:@.+]](i{{[0-9]+}}* noalias [[GTID_ADDR1:%.+]], i{{[0-9]+}}* noalias %{{.+}}, {{.+}}, {{.+}}, [2 x i{{[0-9]+}}]*{{.+}} [[VEC_IN1:%.+]], i{{[0-9]+}}*{{.+}} [[T_VAR_IN1:%.+]], [2 x [[S_INT_TY]]]*{{.+}} [[S_ARR_IN1:%.+]], [[S_INT_TY]]*{{.+}} [[VAR_IN1:%.+]])
// CHECK-DAG: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
// CHECK-DAG: [[S_ARR_PTR_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_PTR1]] to i8*
// CHECK-DAG: [[TMP_VAL_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST1]], i8* [[TMP_VAL_BCAST1]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST1]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST1]],
// CHECK: call void @__kmpc_for_static_fini(
// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF1]],
// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF1]] to i8*
// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV1]] to [[S_INT_TY]]*
// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_DST_EL]] to i8*
// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
// CHECK: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_ADDR1_REF]] to i8*
// CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
// CHECK: ret void
#endif
// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
// firstprivate(s_arr)
// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]],
// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
// firstprivate(s_arr)
// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]],
// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
// firstprivate(s_arr)
// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
// firstprivate(s_arr)
// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]] to [[S_FLOAT_TY]]*
// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_DST_EL]] to i8*
// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
// CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_ADDR_REF]] to i8*
// CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
// CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
// CHECK: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR_REF]],
// CHECK: ret void
// CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
// CHECK-DAG: [[S_ARR_PTR_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_PTR]] to i8*
// CHECK-DAG: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST]], i8* [[TMP_VAL_BCAST]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST]],
// CHECK: call void @__kmpc_for_static_fini(
// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]] to [[S_FLOAT_TY]]*
// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_DST_EL]] to i8*
// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
// CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_ADDR_REF]] to i8*
// CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
// CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
// CHECK: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR_REF]],
// CHECK: ret void
// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF1]],
// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF1]] to i8*
// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV1]] to [[S_INT_TY]]*
// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_DST_EL]] to i8*
// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
// CHECK: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_ADDR1_REF]] to i8*
// CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
// CHECK: ret void
// CHECK: define internal void [[TPAR_OUTL:@.+]](i{{[0-9]+}}* noalias [[GTID_ADDR1:%.+]], i{{[0-9]+}}* noalias %{{.+}}, {{.+}}, {{.+}}, [2 x i{{[0-9]+}}]*{{.+}} [[VEC_IN1:%.+]], i{{[0-9]+}}*{{.+}} [[T_VAR_IN1:%.+]], [2 x [[S_INT_TY]]]*{{.+}} [[S_ARR_IN1:%.+]], [[S_INT_TY]]*{{.+}} [[VAR_IN1:%.+]])
// CHECK-DAG: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
// CHECK-DAG: [[S_ARR_PTR_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_PTR1]] to i8*
// CHECK-DAG: [[TMP_VAL_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST1]], i8* [[TMP_VAL_BCAST1]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST1]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST1]],
// CHECK: call void @__kmpc_for_static_fini(
// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF1]],
// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF1]] to i8*
// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV1]] to [[S_INT_TY]]*
// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_DST_EL]] to i8*
// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
// CHECK: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_ADDR1_REF]] to i8*
// CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
// CHECK: ret void
// CHECK: !{!"llvm.loop.vectorize.enable", i1 true}
// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
// firstprivate(s_arr)
// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]],
// firstprivate vec(vec): copy from *_addr into priv1 and then from priv1 into priv2
// CHECK-DAG: [[VEC_DEST_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
// CHECK-DAG: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST_PRIV]], i8* [[VEC_SRC]], {{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST_PRIV]], i8* align {{[0-9]+}} [[VEC_SRC]], {{.+}})
// firstprivate(s_arr)
// CHECK-DAG: [[S_ARR_PRIV_BGN:%.+]] = getelementptr{{.*}} [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]],
// CHECK-DAG: [[TMP_VAL:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
// CHECK-DAG: [[S_ARR_PTR_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_PTR]] to i8*
// CHECK-DAG: [[TMP_VAL_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST]], i8* [[TMP_VAL_BCAST]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST]],
// CHECK: call void @__kmpc_for_static_fini(
// lastprivates
// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF]],
// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF]] to i8*
// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]] to [[S_FLOAT_TY]]*
// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_FLOAT_TY]]*{{.+}}
// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_DST_EL]] to i8*
// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
// CHECK: [[TMP_VAL1:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[TMP_PRIV]],
// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_ADDR_REF]] to i8*
// CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
// CHECK: [[SVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[S_VAR_PRIV]],
// CHECK: store i{{[0-9]+}} [[SVAR_VAL]], i{{[0-9]+}}* [[SVAR_ADDR_REF]],
// CHECK: ret void
// CHECK-DAG: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
// CHECK-DAG: [[S_ARR_PTR_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_PTR1]] to i8*
// CHECK-DAG: [[TMP_VAL_BCAST1:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_PTR_BCAST1]], i8* [[TMP_VAL_BCAST1]],
+// CHECK-DAG: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_PTR_BCAST1]], i8* align {{[0-9]+}} [[TMP_VAL_BCAST1]],
// CHECK: call void @__kmpc_for_static_fini(
// lastprivates
// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_ADDR_REF1]],
// CHECK: [[BCAST_VEC_ADDR_REF:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_ADDR_REF1]] to i8*
// CHECK: [[BCAST_VEC_PRIV:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[BCAST_VEC_ADDR_REF]], i8* [[BCAST_VEC_PRIV]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[BCAST_VEC_ADDR_REF]], i8* align {{[0-9]+}} [[BCAST_VEC_PRIV]],
// CHECK: [[S_ARR_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_PRIV_BCAST:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_PRIV1]] to [[S_INT_TY]]*
// CHECK: [[S_ARR_BEGIN_GEP:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_BEGIN]], i{{[0-9]+}} 2
// CHECK: [[S_ARR_DST_EL:%.+]] = phi [[S_INT_TY]]*{{.+}}
// CHECK: [[S_ARR_DST_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_DST_EL]] to i8*
// CHECK: [[S_ARR_SRC_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[S_ARR_SRC_EL]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[S_ARR_DST_BCAST]], i8* [[S_ARR_SRC_BCAST]]{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[S_ARR_DST_BCAST]], i8* align {{[0-9]+}} [[S_ARR_SRC_BCAST]]{{.+}})
// CHECK: [[S_ARR_DST_NEXT:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_DST_EL]], i{{[0-9]+}} 1
// CHECK: [[S_ARR_SRC_NEXT:%.+]] = getelementptr{{.+}}
// CHECK: [[CPY_IS_FINISHED:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_DST_NEXT]], [[S_ARR_BEGIN_GEP]]
// CHECK: [[TMP_VAL1:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[TMP_PRIV1]],
// CHECK: [[VAR_ADDR_REF_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_ADDR1_REF]] to i8*
// CHECK: [[TMP_VAL1_BCAST:%.+]] = bitcast [[S_INT_TY]]* [[TMP_VAL1]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VAR_ADDR_REF_BCAST]], i8* [[TMP_VAL1_BCAST]],{{.+}})
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VAR_ADDR_REF_BCAST]], i8* align {{[0-9]+}} [[TMP_VAL1_BCAST]],{{.+}})
// CHECK: ret void
// CHECK: !{!"llvm.loop.vectorize.enable", i1 true}
#endif
// CHECK-64: [[SIVAR7_CONV:%.+]] = bitcast i64* [[SIVAR7_PRIV]] to i32*
// CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
// CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]],
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[VEC_DEST]], i8* align {{[0-9]+}} [[VEC_SRC]],
// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_BEGIN:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_REF]] to [[S_FLOAT_TY]]*
// CHECK: [[S_ARR_PRIV_END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_PRIV_BEGIN]], i{{[0-9]+}} 2
// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_PRIV]], align 128
// CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
// CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
-// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]], i{{[0-9]+}} {{[0-9]+}}, i{{[0-9]+}} 128,
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* align 128 [[VEC_DEST]], i8* align 128 [[VEC_SRC]], i{{[0-9]+}} {{[0-9]+}},
// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
// CHECK: [[S_ARR_BEGIN:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_REF]] to [[S_INT_TY]]*
// CHECK: [[S_ARR_PRIV_END:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_PRIV_BEGIN]], i{{[0-9]+}} 2
// ARRAY-DAG: store double* %{{.+}}, double** [[PRIV_VLA2]],
// ARRAY: call i8* @llvm.stacksave()
// ARRAY: [[SIZE:%.+]] = mul nuw i{{[0-9]+}} %{{.+}}, 8
-// ARRAY: call void @llvm.memcpy.p0i8.p0i8.i{{[0-9]+}}(i8* %{{.+}}, i8* %{{.+}}, i{{[0-9]+}} [[SIZE]], i32 128, i1 false)
+// ARRAY: call void @llvm.memcpy.p0i8.p0i8.i{{[0-9]+}}(i8* align 128 %{{.+}}, i8* align 128 %{{.+}}, i{{[0-9]+}} [[SIZE]], i1 false)
#pragma omp target
#pragma omp teams firstprivate(a, s, vla1, vla2)
s[0].St_func(s, n, vla1);
// ARRAY-DAG: store double* %{{.+}}, double** [[PRIV_VLA2]],
// ARRAY: call i8* @llvm.stacksave()
// ARRAY: [[SIZE:%.+]] = mul nuw i{{[0-9]+}} %{{.+}}, 8
-// ARRAY: call void @llvm.memcpy.p0i8.p0i8.i{{[0-9]+}}(i8* %{{.+}}, i8* %{{.+}}, i{{[0-9]+}} [[SIZE]], i32 128, i1 false)
+// ARRAY: call void @llvm.memcpy.p0i8.p0i8.i{{[0-9]+}}(i8* align 128 %{{.+}}, i8* align 128 %{{.+}}, i{{[0-9]+}} [[SIZE]], i1 false)
#endif
#endif