}
bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
- unsigned,
+ unsigned Alignment,
MachineMemOperand::Flags,
bool *Fast) const {
// Depends what it gets converted into if the type is weird.
// The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
+ auto Ty = VT.getSimpleVT().SimpleTy;
- switch (VT.getSimpleVT().SimpleTy) {
- default:
- return false;
- case MVT::i8:
- case MVT::i16:
- case MVT::i32: {
+ if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) {
// Unaligned access can use (for example) LRDB, LRDH, LDR
if (AllowsUnaligned) {
if (Fast)
*Fast = Subtarget->hasV7Ops();
return true;
}
- return false;
}
- case MVT::f64:
- case MVT::v2f64: {
+
+ if (Ty == MVT::f64 || Ty == MVT::v2f64) {
// For any little-endian targets with neon, we can support unaligned ld/st
// of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
// A big-endian target may also explicitly support unaligned accesses
*Fast = true;
return true;
}
- return false;
}
+
+ if (!Subtarget->hasMVEIntegerOps())
+ return false;
+ if (Ty != MVT::v16i8 && Ty != MVT::v8i16 && Ty != MVT::v8f16 &&
+ Ty != MVT::v4i32 && Ty != MVT::v4f32 && Ty != MVT::v2i64 &&
+ Ty != MVT::v2f64)
+ return false;
+
+ if (Subtarget->isLittle()) {
+ // In little-endian MVE, the store instructions VSTRB.U8,
+ // VSTRH.U16 and VSTRW.U32 all store the vector register in
+ // exactly the same format, and differ only in the range of
+ // their immediate offset field and the required alignment.
+ //
+ // In particular, VSTRB.U8 can store a vector at byte alignment.
+ // So at this stage we can simply say that loads/stores of all
+ // 128-bit wide vector types are permitted at any alignment,
+ // because we know at least _one_ instruction can manage that.
+ //
+ // Later on we might find that some of those loads are better
+ // generated as VLDRW.U32 if alignment permits, to take
+ // advantage of the larger immediate range. But for the moment,
+ // all that matters is that if we don't lower the load then
+ // _some_ instruction can handle it.
+ if (Fast)
+ *Fast = true;
+ return true;
+ } else {
+ // In big-endian MVE, those instructions aren't so similar
+ // after all, because they reorder the bytes of the vector
+ // differently. So this time we can only store a particular
+ // kind of vector if its alignment is at least the element
+ // type. And we can't store vectors of i64 or f64 at all
+ // without having to do some postprocessing, because there's
+ // no VSTRD.U64.
+ if (Ty == MVT::v16i8 ||
+ ((Ty == MVT::v8i16 || Ty == MVT::v8f16) && Alignment >= 2) ||
+ ((Ty == MVT::v4i32 || Ty == MVT::v4f32) && Alignment >= 4)) {
+ if (Fast)
+ *Fast = true;
+ return true;
+ }
}
+
+ return false;
}
static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
def : MVE_unpred_vector_store_typed<v4i32, RegImmInst, StoreKind, shift>;
def : MVE_unpred_vector_store_typed<v4f32, RegImmInst, StoreKind, shift>;
def : MVE_unpred_vector_store_typed<v2i64, RegImmInst, StoreKind, shift>;
+ def : MVE_unpred_vector_store_typed<v2f64, RegImmInst, StoreKind, shift>;
}
class MVE_unpred_vector_load_typed<ValueType Ty, Instruction RegImmInst,
PatFrag LoadKind, int shift>
: Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr)),
(Ty (RegImmInst t2addrmode_imm7<shift>:$addr))>;
+
multiclass MVE_unpred_vector_load<Instruction RegImmInst, PatFrag LoadKind,
int shift> {
def : MVE_unpred_vector_load_typed<v16i8, RegImmInst, LoadKind, shift>;
def : MVE_unpred_vector_load_typed<v4i32, RegImmInst, LoadKind, shift>;
def : MVE_unpred_vector_load_typed<v4f32, RegImmInst, LoadKind, shift>;
def : MVE_unpred_vector_load_typed<v2i64, RegImmInst, LoadKind, shift>;
+ def : MVE_unpred_vector_load_typed<v2f64, RegImmInst, LoadKind, shift>;
}
let Predicates = [HasMVEInt, IsLE] in {
store <4 x i32> %result, <4 x i32>* %resultp, align 16
ret void
}
+
+define void @vector_f64_copy(<2 x double>* %from, <2 x double>* %to) {
+; CHECK-LABEL: vector_f64_copy:
+; CHECK: @ %bb.0:
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+ %v = load <2 x double>, <2 x double>* %from, align 16
+ store <2 x double> %v, <2 x double>* %to, align 16
+ ret void
+}
+
+define arm_aapcs_vfpcc <16 x i8> @stack_slot_handling(<16 x i8> %a) #0 {
+; CHECK-LABEL: stack_slot_handling:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: push {r4, r6, r7, lr}
+; CHECK-NEXT: add r7, sp, #8
+; CHECK-NEXT: sub sp, #16
+; CHECK-NEXT: mov r4, sp
+; CHECK-NEXT: bfc r4, #0, #4
+; CHECK-NEXT: mov sp, r4
+; CHECK-NEXT: mov r0, sp
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: sub.w r4, r7, #8
+; CHECK-NEXT: mov sp, r4
+; CHECK-NEXT: pop {r4, r6, r7, pc}
+entry:
+ %a.addr = alloca <16 x i8>, align 8
+ store <16 x i8> %a, <16 x i8>* %a.addr, align 8
+ %0 = load <16 x i8>, <16 x i8>* %a.addr, align 8
+ ret <16 x i8> %0
+}
+
+attributes #0 = { noinline optnone }
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve %s -o - | FileCheck %s
+
+define i8* @post_ldrwu32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrwu32_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0, #4]
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %z to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrwu32_3(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrwu32_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: adds r2, r0, #3
+; CHECK-NEXT: vldrw.u32 q0, [r2]
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 3
+ %0 = bitcast i8* %z to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrwu32_m4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrwu32_m4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0, #-4]
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 -4
+ %0 = bitcast i8* %z to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrwu32_508(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrwu32_508:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: add.w r2, r0, #508
+; CHECK-NEXT: vldrw.u32 q0, [r2]
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 508
+ %0 = bitcast i8* %z to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrwu32_512(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrwu32_512:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: add.w r2, r0, #512
+; CHECK-NEXT: vldrw.u32 q0, [r2]
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 512
+ %0 = bitcast i8* %z to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrwu32_m508(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrwu32_m508:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: sub.w r2, r0, #508
+; CHECK-NEXT: vldrw.u32 q0, [r2]
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 -508
+ %0 = bitcast i8* %z to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrwu32_m512(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrwu32_m512:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: sub.w r2, r0, #512
+; CHECK-NEXT: vldrw.u32 q0, [r2]
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 -512
+ %0 = bitcast i8* %z to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %x
+}
+
+
+define i8* @post_ldrhu32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhu32_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrh r2, [r0, #4]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrh r2, [r0, #6]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrh r2, [r0, #8]
+; CHECK-NEXT: vmov.32 q0[2], r2
+; CHECK-NEXT: ldrh r2, [r0, #10]
+; CHECK-NEXT: vmov.32 q0[3], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %z to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = zext <4 x i16> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrhu32_3(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhu32_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrh.w r2, [r0, #3]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrh.w r2, [r0, #5]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrh.w r2, [r0, #7]
+; CHECK-NEXT: vmov.32 q0[2], r2
+; CHECK-NEXT: ldrh.w r2, [r0, #9]
+; CHECK-NEXT: vmov.32 q0[3], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 3
+ %0 = bitcast i8* %z to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = zext <4 x i16> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrhu32_2(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhu32_2:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrh r2, [r0, #2]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrh r2, [r0, #4]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrh r2, [r0, #6]
+; CHECK-NEXT: vmov.32 q0[2], r2
+; CHECK-NEXT: ldrh r2, [r0, #8]
+; CHECK-NEXT: vmov.32 q0[3], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 2
+ %0 = bitcast i8* %z to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = zext <4 x i16> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrhu32_254(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhu32_254:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrh.w r2, [r0, #254]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrh.w r2, [r0, #256]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrh.w r2, [r0, #258]
+; CHECK-NEXT: vmov.32 q0[2], r2
+; CHECK-NEXT: ldrh.w r2, [r0, #260]
+; CHECK-NEXT: vmov.32 q0[3], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 254
+ %0 = bitcast i8* %z to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = zext <4 x i16> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrhu32_256(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhu32_256:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrh.w r2, [r0, #256]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrh.w r2, [r0, #258]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrh.w r2, [r0, #260]
+; CHECK-NEXT: vmov.32 q0[2], r2
+; CHECK-NEXT: ldrh.w r2, [r0, #262]
+; CHECK-NEXT: vmov.32 q0[3], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 256
+ %0 = bitcast i8* %z to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = zext <4 x i16> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %x
+}
+
+
+define i8* @post_ldrhs32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhs32_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: ldrsh.w r2, [r0, #4]
+; CHECK-NEXT: ldrsh.w r3, [r0, #6]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrsh.w r12, [r0, #8]
+; CHECK-NEXT: vmov.32 q0[1], r3
+; CHECK-NEXT: ldrsh.w lr, [r0, #10]
+; CHECK-NEXT: vmov.32 q0[2], r12
+; CHECK-NEXT: vmov.32 q0[3], lr
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %z to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = sext <4 x i16> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrhs32_3(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhs32_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: ldrsh.w r2, [r0, #3]
+; CHECK-NEXT: ldrsh.w r3, [r0, #5]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrsh.w r12, [r0, #7]
+; CHECK-NEXT: vmov.32 q0[1], r3
+; CHECK-NEXT: ldrsh.w lr, [r0, #9]
+; CHECK-NEXT: vmov.32 q0[2], r12
+; CHECK-NEXT: vmov.32 q0[3], lr
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 3
+ %0 = bitcast i8* %z to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = sext <4 x i16> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrhs32_2(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhs32_2:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: ldrsh.w r2, [r0, #2]
+; CHECK-NEXT: ldrsh.w r3, [r0, #4]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrsh.w r12, [r0, #6]
+; CHECK-NEXT: vmov.32 q0[1], r3
+; CHECK-NEXT: ldrsh.w lr, [r0, #8]
+; CHECK-NEXT: vmov.32 q0[2], r12
+; CHECK-NEXT: vmov.32 q0[3], lr
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 2
+ %0 = bitcast i8* %z to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = sext <4 x i16> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrhs32_254(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhs32_254:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: ldrsh.w r2, [r0, #254]
+; CHECK-NEXT: ldrsh.w r3, [r0, #256]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrsh.w r12, [r0, #258]
+; CHECK-NEXT: vmov.32 q0[1], r3
+; CHECK-NEXT: ldrsh.w lr, [r0, #260]
+; CHECK-NEXT: vmov.32 q0[2], r12
+; CHECK-NEXT: vmov.32 q0[3], lr
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 254
+ %0 = bitcast i8* %z to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = sext <4 x i16> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrhs32_256(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhs32_256:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: ldrsh.w r2, [r0, #256]
+; CHECK-NEXT: ldrsh.w r3, [r0, #258]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrsh.w r12, [r0, #260]
+; CHECK-NEXT: vmov.32 q0[1], r3
+; CHECK-NEXT: ldrsh.w lr, [r0, #262]
+; CHECK-NEXT: vmov.32 q0[2], r12
+; CHECK-NEXT: vmov.32 q0[3], lr
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 256
+ %0 = bitcast i8* %z to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = sext <4 x i16> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %x
+}
+
+
+define i8* @post_ldrhu16_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhu16_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0, #4]
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %z to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 8
+ %2 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrhu16_3(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhu16_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: adds r2, r0, #3
+; CHECK-NEXT: vldrw.u32 q0, [r2]
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 3
+ %0 = bitcast i8* %z to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 8
+ %2 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrhu16_2(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhu16_2:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: adds r2, r0, #2
+; CHECK-NEXT: vldrw.u32 q0, [r2]
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 2
+ %0 = bitcast i8* %z to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 8
+ %2 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrhu16_254(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhu16_254:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: add.w r2, r0, #254
+; CHECK-NEXT: vldrw.u32 q0, [r2]
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 254
+ %0 = bitcast i8* %z to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 8
+ %2 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrhu16_256(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhu16_256:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: add.w r2, r0, #256
+; CHECK-NEXT: vldrw.u32 q0, [r2]
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 256
+ %0 = bitcast i8* %z to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 8
+ %2 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 8
+ ret i8* %x
+}
+
+
+define i8* @post_ldrbu32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu32_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: ldrb r2, [r0, #4]
+; CHECK-NEXT: ldrb r3, [r0, #5]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrb.w r12, [r0, #6]
+; CHECK-NEXT: vmov.32 q0[1], r3
+; CHECK-NEXT: ldrb.w lr, [r0, #7]
+; CHECK-NEXT: vmov.32 q0[2], r12
+; CHECK-NEXT: vmov.32 q0[3], lr
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %z to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = zext <4 x i8> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrbu32_3(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu32_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: ldrb r2, [r0, #3]
+; CHECK-NEXT: ldrb r3, [r0, #4]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrb.w r12, [r0, #5]
+; CHECK-NEXT: vmov.32 q0[1], r3
+; CHECK-NEXT: ldrb.w lr, [r0, #6]
+; CHECK-NEXT: vmov.32 q0[2], r12
+; CHECK-NEXT: vmov.32 q0[3], lr
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 3
+ %0 = bitcast i8* %z to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = zext <4 x i8> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrbu32_127(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu32_127:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: ldrb.w r2, [r0, #127]
+; CHECK-NEXT: ldrb.w r3, [r0, #128]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrb.w r12, [r0, #129]
+; CHECK-NEXT: vmov.32 q0[1], r3
+; CHECK-NEXT: ldrb.w lr, [r0, #130]
+; CHECK-NEXT: vmov.32 q0[2], r12
+; CHECK-NEXT: vmov.32 q0[3], lr
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 127
+ %0 = bitcast i8* %z to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = zext <4 x i8> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrbu32_128(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu32_128:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: ldrb.w r2, [r0, #128]
+; CHECK-NEXT: ldrb.w r3, [r0, #129]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrb.w r12, [r0, #130]
+; CHECK-NEXT: vmov.32 q0[1], r3
+; CHECK-NEXT: ldrb.w lr, [r0, #131]
+; CHECK-NEXT: vmov.32 q0[2], r12
+; CHECK-NEXT: vmov.32 q0[3], lr
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 128
+ %0 = bitcast i8* %z to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = zext <4 x i8> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %x
+}
+
+
+define i8* @post_ldrbs32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbs32_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrsb.w r2, [r0, #4]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #5]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #6]
+; CHECK-NEXT: vmov.32 q0[2], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #7]
+; CHECK-NEXT: vmov.32 q0[3], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %z to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = sext <4 x i8> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrbs32_3(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbs32_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrsb.w r2, [r0, #3]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #4]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #5]
+; CHECK-NEXT: vmov.32 q0[2], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #6]
+; CHECK-NEXT: vmov.32 q0[3], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 3
+ %0 = bitcast i8* %z to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = sext <4 x i8> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrbs32_127(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbs32_127:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrsb.w r2, [r0, #127]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #128]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #129]
+; CHECK-NEXT: vmov.32 q0[2], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #130]
+; CHECK-NEXT: vmov.32 q0[3], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 127
+ %0 = bitcast i8* %z to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = sext <4 x i8> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrbs32_128(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbs32_128:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrsb.w r2, [r0, #128]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #129]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #130]
+; CHECK-NEXT: vmov.32 q0[2], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #131]
+; CHECK-NEXT: vmov.32 q0[3], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 128
+ %0 = bitcast i8* %z to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = sext <4 x i8> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %x
+}
+
+
+define i8* @post_ldrbu16_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu16_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: ldrb r2, [r0, #4]
+; CHECK-NEXT: ldrb r3, [r0, #5]
+; CHECK-NEXT: vmov.16 q0[0], r2
+; CHECK-NEXT: ldrb.w r12, [r0, #6]
+; CHECK-NEXT: vmov.16 q0[1], r3
+; CHECK-NEXT: ldrb.w lr, [r0, #7]
+; CHECK-NEXT: vmov.16 q0[2], r12
+; CHECK-NEXT: ldrb r2, [r0, #8]
+; CHECK-NEXT: vmov.16 q0[3], lr
+; CHECK-NEXT: vmov.16 q0[4], r2
+; CHECK-NEXT: ldrb r2, [r0, #9]
+; CHECK-NEXT: vmov.16 q0[5], r2
+; CHECK-NEXT: ldrb r2, [r0, #10]
+; CHECK-NEXT: vmov.16 q0[6], r2
+; CHECK-NEXT: ldrb r2, [r0, #11]
+; CHECK-NEXT: vmov.16 q0[7], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %z to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = zext <8 x i8> %1 to <8 x i16>
+ %3 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %2, <8 x i16>* %3, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrbu16_3(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu16_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: ldrb r2, [r0, #3]
+; CHECK-NEXT: ldrb r3, [r0, #4]
+; CHECK-NEXT: vmov.16 q0[0], r2
+; CHECK-NEXT: ldrb.w r12, [r0, #5]
+; CHECK-NEXT: vmov.16 q0[1], r3
+; CHECK-NEXT: ldrb.w lr, [r0, #6]
+; CHECK-NEXT: vmov.16 q0[2], r12
+; CHECK-NEXT: ldrb r2, [r0, #7]
+; CHECK-NEXT: vmov.16 q0[3], lr
+; CHECK-NEXT: vmov.16 q0[4], r2
+; CHECK-NEXT: ldrb r2, [r0, #8]
+; CHECK-NEXT: vmov.16 q0[5], r2
+; CHECK-NEXT: ldrb r2, [r0, #9]
+; CHECK-NEXT: vmov.16 q0[6], r2
+; CHECK-NEXT: ldrb r2, [r0, #10]
+; CHECK-NEXT: vmov.16 q0[7], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 3
+ %0 = bitcast i8* %z to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = zext <8 x i8> %1 to <8 x i16>
+ %3 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %2, <8 x i16>* %3, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrbu16_127(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu16_127:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: ldrb.w r2, [r0, #127]
+; CHECK-NEXT: ldrb.w r3, [r0, #128]
+; CHECK-NEXT: vmov.16 q0[0], r2
+; CHECK-NEXT: ldrb.w r12, [r0, #129]
+; CHECK-NEXT: vmov.16 q0[1], r3
+; CHECK-NEXT: ldrb.w lr, [r0, #130]
+; CHECK-NEXT: vmov.16 q0[2], r12
+; CHECK-NEXT: ldrb.w r2, [r0, #131]
+; CHECK-NEXT: vmov.16 q0[3], lr
+; CHECK-NEXT: vmov.16 q0[4], r2
+; CHECK-NEXT: ldrb.w r2, [r0, #132]
+; CHECK-NEXT: vmov.16 q0[5], r2
+; CHECK-NEXT: ldrb.w r2, [r0, #133]
+; CHECK-NEXT: vmov.16 q0[6], r2
+; CHECK-NEXT: ldrb.w r2, [r0, #134]
+; CHECK-NEXT: vmov.16 q0[7], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 127
+ %0 = bitcast i8* %z to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = zext <8 x i8> %1 to <8 x i16>
+ %3 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %2, <8 x i16>* %3, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrbu16_128(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu16_128:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: ldrb.w r2, [r0, #128]
+; CHECK-NEXT: ldrb.w r3, [r0, #129]
+; CHECK-NEXT: vmov.16 q0[0], r2
+; CHECK-NEXT: ldrb.w r12, [r0, #130]
+; CHECK-NEXT: vmov.16 q0[1], r3
+; CHECK-NEXT: ldrb.w lr, [r0, #131]
+; CHECK-NEXT: vmov.16 q0[2], r12
+; CHECK-NEXT: ldrb.w r2, [r0, #132]
+; CHECK-NEXT: vmov.16 q0[3], lr
+; CHECK-NEXT: vmov.16 q0[4], r2
+; CHECK-NEXT: ldrb.w r2, [r0, #133]
+; CHECK-NEXT: vmov.16 q0[5], r2
+; CHECK-NEXT: ldrb.w r2, [r0, #134]
+; CHECK-NEXT: vmov.16 q0[6], r2
+; CHECK-NEXT: ldrb.w r2, [r0, #135]
+; CHECK-NEXT: vmov.16 q0[7], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 128
+ %0 = bitcast i8* %z to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = zext <8 x i8> %1 to <8 x i16>
+ %3 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %2, <8 x i16>* %3, align 8
+ ret i8* %x
+}
+
+
+define i8* @post_ldrbs16_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbs16_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrsb.w r2, [r0, #4]
+; CHECK-NEXT: vmov.16 q0[0], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #5]
+; CHECK-NEXT: vmov.16 q0[1], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #6]
+; CHECK-NEXT: vmov.16 q0[2], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #7]
+; CHECK-NEXT: vmov.16 q0[3], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #8]
+; CHECK-NEXT: vmov.16 q0[4], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #9]
+; CHECK-NEXT: vmov.16 q0[5], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #10]
+; CHECK-NEXT: vmov.16 q0[6], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #11]
+; CHECK-NEXT: vmov.16 q0[7], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %z to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = sext <8 x i8> %1 to <8 x i16>
+ %3 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %2, <8 x i16>* %3, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrbs16_3(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbs16_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrsb.w r2, [r0, #3]
+; CHECK-NEXT: vmov.16 q0[0], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #4]
+; CHECK-NEXT: vmov.16 q0[1], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #5]
+; CHECK-NEXT: vmov.16 q0[2], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #6]
+; CHECK-NEXT: vmov.16 q0[3], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #7]
+; CHECK-NEXT: vmov.16 q0[4], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #8]
+; CHECK-NEXT: vmov.16 q0[5], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #9]
+; CHECK-NEXT: vmov.16 q0[6], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #10]
+; CHECK-NEXT: vmov.16 q0[7], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 3
+ %0 = bitcast i8* %z to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = sext <8 x i8> %1 to <8 x i16>
+ %3 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %2, <8 x i16>* %3, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrbs16_127(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbs16_127:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrsb.w r2, [r0, #127]
+; CHECK-NEXT: vmov.16 q0[0], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #128]
+; CHECK-NEXT: vmov.16 q0[1], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #129]
+; CHECK-NEXT: vmov.16 q0[2], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #130]
+; CHECK-NEXT: vmov.16 q0[3], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #131]
+; CHECK-NEXT: vmov.16 q0[4], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #132]
+; CHECK-NEXT: vmov.16 q0[5], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #133]
+; CHECK-NEXT: vmov.16 q0[6], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #134]
+; CHECK-NEXT: vmov.16 q0[7], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 127
+ %0 = bitcast i8* %z to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = sext <8 x i8> %1 to <8 x i16>
+ %3 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %2, <8 x i16>* %3, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrbs16_128(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbs16_128:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrsb.w r2, [r0, #128]
+; CHECK-NEXT: vmov.16 q0[0], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #129]
+; CHECK-NEXT: vmov.16 q0[1], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #130]
+; CHECK-NEXT: vmov.16 q0[2], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #131]
+; CHECK-NEXT: vmov.16 q0[3], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #132]
+; CHECK-NEXT: vmov.16 q0[4], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #133]
+; CHECK-NEXT: vmov.16 q0[5], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #134]
+; CHECK-NEXT: vmov.16 q0[6], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #135]
+; CHECK-NEXT: vmov.16 q0[7], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 128
+ %0 = bitcast i8* %z to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = sext <8 x i8> %1 to <8 x i16>
+ %3 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %2, <8 x i16>* %3, align 8
+ ret i8* %x
+}
+
+
+define i8* @post_ldrbu8_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu8_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0, #4]
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %z to <16 x i8>*
+ %1 = load <16 x i8>, <16 x i8>* %0, align 8
+ %2 = bitcast i8* %y to <16 x i8>*
+ store <16 x i8> %1, <16 x i8>* %2, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrbu8_3(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu8_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: adds r2, r0, #3
+; CHECK-NEXT: vldrw.u32 q0, [r2]
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 3
+ %0 = bitcast i8* %z to <16 x i8>*
+ %1 = load <16 x i8>, <16 x i8>* %0, align 8
+ %2 = bitcast i8* %y to <16 x i8>*
+ store <16 x i8> %1, <16 x i8>* %2, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrbu8_127(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu8_127:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: add.w r2, r0, #127
+; CHECK-NEXT: vldrw.u32 q0, [r2]
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 127
+ %0 = bitcast i8* %z to <16 x i8>*
+ %1 = load <16 x i8>, <16 x i8>* %0, align 8
+ %2 = bitcast i8* %y to <16 x i8>*
+ store <16 x i8> %1, <16 x i8>* %2, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrbu8_128(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu8_128:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0, #128]
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 128
+ %0 = bitcast i8* %z to <16 x i8>*
+ %1 = load <16 x i8>, <16 x i8>* %0, align 8
+ %2 = bitcast i8* %y to <16 x i8>*
+ store <16 x i8> %1, <16 x i8>* %2, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrwf32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrwf32_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0, #4]
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %z to <4 x float>*
+ %1 = load <4 x float>, <4 x float>* %0, align 8
+ %2 = bitcast i8* %y to <4 x float>*
+ store <4 x float> %1, <4 x float>* %2, align 8
+ ret i8* %x
+}
+
+define i8* @post_ldrwf16_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrwf16_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0, #4]
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %z to <8 x half>*
+ %1 = load <8 x half>, <8 x half>* %0, align 8
+ %2 = bitcast i8* %y to <8 x half>*
+ store <8 x half> %1, <8 x half>* %2, align 8
+ ret i8* %x
+}
+
+
+
+
+
+define i8* @post_strw32_4(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strw32_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, #4]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 4
+ %0 = bitcast i8* %x to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %y
+}
+
+define i8* @post_strw32_3(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strw32_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: adds r1, r0, #3
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 3
+ %0 = bitcast i8* %x to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %y
+}
+
+define i8* @post_strw32_m4(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strw32_m4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, #-4]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 -4
+ %0 = bitcast i8* %x to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %y
+}
+
+define i8* @post_strw32_508(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strw32_508:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: add.w r1, r0, #508
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 508
+ %0 = bitcast i8* %x to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %y
+}
+
+define i8* @post_strw32_512(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strw32_512:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: add.w r1, r0, #512
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 512
+ %0 = bitcast i8* %x to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %y
+}
+
+define i8* @post_strw32_m508(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strw32_m508:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: sub.w r1, r0, #508
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 -508
+ %0 = bitcast i8* %x to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %y
+}
+
+define i8* @post_strw32_m512(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strw32_m512:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: sub.w r1, r0, #512
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 -512
+ %0 = bitcast i8* %x to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %y
+}
+
+
+define i8* @post_strh32_4(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strh32_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrd r1, r2, [r1]
+; CHECK-NEXT: strd r1, r2, [r0, #4]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 4
+ %0 = bitcast i8* %x to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i16>*
+ store <4 x i16> %1, <4 x i16>* %2, align 8
+ ret i8* %y
+}
+
+define i8* @post_strh32_3(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strh32_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrd r1, r2, [r1]
+; CHECK-NEXT: str.w r1, [r0, #3]
+; CHECK-NEXT: str.w r2, [r0, #7]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 3
+ %0 = bitcast i8* %x to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i16>*
+ store <4 x i16> %1, <4 x i16>* %2, align 8
+ ret i8* %y
+}
+
+define i8* @post_strh32_2(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strh32_2:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrd r1, r2, [r1]
+; CHECK-NEXT: str.w r1, [r0, #2]
+; CHECK-NEXT: str.w r2, [r0, #6]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 2
+ %0 = bitcast i8* %x to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i16>*
+ store <4 x i16> %1, <4 x i16>* %2, align 8
+ ret i8* %y
+}
+
+define i8* @post_strh32_254(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strh32_254:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrd r1, r2, [r1]
+; CHECK-NEXT: str.w r1, [r0, #254]
+; CHECK-NEXT: str.w r2, [r0, #258]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 254
+ %0 = bitcast i8* %x to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i16>*
+ store <4 x i16> %1, <4 x i16>* %2, align 8
+ ret i8* %y
+}
+
+define i8* @post_strh32_256(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strh32_256:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrd r1, r2, [r1]
+; CHECK-NEXT: strd r1, r2, [r0, #256]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 256
+ %0 = bitcast i8* %x to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i16>*
+ store <4 x i16> %1, <4 x i16>* %2, align 8
+ ret i8* %y
+}
+
+
+define i8* @post_strh16_4(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strh16_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, #4]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 4
+ %0 = bitcast i8* %x to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 8
+ %2 = bitcast i8* %z to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 8
+ ret i8* %y
+}
+
+define i8* @post_strh16_3(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strh16_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: adds r1, r0, #3
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 3
+ %0 = bitcast i8* %x to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 8
+ %2 = bitcast i8* %z to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 8
+ ret i8* %y
+}
+
+define i8* @post_strh16_2(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strh16_2:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: adds r1, r0, #2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 2
+ %0 = bitcast i8* %x to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 8
+ %2 = bitcast i8* %z to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 8
+ ret i8* %y
+}
+
+define i8* @post_strh16_254(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strh16_254:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: add.w r1, r0, #254
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 254
+ %0 = bitcast i8* %x to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 8
+ %2 = bitcast i8* %z to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 8
+ ret i8* %y
+}
+
+define i8* @post_strh16_256(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strh16_256:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: add.w r1, r0, #256
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 256
+ %0 = bitcast i8* %x to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 8
+ %2 = bitcast i8* %z to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 8
+ ret i8* %y
+}
+
+
+define i8* @post_strb32_4(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb32_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldr r1, [r1]
+; CHECK-NEXT: str r1, [r0, #4]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 4
+ %0 = bitcast i8* %x to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i8>*
+ store <4 x i8> %1, <4 x i8>* %2, align 8
+ ret i8* %y
+}
+
+define i8* @post_strb32_3(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb32_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldr r1, [r1]
+; CHECK-NEXT: str.w r1, [r0, #3]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 3
+ %0 = bitcast i8* %x to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i8>*
+ store <4 x i8> %1, <4 x i8>* %2, align 8
+ ret i8* %y
+}
+
+define i8* @post_strb32_127(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb32_127:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldr r1, [r1]
+; CHECK-NEXT: str.w r1, [r0, #127]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 127
+ %0 = bitcast i8* %x to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i8>*
+ store <4 x i8> %1, <4 x i8>* %2, align 8
+ ret i8* %y
+}
+
+define i8* @post_strb32_128(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb32_128:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldr r1, [r1]
+; CHECK-NEXT: str.w r1, [r0, #128]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 128
+ %0 = bitcast i8* %x to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i8>*
+ store <4 x i8> %1, <4 x i8>* %2, align 8
+ ret i8* %y
+}
+
+
+define i8* @post_strb16_4(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb16_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrd r1, r2, [r1]
+; CHECK-NEXT: strd r1, r2, [r0, #4]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 4
+ %0 = bitcast i8* %x to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = bitcast i8* %z to <8 x i8>*
+ store <8 x i8> %1, <8 x i8>* %2, align 8
+ ret i8* %y
+}
+
+define i8* @post_strb16_3(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb16_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrd r1, r2, [r1]
+; CHECK-NEXT: str.w r1, [r0, #3]
+; CHECK-NEXT: str.w r2, [r0, #7]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 3
+ %0 = bitcast i8* %x to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = bitcast i8* %z to <8 x i8>*
+ store <8 x i8> %1, <8 x i8>* %2, align 8
+ ret i8* %y
+}
+
+define i8* @post_strb16_127(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb16_127:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrd r1, r2, [r1]
+; CHECK-NEXT: str.w r1, [r0, #127]
+; CHECK-NEXT: str.w r2, [r0, #131]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 127
+ %0 = bitcast i8* %x to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = bitcast i8* %z to <8 x i8>*
+ store <8 x i8> %1, <8 x i8>* %2, align 8
+ ret i8* %y
+}
+
+define i8* @post_strb16_128(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb16_128:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrd r1, r2, [r1]
+; CHECK-NEXT: strd r1, r2, [r0, #128]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 128
+ %0 = bitcast i8* %x to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = bitcast i8* %z to <8 x i8>*
+ store <8 x i8> %1, <8 x i8>* %2, align 8
+ ret i8* %y
+}
+
+
+define i8* @post_strb8_4(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb8_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, #4]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 4
+ %0 = bitcast i8* %x to <16 x i8>*
+ %1 = load <16 x i8>, <16 x i8>* %0, align 8
+ %2 = bitcast i8* %z to <16 x i8>*
+ store <16 x i8> %1, <16 x i8>* %2, align 8
+ ret i8* %y
+}
+
+define i8* @post_strb8_3(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb8_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: adds r1, r0, #3
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 3
+ %0 = bitcast i8* %x to <16 x i8>*
+ %1 = load <16 x i8>, <16 x i8>* %0, align 8
+ %2 = bitcast i8* %z to <16 x i8>*
+ store <16 x i8> %1, <16 x i8>* %2, align 8
+ ret i8* %y
+}
+
+define i8* @post_strb8_127(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb8_127:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: add.w r1, r0, #127
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 127
+ %0 = bitcast i8* %x to <16 x i8>*
+ %1 = load <16 x i8>, <16 x i8>* %0, align 8
+ %2 = bitcast i8* %z to <16 x i8>*
+ store <16 x i8> %1, <16 x i8>* %2, align 8
+ ret i8* %y
+}
+
+define i8* @post_strb8_128(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb8_128:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, #128]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 128
+ %0 = bitcast i8* %x to <16 x i8>*
+ %1 = load <16 x i8>, <16 x i8>* %0, align 8
+ %2 = bitcast i8* %z to <16 x i8>*
+ store <16 x i8> %1, <16 x i8>* %2, align 8
+ ret i8* %y
+}
+
+define i8* @post_strf32_4(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strf32_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, #4]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 4
+ %0 = bitcast i8* %x to <4 x float>*
+ %1 = load <4 x float>, <4 x float>* %0, align 8
+ %2 = bitcast i8* %z to <4 x float>*
+ store <4 x float> %1, <4 x float>* %2, align 8
+ ret i8* %y
+}
+
+define i8* @post_strf16_4(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strf16_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, #4]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 4
+ %0 = bitcast i8* %x to <8 x half>*
+ %1 = load <8 x half>, <8 x half>* %0, align 8
+ %2 = bitcast i8* %z to <8 x half>*
+ store <8 x half> %1, <8 x half>* %2, align 8
+ ret i8* %y
+}
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve %s -o - | FileCheck %s
+
+define i8* @post_ldrwu32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrwu32_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %x to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrwu32_3(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrwu32_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 3
+ %0 = bitcast i8* %x to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrwu32_m4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrwu32_m4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: subs r0, #4
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 -4
+ %0 = bitcast i8* %x to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrwu32_508(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrwu32_508:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: add.w r0, r0, #508
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 508
+ %0 = bitcast i8* %x to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrwu32_512(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrwu32_512:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: add.w r0, r0, #512
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 512
+ %0 = bitcast i8* %x to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrwu32_m508(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrwu32_m508:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: sub.w r0, r0, #508
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 -508
+ %0 = bitcast i8* %x to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrwu32_m512(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrwu32_m512:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: sub.w r0, r0, #512
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 -512
+ %0 = bitcast i8* %x to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %z
+}
+
+
+define i8* @post_ldrhu32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhu32_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrh r2, [r0]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrh r2, [r0, #2]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: mov r2, r0
+; CHECK-NEXT: ldrh r3, [r2, #4]!
+; CHECK-NEXT: ldrh r0, [r0, #6]
+; CHECK-NEXT: vmov.32 q0[2], r3
+; CHECK-NEXT: vmov.32 q0[3], r0
+; CHECK-NEXT: mov r0, r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %x to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = zext <4 x i16> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrhu32_3(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhu32_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrh r2, [r0]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrh r2, [r0, #2]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrh r2, [r0, #4]
+; CHECK-NEXT: vmov.32 q0[2], r2
+; CHECK-NEXT: ldrh r2, [r0, #6]
+; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: vmov.32 q0[3], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 3
+ %0 = bitcast i8* %x to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = zext <4 x i16> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrhu32_2(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhu32_2:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrh r2, [r0]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: mov r2, r0
+; CHECK-NEXT: ldrh r3, [r2, #2]!
+; CHECK-NEXT: vmov.32 q0[1], r3
+; CHECK-NEXT: ldrh r3, [r0, #4]
+; CHECK-NEXT: ldrh r0, [r0, #6]
+; CHECK-NEXT: vmov.32 q0[2], r3
+; CHECK-NEXT: vmov.32 q0[3], r0
+; CHECK-NEXT: mov r0, r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 2
+ %0 = bitcast i8* %x to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = zext <4 x i16> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrhu32_254(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhu32_254:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrh r2, [r0]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrh r2, [r0, #2]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrh r2, [r0, #4]
+; CHECK-NEXT: vmov.32 q0[2], r2
+; CHECK-NEXT: ldrh r2, [r0, #6]
+; CHECK-NEXT: adds r0, #254
+; CHECK-NEXT: vmov.32 q0[3], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 254
+ %0 = bitcast i8* %x to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = zext <4 x i16> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrhu32_256(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhu32_256:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrh r2, [r0]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrh r2, [r0, #2]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrh r2, [r0, #4]
+; CHECK-NEXT: vmov.32 q0[2], r2
+; CHECK-NEXT: ldrh r2, [r0, #6]
+; CHECK-NEXT: add.w r0, r0, #256
+; CHECK-NEXT: vmov.32 q0[3], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 256
+ %0 = bitcast i8* %x to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = zext <4 x i16> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+
+define i8* @post_ldrhs32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhs32_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrsh.w r2, [r0]
+; CHECK-NEXT: ldrsh.w r3, [r0, #2]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrsh.w r12, [r0, #6]
+; CHECK-NEXT: ldrsh r2, [r0, #4]!
+; CHECK-NEXT: vmov.32 q0[1], r3
+; CHECK-NEXT: vmov.32 q0[2], r2
+; CHECK-NEXT: vmov.32 q0[3], r12
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %x to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = sext <4 x i16> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrhs32_3(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhs32_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: ldrsh.w r2, [r0]
+; CHECK-NEXT: ldrsh.w r3, [r0, #2]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrsh.w r12, [r0, #4]
+; CHECK-NEXT: vmov.32 q0[1], r3
+; CHECK-NEXT: ldrsh.w lr, [r0, #6]
+; CHECK-NEXT: vmov.32 q0[2], r12
+; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: vmov.32 q0[3], lr
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 3
+ %0 = bitcast i8* %x to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = sext <4 x i16> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrhs32_2(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhs32_2:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrsh.w r2, [r0]
+; CHECK-NEXT: ldrsh.w r3, [r0, #4]
+; CHECK-NEXT: ldrsh.w r12, [r0, #6]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrsh r2, [r0, #2]!
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: vmov.32 q0[2], r3
+; CHECK-NEXT: vmov.32 q0[3], r12
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 2
+ %0 = bitcast i8* %x to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = sext <4 x i16> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrhs32_254(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhs32_254:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: ldrsh.w r2, [r0]
+; CHECK-NEXT: ldrsh.w r3, [r0, #2]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrsh.w r12, [r0, #4]
+; CHECK-NEXT: vmov.32 q0[1], r3
+; CHECK-NEXT: ldrsh.w lr, [r0, #6]
+; CHECK-NEXT: vmov.32 q0[2], r12
+; CHECK-NEXT: adds r0, #254
+; CHECK-NEXT: vmov.32 q0[3], lr
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 254
+ %0 = bitcast i8* %x to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = sext <4 x i16> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrhs32_256(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhs32_256:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: ldrsh.w r2, [r0]
+; CHECK-NEXT: ldrsh.w r3, [r0, #2]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrsh.w r12, [r0, #4]
+; CHECK-NEXT: vmov.32 q0[1], r3
+; CHECK-NEXT: ldrsh.w lr, [r0, #6]
+; CHECK-NEXT: vmov.32 q0[2], r12
+; CHECK-NEXT: add.w r0, r0, #256
+; CHECK-NEXT: vmov.32 q0[3], lr
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 256
+ %0 = bitcast i8* %x to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = sext <4 x i16> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+
+define i8* @post_ldrhu16_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhu16_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %x to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 8
+ %2 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrhu16_3(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhu16_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 3
+ %0 = bitcast i8* %x to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 8
+ %2 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrhu16_2(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhu16_2:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 2
+ %0 = bitcast i8* %x to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 8
+ %2 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrhu16_254(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhu16_254:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: adds r0, #254
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 254
+ %0 = bitcast i8* %x to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 8
+ %2 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrhu16_256(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhu16_256:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: add.w r0, r0, #256
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 256
+ %0 = bitcast i8* %x to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 8
+ %2 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 8
+ ret i8* %z
+}
+
+
+define i8* @post_ldrbu32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu32_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: ldrb r2, [r0]
+; CHECK-NEXT: ldrb r3, [r0, #1]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrb.w r12, [r0, #2]
+; CHECK-NEXT: vmov.32 q0[1], r3
+; CHECK-NEXT: ldrb.w lr, [r0, #3]
+; CHECK-NEXT: vmov.32 q0[2], r12
+; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vmov.32 q0[3], lr
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %x to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = zext <4 x i8> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrbu32_3(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu32_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrb r2, [r0]
+; CHECK-NEXT: ldrb r3, [r0, #1]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrb.w r12, [r0, #2]
+; CHECK-NEXT: vmov.32 q0[1], r3
+; CHECK-NEXT: ldrb r2, [r0, #3]!
+; CHECK-NEXT: vmov.32 q0[2], r12
+; CHECK-NEXT: vmov.32 q0[3], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 3
+ %0 = bitcast i8* %x to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = zext <4 x i8> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrbu32_127(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu32_127:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: ldrb r2, [r0]
+; CHECK-NEXT: ldrb r3, [r0, #1]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrb.w r12, [r0, #2]
+; CHECK-NEXT: vmov.32 q0[1], r3
+; CHECK-NEXT: ldrb.w lr, [r0, #3]
+; CHECK-NEXT: vmov.32 q0[2], r12
+; CHECK-NEXT: adds r0, #127
+; CHECK-NEXT: vmov.32 q0[3], lr
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 127
+ %0 = bitcast i8* %x to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = zext <4 x i8> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrbu32_128(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu32_128:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: ldrb r2, [r0]
+; CHECK-NEXT: ldrb r3, [r0, #1]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrb.w r12, [r0, #2]
+; CHECK-NEXT: vmov.32 q0[1], r3
+; CHECK-NEXT: ldrb.w lr, [r0, #3]
+; CHECK-NEXT: vmov.32 q0[2], r12
+; CHECK-NEXT: adds r0, #128
+; CHECK-NEXT: vmov.32 q0[3], lr
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 128
+ %0 = bitcast i8* %x to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = zext <4 x i8> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+
+define i8* @post_ldrbs32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbs32_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrsb.w r2, [r0]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #1]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #2]
+; CHECK-NEXT: vmov.32 q0[2], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #3]
+; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vmov.32 q0[3], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %x to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = sext <4 x i8> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrbs32_3(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbs32_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrsb.w r2, [r0]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #1]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #2]
+; CHECK-NEXT: vmov.32 q0[2], r2
+; CHECK-NEXT: ldrsb r2, [r0, #3]!
+; CHECK-NEXT: vmov.32 q0[3], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 3
+ %0 = bitcast i8* %x to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = sext <4 x i8> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrbs32_127(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbs32_127:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrsb.w r2, [r0]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #1]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #2]
+; CHECK-NEXT: vmov.32 q0[2], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #3]
+; CHECK-NEXT: adds r0, #127
+; CHECK-NEXT: vmov.32 q0[3], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 127
+ %0 = bitcast i8* %x to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = sext <4 x i8> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrbs32_128(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbs32_128:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrsb.w r2, [r0]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #1]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #2]
+; CHECK-NEXT: vmov.32 q0[2], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #3]
+; CHECK-NEXT: adds r0, #128
+; CHECK-NEXT: vmov.32 q0[3], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 128
+ %0 = bitcast i8* %x to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = sext <4 x i8> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+
+define i8* @post_ldrbu16_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu16_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: ldrb r2, [r0]
+; CHECK-NEXT: ldrb r3, [r0, #1]
+; CHECK-NEXT: vmov.16 q0[0], r2
+; CHECK-NEXT: ldrb.w r12, [r0, #2]
+; CHECK-NEXT: vmov.16 q0[1], r3
+; CHECK-NEXT: mov r2, r0
+; CHECK-NEXT: ldrb.w lr, [r0, #3]
+; CHECK-NEXT: vmov.16 q0[2], r12
+; CHECK-NEXT: ldrb r3, [r2, #4]!
+; CHECK-NEXT: vmov.16 q0[3], lr
+; CHECK-NEXT: vmov.16 q0[4], r3
+; CHECK-NEXT: ldrb r3, [r0, #5]
+; CHECK-NEXT: vmov.16 q0[5], r3
+; CHECK-NEXT: ldrb r3, [r0, #6]
+; CHECK-NEXT: ldrb r0, [r0, #7]
+; CHECK-NEXT: vmov.16 q0[6], r3
+; CHECK-NEXT: vmov.16 q0[7], r0
+; CHECK-NEXT: mov r0, r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %x to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = zext <8 x i8> %1 to <8 x i16>
+ %3 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %2, <8 x i16>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrbu16_3(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu16_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: ldrb r2, [r0]
+; CHECK-NEXT: ldrb r3, [r0, #1]
+; CHECK-NEXT: vmov.16 q0[0], r2
+; CHECK-NEXT: ldrb.w r12, [r0, #2]
+; CHECK-NEXT: mov r2, r0
+; CHECK-NEXT: vmov.16 q0[1], r3
+; CHECK-NEXT: ldrb r3, [r2, #3]!
+; CHECK-NEXT: vmov.16 q0[2], r12
+; CHECK-NEXT: ldrb.w lr, [r0, #4]
+; CHECK-NEXT: vmov.16 q0[3], r3
+; CHECK-NEXT: ldrb r3, [r0, #5]
+; CHECK-NEXT: vmov.16 q0[4], lr
+; CHECK-NEXT: vmov.16 q0[5], r3
+; CHECK-NEXT: ldrb r3, [r0, #6]
+; CHECK-NEXT: ldrb r0, [r0, #7]
+; CHECK-NEXT: vmov.16 q0[6], r3
+; CHECK-NEXT: vmov.16 q0[7], r0
+; CHECK-NEXT: mov r0, r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 3
+ %0 = bitcast i8* %x to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = zext <8 x i8> %1 to <8 x i16>
+ %3 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %2, <8 x i16>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrbu16_127(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu16_127:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: ldrb r2, [r0]
+; CHECK-NEXT: ldrb r3, [r0, #1]
+; CHECK-NEXT: vmov.16 q0[0], r2
+; CHECK-NEXT: ldrb.w r12, [r0, #2]
+; CHECK-NEXT: vmov.16 q0[1], r3
+; CHECK-NEXT: ldrb.w lr, [r0, #3]
+; CHECK-NEXT: vmov.16 q0[2], r12
+; CHECK-NEXT: ldrb r2, [r0, #4]
+; CHECK-NEXT: vmov.16 q0[3], lr
+; CHECK-NEXT: vmov.16 q0[4], r2
+; CHECK-NEXT: ldrb r2, [r0, #5]
+; CHECK-NEXT: vmov.16 q0[5], r2
+; CHECK-NEXT: ldrb r2, [r0, #6]
+; CHECK-NEXT: vmov.16 q0[6], r2
+; CHECK-NEXT: ldrb r2, [r0, #7]
+; CHECK-NEXT: adds r0, #127
+; CHECK-NEXT: vmov.16 q0[7], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 127
+ %0 = bitcast i8* %x to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = zext <8 x i8> %1 to <8 x i16>
+ %3 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %2, <8 x i16>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrbu16_128(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu16_128:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: ldrb r2, [r0]
+; CHECK-NEXT: ldrb r3, [r0, #1]
+; CHECK-NEXT: vmov.16 q0[0], r2
+; CHECK-NEXT: ldrb.w r12, [r0, #2]
+; CHECK-NEXT: vmov.16 q0[1], r3
+; CHECK-NEXT: ldrb.w lr, [r0, #3]
+; CHECK-NEXT: vmov.16 q0[2], r12
+; CHECK-NEXT: ldrb r2, [r0, #4]
+; CHECK-NEXT: vmov.16 q0[3], lr
+; CHECK-NEXT: vmov.16 q0[4], r2
+; CHECK-NEXT: ldrb r2, [r0, #5]
+; CHECK-NEXT: vmov.16 q0[5], r2
+; CHECK-NEXT: ldrb r2, [r0, #6]
+; CHECK-NEXT: vmov.16 q0[6], r2
+; CHECK-NEXT: ldrb r2, [r0, #7]
+; CHECK-NEXT: adds r0, #128
+; CHECK-NEXT: vmov.16 q0[7], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 128
+ %0 = bitcast i8* %x to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = zext <8 x i8> %1 to <8 x i16>
+ %3 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %2, <8 x i16>* %3, align 8
+ ret i8* %z
+}
+
+
+define i8* @post_ldrbs16_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbs16_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrsb.w r2, [r0]
+; CHECK-NEXT: vmov.16 q0[0], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #1]
+; CHECK-NEXT: vmov.16 q0[1], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #2]
+; CHECK-NEXT: vmov.16 q0[2], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #3]
+; CHECK-NEXT: vmov.16 q0[3], r2
+; CHECK-NEXT: mov r2, r0
+; CHECK-NEXT: ldrsb r3, [r2, #4]!
+; CHECK-NEXT: vmov.16 q0[4], r3
+; CHECK-NEXT: ldrsb.w r3, [r0, #5]
+; CHECK-NEXT: vmov.16 q0[5], r3
+; CHECK-NEXT: ldrsb.w r3, [r0, #6]
+; CHECK-NEXT: ldrsb.w r0, [r0, #7]
+; CHECK-NEXT: vmov.16 q0[6], r3
+; CHECK-NEXT: vmov.16 q0[7], r0
+; CHECK-NEXT: mov r0, r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %x to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = sext <8 x i8> %1 to <8 x i16>
+ %3 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %2, <8 x i16>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrbs16_3(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbs16_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrsb.w r2, [r0]
+; CHECK-NEXT: vmov.16 q0[0], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #1]
+; CHECK-NEXT: vmov.16 q0[1], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #2]
+; CHECK-NEXT: vmov.16 q0[2], r2
+; CHECK-NEXT: mov r2, r0
+; CHECK-NEXT: ldrsb r3, [r2, #3]!
+; CHECK-NEXT: vmov.16 q0[3], r3
+; CHECK-NEXT: ldrsb.w r3, [r0, #4]
+; CHECK-NEXT: vmov.16 q0[4], r3
+; CHECK-NEXT: ldrsb.w r3, [r0, #5]
+; CHECK-NEXT: vmov.16 q0[5], r3
+; CHECK-NEXT: ldrsb.w r3, [r0, #6]
+; CHECK-NEXT: ldrsb.w r0, [r0, #7]
+; CHECK-NEXT: vmov.16 q0[6], r3
+; CHECK-NEXT: vmov.16 q0[7], r0
+; CHECK-NEXT: mov r0, r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 3
+ %0 = bitcast i8* %x to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = sext <8 x i8> %1 to <8 x i16>
+ %3 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %2, <8 x i16>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrbs16_127(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbs16_127:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrsb.w r2, [r0]
+; CHECK-NEXT: vmov.16 q0[0], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #1]
+; CHECK-NEXT: vmov.16 q0[1], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #2]
+; CHECK-NEXT: vmov.16 q0[2], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #3]
+; CHECK-NEXT: vmov.16 q0[3], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #4]
+; CHECK-NEXT: vmov.16 q0[4], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #5]
+; CHECK-NEXT: vmov.16 q0[5], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #6]
+; CHECK-NEXT: vmov.16 q0[6], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #7]
+; CHECK-NEXT: adds r0, #127
+; CHECK-NEXT: vmov.16 q0[7], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 127
+ %0 = bitcast i8* %x to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = sext <8 x i8> %1 to <8 x i16>
+ %3 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %2, <8 x i16>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrbs16_128(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbs16_128:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrsb.w r2, [r0]
+; CHECK-NEXT: vmov.16 q0[0], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #1]
+; CHECK-NEXT: vmov.16 q0[1], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #2]
+; CHECK-NEXT: vmov.16 q0[2], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #3]
+; CHECK-NEXT: vmov.16 q0[3], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #4]
+; CHECK-NEXT: vmov.16 q0[4], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #5]
+; CHECK-NEXT: vmov.16 q0[5], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #6]
+; CHECK-NEXT: vmov.16 q0[6], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #7]
+; CHECK-NEXT: adds r0, #128
+; CHECK-NEXT: vmov.16 q0[7], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 128
+ %0 = bitcast i8* %x to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = sext <8 x i8> %1 to <8 x i16>
+ %3 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %2, <8 x i16>* %3, align 8
+ ret i8* %z
+}
+
+
+define i8* @post_ldrbu8_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu8_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %x to <16 x i8>*
+ %1 = load <16 x i8>, <16 x i8>* %0, align 8
+ %2 = bitcast i8* %y to <16 x i8>*
+ store <16 x i8> %1, <16 x i8>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrbu8_3(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu8_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 3
+ %0 = bitcast i8* %x to <16 x i8>*
+ %1 = load <16 x i8>, <16 x i8>* %0, align 8
+ %2 = bitcast i8* %y to <16 x i8>*
+ store <16 x i8> %1, <16 x i8>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrbu8_127(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu8_127:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: adds r0, #127
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 127
+ %0 = bitcast i8* %x to <16 x i8>*
+ %1 = load <16 x i8>, <16 x i8>* %0, align 8
+ %2 = bitcast i8* %y to <16 x i8>*
+ store <16 x i8> %1, <16 x i8>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrbu8_128(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu8_128:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: adds r0, #128
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 128
+ %0 = bitcast i8* %x to <16 x i8>*
+ %1 = load <16 x i8>, <16 x i8>* %0, align 8
+ %2 = bitcast i8* %y to <16 x i8>*
+ store <16 x i8> %1, <16 x i8>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrwf32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrwf32_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %x to <4 x float>*
+ %1 = load <4 x float>, <4 x float>* %0, align 8
+ %2 = bitcast i8* %y to <4 x float>*
+ store <4 x float> %1, <4 x float>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrwf16_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrwf16_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %x to <8 x half>*
+ %1 = load <8 x half>, <8 x half>* %0, align 8
+ %2 = bitcast i8* %y to <8 x half>*
+ store <8 x half> %1, <8 x half>* %2, align 8
+ ret i8* %z
+}
+
+
+
+
+
+define i8* @post_strw32_4(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strw32_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 4
+ %0 = bitcast i8* %x to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strw32_3(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strw32_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 3
+ %0 = bitcast i8* %x to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strw32_m4(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strw32_m4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: subs r0, #4
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 -4
+ %0 = bitcast i8* %x to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strw32_508(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strw32_508:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: add.w r0, r0, #508
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 508
+ %0 = bitcast i8* %x to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strw32_512(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strw32_512:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: add.w r0, r0, #512
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 512
+ %0 = bitcast i8* %x to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strw32_m508(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strw32_m508:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: sub.w r0, r0, #508
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 -508
+ %0 = bitcast i8* %x to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strw32_m512(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strw32_m512:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: sub.w r0, r0, #512
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 -512
+ %0 = bitcast i8* %x to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %z
+}
+
+
+define i8* @post_strh32_4(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strh32_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrd r1, r2, [r1]
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: str r2, [r0, #4]!
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 4
+ %0 = bitcast i8* %x to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i16>*
+ store <4 x i16> %1, <4 x i16>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strh32_3(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strh32_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrd r1, r2, [r1]
+; CHECK-NEXT: strd r1, r2, [r0]
+; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 3
+ %0 = bitcast i8* %x to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i16>*
+ store <4 x i16> %1, <4 x i16>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strh32_2(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strh32_2:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrd r1, r2, [r1]
+; CHECK-NEXT: strd r1, r2, [r0]
+; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 2
+ %0 = bitcast i8* %x to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i16>*
+ store <4 x i16> %1, <4 x i16>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strh32_254(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strh32_254:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrd r1, r2, [r1]
+; CHECK-NEXT: strd r1, r2, [r0]
+; CHECK-NEXT: adds r0, #254
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 254
+ %0 = bitcast i8* %x to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i16>*
+ store <4 x i16> %1, <4 x i16>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strh32_256(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strh32_256:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrd r1, r2, [r1]
+; CHECK-NEXT: strd r1, r2, [r0]
+; CHECK-NEXT: add.w r0, r0, #256
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 256
+ %0 = bitcast i8* %x to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i16>*
+ store <4 x i16> %1, <4 x i16>* %2, align 8
+ ret i8* %z
+}
+
+
+define i8* @post_strh16_4(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strh16_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 4
+ %0 = bitcast i8* %x to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 8
+ %2 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strh16_3(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strh16_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 3
+ %0 = bitcast i8* %x to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 8
+ %2 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strh16_2(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strh16_2:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 2
+ %0 = bitcast i8* %x to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 8
+ %2 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strh16_254(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strh16_254:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: adds r0, #254
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 254
+ %0 = bitcast i8* %x to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 8
+ %2 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strh16_256(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strh16_256:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: add.w r0, r0, #256
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 256
+ %0 = bitcast i8* %x to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 8
+ %2 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 8
+ ret i8* %z
+}
+
+
+define i8* @post_strb32_4(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb32_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldr r1, [r1]
+; CHECK-NEXT: str r1, [r0], #4
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 4
+ %0 = bitcast i8* %x to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i8>*
+ store <4 x i8> %1, <4 x i8>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strb32_3(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb32_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldr r1, [r1]
+; CHECK-NEXT: str r1, [r0], #3
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 3
+ %0 = bitcast i8* %x to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i8>*
+ store <4 x i8> %1, <4 x i8>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strb32_127(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb32_127:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldr r1, [r1]
+; CHECK-NEXT: str r1, [r0], #127
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 127
+ %0 = bitcast i8* %x to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i8>*
+ store <4 x i8> %1, <4 x i8>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strb32_128(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb32_128:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldr r1, [r1]
+; CHECK-NEXT: str r1, [r0], #128
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 128
+ %0 = bitcast i8* %x to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i8>*
+ store <4 x i8> %1, <4 x i8>* %2, align 8
+ ret i8* %z
+}
+
+
+define i8* @post_strb16_4(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb16_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrd r1, r2, [r1]
+; CHECK-NEXT: str r1, [r0]
+; CHECK-NEXT: str r2, [r0, #4]!
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 4
+ %0 = bitcast i8* %x to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = bitcast i8* %y to <8 x i8>*
+ store <8 x i8> %1, <8 x i8>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strb16_3(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb16_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrd r1, r2, [r1]
+; CHECK-NEXT: strd r1, r2, [r0]
+; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 3
+ %0 = bitcast i8* %x to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = bitcast i8* %y to <8 x i8>*
+ store <8 x i8> %1, <8 x i8>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strb16_127(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb16_127:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrd r1, r2, [r1]
+; CHECK-NEXT: strd r1, r2, [r0]
+; CHECK-NEXT: adds r0, #127
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 127
+ %0 = bitcast i8* %x to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = bitcast i8* %y to <8 x i8>*
+ store <8 x i8> %1, <8 x i8>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strb16_128(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb16_128:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrd r1, r2, [r1]
+; CHECK-NEXT: strd r1, r2, [r0]
+; CHECK-NEXT: adds r0, #128
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 128
+ %0 = bitcast i8* %x to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = bitcast i8* %y to <8 x i8>*
+ store <8 x i8> %1, <8 x i8>* %2, align 8
+ ret i8* %z
+}
+
+
+define i8* @post_strb8_4(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb8_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 4
+ %0 = bitcast i8* %x to <16 x i8>*
+ %1 = load <16 x i8>, <16 x i8>* %0, align 8
+ %2 = bitcast i8* %y to <16 x i8>*
+ store <16 x i8> %1, <16 x i8>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strb8_3(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb8_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 3
+ %0 = bitcast i8* %x to <16 x i8>*
+ %1 = load <16 x i8>, <16 x i8>* %0, align 8
+ %2 = bitcast i8* %y to <16 x i8>*
+ store <16 x i8> %1, <16 x i8>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strb8_127(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb8_127:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: adds r0, #127
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 127
+ %0 = bitcast i8* %x to <16 x i8>*
+ %1 = load <16 x i8>, <16 x i8>* %0, align 8
+ %2 = bitcast i8* %y to <16 x i8>*
+ store <16 x i8> %1, <16 x i8>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strb8_128(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb8_128:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: adds r0, #128
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 128
+ %0 = bitcast i8* %x to <16 x i8>*
+ %1 = load <16 x i8>, <16 x i8>* %0, align 8
+ %2 = bitcast i8* %y to <16 x i8>*
+ store <16 x i8> %1, <16 x i8>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strf32_4(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strf32_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 4
+ %0 = bitcast i8* %x to <4 x float>*
+ %1 = load <4 x float>, <4 x float>* %0, align 8
+ %2 = bitcast i8* %y to <4 x float>*
+ store <4 x float> %1, <4 x float>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strf16_4(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strf16_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 4
+ %0 = bitcast i8* %x to <8 x half>*
+ %1 = load <8 x half>, <8 x half>* %0, align 8
+ %2 = bitcast i8* %y to <8 x half>*
+ store <8 x half> %1, <8 x half>* %2, align 8
+ ret i8* %z
+}
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve %s -o - | FileCheck %s
+
+define i8* @post_ldrwu32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrwu32_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0, #4]
+; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %z to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrwu32_3(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrwu32_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 3
+ %0 = bitcast i8* %z to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrwu32_m4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrwu32_m4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0, #-4]
+; CHECK-NEXT: subs r0, #4
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 -4
+ %0 = bitcast i8* %z to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrwu32_508(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrwu32_508:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: add.w r0, r0, #508
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 508
+ %0 = bitcast i8* %z to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrwu32_512(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrwu32_512:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: add.w r0, r0, #512
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 512
+ %0 = bitcast i8* %z to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrwu32_m508(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrwu32_m508:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: sub.w r0, r0, #508
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 -508
+ %0 = bitcast i8* %z to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrwu32_m512(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrwu32_m512:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: sub.w r0, r0, #512
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 -512
+ %0 = bitcast i8* %z to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %z
+}
+
+
+define i8* @post_ldrhu32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhu32_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrh r2, [r0, #4]!
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrh r2, [r0, #2]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrh r2, [r0, #4]
+; CHECK-NEXT: vmov.32 q0[2], r2
+; CHECK-NEXT: ldrh r2, [r0, #6]
+; CHECK-NEXT: vmov.32 q0[3], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %z to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = zext <4 x i16> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrhu32_3(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhu32_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrh r2, [r0, #3]!
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrh r2, [r0, #2]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrh r2, [r0, #4]
+; CHECK-NEXT: vmov.32 q0[2], r2
+; CHECK-NEXT: ldrh r2, [r0, #6]
+; CHECK-NEXT: vmov.32 q0[3], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 3
+ %0 = bitcast i8* %z to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = zext <4 x i16> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrhu32_2(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhu32_2:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrh r2, [r0, #2]!
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrh r2, [r0, #2]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrh r2, [r0, #4]
+; CHECK-NEXT: vmov.32 q0[2], r2
+; CHECK-NEXT: ldrh r2, [r0, #6]
+; CHECK-NEXT: vmov.32 q0[3], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 2
+ %0 = bitcast i8* %z to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = zext <4 x i16> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrhu32_254(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhu32_254:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrh r2, [r0, #254]!
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrh r2, [r0, #2]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrh r2, [r0, #4]
+; CHECK-NEXT: vmov.32 q0[2], r2
+; CHECK-NEXT: ldrh r2, [r0, #6]
+; CHECK-NEXT: vmov.32 q0[3], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 254
+ %0 = bitcast i8* %z to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = zext <4 x i16> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrhu32_256(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhu32_256:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrh.w r2, [r0, #256]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrh.w r2, [r0, #258]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrh.w r2, [r0, #260]
+; CHECK-NEXT: vmov.32 q0[2], r2
+; CHECK-NEXT: ldrh.w r2, [r0, #262]
+; CHECK-NEXT: add.w r0, r0, #256
+; CHECK-NEXT: vmov.32 q0[3], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 256
+ %0 = bitcast i8* %z to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = zext <4 x i16> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+
+define i8* @post_ldrhs32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhs32_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrsh r2, [r0, #4]!
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrsh.w r2, [r0, #2]
+; CHECK-NEXT: ldrsh.w r3, [r0, #4]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrsh.w r12, [r0, #6]
+; CHECK-NEXT: vmov.32 q0[2], r3
+; CHECK-NEXT: vmov.32 q0[3], r12
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %z to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = sext <4 x i16> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrhs32_3(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhs32_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrsh r2, [r0, #3]!
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrsh.w r2, [r0, #2]
+; CHECK-NEXT: ldrsh.w r3, [r0, #4]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrsh.w r12, [r0, #6]
+; CHECK-NEXT: vmov.32 q0[2], r3
+; CHECK-NEXT: vmov.32 q0[3], r12
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 3
+ %0 = bitcast i8* %z to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = sext <4 x i16> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrhs32_2(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhs32_2:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrsh r2, [r0, #2]!
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrsh.w r2, [r0, #2]
+; CHECK-NEXT: ldrsh.w r3, [r0, #4]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrsh.w r12, [r0, #6]
+; CHECK-NEXT: vmov.32 q0[2], r3
+; CHECK-NEXT: vmov.32 q0[3], r12
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 2
+ %0 = bitcast i8* %z to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = sext <4 x i16> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrhs32_254(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhs32_254:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrsh r2, [r0, #254]!
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrsh.w r2, [r0, #2]
+; CHECK-NEXT: ldrsh.w r3, [r0, #4]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrsh.w r12, [r0, #6]
+; CHECK-NEXT: vmov.32 q0[2], r3
+; CHECK-NEXT: vmov.32 q0[3], r12
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 254
+ %0 = bitcast i8* %z to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = sext <4 x i16> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrhs32_256(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhs32_256:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: ldrsh.w r2, [r0, #256]
+; CHECK-NEXT: ldrsh.w r3, [r0, #258]
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrsh.w r12, [r0, #260]
+; CHECK-NEXT: vmov.32 q0[1], r3
+; CHECK-NEXT: ldrsh.w lr, [r0, #262]
+; CHECK-NEXT: vmov.32 q0[2], r12
+; CHECK-NEXT: add.w r0, r0, #256
+; CHECK-NEXT: vmov.32 q0[3], lr
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 256
+ %0 = bitcast i8* %z to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = sext <4 x i16> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+
+define i8* @post_ldrhu16_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhu16_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0, #4]
+; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %z to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 8
+ %2 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrhu16_3(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhu16_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 3
+ %0 = bitcast i8* %z to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 8
+ %2 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrhu16_2(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhu16_2:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 2
+ %0 = bitcast i8* %z to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 8
+ %2 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrhu16_254(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhu16_254:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: adds r0, #254
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 254
+ %0 = bitcast i8* %z to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 8
+ %2 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrhu16_256(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrhu16_256:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: add.w r0, r0, #256
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 256
+ %0 = bitcast i8* %z to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 8
+ %2 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 8
+ ret i8* %z
+}
+
+
+define i8* @post_ldrbu32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu32_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrb r2, [r0, #4]!
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrb r2, [r0, #1]
+; CHECK-NEXT: ldrb r3, [r0, #2]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrb.w r12, [r0, #3]
+; CHECK-NEXT: vmov.32 q0[2], r3
+; CHECK-NEXT: vmov.32 q0[3], r12
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %z to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = zext <4 x i8> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrbu32_3(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu32_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrb r2, [r0, #3]!
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrb r2, [r0, #1]
+; CHECK-NEXT: ldrb r3, [r0, #2]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrb.w r12, [r0, #3]
+; CHECK-NEXT: vmov.32 q0[2], r3
+; CHECK-NEXT: vmov.32 q0[3], r12
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 3
+ %0 = bitcast i8* %z to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = zext <4 x i8> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrbu32_127(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu32_127:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrb r2, [r0, #127]!
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrb r2, [r0, #1]
+; CHECK-NEXT: ldrb r3, [r0, #2]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrb.w r12, [r0, #3]
+; CHECK-NEXT: vmov.32 q0[2], r3
+; CHECK-NEXT: vmov.32 q0[3], r12
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 127
+ %0 = bitcast i8* %z to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = zext <4 x i8> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrbu32_128(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu32_128:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrb r2, [r0, #128]!
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrb r2, [r0, #1]
+; CHECK-NEXT: ldrb r3, [r0, #2]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrb.w r12, [r0, #3]
+; CHECK-NEXT: vmov.32 q0[2], r3
+; CHECK-NEXT: vmov.32 q0[3], r12
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 128
+ %0 = bitcast i8* %z to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = zext <4 x i8> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+
+define i8* @post_ldrbs32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbs32_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrsb r2, [r0, #4]!
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #1]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #2]
+; CHECK-NEXT: vmov.32 q0[2], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #3]
+; CHECK-NEXT: vmov.32 q0[3], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %z to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = sext <4 x i8> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrbs32_3(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbs32_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrsb r2, [r0, #3]!
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #1]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #2]
+; CHECK-NEXT: vmov.32 q0[2], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #3]
+; CHECK-NEXT: vmov.32 q0[3], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 3
+ %0 = bitcast i8* %z to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = sext <4 x i8> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrbs32_127(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbs32_127:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrsb r2, [r0, #127]!
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #1]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #2]
+; CHECK-NEXT: vmov.32 q0[2], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #3]
+; CHECK-NEXT: vmov.32 q0[3], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 127
+ %0 = bitcast i8* %z to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = sext <4 x i8> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrbs32_128(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbs32_128:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrsb r2, [r0, #128]!
+; CHECK-NEXT: vmov.32 q0[0], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #1]
+; CHECK-NEXT: vmov.32 q0[1], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #2]
+; CHECK-NEXT: vmov.32 q0[2], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #3]
+; CHECK-NEXT: vmov.32 q0[3], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 128
+ %0 = bitcast i8* %z to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = sext <4 x i8> %1 to <4 x i32>
+ %3 = bitcast i8* %y to <4 x i32>*
+ store <4 x i32> %2, <4 x i32>* %3, align 8
+ ret i8* %z
+}
+
+
+define i8* @post_ldrbu16_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu16_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: ldrb r2, [r0, #4]!
+; CHECK-NEXT: vmov.16 q0[0], r2
+; CHECK-NEXT: ldrb r2, [r0, #1]
+; CHECK-NEXT: ldrb r3, [r0, #2]
+; CHECK-NEXT: vmov.16 q0[1], r2
+; CHECK-NEXT: ldrb.w r12, [r0, #3]
+; CHECK-NEXT: vmov.16 q0[2], r3
+; CHECK-NEXT: ldrb.w lr, [r0, #4]
+; CHECK-NEXT: vmov.16 q0[3], r12
+; CHECK-NEXT: ldrb r2, [r0, #5]
+; CHECK-NEXT: vmov.16 q0[4], lr
+; CHECK-NEXT: vmov.16 q0[5], r2
+; CHECK-NEXT: ldrb r2, [r0, #6]
+; CHECK-NEXT: vmov.16 q0[6], r2
+; CHECK-NEXT: ldrb r2, [r0, #7]
+; CHECK-NEXT: vmov.16 q0[7], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %z to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = zext <8 x i8> %1 to <8 x i16>
+ %3 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %2, <8 x i16>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrbu16_3(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu16_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: ldrb r2, [r0, #3]!
+; CHECK-NEXT: vmov.16 q0[0], r2
+; CHECK-NEXT: ldrb r2, [r0, #1]
+; CHECK-NEXT: ldrb r3, [r0, #2]
+; CHECK-NEXT: vmov.16 q0[1], r2
+; CHECK-NEXT: ldrb.w r12, [r0, #3]
+; CHECK-NEXT: vmov.16 q0[2], r3
+; CHECK-NEXT: ldrb.w lr, [r0, #4]
+; CHECK-NEXT: vmov.16 q0[3], r12
+; CHECK-NEXT: ldrb r2, [r0, #5]
+; CHECK-NEXT: vmov.16 q0[4], lr
+; CHECK-NEXT: vmov.16 q0[5], r2
+; CHECK-NEXT: ldrb r2, [r0, #6]
+; CHECK-NEXT: vmov.16 q0[6], r2
+; CHECK-NEXT: ldrb r2, [r0, #7]
+; CHECK-NEXT: vmov.16 q0[7], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 3
+ %0 = bitcast i8* %z to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = zext <8 x i8> %1 to <8 x i16>
+ %3 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %2, <8 x i16>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrbu16_127(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu16_127:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: ldrb r2, [r0, #127]!
+; CHECK-NEXT: vmov.16 q0[0], r2
+; CHECK-NEXT: ldrb r2, [r0, #1]
+; CHECK-NEXT: ldrb r3, [r0, #2]
+; CHECK-NEXT: vmov.16 q0[1], r2
+; CHECK-NEXT: ldrb.w r12, [r0, #3]
+; CHECK-NEXT: vmov.16 q0[2], r3
+; CHECK-NEXT: ldrb.w lr, [r0, #4]
+; CHECK-NEXT: vmov.16 q0[3], r12
+; CHECK-NEXT: ldrb r2, [r0, #5]
+; CHECK-NEXT: vmov.16 q0[4], lr
+; CHECK-NEXT: vmov.16 q0[5], r2
+; CHECK-NEXT: ldrb r2, [r0, #6]
+; CHECK-NEXT: vmov.16 q0[6], r2
+; CHECK-NEXT: ldrb r2, [r0, #7]
+; CHECK-NEXT: vmov.16 q0[7], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 127
+ %0 = bitcast i8* %z to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = zext <8 x i8> %1 to <8 x i16>
+ %3 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %2, <8 x i16>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrbu16_128(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu16_128:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r7, lr}
+; CHECK-NEXT: push {r7, lr}
+; CHECK-NEXT: ldrb r2, [r0, #128]!
+; CHECK-NEXT: vmov.16 q0[0], r2
+; CHECK-NEXT: ldrb r2, [r0, #1]
+; CHECK-NEXT: ldrb r3, [r0, #2]
+; CHECK-NEXT: vmov.16 q0[1], r2
+; CHECK-NEXT: ldrb.w r12, [r0, #3]
+; CHECK-NEXT: vmov.16 q0[2], r3
+; CHECK-NEXT: ldrb.w lr, [r0, #4]
+; CHECK-NEXT: vmov.16 q0[3], r12
+; CHECK-NEXT: ldrb r2, [r0, #5]
+; CHECK-NEXT: vmov.16 q0[4], lr
+; CHECK-NEXT: vmov.16 q0[5], r2
+; CHECK-NEXT: ldrb r2, [r0, #6]
+; CHECK-NEXT: vmov.16 q0[6], r2
+; CHECK-NEXT: ldrb r2, [r0, #7]
+; CHECK-NEXT: vmov.16 q0[7], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: pop {r7, pc}
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 128
+ %0 = bitcast i8* %z to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = zext <8 x i8> %1 to <8 x i16>
+ %3 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %2, <8 x i16>* %3, align 8
+ ret i8* %z
+}
+
+
+define i8* @post_ldrbs16_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbs16_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrsb r2, [r0, #4]!
+; CHECK-NEXT: vmov.16 q0[0], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #1]
+; CHECK-NEXT: vmov.16 q0[1], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #2]
+; CHECK-NEXT: vmov.16 q0[2], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #3]
+; CHECK-NEXT: vmov.16 q0[3], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #4]
+; CHECK-NEXT: vmov.16 q0[4], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #5]
+; CHECK-NEXT: vmov.16 q0[5], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #6]
+; CHECK-NEXT: vmov.16 q0[6], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #7]
+; CHECK-NEXT: vmov.16 q0[7], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %z to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = sext <8 x i8> %1 to <8 x i16>
+ %3 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %2, <8 x i16>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrbs16_3(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbs16_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrsb r2, [r0, #3]!
+; CHECK-NEXT: vmov.16 q0[0], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #1]
+; CHECK-NEXT: vmov.16 q0[1], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #2]
+; CHECK-NEXT: vmov.16 q0[2], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #3]
+; CHECK-NEXT: vmov.16 q0[3], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #4]
+; CHECK-NEXT: vmov.16 q0[4], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #5]
+; CHECK-NEXT: vmov.16 q0[5], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #6]
+; CHECK-NEXT: vmov.16 q0[6], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #7]
+; CHECK-NEXT: vmov.16 q0[7], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 3
+ %0 = bitcast i8* %z to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = sext <8 x i8> %1 to <8 x i16>
+ %3 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %2, <8 x i16>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrbs16_127(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbs16_127:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrsb r2, [r0, #127]!
+; CHECK-NEXT: vmov.16 q0[0], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #1]
+; CHECK-NEXT: vmov.16 q0[1], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #2]
+; CHECK-NEXT: vmov.16 q0[2], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #3]
+; CHECK-NEXT: vmov.16 q0[3], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #4]
+; CHECK-NEXT: vmov.16 q0[4], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #5]
+; CHECK-NEXT: vmov.16 q0[5], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #6]
+; CHECK-NEXT: vmov.16 q0[6], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #7]
+; CHECK-NEXT: vmov.16 q0[7], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 127
+ %0 = bitcast i8* %z to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = sext <8 x i8> %1 to <8 x i16>
+ %3 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %2, <8 x i16>* %3, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrbs16_128(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbs16_128:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrsb r2, [r0, #128]!
+; CHECK-NEXT: vmov.16 q0[0], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #1]
+; CHECK-NEXT: vmov.16 q0[1], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #2]
+; CHECK-NEXT: vmov.16 q0[2], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #3]
+; CHECK-NEXT: vmov.16 q0[3], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #4]
+; CHECK-NEXT: vmov.16 q0[4], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #5]
+; CHECK-NEXT: vmov.16 q0[5], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #6]
+; CHECK-NEXT: vmov.16 q0[6], r2
+; CHECK-NEXT: ldrsb.w r2, [r0, #7]
+; CHECK-NEXT: vmov.16 q0[7], r2
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 128
+ %0 = bitcast i8* %z to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = sext <8 x i8> %1 to <8 x i16>
+ %3 = bitcast i8* %y to <8 x i16>*
+ store <8 x i16> %2, <8 x i16>* %3, align 8
+ ret i8* %z
+}
+
+
+define i8* @post_ldrbu8_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu8_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0, #4]
+; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %z to <16 x i8>*
+ %1 = load <16 x i8>, <16 x i8>* %0, align 8
+ %2 = bitcast i8* %y to <16 x i8>*
+ store <16 x i8> %1, <16 x i8>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrbu8_3(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu8_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 3
+ %0 = bitcast i8* %z to <16 x i8>*
+ %1 = load <16 x i8>, <16 x i8>* %0, align 8
+ %2 = bitcast i8* %y to <16 x i8>*
+ store <16 x i8> %1, <16 x i8>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrbu8_127(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu8_127:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: adds r0, #127
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 127
+ %0 = bitcast i8* %z to <16 x i8>*
+ %1 = load <16 x i8>, <16 x i8>* %0, align 8
+ %2 = bitcast i8* %y to <16 x i8>*
+ store <16 x i8> %1, <16 x i8>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrbu8_128(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrbu8_128:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0, #128]
+; CHECK-NEXT: adds r0, #128
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 128
+ %0 = bitcast i8* %z to <16 x i8>*
+ %1 = load <16 x i8>, <16 x i8>* %0, align 8
+ %2 = bitcast i8* %y to <16 x i8>*
+ store <16 x i8> %1, <16 x i8>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrwf32_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrwf32_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0, #4]
+; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %z to <4 x float>*
+ %1 = load <4 x float>, <4 x float>* %0, align 8
+ %2 = bitcast i8* %y to <4 x float>*
+ store <4 x float> %1, <4 x float>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_ldrwf16_4(i8* %x, i8* %y) {
+; CHECK-LABEL: post_ldrwf16_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0, #4]
+; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %x, i32 4
+ %0 = bitcast i8* %z to <8 x half>*
+ %1 = load <8 x half>, <8 x half>* %0, align 8
+ %2 = bitcast i8* %y to <8 x half>*
+ store <8 x half> %1, <8 x half>* %2, align 8
+ ret i8* %z
+}
+
+
+
+
+
+define i8* @post_strw32_4(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strw32_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, #4]
+; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 4
+ %0 = bitcast i8* %x to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strw32_3(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strw32_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 3
+ %0 = bitcast i8* %x to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strw32_m4(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strw32_m4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, #-4]
+; CHECK-NEXT: subs r0, #4
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 -4
+ %0 = bitcast i8* %x to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strw32_508(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strw32_508:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: add.w r0, r0, #508
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 508
+ %0 = bitcast i8* %x to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strw32_512(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strw32_512:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: add.w r0, r0, #512
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 512
+ %0 = bitcast i8* %x to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strw32_m508(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strw32_m508:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: sub.w r0, r0, #508
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 -508
+ %0 = bitcast i8* %x to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strw32_m512(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strw32_m512:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: sub.w r0, r0, #512
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 -512
+ %0 = bitcast i8* %x to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 8
+ ret i8* %z
+}
+
+
+define i8* @post_strh32_4(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strh32_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrd r1, r2, [r1]
+; CHECK-NEXT: str r1, [r0, #4]!
+; CHECK-NEXT: str r2, [r0, #4]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 4
+ %0 = bitcast i8* %x to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i16>*
+ store <4 x i16> %1, <4 x i16>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strh32_3(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strh32_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrd r1, r2, [r1]
+; CHECK-NEXT: str r1, [r0, #3]!
+; CHECK-NEXT: str r2, [r0, #4]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 3
+ %0 = bitcast i8* %x to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i16>*
+ store <4 x i16> %1, <4 x i16>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strh32_2(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strh32_2:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrd r1, r2, [r1]
+; CHECK-NEXT: str r1, [r0, #2]!
+; CHECK-NEXT: str r2, [r0, #4]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 2
+ %0 = bitcast i8* %x to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i16>*
+ store <4 x i16> %1, <4 x i16>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strh32_254(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strh32_254:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrd r1, r2, [r1]
+; CHECK-NEXT: str r1, [r0, #254]!
+; CHECK-NEXT: str r2, [r0, #4]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 254
+ %0 = bitcast i8* %x to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i16>*
+ store <4 x i16> %1, <4 x i16>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strh32_256(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strh32_256:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrd r1, r2, [r1]
+; CHECK-NEXT: strd r1, r2, [r0, #256]
+; CHECK-NEXT: add.w r0, r0, #256
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 256
+ %0 = bitcast i8* %x to <4 x i16>*
+ %1 = load <4 x i16>, <4 x i16>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i16>*
+ store <4 x i16> %1, <4 x i16>* %2, align 8
+ ret i8* %z
+}
+
+
+define i8* @post_strh16_4(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strh16_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, #4]
+; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 4
+ %0 = bitcast i8* %x to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 8
+ %2 = bitcast i8* %z to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strh16_3(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strh16_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 3
+ %0 = bitcast i8* %x to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 8
+ %2 = bitcast i8* %z to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strh16_2(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strh16_2:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: adds r0, #2
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 2
+ %0 = bitcast i8* %x to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 8
+ %2 = bitcast i8* %z to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strh16_254(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strh16_254:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: adds r0, #254
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 254
+ %0 = bitcast i8* %x to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 8
+ %2 = bitcast i8* %z to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strh16_256(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strh16_256:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: add.w r0, r0, #256
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 256
+ %0 = bitcast i8* %x to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 8
+ %2 = bitcast i8* %z to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 8
+ ret i8* %z
+}
+
+
+define i8* @post_strb32_4(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb32_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldr r1, [r1]
+; CHECK-NEXT: str r1, [r0, #4]!
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 4
+ %0 = bitcast i8* %x to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i8>*
+ store <4 x i8> %1, <4 x i8>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strb32_3(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb32_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldr r1, [r1]
+; CHECK-NEXT: str r1, [r0, #3]!
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 3
+ %0 = bitcast i8* %x to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i8>*
+ store <4 x i8> %1, <4 x i8>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strb32_127(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb32_127:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldr r1, [r1]
+; CHECK-NEXT: str r1, [r0, #127]!
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 127
+ %0 = bitcast i8* %x to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i8>*
+ store <4 x i8> %1, <4 x i8>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strb32_128(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb32_128:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldr r1, [r1]
+; CHECK-NEXT: str r1, [r0, #128]!
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 128
+ %0 = bitcast i8* %x to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 8
+ %2 = bitcast i8* %z to <4 x i8>*
+ store <4 x i8> %1, <4 x i8>* %2, align 8
+ ret i8* %z
+}
+
+
+define i8* @post_strb16_4(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb16_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrd r1, r2, [r1]
+; CHECK-NEXT: str r1, [r0, #4]!
+; CHECK-NEXT: str r2, [r0, #4]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 4
+ %0 = bitcast i8* %x to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = bitcast i8* %z to <8 x i8>*
+ store <8 x i8> %1, <8 x i8>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strb16_3(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb16_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrd r1, r2, [r1]
+; CHECK-NEXT: str r1, [r0, #3]!
+; CHECK-NEXT: str r2, [r0, #4]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 3
+ %0 = bitcast i8* %x to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = bitcast i8* %z to <8 x i8>*
+ store <8 x i8> %1, <8 x i8>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strb16_127(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb16_127:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrd r1, r2, [r1]
+; CHECK-NEXT: str r1, [r0, #127]!
+; CHECK-NEXT: str r2, [r0, #4]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 127
+ %0 = bitcast i8* %x to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = bitcast i8* %z to <8 x i8>*
+ store <8 x i8> %1, <8 x i8>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strb16_128(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb16_128:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: ldrd r1, r2, [r1]
+; CHECK-NEXT: str r1, [r0, #128]!
+; CHECK-NEXT: str r2, [r0, #4]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 128
+ %0 = bitcast i8* %x to <8 x i8>*
+ %1 = load <8 x i8>, <8 x i8>* %0, align 8
+ %2 = bitcast i8* %z to <8 x i8>*
+ store <8 x i8> %1, <8 x i8>* %2, align 8
+ ret i8* %z
+}
+
+
+define i8* @post_strb8_4(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb8_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, #4]
+; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 4
+ %0 = bitcast i8* %x to <16 x i8>*
+ %1 = load <16 x i8>, <16 x i8>* %0, align 8
+ %2 = bitcast i8* %z to <16 x i8>*
+ store <16 x i8> %1, <16 x i8>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strb8_3(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb8_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: adds r0, #3
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 3
+ %0 = bitcast i8* %x to <16 x i8>*
+ %1 = load <16 x i8>, <16 x i8>* %0, align 8
+ %2 = bitcast i8* %z to <16 x i8>*
+ store <16 x i8> %1, <16 x i8>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strb8_127(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb8_127:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: adds r0, #127
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 127
+ %0 = bitcast i8* %x to <16 x i8>*
+ %1 = load <16 x i8>, <16 x i8>* %0, align 8
+ %2 = bitcast i8* %z to <16 x i8>*
+ store <16 x i8> %1, <16 x i8>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strb8_128(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strb8_128:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, #128]
+; CHECK-NEXT: adds r0, #128
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 128
+ %0 = bitcast i8* %x to <16 x i8>*
+ %1 = load <16 x i8>, <16 x i8>* %0, align 8
+ %2 = bitcast i8* %z to <16 x i8>*
+ store <16 x i8> %1, <16 x i8>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strf32_4(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strf32_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, #4]
+; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 4
+ %0 = bitcast i8* %x to <4 x float>*
+ %1 = load <4 x float>, <4 x float>* %0, align 8
+ %2 = bitcast i8* %z to <4 x float>*
+ store <4 x float> %1, <4 x float>* %2, align 8
+ ret i8* %z
+}
+
+define i8* @post_strf16_4(i8* %y, i8* %x) {
+; CHECK-LABEL: post_strf16_4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r0, #4]
+; CHECK-NEXT: adds r0, #4
+; CHECK-NEXT: bx lr
+entry:
+ %z = getelementptr inbounds i8, i8* %y, i32 4
+ %0 = bitcast i8* %x to <8 x half>*
+ %1 = load <8 x half>, <8 x half>* %0, align 8
+ %2 = bitcast i8* %z to <8 x half>*
+ store <8 x half> %1, <8 x half>* %2, align 8
+ ret i8* %z
+}
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve %s -o - | FileCheck %s
+
+%struct.s_int8_t = type { [16 x i8], [16 x i8] }
+%struct.s_int16_t = type { [8 x i16], [8 x i16] }
+%struct.s_int32_t = type { [4 x i32], [4 x i32] }
+%struct.s_float16_t = type { [8 x half], [8 x half] }
+%struct.s_float32_t = type { [4 x float], [4 x float] }
+
+define hidden void @fwd_int8_t(%struct.s_int8_t* noalias %v) local_unnamed_addr #0 {
+; CHECK-LABEL: fwd_int8_t:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrb.u8 q0, [r0]
+; CHECK-NEXT: vstrb.8 q0, [r0, #16]
+; CHECK-NEXT: bx lr
+entry:
+ %arrayidx3 = getelementptr inbounds %struct.s_int8_t, %struct.s_int8_t* %v, i32 0, i32 1, i32 0
+ %0 = bitcast %struct.s_int8_t* %v to <16 x i8>*
+ %1 = load <16 x i8>, <16 x i8>* %0, align 1
+ %2 = bitcast i8* %arrayidx3 to <16 x i8>*
+ store <16 x i8> %1, <16 x i8>* %2, align 1
+ ret void
+}
+
+define hidden void @fwd_int16_t(%struct.s_int16_t* noalias nocapture %v) local_unnamed_addr #0 {
+; CHECK-LABEL: fwd_int16_t:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrh.u16 q0, [r0]
+; CHECK-NEXT: vstrh.16 q0, [r0, #16]
+; CHECK-NEXT: bx lr
+entry:
+ %arrayidx3 = getelementptr inbounds %struct.s_int16_t, %struct.s_int16_t* %v, i32 0, i32 1, i32 0
+ %0 = bitcast %struct.s_int16_t* %v to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 2
+ %2 = bitcast i16* %arrayidx3 to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 2
+ ret void
+}
+
+define hidden void @fwd_int32_t(%struct.s_int32_t* noalias nocapture %v) local_unnamed_addr #0 {
+; CHECK-LABEL: fwd_int32_t:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vstrw.32 q0, [r0, #16]
+; CHECK-NEXT: bx lr
+entry:
+ %arrayidx3 = getelementptr inbounds %struct.s_int32_t, %struct.s_int32_t* %v, i32 0, i32 1, i32 0
+ %0 = bitcast %struct.s_int32_t* %v to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 4
+ %2 = bitcast i32* %arrayidx3 to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 4
+ ret void
+}
+
+define hidden void @fwd_float16_t(%struct.s_float16_t* noalias nocapture %v) local_unnamed_addr #0 {
+; CHECK-LABEL: fwd_float16_t:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrh.u16 q0, [r0]
+; CHECK-NEXT: vstrh.16 q0, [r0, #16]
+; CHECK-NEXT: bx lr
+entry:
+ %arrayidx3 = getelementptr inbounds %struct.s_float16_t, %struct.s_float16_t* %v, i32 0, i32 1, i32 0
+ %0 = bitcast %struct.s_float16_t* %v to <8 x half>*
+ %1 = load <8 x half>, <8 x half>* %0, align 2
+ %2 = bitcast half* %arrayidx3 to <8 x half>*
+ store <8 x half> %1, <8 x half>* %2, align 2
+ ret void
+}
+
+define hidden void @fwd_float32_t(%struct.s_float32_t* noalias nocapture %v) local_unnamed_addr #0 {
+; CHECK-LABEL: fwd_float32_t:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vstrw.32 q0, [r0, #16]
+; CHECK-NEXT: bx lr
+entry:
+ %d = getelementptr inbounds %struct.s_float32_t, %struct.s_float32_t* %v, i32 0, i32 1
+ %0 = bitcast %struct.s_float32_t* %v to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 4
+ %2 = bitcast [4 x float]* %d to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 4
+ ret void
+}
+
+define hidden void @bwd_int8_t(%struct.s_int8_t* noalias %v) local_unnamed_addr #0 {
+; CHECK-LABEL: bwd_int8_t:
+; CHECK: @ %bb.0: @ %for.end
+; CHECK-NEXT: vldrb.u8 q0, [r0]
+; CHECK-NEXT: vstrb.8 q0, [r0, #-16]
+; CHECK-NEXT: bx lr
+for.end:
+ %0 = bitcast %struct.s_int8_t* %v to <16 x i8>*
+ %1 = load <16 x i8>, <16 x i8>* %0, align 1
+ %arrayidx3 = getelementptr inbounds %struct.s_int8_t, %struct.s_int8_t* %v, i32 -1, i32 1, i32 0
+ %2 = bitcast i8* %arrayidx3 to <16 x i8>*
+ store <16 x i8> %1, <16 x i8>* %2, align 1
+ ret void
+}
+
+define hidden void @bwd_int16_t(%struct.s_int16_t* noalias nocapture %v) local_unnamed_addr #0 {
+; CHECK-LABEL: bwd_int16_t:
+; CHECK: @ %bb.0: @ %for.end
+; CHECK-NEXT: vldrh.u16 q0, [r0]
+; CHECK-NEXT: vstrh.16 q0, [r0, #-16]
+; CHECK-NEXT: bx lr
+for.end:
+ %0 = bitcast %struct.s_int16_t* %v to <8 x i16>*
+ %1 = load <8 x i16>, <8 x i16>* %0, align 2
+ %arrayidx3 = getelementptr inbounds %struct.s_int16_t, %struct.s_int16_t* %v, i32 -1, i32 1, i32 0
+ %2 = bitcast i16* %arrayidx3 to <8 x i16>*
+ store <8 x i16> %1, <8 x i16>* %2, align 2
+ ret void
+}
+
+define hidden void @bwd_int32_t(%struct.s_int32_t* noalias nocapture %v) local_unnamed_addr #0 {
+; CHECK-LABEL: bwd_int32_t:
+; CHECK: @ %bb.0: @ %for.end
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vstrw.32 q0, [r0, #-16]
+; CHECK-NEXT: bx lr
+for.end:
+ %0 = bitcast %struct.s_int32_t* %v to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 4
+ %arrayidx3 = getelementptr inbounds %struct.s_int32_t, %struct.s_int32_t* %v, i32 -1, i32 1, i32 0
+ %2 = bitcast i32* %arrayidx3 to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 4
+ ret void
+}
+
+define hidden void @bwd_float16_t(%struct.s_float16_t* noalias nocapture %v) local_unnamed_addr #0 {
+; CHECK-LABEL: bwd_float16_t:
+; CHECK: @ %bb.0: @ %for.end
+; CHECK-NEXT: vldrh.u16 q0, [r0]
+; CHECK-NEXT: vstrh.16 q0, [r0, #-16]
+; CHECK-NEXT: bx lr
+for.end:
+ %0 = bitcast %struct.s_float16_t* %v to <8 x half>*
+ %1 = load <8 x half>, <8 x half>* %0, align 2
+ %arrayidx3 = getelementptr inbounds %struct.s_float16_t, %struct.s_float16_t* %v, i32 -1, i32 1, i32 0
+ %2 = bitcast half* %arrayidx3 to <8 x half>*
+ store <8 x half> %1, <8 x half>* %2, align 2
+ ret void
+}
+
+define hidden void @bwd_float32_t(%struct.s_float32_t* noalias nocapture %v) local_unnamed_addr #0 {
+; CHECK-LABEL: bwd_float32_t:
+; CHECK: @ %bb.0: @ %for.end
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vstrw.32 q0, [r0, #-16]
+; CHECK-NEXT: bx lr
+for.end:
+ %0 = bitcast %struct.s_float32_t* %v to <4 x i32>*
+ %1 = load <4 x i32>, <4 x i32>* %0, align 4
+ %d = getelementptr inbounds %struct.s_float32_t, %struct.s_float32_t* %v, i32 -1, i32 1
+ %2 = bitcast [4 x float]* %d to <4 x i32>*
+ store <4 x i32> %1, <4 x i32>* %2, align 4
+ ret void
+}
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve %s -o - | FileCheck %s
+
+define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a4(<4 x i32>* %vp) {
+; CHECK-LABEL: load_4xi32_a4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %0 = load <4 x i32>, <4 x i32>* %vp, align 4
+ ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a2(<4 x i32>* %vp) {
+; CHECK-LABEL: load_4xi32_a2:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrh.u16 q0, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %0 = load <4 x i32>, <4 x i32>* %vp, align 2
+ ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a1(<4 x i32>* %vp) {
+; CHECK-LABEL: load_4xi32_a1:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldrb.u8 q0, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %0 = load <4 x i32>, <4 x i32>* %vp, align 1
+ ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc void @store_4xi32_a4(<4 x i32>* %vp, <4 x i32> %val) {
+; CHECK-LABEL: store_4xi32_a4:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ store <4 x i32> %val, <4 x i32>* %vp, align 4
+ ret void
+}
+
+define arm_aapcs_vfpcc void @store_4xi32_a2(<4 x i32>* %vp, <4 x i32> %val) {
+; CHECK-LABEL: store_4xi32_a2:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vstrh.16 q0, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ store <4 x i32> %val, <4 x i32>* %vp, align 2
+ ret void
+}
+
+define arm_aapcs_vfpcc void @store_4xi32_a1(<4 x i32>* %vp, <4 x i32> %val) {
+; CHECK-LABEL: store_4xi32_a1:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vstrb.8 q0, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ store <4 x i32> %val, <4 x i32>* %vp, align 1
+ ret void
+}
+
+define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a4_offset_pos(i32* %ip) {
+; CHECK-LABEL: load_4xi32_a4_offset_pos:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: add.w r0, r0, #508
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %ipoffset = getelementptr inbounds i32, i32* %ip, i32 127
+ %vp = bitcast i32* %ipoffset to <4 x i32>*
+ %0 = load <4 x i32>, <4 x i32>* %vp, align 4
+ ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @load_4xi32_a4_offset_neg(i32* %ip) {
+; CHECK-LABEL: load_4xi32_a4_offset_neg:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: sub.w r0, r0, #508
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %ipoffset = getelementptr inbounds i32, i32* %ip, i32 -127
+ %vp = bitcast i32* %ipoffset to <4 x i32>*
+ %0 = load <4 x i32>, <4 x i32>* %vp, align 4
+ ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @loadstore_4xi32_stack_off16() {
+; CHECK-LABEL: loadstore_4xi32_stack_off16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .pad #40
+; CHECK-NEXT: sub sp, #40
+; CHECK-NEXT: movs r0, #1
+; CHECK-NEXT: vdup.32 q0, r0
+; CHECK-NEXT: mov r0, sp
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: movs r0, #3
+; CHECK-NEXT: vstrw.32 q0, [sp, #16]
+; CHECK-NEXT: str r0, [sp, #16]
+; CHECK-NEXT: vldrw.u32 q0, [sp, #16]
+; CHECK-NEXT: add sp, #40
+; CHECK-NEXT: bx lr
+entry:
+ %c = alloca [1 x [5 x [2 x i32]]], align 4
+ %0 = bitcast [1 x [5 x [2 x i32]]]* %c to i8*
+ %arrayidx5 = getelementptr inbounds [1 x [5 x [2 x i32]]], [1 x [5 x [2 x i32]]]* %c, i32 0, i32 0, i32 0, i32 0
+ %1 = bitcast [1 x [5 x [2 x i32]]]* %c to <4 x i32>*
+ store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32>* %1, align 4
+ %arrayidx5.2 = getelementptr inbounds [1 x [5 x [2 x i32]]], [1 x [5 x [2 x i32]]]* %c, i32 0, i32 0, i32 2, i32 0
+ %2 = bitcast i32* %arrayidx5.2 to <4 x i32>*
+ store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32>* %2, align 4
+ store i32 3, i32* %arrayidx5.2, align 4
+ %3 = load <4 x i32>, <4 x i32>* %2, align 4
+ ret <4 x i32> %3
+}
+
+define arm_aapcs_vfpcc <8 x i16> @loadstore_8xi16_stack_off16() {
+; CHECK-LABEL: loadstore_8xi16_stack_off16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .pad #40
+; CHECK-NEXT: sub sp, #40
+; CHECK-NEXT: movs r0, #1
+; CHECK-NEXT: vdup.16 q0, r0
+; CHECK-NEXT: mov r0, sp
+; CHECK-NEXT: vstrh.16 q0, [r0]
+; CHECK-NEXT: movs r0, #3
+; CHECK-NEXT: vstrh.16 q0, [sp, #16]
+; CHECK-NEXT: strh.w r0, [sp, #16]
+; CHECK-NEXT: vldrh.u16 q0, [sp, #16]
+; CHECK-NEXT: add sp, #40
+; CHECK-NEXT: bx lr
+entry:
+ %c = alloca [1 x [10 x [2 x i16]]], align 2
+ %0 = bitcast [1 x [10 x [2 x i16]]]* %c to i8*
+ %arrayidx5 = getelementptr inbounds [1 x [10 x [2 x i16]]], [1 x [10 x [2 x i16]]]* %c, i32 0, i32 0, i32 0, i32 0
+ %1 = bitcast [1 x [10 x [2 x i16]]]* %c to <8 x i16>*
+ store <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, <8 x i16>* %1, align 2
+ %arrayidx5.2 = getelementptr inbounds [1 x [10 x [2 x i16]]], [1 x [10 x [2 x i16]]]* %c, i32 0, i32 0, i32 4, i32 0
+ %2 = bitcast i16* %arrayidx5.2 to <8 x i16>*
+ store <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, <8 x i16>* %2, align 2
+ store i16 3, i16* %arrayidx5.2, align 2
+ %3 = load <8 x i16>, <8 x i16>* %2, align 2
+ ret <8 x i16> %3
+}
+
+define arm_aapcs_vfpcc <16 x i8> @loadstore_16xi8_stack_off16() {
+; CHECK-LABEL: loadstore_16xi8_stack_off16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .pad #40
+; CHECK-NEXT: sub sp, #40
+; CHECK-NEXT: movs r0, #1
+; CHECK-NEXT: vdup.8 q0, r0
+; CHECK-NEXT: mov r0, sp
+; CHECK-NEXT: vstrb.8 q0, [r0]
+; CHECK-NEXT: movs r0, #3
+; CHECK-NEXT: vstrb.8 q0, [sp, #16]
+; CHECK-NEXT: strb.w r0, [sp, #16]
+; CHECK-NEXT: vldrb.u8 q0, [sp, #16]
+; CHECK-NEXT: add sp, #40
+; CHECK-NEXT: bx lr
+entry:
+ %c = alloca [1 x [20 x [2 x i8]]], align 1
+ %0 = bitcast [1 x [20 x [2 x i8]]]* %c to i8*
+ %arrayidx5 = getelementptr inbounds [1 x [20 x [2 x i8]]], [1 x [20 x [2 x i8]]]* %c, i32 0, i32 0, i32 0, i32 0
+ %1 = bitcast [1 x [20 x [2 x i8]]]* %c to <16 x i8>*
+ store <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <16 x i8>* %1, align 1
+ %arrayidx5.2 = getelementptr inbounds [1 x [20 x [2 x i8]]], [1 x [20 x [2 x i8]]]* %c, i32 0, i32 0, i32 8, i32 0
+ %2 = bitcast i8* %arrayidx5.2 to <16 x i8>*
+ store <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <16 x i8>* %2, align 1
+ store i8 3, i8* %arrayidx5.2, align 1
+ %3 = load <16 x i8>, <16 x i8>* %2, align 1
+ ret <16 x i8> %3
+}