From cf5b8bc093bca4ad6643ec1028ec20e3500e1b9a Mon Sep 17 00:00:00 2001 From: Artur Pilipenko Date: Mon, 6 Feb 2017 14:15:31 +0000 Subject: [PATCH] Add DAGCombiner load combine tests with non-zero offset This is separated from https://reviews.llvm.org/D29394 review. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@294185 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../AArch64/load-combine-big-endian.ll | 140 ++++++++++++++ test/CodeGen/AArch64/load-combine.ll | 140 ++++++++++++++ test/CodeGen/ARM/load-combine-big-endian.ll | 180 ++++++++++++++++++ test/CodeGen/ARM/load-combine.ll | 180 ++++++++++++++++++ test/CodeGen/X86/load-combine.ll | 168 +++++++++++++++- 5 files changed, 805 insertions(+), 3 deletions(-) diff --git a/test/CodeGen/AArch64/load-combine-big-endian.ll b/test/CodeGen/AArch64/load-combine-big-endian.ll index b19ef3ec692..692a57c4471 100644 --- a/test/CodeGen/AArch64/load-combine-big-endian.ll +++ b/test/CodeGen/AArch64/load-combine-big-endian.ll @@ -191,3 +191,143 @@ define i64 @load_i64_by_i8(i64* %arg) { %tmp37 = or i64 %tmp33, %tmp36 ret i64 %tmp37 } + +; i8* p; // p[1] is 4 byte aligned +; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24) +define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_nonzero_offset: +; CHECK: ldrb w8, [x0, #1] +; CHECK-NEXT: ldrb w9, [x0, #2] +; CHECK-NEXT: ldrb w10, [x0, #3] +; CHECK-NEXT: ldrb w11, [x0, #4] +; CHECK-NEXT: bfi w8, w9, #8, #8 +; CHECK-NEXT: bfi w8, w10, #16, #8 +; CHECK-NEXT: bfi w8, w11, #24, #8 +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: ret + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp2 = load i8, i8* %tmp1, align 4 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 4 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[-4] is 4 byte aligned +; (i32) p[-4] | ((i32) p[-3] << 8) | ((i32) p[-2] << 16) | ((i32) p[-1] << 24) +define i32 @load_i32_by_i8_neg_offset(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_neg_offset: +; CHECK: ldurb w8, [x0, #-4] +; CHECK-NEXT: ldurb w9, [x0, #-3] +; CHECK-NEXT: ldurb w10, [x0, #-2] +; CHECK-NEXT: ldurb w11, [x0, #-1] +; CHECK-NEXT: bfi w8, w9, #8, #8 +; CHECK-NEXT: bfi w8, w10, #16, #8 +; CHECK-NEXT: bfi w8, w11, #24, #8 +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: ret + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -4 + %tmp2 = load i8, i8* %tmp1, align 4 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -3 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -1 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[1] is 4 byte aligned +; (i32) p[4] | ((i32) p[3] << 8) | ((i32) p[2] << 16) | ((i32) p[1] << 24) +define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap: +; CHECK: ldrb w8, [x0, #4] +; CHECK-NEXT: ldrb w9, [x0, #3] +; CHECK-NEXT: ldrb w10, [x0, #2] +; CHECK-NEXT: ldrb w11, [x0, #1] +; CHECK-NEXT: bfi w8, w9, #8, #8 +; CHECK-NEXT: bfi w8, w10, #16, #8 +; CHECK-NEXT: bfi w8, w11, #24, #8 +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: ret + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 4 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp15 = load i8, i8* %tmp14, align 4 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[-4] is 4 byte aligned +; (i32) p[-1] | ((i32) p[-2] << 8) | ((i32) p[-3] << 16) | ((i32) p[-4] << 24) +define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap: +; CHECK: ldurb w8, [x0, #-1] +; CHECK-NEXT: ldurb w9, [x0, #-2] +; CHECK-NEXT: ldurb w10, [x0, #-3] +; CHECK-NEXT: ldurb w11, [x0, #-4] +; CHECK-NEXT: bfi w8, w9, #8, #8 +; CHECK-NEXT: bfi w8, w10, #16, #8 +; CHECK-NEXT: bfi w8, w11, #24, #8 +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: ret + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -1 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -2 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -3 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -4 + %tmp15 = load i8, i8* %tmp14, align 4 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} diff --git a/test/CodeGen/AArch64/load-combine.ll b/test/CodeGen/AArch64/load-combine.ll index 4644fa263ee..86461e2a1d7 100644 --- a/test/CodeGen/AArch64/load-combine.ll +++ b/test/CodeGen/AArch64/load-combine.ll @@ -178,3 +178,143 @@ define i64 @load_i64_by_i8_bswap(i64* %arg) { %tmp37 = or i64 %tmp33, %tmp36 ret i64 %tmp37 } + +; i8* p; // p[1] is 4 byte aligned +; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24) +define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_nonzero_offset: +; CHECK: ldrb w8, [x0, #1] +; CHECK-NEXT: ldrb w9, [x0, #2] +; CHECK-NEXT: ldrb w10, [x0, #3] +; CHECK-NEXT: ldrb w11, [x0, #4] +; CHECK-NEXT: bfi w8, w9, #8, #8 +; CHECK-NEXT: bfi w8, w10, #16, #8 +; CHECK-NEXT: bfi w8, w11, #24, #8 +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: ret + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp2 = load i8, i8* %tmp1, align 4 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 4 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[-4] is 4 byte aligned +; (i32) p[-4] | ((i32) p[-3] << 8) | ((i32) p[-2] << 16) | ((i32) p[-1] << 24) +define i32 @load_i32_by_i8_neg_offset(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_neg_offset: +; CHECK: ldurb w8, [x0, #-4] +; CHECK-NEXT: ldurb w9, [x0, #-3] +; CHECK-NEXT: ldurb w10, [x0, #-2] +; CHECK-NEXT: ldurb w11, [x0, #-1] +; CHECK-NEXT: bfi w8, w9, #8, #8 +; CHECK-NEXT: bfi w8, w10, #16, #8 +; CHECK-NEXT: bfi w8, w11, #24, #8 +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: ret + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -4 + %tmp2 = load i8, i8* %tmp1, align 4 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -3 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -1 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[1] is 4 byte aligned +; (i32) p[4] | ((i32) p[3] << 8) | ((i32) p[2] << 16) | ((i32) p[1] << 24) +define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap: +; CHECK: ldrb w8, [x0, #4] +; CHECK-NEXT: ldrb w9, [x0, #3] +; CHECK-NEXT: ldrb w10, [x0, #2] +; CHECK-NEXT: ldrb w11, [x0, #1] +; CHECK-NEXT: bfi w8, w9, #8, #8 +; CHECK-NEXT: bfi w8, w10, #16, #8 +; CHECK-NEXT: bfi w8, w11, #24, #8 +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: ret + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 4 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp15 = load i8, i8* %tmp14, align 4 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[-4] is 4 byte aligned +; (i32) p[-1] | ((i32) p[-2] << 8) | ((i32) p[-3] << 16) | ((i32) p[-4] << 24) +define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap: +; CHECK: ldurb w8, [x0, #-1] +; CHECK-NEXT: ldurb w9, [x0, #-2] +; CHECK-NEXT: ldurb w10, [x0, #-3] +; CHECK-NEXT: ldurb w11, [x0, #-4] +; CHECK-NEXT: bfi w8, w9, #8, #8 +; CHECK-NEXT: bfi w8, w10, #16, #8 +; CHECK-NEXT: bfi w8, w11, #24, #8 +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: ret + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -1 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -2 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -3 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -4 + %tmp15 = load i8, i8* %tmp14, align 4 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} diff --git a/test/CodeGen/ARM/load-combine-big-endian.ll b/test/CodeGen/ARM/load-combine-big-endian.ll index 6f0e008a655..795e69fc4f5 100644 --- a/test/CodeGen/ARM/load-combine-big-endian.ll +++ b/test/CodeGen/ARM/load-combine-big-endian.ll @@ -269,3 +269,183 @@ define i64 @load_i64_by_i8(i64* %arg) { %tmp37 = or i64 %tmp33, %tmp36 ret i64 %tmp37 } + +; i8* p; // p[1] is 4 byte aligned +; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24) +define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_nonzero_offset: +; CHECK: ldrb r1, [r0, #1] +; CHECK-NEXT: ldrb r2, [r0, #2] +; CHECK-NEXT: ldrb r3, [r0, #3] +; CHECK-NEXT: ldrb r0, [r0, #4] +; CHECK-NEXT: orr r1, r1, r2, lsl #8 +; CHECK-NEXT: orr r1, r1, r3, lsl #16 +; CHECK-NEXT: orr r0, r1, r0, lsl #24 +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset: +; CHECK-ARMv6: ldrb r1, [r0, #1] +; CHECK-ARMv6-NEXT: ldrb r2, [r0, #2] +; CHECK-ARMv6-NEXT: ldrb r3, [r0, #3] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #4] +; CHECK-ARMv6-NEXT: orr r1, r1, r2, lsl #8 +; CHECK-ARMv6-NEXT: orr r1, r1, r3, lsl #16 +; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #24 +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp2 = load i8, i8* %tmp1, align 4 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 4 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[-4] is 4 byte aligned +; (i32) p[-4] | ((i32) p[-3] << 8) | ((i32) p[-2] << 16) | ((i32) p[-1] << 24) +define i32 @load_i32_by_i8_neg_offset(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_neg_offset: +; CHECK: ldrb r1, [r0, #-4] +; CHECK-NEXT: ldrb r2, [r0, #-3] +; CHECK-NEXT: ldrb r3, [r0, #-2] +; CHECK-NEXT: ldrb r0, [r0, #-1] +; CHECK-NEXT: orr r1, r1, r2, lsl #8 +; CHECK-NEXT: orr r1, r1, r3, lsl #16 +; CHECK-NEXT: orr r0, r1, r0, lsl #24 +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset: +; CHECK-ARMv6: ldrb r1, [r0, #-4] +; CHECK-ARMv6-NEXT: ldrb r2, [r0, #-3] +; CHECK-ARMv6-NEXT: ldrb r3, [r0, #-2] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #-1] +; CHECK-ARMv6-NEXT: orr r1, r1, r2, lsl #8 +; CHECK-ARMv6-NEXT: orr r1, r1, r3, lsl #16 +; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #24 +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -4 + %tmp2 = load i8, i8* %tmp1, align 4 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -3 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -1 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[1] is 4 byte aligned +; (i32) p[4] | ((i32) p[3] << 8) | ((i32) p[2] << 16) | ((i32) p[1] << 24) +define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap: +; CHECK: ldrb r1, [r0, #1] +; CHECK-NEXT: ldrb r2, [r0, #2] +; CHECK-NEXT: ldrb r3, [r0, #3] +; CHECK-NEXT: ldrb r0, [r0, #4] +; CHECK-NEXT: orr r0, r0, r3, lsl #8 +; CHECK-NEXT: orr r0, r0, r2, lsl #16 +; CHECK-NEXT: orr r0, r0, r1, lsl #24 +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset_bswap: +; CHECK-ARMv6: ldrb r1, [r0, #1] +; CHECK-ARMv6-NEXT: ldrb r2, [r0, #2] +; CHECK-ARMv6-NEXT: ldrb r3, [r0, #3] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #4] +; CHECK-ARMv6-NEXT: orr r0, r0, r3, lsl #8 +; CHECK-ARMv6-NEXT: orr r0, r0, r2, lsl #16 +; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #24 +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 4 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp15 = load i8, i8* %tmp14, align 4 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[-4] is 4 byte aligned +; (i32) p[-1] | ((i32) p[-2] << 8) | ((i32) p[-3] << 16) | ((i32) p[-4] << 24) +define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap: +; CHECK: ldrb r1, [r0, #-4] +; CHECK-NEXT: ldrb r2, [r0, #-3] +; CHECK-NEXT: ldrb r3, [r0, #-2] +; CHECK-NEXT: ldrb r0, [r0, #-1] +; CHECK-NEXT: orr r0, r0, r3, lsl #8 +; CHECK-NEXT: orr r0, r0, r2, lsl #16 +; CHECK-NEXT: orr r0, r0, r1, lsl #24 +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset_bswap: +; CHECK-ARMv6: ldrb r1, [r0, #-4] +; CHECK-ARMv6-NEXT: ldrb r2, [r0, #-3] +; CHECK-ARMv6-NEXT: ldrb r3, [r0, #-2] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #-1] +; CHECK-ARMv6-NEXT: orr r0, r0, r3, lsl #8 +; CHECK-ARMv6-NEXT: orr r0, r0, r2, lsl #16 +; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #24 +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -1 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -2 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -3 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -4 + %tmp15 = load i8, i8* %tmp14, align 4 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} diff --git a/test/CodeGen/ARM/load-combine.ll b/test/CodeGen/ARM/load-combine.ll index 4ee7780bff7..bafa13894c6 100644 --- a/test/CodeGen/ARM/load-combine.ll +++ b/test/CodeGen/ARM/load-combine.ll @@ -227,3 +227,183 @@ define i64 @load_i64_by_i8_bswap(i64* %arg) { %tmp37 = or i64 %tmp33, %tmp36 ret i64 %tmp37 } + +; i8* p; // p[1] is 4 byte aligned +; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24) +define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_nonzero_offset: +; CHECK: ldrb r1, [r0, #1] +; CHECK-NEXT: ldrb r2, [r0, #2] +; CHECK-NEXT: ldrb r3, [r0, #3] +; CHECK-NEXT: ldrb r0, [r0, #4] +; CHECK-NEXT: orr r1, r1, r2, lsl #8 +; CHECK-NEXT: orr r1, r1, r3, lsl #16 +; CHECK-NEXT: orr r0, r1, r0, lsl #24 +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset: +; CHECK-ARMv6: ldrb r1, [r0, #1] +; CHECK-ARMv6-NEXT: ldrb r2, [r0, #2] +; CHECK-ARMv6-NEXT: ldrb r3, [r0, #3] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #4] +; CHECK-ARMv6-NEXT: orr r1, r1, r2, lsl #8 +; CHECK-ARMv6-NEXT: orr r1, r1, r3, lsl #16 +; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #24 +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp2 = load i8, i8* %tmp1, align 4 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 4 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[-4] is 4 byte aligned +; (i32) p[-4] | ((i32) p[-3] << 8) | ((i32) p[-2] << 16) | ((i32) p[-1] << 24) +define i32 @load_i32_by_i8_neg_offset(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_neg_offset: +; CHECK: ldrb r1, [r0, #-4] +; CHECK-NEXT: ldrb r2, [r0, #-3] +; CHECK-NEXT: ldrb r3, [r0, #-2] +; CHECK-NEXT: ldrb r0, [r0, #-1] +; CHECK-NEXT: orr r1, r1, r2, lsl #8 +; CHECK-NEXT: orr r1, r1, r3, lsl #16 +; CHECK-NEXT: orr r0, r1, r0, lsl #24 +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset: +; CHECK-ARMv6: ldrb r1, [r0, #-4] +; CHECK-ARMv6-NEXT: ldrb r2, [r0, #-3] +; CHECK-ARMv6-NEXT: ldrb r3, [r0, #-2] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #-1] +; CHECK-ARMv6-NEXT: orr r1, r1, r2, lsl #8 +; CHECK-ARMv6-NEXT: orr r1, r1, r3, lsl #16 +; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #24 +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -4 + %tmp2 = load i8, i8* %tmp1, align 4 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -3 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -1 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[1] is 4 byte aligned +; (i32) p[4] | ((i32) p[3] << 8) | ((i32) p[2] << 16) | ((i32) p[1] << 24) +define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap: +; CHECK: ldrb r1, [r0, #1] +; CHECK-NEXT: ldrb r2, [r0, #2] +; CHECK-NEXT: ldrb r3, [r0, #3] +; CHECK-NEXT: ldrb r0, [r0, #4] +; CHECK-NEXT: orr r0, r0, r3, lsl #8 +; CHECK-NEXT: orr r0, r0, r2, lsl #16 +; CHECK-NEXT: orr r0, r0, r1, lsl #24 +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset_bswap: +; CHECK-ARMv6: ldrb r1, [r0, #1] +; CHECK-ARMv6-NEXT: ldrb r2, [r0, #2] +; CHECK-ARMv6-NEXT: ldrb r3, [r0, #3] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #4] +; CHECK-ARMv6-NEXT: orr r0, r0, r3, lsl #8 +; CHECK-ARMv6-NEXT: orr r0, r0, r2, lsl #16 +; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #24 +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 4 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp15 = load i8, i8* %tmp14, align 4 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; // p[-4] is 4 byte aligned +; (i32) p[-1] | ((i32) p[-2] << 8) | ((i32) p[-3] << 16) | ((i32) p[-4] << 24) +define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap: +; CHECK: ldrb r1, [r0, #-4] +; CHECK-NEXT: ldrb r2, [r0, #-3] +; CHECK-NEXT: ldrb r3, [r0, #-2] +; CHECK-NEXT: ldrb r0, [r0, #-1] +; CHECK-NEXT: orr r0, r0, r3, lsl #8 +; CHECK-NEXT: orr r0, r0, r2, lsl #16 +; CHECK-NEXT: orr r0, r0, r1, lsl #24 +; CHECK-NEXT: mov pc, lr + +; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset_bswap: +; CHECK-ARMv6: ldrb r1, [r0, #-4] +; CHECK-ARMv6-NEXT: ldrb r2, [r0, #-3] +; CHECK-ARMv6-NEXT: ldrb r3, [r0, #-2] +; CHECK-ARMv6-NEXT: ldrb r0, [r0, #-1] +; CHECK-ARMv6-NEXT: orr r0, r0, r3, lsl #8 +; CHECK-ARMv6-NEXT: orr r0, r0, r2, lsl #16 +; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #24 +; CHECK-ARMv6-NEXT: bx lr + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -1 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -2 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -3 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -4 + %tmp15 = load i8, i8* %tmp14, align 4 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} diff --git a/test/CodeGen/X86/load-combine.ll b/test/CodeGen/X86/load-combine.ll index 08fc1c9c3f4..8942ca09c14 100644 --- a/test/CodeGen/X86/load-combine.ll +++ b/test/CodeGen/X86/load-combine.ll @@ -574,8 +574,8 @@ define i32 @load_i32_by_i8_bswap_unrelated_load(i32* %arg, i32* %arg1) { ; Non-zero offsets are not supported for now ; i8* p; ; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24) -define i32 @load_i32_by_i8_unsupported_offset(i32* %arg) { -; CHECK-LABEL: load_i32_by_i8_unsupported_offset: +define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_nonzero_offset: ; CHECK: # BB#0: ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movzbl 1(%eax), %ecx @@ -590,7 +590,7 @@ define i32 @load_i32_by_i8_unsupported_offset(i32* %arg) { ; CHECK-NEXT: orl %ecx, %eax ; CHECK-NEXT: retl ; -; CHECK64-LABEL: load_i32_by_i8_unsupported_offset: +; CHECK64-LABEL: load_i32_by_i8_nonzero_offset: ; CHECK64: # BB#0: ; CHECK64-NEXT: movzbl 1(%rdi), %eax ; CHECK64-NEXT: movzbl 2(%rdi), %ecx @@ -626,6 +626,168 @@ define i32 @load_i32_by_i8_unsupported_offset(i32* %arg) { ret i32 %tmp18 } +; i8* p; +; (i32) p[-4] | ((i32) p[-3] << 8) | ((i32) p[-2] << 16) | ((i32) p[-1] << 24) +define i32 @load_i32_by_i8_neg_offset(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_neg_offset: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movzbl -4(%eax), %ecx +; CHECK-NEXT: movzbl -3(%eax), %edx +; CHECK-NEXT: shll $8, %edx +; CHECK-NEXT: orl %ecx, %edx +; CHECK-NEXT: movzbl -2(%eax), %ecx +; CHECK-NEXT: shll $16, %ecx +; CHECK-NEXT: orl %edx, %ecx +; CHECK-NEXT: movzbl -1(%eax), %eax +; CHECK-NEXT: shll $24, %eax +; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i32_by_i8_neg_offset: +; CHECK64: # BB#0: +; CHECK64-NEXT: movzbl -4(%rdi), %eax +; CHECK64-NEXT: movzbl -3(%rdi), %ecx +; CHECK64-NEXT: shll $8, %ecx +; CHECK64-NEXT: orl %eax, %ecx +; CHECK64-NEXT: movzbl -2(%rdi), %edx +; CHECK64-NEXT: shll $16, %edx +; CHECK64-NEXT: orl %ecx, %edx +; CHECK64-NEXT: movzbl -1(%rdi), %eax +; CHECK64-NEXT: shll $24, %eax +; CHECK64-NEXT: orl %edx, %eax +; CHECK64-NEXT: retq + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -4 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -3 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -1 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; +; (i32) p[4] | ((i32) p[3] << 8) | ((i32) p[2] << 16) | ((i32) p[1] << 24) +define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movzbl 4(%eax), %ecx +; CHECK-NEXT: movzbl 3(%eax), %edx +; CHECK-NEXT: shll $8, %edx +; CHECK-NEXT: orl %ecx, %edx +; CHECK-NEXT: movzbl 2(%eax), %ecx +; CHECK-NEXT: shll $16, %ecx +; CHECK-NEXT: orl %edx, %ecx +; CHECK-NEXT: movzbl 1(%eax), %eax +; CHECK-NEXT: shll $24, %eax +; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i32_by_i8_nonzero_offset_bswap: +; CHECK64: # BB#0: +; CHECK64-NEXT: movzbl 4(%rdi), %eax +; CHECK64-NEXT: movzbl 3(%rdi), %ecx +; CHECK64-NEXT: shll $8, %ecx +; CHECK64-NEXT: orl %eax, %ecx +; CHECK64-NEXT: movzbl 2(%rdi), %edx +; CHECK64-NEXT: shll $16, %edx +; CHECK64-NEXT: orl %ecx, %edx +; CHECK64-NEXT: movzbl 1(%rdi), %eax +; CHECK64-NEXT: shll $24, %eax +; CHECK64-NEXT: orl %edx, %eax +; CHECK64-NEXT: retq + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 4 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 3 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 1 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + +; i8* p; +; (i32) p[-1] | ((i32) p[-2] << 8) | ((i32) p[-3] << 16) | ((i32) p[-4] << 24) +define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) { +; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movzbl -1(%eax), %ecx +; CHECK-NEXT: movzbl -2(%eax), %edx +; CHECK-NEXT: shll $8, %edx +; CHECK-NEXT: orl %ecx, %edx +; CHECK-NEXT: movzbl -3(%eax), %ecx +; CHECK-NEXT: shll $16, %ecx +; CHECK-NEXT: orl %edx, %ecx +; CHECK-NEXT: movzbl -4(%eax), %eax +; CHECK-NEXT: shll $24, %eax +; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: retl +; +; CHECK64-LABEL: load_i32_by_i8_neg_offset_bswap: +; CHECK64: # BB#0: +; CHECK64-NEXT: movzbl -1(%rdi), %eax +; CHECK64-NEXT: movzbl -2(%rdi), %ecx +; CHECK64-NEXT: shll $8, %ecx +; CHECK64-NEXT: orl %eax, %ecx +; CHECK64-NEXT: movzbl -3(%rdi), %edx +; CHECK64-NEXT: shll $16, %edx +; CHECK64-NEXT: orl %ecx, %edx +; CHECK64-NEXT: movzbl -4(%rdi), %eax +; CHECK64-NEXT: shll $24, %eax +; CHECK64-NEXT: orl %edx, %eax +; CHECK64-NEXT: retq + + %tmp = bitcast i32* %arg to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -1 + %tmp2 = load i8, i8* %tmp1, align 1 + %tmp3 = zext i8 %tmp2 to i32 + %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -2 + %tmp5 = load i8, i8* %tmp4, align 1 + %tmp6 = zext i8 %tmp5 to i32 + %tmp7 = shl nuw nsw i32 %tmp6, 8 + %tmp8 = or i32 %tmp7, %tmp3 + %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -3 + %tmp10 = load i8, i8* %tmp9, align 1 + %tmp11 = zext i8 %tmp10 to i32 + %tmp12 = shl nuw nsw i32 %tmp11, 16 + %tmp13 = or i32 %tmp8, %tmp12 + %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -4 + %tmp15 = load i8, i8* %tmp14, align 1 + %tmp16 = zext i8 %tmp15 to i32 + %tmp17 = shl nuw nsw i32 %tmp16, 24 + %tmp18 = or i32 %tmp13, %tmp17 + ret i32 %tmp18 +} + ; i8* p; i32 i; ; ((i32) p[i] << 24) | ((i32) p[i + 1] << 16) | ((i32) p[i + 2] << 8) | (i32) p[i + 3] define i32 @load_i32_by_i8_bswap_base_index_offset(i32* %arg, i32 %arg1) { -- 2.50.1