-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mcpu=pwr9 -O3 -verify-machineinstrs -ppc-vsr-nums-as-vr \
; RUN: -ppc-asm-full-reg-names -mtriple=powerpc64le-unknown-linux-gnu \
; RUN: < %s | FileCheck %s
; CHECK-NEXT: xvnegsp v0, v1
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_1: # %for.cond1.preheader
-; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: lfd f0, 0(r3)
+; CHECK: lfd f0, 0(r3)
; CHECK-NEXT: xxpermdi v1, f0, f0, 2
-; CHECK-NEXT: vperm v6, v3, v1, v2
-; CHECK-NEXT: vperm v1, v1, v3, v4
-; CHECK-NEXT: xvnegsp v6, v6
+; CHECK-NEXT: vperm v6, v1, v3, v4
+; CHECK-NEXT: vperm v1, v3, v1, v2
; CHECK-NEXT: xvnegsp v1, v1
-; CHECK-NEXT: vabsduw v6, v6, v5
-; CHECK-NEXT: vabsduw v1, v1, v0
-; CHECK-NEXT: vadduwm v1, v1, v6
+; CHECK-NEXT: xvnegsp v6, v6
+; CHECK-NEXT: vabsduw v1, v1, v5
+; CHECK-NEXT: vabsduw v6, v6, v0
+; CHECK-NEXT: vadduwm v1, v6, v1
; CHECK-NEXT: xxswapd v6, v1
; CHECK-NEXT: vadduwm v1, v1, v6
; CHECK-NEXT: xxspltw v6, v1, 2
; CHECK-NEXT: vadduwm v1, v1, v6
; CHECK-NEXT: vextuwrx r7, r5, v1
-; CHECK-NEXT: ldux r8, r3, r4
-; CHECK-NEXT: add r3, r3, r4
+; CHECK-NEXT: lfdx f0, r3, r4
; CHECK-NEXT: add r6, r7, r6
-; CHECK-NEXT: mtvsrd f0, r8
-; CHECK-NEXT: xxswapd v1, vs0
+; CHECK-NEXT: add r7, r3, r4
+; CHECK-NEXT: xxpermdi v1, f0, f0, 2
+; CHECK-NEXT: add r3, r7, r4
; CHECK-NEXT: vperm v6, v3, v1, v2
; CHECK-NEXT: vperm v1, v1, v3, v4
; CHECK-NEXT: xvnegsp v6, v6
; CHECK-NEXT: vadduwm v1, v1, v6
; CHECK-NEXT: xxspltw v6, v1, 2
; CHECK-NEXT: vadduwm v1, v1, v6
-; CHECK-NEXT: vextuwrx r7, r5, v1
-; CHECK-NEXT: add r6, r7, r6
+; CHECK-NEXT: vextuwrx r8, r5, v1
+; CHECK-NEXT: add r6, r8, r6
; CHECK-NEXT: bdnz .LBB0_1
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
; CHECK-NEXT: extsw r3, r6
; P9BE-NEXT: xvnegsp v0, v1
; P9BE-NEXT: .p2align 4
; P9BE-NEXT: .LBB0_1: # %for.cond1.preheader
-; P9BE-NEXT: # =>This Inner Loop Header: Depth=1
-; P9BE-NEXT: lfd f0, 0(r3)
+; P9BE: lfd f0, 0(r3)
; P9BE-NEXT: xxlor v1, vs0, vs0
; P9BE-NEXT: vperm v6, v3, v1, v4
; P9BE-NEXT: vperm v1, v3, v1, v2
; P9BE-NEXT: xxspltw v6, v1, 1
; P9BE-NEXT: vadduwm v1, v1, v6
; P9BE-NEXT: vextuwlx r7, r5, v1
+; P9BE-NEXT: lfdx f0, r3, r4
; P9BE-NEXT: add r6, r7, r6
-; P9BE-NEXT: ldux r7, r3, r4
-; P9BE-NEXT: add r3, r3, r4
-; P9BE-NEXT: mtvsrd v1, r7
+; P9BE-NEXT: add r7, r3, r4
+; P9BE-NEXT: xxlor v1, vs0, vs0
+; P9BE-NEXT: add r3, r7, r4
; P9BE-NEXT: vperm v6, v3, v1, v2
; P9BE-NEXT: vperm v1, v3, v1, v4
; P9BE-NEXT: xvnegsp v6, v6
; P9BE-NEXT: vadduwm v1, v1, v6
; P9BE-NEXT: xxspltw v6, v1, 1
; P9BE-NEXT: vadduwm v1, v1, v6
-; P9BE-NEXT: vextuwlx r7, r5, v1
-; P9BE-NEXT: add r6, r7, r6
+; P9BE-NEXT: vextuwlx r8, r5, v1
+; P9BE-NEXT: add r6, r8, r6
; P9BE-NEXT: bdnz .LBB0_1
; P9BE-NEXT: # %bb.2: # %for.cond.cleanup
; P9BE-NEXT: extsw r3, r6
; return i_sum;
;}
+define void @test32(i8* nocapture readonly %pix2, i32 signext %i_pix2) {
+entry:
+ %idx.ext63 = sext i32 %i_pix2 to i64
+ %add.ptr64 = getelementptr inbounds i8, i8* %pix2, i64 %idx.ext63
+ %arrayidx5.1 = getelementptr inbounds i8, i8* %add.ptr64, i64 4
+ %0 = bitcast i8* %add.ptr64 to <4 x i8>*
+ %1 = load <4 x i8>, <4 x i8>* %0, align 1
+ %reorder_shuffle117 = shufflevector <4 x i8> %1, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ %2 = zext <4 x i8> %reorder_shuffle117 to <4 x i32>
+ %3 = sub nsw <4 x i32> zeroinitializer, %2
+ %4 = bitcast i8* %arrayidx5.1 to <4 x i8>*
+ %5 = load <4 x i8>, <4 x i8>* %4, align 1
+ %reorder_shuffle115 = shufflevector <4 x i8> %5, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ %6 = zext <4 x i8> %reorder_shuffle115 to <4 x i32>
+ %7 = sub nsw <4 x i32> zeroinitializer, %6
+ %8 = shl nsw <4 x i32> %7, <i32 16, i32 16, i32 16, i32 16>
+ %9 = add nsw <4 x i32> %8, %3
+ %10 = sub nsw <4 x i32> %9, zeroinitializer
+ %11 = shufflevector <4 x i32> undef, <4 x i32> %10, <4 x i32> <i32 2, i32 7, i32 0, i32 5>
+ %12 = add nsw <4 x i32> zeroinitializer, %11
+ %13 = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+ store <4 x i32> %13, <4 x i32>* undef, align 16
+ ret void
+; CHECK-LABEL: test32:
+; CHECK-NOT: lwzux
+; CHECK-NOT: mtvsrws
+; CHECK: lfiwzx
+; CHECK: lfiwzx
+; P9BE-CHECK-LABEL: test32:
+; P9BE-CHECK-NOT: lwzux
+; P9BE-CHECK-NOT: mtvsrws
+; P9BE-CHECK: lfiwzx
+; P9BE-CHECK: lfiwzx
+}
+
+define void @test16(i16* nocapture readonly %sums, i32 signext %delta, i32 signext %thresh) {
+entry:
+ %idxprom = sext i32 %delta to i64
+ %add14 = add nsw i32 %delta, 8
+ %idxprom15 = sext i32 %add14 to i64
+ br label %for.body
+
+for.body: ; preds = %entry
+ %arrayidx8 = getelementptr inbounds i16, i16* %sums, i64 %idxprom
+ %0 = load i16, i16* %arrayidx8, align 2
+ %arrayidx16 = getelementptr inbounds i16, i16* %sums, i64 %idxprom15
+ %1 = load i16, i16* %arrayidx16, align 2
+ %2 = insertelement <4 x i16> undef, i16 %0, i32 2
+ %3 = insertelement <4 x i16> %2, i16 %1, i32 3
+ %4 = zext <4 x i16> %3 to <4 x i32>
+ %5 = sub nsw <4 x i32> zeroinitializer, %4
+ %6 = sub nsw <4 x i32> zeroinitializer, %5
+ %7 = select <4 x i1> undef, <4 x i32> %6, <4 x i32> %5
+ %bin.rdx = add <4 x i32> %7, zeroinitializer
+ %rdx.shuf54 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %bin.rdx55 = add <4 x i32> %bin.rdx, %rdx.shuf54
+ %8 = extractelement <4 x i32> %bin.rdx55, i32 0
+ %op.extra = add nuw i32 %8, 0
+ %cmp25 = icmp slt i32 %op.extra, %thresh
+ br i1 %cmp25, label %if.then, label %if.end
+
+if.then: ; preds = %for.body
+ unreachable
+
+if.end: ; preds = %for.body
+ ret void
+; CHECK-LABEL: test16:
+; CHECK-NOT: lhzux
+; CHECK: lxsihzx
+; CHECK: lxsihzx
+; P9BE-CHECK-LABEL: test16:
+; P9BE-CHECK-NOT: lhzux
+; P9BE-CHECK: lxsihzx
+; P9BE-CHECK: lxsihzx
+}
+
+define void @test8(i8* nocapture readonly %sums, i32 signext %delta, i32 signext %thresh) {
+entry:
+ %idxprom = sext i32 %delta to i64
+ %add14 = add nsw i32 %delta, 8
+ %idxprom15 = sext i32 %add14 to i64
+ br label %for.body
+
+for.body: ; preds = %entry
+ %arrayidx8 = getelementptr inbounds i8, i8* %sums, i64 %idxprom
+ %0 = load i8, i8* %arrayidx8, align 2
+ %arrayidx16 = getelementptr inbounds i8, i8* %sums, i64 %idxprom15
+ %1 = load i8, i8* %arrayidx16, align 2
+ %2 = insertelement <4 x i8> undef, i8 %0, i32 2
+ %3 = insertelement <4 x i8> %2, i8 %1, i32 3
+ %4 = zext <4 x i8> %3 to <4 x i32>
+ %5 = sub nsw <4 x i32> zeroinitializer, %4
+ %6 = sub nsw <4 x i32> zeroinitializer, %5
+ %7 = select <4 x i1> undef, <4 x i32> %6, <4 x i32> %5
+ %bin.rdx = add <4 x i32> %7, zeroinitializer
+ %rdx.shuf54 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %bin.rdx55 = add <4 x i32> %bin.rdx, %rdx.shuf54
+ %8 = extractelement <4 x i32> %bin.rdx55, i32 0
+ %op.extra = add nuw i32 %8, 0
+ %cmp25 = icmp slt i32 %op.extra, %thresh
+ br i1 %cmp25, label %if.then, label %if.end
+
+if.then: ; preds = %for.body
+ unreachable
+
+if.end: ; preds = %for.body
+ ret void
+; CHECK-LABEL: test8:
+; CHECK-NOT: lbzux
+; CHECK: lxsibzx
+; CHECK: lxsibzx
+; P9BE-CHECK-LABEL: test8:
+; P9BE-CHECK-NOT: lbzux
+; P9BE-CHECK: lxsibzx
+; P9BE-CHECK: lxsibzx
+}