From: Matthew Simpson Date: Tue, 7 Feb 2017 19:17:44 +0000 (+0000) Subject: [LV] Simplify ARM/AArch64 interleaved access cost model tests (NFC) X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=85dcf5ecdf1a4041e3074e0d84b52b4d48fd33f1;p=llvm [LV] Simplify ARM/AArch64 interleaved access cost model tests (NFC) This patch removes unneeded instructions from the existing ARM/AArch64 interleaved access cost model tests. I'll be adding a similar set of tests in a follow-on patch to increase coverage. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@294336 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll index df1f9c61940..4f78f3f5007 100644 --- a/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll +++ b/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll @@ -1,81 +1,70 @@ -; RUN: opt -S -debug-only=loop-vectorize -loop-vectorize -instcombine < %s 2>&1 | FileCheck %s +; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2 +; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8 +; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16 ; REQUIRES: asserts target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnueabi" -@AB = common global [1024 x i8] zeroinitializer, align 4 -@CD = common global [1024 x i8] zeroinitializer, align 4 - -define void @test_byte_interleaved_cost(i8 %C, i8 %D) { +%i8.2 = type {i8, i8} +define void @i8_factor_2(%i8.2* %data, i64 %n) { entry: br label %for.body -; 8xi8 and 16xi8 are valid i8 vector types, so the cost of the interleaved -; access group is 2. - -; CHECK: LV: Checking a loop in "test_byte_interleaved_cost" -; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction: %tmp = load i8, i8* %arrayidx0, align 4 -; CHECK: LV: Found an estimated cost of 2 for VF 16 For instruction: %tmp = load i8, i8* %arrayidx0, align 4 - -for.body: ; preds = %for.body, %entry - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @AB, i64 0, i64 %indvars.iv - %tmp = load i8, i8* %arrayidx0, align 4 - %tmp1 = or i64 %indvars.iv, 1 - %arrayidx1 = getelementptr inbounds [1024 x i8], [1024 x i8]* @AB, i64 0, i64 %tmp1 - %tmp2 = load i8, i8* %arrayidx1, align 4 - %add = add nsw i8 %tmp, %C - %mul = mul nsw i8 %tmp2, %D - %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @CD, i64 0, i64 %indvars.iv - store i8 %add, i8* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @CD, i64 0, i64 %tmp1 - store i8 %mul, i8* %arrayidx3, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 - %cmp = icmp slt i64 %indvars.iv.next, 1024 - br i1 %cmp, label %for.body, label %for.end +; VF_8-LABEL: Checking a loop in "i8_factor_2" +; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp1, align 1 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp0, align 1 +; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1 +; VF_16-LABEL: Checking a loop in "i8_factor_2" +; VF_16: Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, i8* %tmp0, align 1 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1 +; VF_16-NEXT: Found an estimated cost of 2 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1 +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 0 + %tmp1 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 1 + %tmp2 = load i8, i8* %tmp0, align 1 + %tmp3 = load i8, i8* %tmp1, align 1 + store i8 0, i8* %tmp0, align 1 + store i8 0, i8* %tmp1, align 1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end -for.end: ; preds = %for.body +for.end: ret void } -%ig.factor.8 = type { double*, double, double, double, double, double, double, double } -define double @wide_interleaved_group(%ig.factor.8* %s, double %a, double %b, i32 %n) { +%i64.8 = type {i64, i64, i64, i64, i64, i64, i64, i64} +define void @i64_factor_8(%i64.8* %data, i64 %n) { entry: br label %for.body -; Check the default cost of a strided load with a factor that is greater than -; the maximum allowed. In this test, the interleave factor would be 8, which is -; not supported. - -; CHECK: LV: Checking a loop in "wide_interleaved_group" -; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %1 = load double, double* %0, align 8 -; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %5 = load double, double* %4, align 8 -; CHECK: LV: Found an estimated cost of 10 for VF 2 For instruction: store double %9, double* %10, align 8 - +; The interleave factor in this test is 8, which is greater than the maximum +; allowed factor for AArch64 (4). Thus, we will fall back to the basic TTI +; implementation for determining the cost of the interleaved load group. The +; stores do not form a legal interleaved group because the group would contain +; gaps. +; +; VF_2-LABEL: Checking a loop in "i64_factor_8" +; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 10 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 10 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] - %r = phi double [ 0.000000e+00, %entry ], [ %12, %for.body ] - %0 = getelementptr inbounds %ig.factor.8, %ig.factor.8* %s, i64 %i, i32 2 - %1 = load double, double* %0, align 8 - %2 = fcmp fast olt double %1, %a - %3 = select i1 %2, double 0.000000e+00, double %1 - %4 = getelementptr inbounds %ig.factor.8, %ig.factor.8* %s, i64 %i, i32 6 - %5 = load double, double* %4, align 8 - %6 = fcmp fast olt double %5, %a - %7 = select i1 %6, double 0.000000e+00, double %5 - %8 = fmul fast double %7, %b - %9 = fadd fast double %8, %3 - %10 = getelementptr inbounds %ig.factor.8, %ig.factor.8* %s, i64 %i, i32 3 - store double %9, double* %10, align 8 - %11 = fmul fast double %9, %9 - %12 = fadd fast double %11, %r + %tmp0 = getelementptr inbounds %i64.8, %i64.8* %data, i64 %i, i32 2 + %tmp1 = getelementptr inbounds %i64.8, %i64.8* %data, i64 %i, i32 6 + %tmp2 = load i64, i64* %tmp0, align 8 + %tmp3 = load i64, i64* %tmp1, align 8 + store i64 0, i64* %tmp0, align 8 + store i64 0, i64* %tmp1, align 8 %i.next = add nuw nsw i64 %i, 1 - %13 = trunc i64 %i.next to i32 - %cond = icmp eq i32 %13, %n - br i1 %cond, label %for.exit, label %for.body + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end -for.exit: - %r.lcssa = phi double [ %12, %for.body ] - ret double %r.lcssa +for.end: + ret void } diff --git a/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll b/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll index de3626b57d8..3ccd9217eee 100644 --- a/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll +++ b/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll @@ -1,39 +1,37 @@ -; RUN: opt -S -debug-only=loop-vectorize -loop-vectorize -instcombine < %s 2>&1 | FileCheck %s +; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8 +; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16 ; REQUIRES: asserts target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "armv8--linux-gnueabihf" -@AB = common global [1024 x i8] zeroinitializer, align 4 -@CD = common global [1024 x i8] zeroinitializer, align 4 - -define void @test_byte_interleaved_cost(i8 %C, i8 %D) { +%i8.2 = type {i8, i8} +define void @i8_factor_2(%i8.2* %data, i64 %n) { entry: br label %for.body -; 8xi8 and 16xi8 are valid i8 vector types, so the cost of the interleaved -; access group is 2. - -; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction: %tmp = load i8, i8* %arrayidx0, align 4 -; CHECK: LV: Found an estimated cost of 2 for VF 16 For instruction: %tmp = load i8, i8* %arrayidx0, align 4 - -for.body: ; preds = %for.body, %entry - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @AB, i64 0, i64 %indvars.iv - %tmp = load i8, i8* %arrayidx0, align 4 - %tmp1 = or i64 %indvars.iv, 1 - %arrayidx1 = getelementptr inbounds [1024 x i8], [1024 x i8]* @AB, i64 0, i64 %tmp1 - %tmp2 = load i8, i8* %arrayidx1, align 4 - %add = add nsw i8 %tmp, %C - %mul = mul nsw i8 %tmp2, %D - %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @CD, i64 0, i64 %indvars.iv - store i8 %add, i8* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @CD, i64 0, i64 %tmp1 - store i8 %mul, i8* %arrayidx3, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 - %cmp = icmp slt i64 %indvars.iv.next, 1024 - br i1 %cmp, label %for.body, label %for.end +; VF_8-LABEL: Checking a loop in "i8_factor_2" +; VF_8: Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp1, align 1 +; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp0, align 1 +; VF_8-NEXT: Found an estimated cost of 2 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1 +; VF_16-LABEL: Checking a loop in "i8_factor_2" +; VF_16: Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, i8* %tmp0, align 1 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1 +; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1 +; VF_16-NEXT: Found an estimated cost of 2 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1 +for.body: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] + %tmp0 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 0 + %tmp1 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 1 + %tmp2 = load i8, i8* %tmp0, align 1 + %tmp3 = load i8, i8* %tmp1, align 1 + store i8 0, i8* %tmp0, align 1 + store i8 0, i8* %tmp1, align 1 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end -for.end: ; preds = %for.body +for.end: ret void }