From: Eric Christopher Date: Wed, 20 Feb 2019 04:42:07 +0000 (+0000) Subject: Temporarily Revert "[X86][SLP] Enable SLP vectorization for 128-bit horizontal X86... X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=46f8c8ccf19e45b29452001471cabcc2c601e908;p=llvm Temporarily Revert "[X86][SLP] Enable SLP vectorization for 128-bit horizontal X86 instructions (add, sub)" As this has broken the lto bootstrap build for 3 days and is showing a significant regression on the Dither_benchmark results (from the LLVM benchmark suite) -- specifically, on the BENCHMARK_FLOYD_DITHER_128, BENCHMARK_FLOYD_DITHER_256, and BENCHMARK_FLOYD_DITHER_512; the others are unchanged. These have regressed by about 28% on Skylake, 34% on Haswell, and over 40% on Sandybridge. This reverts commit r353923. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@354434 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 57dcef20896..90adb3a4146 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -146,13 +146,6 @@ unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const { return 32; } -// Use horizontal 128-bit operations, which use low and high -// 64-bit parts of vector register. This also allows vectorizer -// to use partial vector operations. -unsigned X86TTIImpl::getMinVectorRegisterBitWidth() const { - return 64; -} - unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const { return getRegisterBitWidth(true); } diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h index 36a941da66d..8bfcd07be04 100644 --- a/lib/Target/X86/X86TargetTransformInfo.h +++ b/lib/Target/X86/X86TargetTransformInfo.h @@ -113,7 +113,6 @@ public: unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector) const; - unsigned getMinVectorRegisterBitWidth() const; unsigned getLoadStoreVecRegBitWidth(unsigned AS) const; unsigned getMaxInterleaveFactor(unsigned VF); int getArithmeticInstrCost( diff --git a/test/Transforms/SLPVectorizer/X86/addsub.ll b/test/Transforms/SLPVectorizer/X86/addsub.ll index 6ab5e163258..510bc9e5b20 100644 --- a/test/Transforms/SLPVectorizer/X86/addsub.ll +++ b/test/Transforms/SLPVectorizer/X86/addsub.ll @@ -348,18 +348,22 @@ define void @reorder_alt_rightsubTree(double* nocapture %c, double* noalias noca define void @no_vec_shuff_reorder() #0 { ; CHECK-LABEL: @no_vec_shuff_reorder( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([4 x float]* @fa to <2 x float>*), align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([4 x float]* @fb to <2 x float>*), align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP5]], <2 x float>* bitcast ([4 x float]* @fc to <2 x float>*), align 4 -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2) to <2 x float>*), align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2) to <2 x float>*), align 4 -; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = fsub <2 x float> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x i32> -; CHECK-NEXT: store <2 x float> [[TMP10]], <2 x float>* bitcast (float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2) to <2 x float>*), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fadd float [[TMP1]], [[TMP2]] +; CHECK-NEXT: store float [[TMP3]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4 +; CHECK-NEXT: [[TMP6:%.*]] = fsub float [[TMP4]], [[TMP5]] +; CHECK-NEXT: store float [[TMP6]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4 +; CHECK-NEXT: [[TMP9:%.*]] = fadd float [[TMP7]], [[TMP8]] +; CHECK-NEXT: store float [[TMP9]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4 +; CHECK-NEXT: [[TMP11:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4 +; CHECK-NEXT: [[TMP12:%.*]] = fsub float [[TMP10]], [[TMP11]] +; CHECK-NEXT: store float [[TMP12]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4 ; CHECK-NEXT: ret void ; %1 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4 diff --git a/test/Transforms/SLPVectorizer/X86/alternate-fp.ll b/test/Transforms/SLPVectorizer/X86/alternate-fp.ll index 585381fad5e..9602314c6d4 100644 --- a/test/Transforms/SLPVectorizer/X86/alternate-fp.ll +++ b/test/Transforms/SLPVectorizer/X86/alternate-fp.ll @@ -125,15 +125,14 @@ define <4 x float> @fmul_fdiv_v4f32_const(<4 x float> %a) { ; SSE-NEXT: ret <4 x float> [[TMP1]] ; ; SLM-LABEL: @fmul_fdiv_v4f32_const( -; SLM-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A:%.*]], i32 2 +; SLM-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 +; SLM-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 ; SLM-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> -; SLM-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], +; SLM-NEXT: [[AB0:%.*]] = fmul float [[A0]], 2.000000e+00 ; SLM-NEXT: [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00 -; SLM-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 -; SLM-NEXT: [[R0:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0 -; SLM-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 -; SLM-NEXT: [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP4]], i32 1 +; SLM-NEXT: [[R0:%.*]] = insertelement <4 x float> undef, float [[AB0]], i32 0 +; SLM-NEXT: [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[A1]], i32 1 ; SLM-NEXT: [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[A2]], i32 2 ; SLM-NEXT: [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i32 3 ; SLM-NEXT: ret <4 x float> [[R3]] diff --git a/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/test/Transforms/SLPVectorizer/X86/alternate-int.ll index 3faabe4bf52..fe8ae4d1c3d 100644 --- a/test/Transforms/SLPVectorizer/X86/alternate-int.ll +++ b/test/Transforms/SLPVectorizer/X86/alternate-int.ll @@ -83,17 +83,20 @@ define <4 x i32> @add_mul_v4i32(<4 x i32> %a, <4 x i32> %b) { ; ; SLM-LABEL: @add_mul_v4i32( ; SLM-NEXT: [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0 +; SLM-NEXT: [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2 ; SLM-NEXT: [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3 ; SLM-NEXT: [[B0:%.*]] = extractelement <4 x i32> [[B:%.*]], i32 0 +; SLM-NEXT: [[B1:%.*]] = extractelement <4 x i32> [[B]], i32 1 +; SLM-NEXT: [[B2:%.*]] = extractelement <4 x i32> [[B]], i32 2 ; SLM-NEXT: [[B3:%.*]] = extractelement <4 x i32> [[B]], i32 3 ; SLM-NEXT: [[AB0:%.*]] = mul i32 [[A0]], [[B0]] -; SLM-NEXT: [[TMP1:%.*]] = add <4 x i32> [[A]], [[B]] +; SLM-NEXT: [[AB1:%.*]] = add i32 [[A1]], [[B1]] +; SLM-NEXT: [[AB2:%.*]] = add i32 [[A2]], [[B2]] ; SLM-NEXT: [[AB3:%.*]] = mul i32 [[A3]], [[B3]] ; SLM-NEXT: [[R0:%.*]] = insertelement <4 x i32> undef, i32 [[AB0]], i32 0 -; SLM-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 -; SLM-NEXT: [[R1:%.*]] = insertelement <4 x i32> [[R0]], i32 [[TMP2]], i32 1 -; SLM-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 -; SLM-NEXT: [[R2:%.*]] = insertelement <4 x i32> [[R1]], i32 [[TMP3]], i32 2 +; SLM-NEXT: [[R1:%.*]] = insertelement <4 x i32> [[R0]], i32 [[AB1]], i32 1 +; SLM-NEXT: [[R2:%.*]] = insertelement <4 x i32> [[R1]], i32 [[AB2]], i32 2 ; SLM-NEXT: [[R3:%.*]] = insertelement <4 x i32> [[R2]], i32 [[AB3]], i32 3 ; SLM-NEXT: ret <4 x i32> [[R3]] ; @@ -271,28 +274,34 @@ define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE-LABEL: @ashr_lshr_shl_v8i32( ; SSE-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 ; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; SSE-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 +; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 ; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 ; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 ; SSE-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 ; SSE-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 +; SSE-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 +; SSE-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 +; SSE-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B]], i32 4 +; SSE-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5 ; SSE-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 ; SSE-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 ; SSE-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] ; SSE-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] -; SSE-NEXT: [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]] -; SSE-NEXT: [[TMP2:%.*]] = lshr <8 x i32> [[A]], [[B]] +; SSE-NEXT: [[AB2:%.*]] = lshr i32 [[A2]], [[B2]] +; SSE-NEXT: [[AB3:%.*]] = lshr i32 [[A3]], [[B3]] +; SSE-NEXT: [[AB4:%.*]] = lshr i32 [[A4]], [[B4]] +; SSE-NEXT: [[AB5:%.*]] = lshr i32 [[A5]], [[B5]] ; SSE-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] ; SSE-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] ; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 ; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; SSE-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 -; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP3]], i32 2 -; SSE-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP4]], i32 3 -; SSE-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP2]], i32 4 -; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP5]], i32 4 -; SSE-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP2]], i32 5 -; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP6]], i32 5 +; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 +; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 ; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 ; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; SSE-NEXT: ret <8 x i32> [[R7]] @@ -477,110 +486,26 @@ define <8 x i32> @add_v8i32_undefs(<8 x i32> %a) { } define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) { -; SSE-LABEL: @sdiv_v8i32_undefs( -; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 -; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 -; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 -; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 -; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; SSE-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 -; SSE-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 -; SSE-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 -; SSE-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 -; SSE-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 -; SSE-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 -; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> undef, i32 [[AB1]], i32 1 -; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5 -; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 -; SSE-NEXT: ret <8 x i32> [[R7]] -; -; SLM-LABEL: @sdiv_v8i32_undefs( -; SLM-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 -; SLM-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; SLM-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 -; SLM-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 -; SLM-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 -; SLM-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; SLM-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 -; SLM-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 -; SLM-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 -; SLM-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 -; SLM-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 -; SLM-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 -; SLM-NEXT: [[R1:%.*]] = insertelement <8 x i32> undef, i32 [[AB1]], i32 1 -; SLM-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; SLM-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5 -; SLM-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; SLM-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 -; SLM-NEXT: ret <8 x i32> [[R7]] -; -; AVX1-LABEL: @sdiv_v8i32_undefs( -; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 -; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 -; AVX1-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 -; AVX1-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 -; AVX1-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; AVX1-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 -; AVX1-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 -; AVX1-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 -; AVX1-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 -; AVX1-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 -; AVX1-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 -; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> undef, i32 [[AB1]], i32 1 -; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; AVX1-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5 -; AVX1-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; AVX1-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 -; AVX1-NEXT: ret <8 x i32> [[R7]] -; -; AVX2-LABEL: @sdiv_v8i32_undefs( -; AVX2-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 3 -; AVX2-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> -; AVX2-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], -; AVX2-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 -; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> -; AVX2-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], -; AVX2-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 -; AVX2-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 -; AVX2-NEXT: [[R1:%.*]] = insertelement <8 x i32> undef, i32 [[TMP5]], i32 1 -; AVX2-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 -; AVX2-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP6]], i32 2 -; AVX2-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; AVX2-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 -; AVX2-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP7]], i32 5 -; AVX2-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 -; AVX2-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[TMP8]], i32 6 -; AVX2-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 -; AVX2-NEXT: ret <8 x i32> [[R7]] -; -; AVX512-LABEL: @sdiv_v8i32_undefs( -; AVX512-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 3 -; AVX512-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> -; AVX512-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], -; AVX512-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 -; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> -; AVX512-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], -; AVX512-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 -; AVX512-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 -; AVX512-NEXT: [[R1:%.*]] = insertelement <8 x i32> undef, i32 [[TMP5]], i32 1 -; AVX512-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 -; AVX512-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP6]], i32 2 -; AVX512-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; AVX512-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 -; AVX512-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP7]], i32 5 -; AVX512-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 -; AVX512-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[TMP8]], i32 6 -; AVX512-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 -; AVX512-NEXT: ret <8 x i32> [[R7]] +; CHECK-LABEL: @sdiv_v8i32_undefs( +; CHECK-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 +; CHECK-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; CHECK-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; CHECK-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; CHECK-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 +; CHECK-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; CHECK-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 +; CHECK-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 +; CHECK-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; CHECK-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 +; CHECK-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 +; CHECK-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 +; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x i32> undef, i32 [[AB1]], i32 1 +; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5 +; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; CHECK-NEXT: ret <8 x i32> [[R7]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 diff --git a/test/Transforms/SLPVectorizer/X86/crash_7zip.ll b/test/Transforms/SLPVectorizer/X86/crash_7zip.ll index 548a0fab6ad..e7bff49c09f 100644 --- a/test/Transforms/SLPVectorizer/X86/crash_7zip.ll +++ b/test/Transforms/SLPVectorizer/X86/crash_7zip.ll @@ -11,23 +11,27 @@ define fastcc void @LzmaDec_DecodeReal2(%struct.CLzmaDec.1.28.55.82.103.124.145. ; CHECK-LABEL: @LzmaDec_DecodeReal2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[RANGE20_I:%.*]] = getelementptr inbounds [[STRUCT_CLZMADEC_1_28_55_82_103_124_145_166_181_196_229_259_334:%.*]], %struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334* [[P:%.*]], i64 0, i32 4 +; CHECK-NEXT: [[CODE21_I:%.*]] = getelementptr inbounds [[STRUCT_CLZMADEC_1_28_55_82_103_124_145_166_181_196_229_259_334]], %struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334* [[P]], i64 0, i32 5 ; CHECK-NEXT: br label [[DO_BODY66_I:%.*]] ; CHECK: do.body66.i: -; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ [[TMP5:%.*]], [[DO_COND_I:%.*]] ], [ undef, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = select <2 x i1> undef, <2 x i32> undef, <2 x i32> [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 1 +; CHECK-NEXT: [[RANGE_2_I:%.*]] = phi i32 [ [[RANGE_4_I:%.*]], [[DO_COND_I:%.*]] ], [ undef, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[CODE_2_I:%.*]] = phi i32 [ [[CODE_4_I:%.*]], [[DO_COND_I]] ], [ undef, [[ENTRY]] ] +; CHECK-NEXT: [[DOTRANGE_2_I:%.*]] = select i1 undef, i32 undef, i32 [[RANGE_2_I]] +; CHECK-NEXT: [[DOTCODE_2_I:%.*]] = select i1 undef, i32 undef, i32 [[CODE_2_I]] ; CHECK-NEXT: br i1 undef, label [[DO_COND_I]], label [[IF_ELSE_I:%.*]] ; CHECK: if.else.i: -; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i32> [[TMP1]], undef +; CHECK-NEXT: [[SUB91_I:%.*]] = sub i32 [[DOTRANGE_2_I]], undef +; CHECK-NEXT: [[SUB92_I:%.*]] = sub i32 [[DOTCODE_2_I]], undef ; CHECK-NEXT: br label [[DO_COND_I]] ; CHECK: do.cond.i: -; CHECK-NEXT: [[TMP5]] = phi <2 x i32> [ [[TMP4]], [[IF_ELSE_I]] ], [ [[TMP3]], [[DO_BODY66_I]] ] +; CHECK-NEXT: [[RANGE_4_I]] = phi i32 [ [[SUB91_I]], [[IF_ELSE_I]] ], [ undef, [[DO_BODY66_I]] ] +; CHECK-NEXT: [[CODE_4_I]] = phi i32 [ [[SUB92_I]], [[IF_ELSE_I]] ], [ [[DOTCODE_2_I]], [[DO_BODY66_I]] ] ; CHECK-NEXT: br i1 undef, label [[DO_BODY66_I]], label [[DO_END1006_I:%.*]] ; CHECK: do.end1006.i: -; CHECK-NEXT: [[TMP6:%.*]] = select <2 x i1> undef, <2 x i32> undef, <2 x i32> [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[RANGE20_I]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32>* [[TMP7]], align 4 +; CHECK-NEXT: [[DOTRANGE_4_I:%.*]] = select i1 undef, i32 undef, i32 [[RANGE_4_I]] +; CHECK-NEXT: [[DOTCODE_4_I:%.*]] = select i1 undef, i32 undef, i32 [[CODE_4_I]] +; CHECK-NEXT: store i32 [[DOTRANGE_4_I]], i32* [[RANGE20_I]], align 4 +; CHECK-NEXT: store i32 [[DOTCODE_4_I]], i32* [[CODE21_I]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/test/Transforms/SLPVectorizer/X86/crash_bullet.ll b/test/Transforms/SLPVectorizer/X86/crash_bullet.ll index 11dbbdaa165..a1a3f50765a 100644 --- a/test/Transforms/SLPVectorizer/X86/crash_bullet.ll +++ b/test/Transforms/SLPVectorizer/X86/crash_bullet.ll @@ -14,18 +14,23 @@ define void @_ZN23btGeneric6DofConstraint8getInfo1EPN17btTypedConstraint17btCons ; CHECK-NEXT: ret void ; CHECK: if.else: ; CHECK-NEXT: [[M_NUMCONSTRAINTROWS4:%.*]] = getelementptr inbounds %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960", %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* [[INFO:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[NUB5:%.*]] = getelementptr inbounds %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960", %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* [[INFO]], i64 0, i32 1 ; CHECK-NEXT: br i1 undef, label [[LAND_LHS_TRUE_I_1:%.*]], label [[IF_THEN7_1:%.*]] ; CHECK: land.lhs.true.i.1: ; CHECK-NEXT: br i1 undef, label [[FOR_INC_1:%.*]], label [[IF_THEN7_1]] ; CHECK: if.then7.1: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[M_NUMCONSTRAINTROWS4]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> , <2 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[INC_1:%.*]] = add nsw i32 0, 1 +; CHECK-NEXT: store i32 [[INC_1]], i32* [[M_NUMCONSTRAINTROWS4]], align 4 +; CHECK-NEXT: [[DEC_1:%.*]] = add nsw i32 6, -1 +; CHECK-NEXT: store i32 [[DEC_1]], i32* [[NUB5]], align 4 ; CHECK-NEXT: br label [[FOR_INC_1]] ; CHECK: for.inc.1: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ , [[IF_THEN7_1]] ], [ , [[LAND_LHS_TRUE_I_1]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> , [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[M_NUMCONSTRAINTROWS4]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP2]], <2 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[DEC_1]], [[IF_THEN7_1]] ], [ 6, [[LAND_LHS_TRUE_I_1]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[INC_1]], [[IF_THEN7_1]] ], [ 0, [[LAND_LHS_TRUE_I_1]] ] +; CHECK-NEXT: [[INC_2:%.*]] = add nsw i32 [[TMP1]], 1 +; CHECK-NEXT: store i32 [[INC_2]], i32* [[M_NUMCONSTRAINTROWS4]], align 4 +; CHECK-NEXT: [[DEC_2:%.*]] = add nsw i32 [[TMP0]], -1 +; CHECK-NEXT: store i32 [[DEC_2]], i32* [[NUB5]], align 4 ; CHECK-NEXT: unreachable ; entry: @@ -69,14 +74,15 @@ define void @_ZN30GIM_TRIANGLE_CALCULATION_CACHE18triangle_collisionERK9btVector ; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [[CLASS_GIM_TRIANGLE_CALCULATION_CACHE_9_34_69_94_119_144_179_189_264_284_332:%.*]], %class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332* [[THIS:%.*]], i64 0, i32 2, i64 0, i32 0, i64 1 ; CHECK-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds [[CLASS_GIM_TRIANGLE_CALCULATION_CACHE_9_34_69_94_119_144_179_189_264_284_332]], %class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332* [[THIS]], i64 0, i32 2, i64 0, i32 0, i64 2 ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX36]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> undef, float [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x float> [[TMP1]], undef -; CHECK-NEXT: [[TMP3:%.*]] = fsub <2 x float> [[TMP2]], undef -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 -; CHECK-NEXT: store float [[TMP4]], float* undef, align 4 -; CHECK-NEXT: [[TMP5:%.*]] = fsub <2 x float> [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[ARRAYIDX26]] to <2 x float>* -; CHECK-NEXT: store <2 x float> [[TMP5]], <2 x float>* [[TMP6]], align 4 +; CHECK-NEXT: [[ADD587:%.*]] = fadd float undef, undef +; CHECK-NEXT: [[SUB600:%.*]] = fsub float [[ADD587]], undef +; CHECK-NEXT: store float [[SUB600]], float* undef, align 4 +; CHECK-NEXT: [[SUB613:%.*]] = fsub float [[ADD587]], [[SUB600]] +; CHECK-NEXT: store float [[SUB613]], float* [[ARRAYIDX26]], align 4 +; CHECK-NEXT: [[ADD626:%.*]] = fadd float [[TMP0]], undef +; CHECK-NEXT: [[SUB639:%.*]] = fsub float [[ADD626]], undef +; CHECK-NEXT: [[SUB652:%.*]] = fsub float [[ADD626]], [[SUB639]] +; CHECK-NEXT: store float [[SUB652]], float* [[ARRAYIDX36]], align 4 ; CHECK-NEXT: br i1 undef, label [[IF_ELSE1609:%.*]], label [[IF_THEN1595:%.*]] ; CHECK: if.then1595: ; CHECK-NEXT: br i1 undef, label [[RETURN:%.*]], label [[FOR_BODY_LR_PH_I_I1702:%.*]] diff --git a/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll b/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll index 677247a8788..7ec4fc18598 100644 --- a/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll +++ b/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll @@ -24,30 +24,34 @@ define void @_ZN11HullLibrary15CleanupVerticesEjPK9btVector3jRjPS0_fRS0_(%class. ; CHECK: for.body233: ; CHECK-NEXT: br i1 undef, label [[FOR_BODY233]], label [[FOR_END271]] ; CHECK: for.end271: -; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x float> [ , [[FOR_END227]] ], [ undef, [[FOR_BODY233]] ] -; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x float> undef, [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = phi float [ 0x47EFFFFFE0000000, [[FOR_END227]] ], [ undef, [[FOR_BODY233]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi float [ 0x47EFFFFFE0000000, [[FOR_END227]] ], [ undef, [[FOR_BODY233]] ] +; CHECK-NEXT: [[SUB275:%.*]] = fsub float undef, [[TMP1]] +; CHECK-NEXT: [[SUB279:%.*]] = fsub float undef, [[TMP0]] ; CHECK-NEXT: br i1 undef, label [[IF_THEN291:%.*]], label [[RETURN]] ; CHECK: if.then291: -; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x float> , [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[TMP0]], [[TMP2]] +; CHECK-NEXT: [[MUL292:%.*]] = fmul float [[SUB275]], 5.000000e-01 +; CHECK-NEXT: [[ADD294:%.*]] = fadd float [[TMP1]], [[MUL292]] +; CHECK-NEXT: [[MUL295:%.*]] = fmul float [[SUB279]], 5.000000e-01 +; CHECK-NEXT: [[ADD297:%.*]] = fadd float [[TMP0]], [[MUL295]] ; CHECK-NEXT: br i1 undef, label [[IF_END332:%.*]], label [[IF_ELSE319:%.*]] ; CHECK: if.else319: ; CHECK-NEXT: br i1 undef, label [[IF_THEN325:%.*]], label [[IF_END327:%.*]] ; CHECK: if.then325: ; CHECK-NEXT: br label [[IF_END327]] ; CHECK: if.end327: -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> undef, float [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float undef, i32 1 ; CHECK-NEXT: br i1 undef, label [[IF_THEN329:%.*]], label [[IF_END332]] ; CHECK: if.then329: ; CHECK-NEXT: br label [[IF_END332]] ; CHECK: if.end332: -; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x float> [ [[TMP6]], [[IF_THEN329]] ], [ [[TMP6]], [[IF_END327]] ], [ , [[IF_THEN291]] ] -; CHECK-NEXT: [[TMP8:%.*]] = fsub <2 x float> [[TMP3]], [[TMP7]] +; CHECK-NEXT: [[DX272_1:%.*]] = phi float [ [[SUB275]], [[IF_THEN329]] ], [ [[SUB275]], [[IF_END327]] ], [ 0x3F847AE140000000, [[IF_THEN291]] ] +; CHECK-NEXT: [[DY276_1:%.*]] = phi float [ undef, [[IF_THEN329]] ], [ undef, [[IF_END327]] ], [ 0x3F847AE140000000, [[IF_THEN291]] ] +; CHECK-NEXT: [[SUB334:%.*]] = fsub float [[ADD294]], [[DX272_1]] +; CHECK-NEXT: [[SUB338:%.*]] = fsub float [[ADD297]], [[DY276_1]] ; CHECK-NEXT: [[ARRAYIDX_I_I606:%.*]] = getelementptr inbounds [[CLASS_BTVECTOR3_23_221_463_485_507_573_595_683_727_749_815_837_991_1585_1607_1629_1651_1849_2047_2069_2091_2113:%.*]], %class.btVector3.23.221.463.485.507.573.595.683.727.749.815.837.991.1585.1607.1629.1651.1849.2047.2069.2091.2113* [[VERTICES:%.*]], i64 0, i32 0, i64 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[ARRAYIDX_I_I606]] to <2 x float>* -; CHECK-NEXT: store <2 x float> [[TMP8]], <2 x float>* [[TMP9]], align 4 +; CHECK-NEXT: store float [[SUB334]], float* [[ARRAYIDX_I_I606]], align 4 +; CHECK-NEXT: [[ARRAYIDX3_I607:%.*]] = getelementptr inbounds [[CLASS_BTVECTOR3_23_221_463_485_507_573_595_683_727_749_815_837_991_1585_1607_1629_1651_1849_2047_2069_2091_2113]], %class.btVector3.23.221.463.485.507.573.595.683.727.749.815.837.991.1585.1607.1629.1651.1849.2047.2069.2091.2113* [[VERTICES]], i64 0, i32 0, i64 1 +; CHECK-NEXT: store float [[SUB338]], float* [[ARRAYIDX3_I607]], align 4 ; CHECK-NEXT: br label [[RETURN]] ; CHECK: return: ; CHECK-NEXT: ret void diff --git a/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll b/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll index 7948a9154c8..47e89df5ab6 100644 --- a/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll +++ b/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll @@ -55,32 +55,35 @@ define void @testfunc(float* nocapture %dest, float* nocapture readonly %src) { ; AVX: for.body: ; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; AVX-NEXT: [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ] -; AVX-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ] +; AVX-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP23:%.*]], [[FOR_BODY]] ] ; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]] ; AVX-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4 ; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]] ; AVX-NEXT: store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4 -; AVX-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> undef, <2 x i32> -; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 -; AVX-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[REORDER_SHUFFLE]], [[TMP3]] -; AVX-NEXT: [[TMP5:%.*]] = fmul <2 x float> zeroinitializer, [[TMP0]] -; AVX-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[TMP5]], [[TMP4]] -; AVX-NEXT: [[TMP7:%.*]] = fcmp olt <2 x float> [[TMP6]], -; AVX-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP7]], <2 x float> [[TMP6]], <2 x float> -; AVX-NEXT: [[TMP9:%.*]] = fcmp olt <2 x float> [[TMP8]], -; AVX-NEXT: [[TMP10:%.*]] = fmul <2 x float> zeroinitializer, [[TMP8]] -; AVX-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP9]], <2 x float> , <2 x float> [[TMP10]] -; AVX-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 -; AVX-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 -; AVX-NEXT: [[ADD13]] = fadd float [[TMP12]], [[TMP13]] -; AVX-NEXT: [[TMP14:%.*]] = insertelement <2 x float> undef, float [[TMP13]], i32 0 -; AVX-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[ADD13]], i32 1 -; AVX-NEXT: [[TMP16:%.*]] = fcmp olt <2 x float> [[TMP15]], -; AVX-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP16]], <2 x float> [[TMP15]], <2 x float> -; AVX-NEXT: [[TMP18:%.*]] = fcmp olt <2 x float> [[TMP17]], -; AVX-NEXT: [[TMP19]] = select <2 x i1> [[TMP18]], <2 x float> , <2 x float> [[TMP17]] +; AVX-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 1 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0 +; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP0]], i32 0 +; AVX-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP4]], i32 1 +; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0 +; AVX-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP1]], i32 1 +; AVX-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP5]], [[TMP7]] +; AVX-NEXT: [[TMP9:%.*]] = fmul <2 x float> zeroinitializer, [[TMP0]] +; AVX-NEXT: [[TMP10:%.*]] = fadd <2 x float> [[TMP9]], [[TMP8]] +; AVX-NEXT: [[TMP11:%.*]] = fcmp olt <2 x float> [[TMP10]], +; AVX-NEXT: [[TMP12:%.*]] = select <2 x i1> [[TMP11]], <2 x float> [[TMP10]], <2 x float> +; AVX-NEXT: [[TMP13:%.*]] = fcmp olt <2 x float> [[TMP12]], +; AVX-NEXT: [[TMP14:%.*]] = fmul <2 x float> zeroinitializer, [[TMP12]] +; AVX-NEXT: [[TMP15:%.*]] = select <2 x i1> [[TMP13]], <2 x float> , <2 x float> [[TMP14]] +; AVX-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 +; AVX-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 +; AVX-NEXT: [[ADD13]] = fadd float [[TMP16]], [[TMP17]] +; AVX-NEXT: [[TMP18:%.*]] = insertelement <2 x float> undef, float [[TMP17]], i32 0 +; AVX-NEXT: [[TMP19:%.*]] = insertelement <2 x float> [[TMP18]], float [[ADD13]], i32 1 +; AVX-NEXT: [[TMP20:%.*]] = fcmp olt <2 x float> [[TMP19]], +; AVX-NEXT: [[TMP21:%.*]] = select <2 x i1> [[TMP20]], <2 x float> [[TMP19]], <2 x float> +; AVX-NEXT: [[TMP22:%.*]] = fcmp olt <2 x float> [[TMP21]], +; AVX-NEXT: [[TMP23]] = select <2 x i1> [[TMP22]], <2 x float> , <2 x float> [[TMP21]] ; AVX-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32 ; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; AVX: for.end: diff --git a/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll b/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll index 8e4c282ecdc..9a07cfc8431 100644 --- a/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll +++ b/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll @@ -27,24 +27,25 @@ define void @SIM4() { ; CHECK: land.rhs.lr.ph: ; CHECK-NEXT: unreachable ; CHECK: if.end98: +; CHECK-NEXT: [[FROM299:%.*]] = getelementptr inbounds [[STRUCT__EXON_T_12_103_220_363_480_649_740_857_1039_1065_1078_1091_1117_1130_1156_1169_1195_1221_1234_1286_1299_1312_1338_1429_1455_1468_1494_1520_1884_1897_1975_2066_2105_2170_2171:%.*]], %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 1 ; CHECK-NEXT: br i1 undef, label [[LAND_LHS_TRUE167]], label [[IF_THEN103:%.*]] ; CHECK: if.then103: ; CHECK-NEXT: [[DOTSUB100:%.*]] = select i1 undef, i32 250, i32 undef ; CHECK-NEXT: [[MUL114:%.*]] = shl nsw i32 [[DOTSUB100]], 2 -; CHECK-NEXT: [[FROM1115:%.*]] = getelementptr inbounds [[STRUCT__EXON_T_12_103_220_363_480_649_740_857_1039_1065_1078_1091_1117_1130_1156_1169_1195_1221_1234_1286_1299_1312_1338_1429_1455_1468_1494_1520_1884_1897_1975_2066_2105_2170_2171:%.*]], %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 0 +; CHECK-NEXT: [[FROM1115:%.*]] = getelementptr inbounds [[STRUCT__EXON_T_12_103_220_363_480_649_740_857_1039_1065_1078_1091_1117_1130_1156_1169_1195_1221_1234_1286_1299_1312_1338_1429_1455_1468_1494_1520_1884_1897_1975_2066_2105_2170_2171]], %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 0 ; CHECK-NEXT: [[COND125:%.*]] = select i1 undef, i32 undef, i32 [[MUL114]] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> undef, i32 [[COND125]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[DOTSUB100]], i32 1 ; CHECK-NEXT: br label [[FOR_COND_I:%.*]] ; CHECK: for.cond.i: -; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x i32> [ undef, [[LAND_RHS_I874:%.*]] ], [ [[TMP1]], [[IF_THEN103]] ] +; CHECK-NEXT: [[ROW_0_I:%.*]] = phi i32 [ undef, [[LAND_RHS_I874:%.*]] ], [ [[DOTSUB100]], [[IF_THEN103]] ] +; CHECK-NEXT: [[COL_0_I:%.*]] = phi i32 [ undef, [[LAND_RHS_I874]] ], [ [[COND125]], [[IF_THEN103]] ] ; CHECK-NEXT: br i1 undef, label [[LAND_RHS_I874]], label [[FOR_END_I:%.*]] ; CHECK: land.rhs.i874: ; CHECK-NEXT: br i1 undef, label [[FOR_COND_I]], label [[FOR_END_I]] ; CHECK: for.end.i: ; CHECK-NEXT: br i1 undef, label [[IF_THEN_I:%.*]], label [[IF_END_I:%.*]] ; CHECK: if.then.i: -; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> undef, [[TMP2]] +; CHECK-NEXT: [[ADD14_I:%.*]] = add nsw i32 [[ROW_0_I]], undef +; CHECK-NEXT: [[ADD15_I:%.*]] = add nsw i32 [[COL_0_I]], undef ; CHECK-NEXT: br label [[EXTEND_BW_EXIT:%.*]] ; CHECK: if.end.i: ; CHECK-NEXT: [[ADD16_I:%.*]] = add i32 [[COND125]], [[DOTSUB100]] @@ -65,12 +66,14 @@ define void @SIM4() { ; CHECK: while.end275.i: ; CHECK-NEXT: br label [[EXTEND_BW_EXIT]] ; CHECK: extend_bw.exit: -; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x i32> [ [[TMP3]], [[IF_THEN_I]] ], [ undef, [[WHILE_END275_I]] ] +; CHECK-NEXT: [[ADD14_I1262:%.*]] = phi i32 [ [[ADD14_I]], [[IF_THEN_I]] ], [ undef, [[WHILE_END275_I]] ] +; CHECK-NEXT: [[ADD15_I1261:%.*]] = phi i32 [ [[ADD15_I]], [[IF_THEN_I]] ], [ undef, [[WHILE_END275_I]] ] ; CHECK-NEXT: br i1 false, label [[IF_THEN157:%.*]], label [[LAND_LHS_TRUE167]] ; CHECK: if.then157: -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> , [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[FROM1115]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP5]], <2 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: [[ADD158:%.*]] = add nsw i32 [[ADD14_I1262]], 1 +; CHECK-NEXT: store i32 [[ADD158]], i32* [[FROM299]], align 4 +; CHECK-NEXT: [[ADD160:%.*]] = add nsw i32 [[ADD15_I1261]], 1 +; CHECK-NEXT: store i32 [[ADD160]], i32* [[FROM1115]], align 4 ; CHECK-NEXT: br label [[LAND_LHS_TRUE167]] ; CHECK: land.lhs.true167: ; CHECK-NEXT: unreachable diff --git a/test/Transforms/SLPVectorizer/X86/fptosi.ll b/test/Transforms/SLPVectorizer/X86/fptosi.ll index 790a2f40924..69b9f32b733 100644 --- a/test/Transforms/SLPVectorizer/X86/fptosi.ll +++ b/test/Transforms/SLPVectorizer/X86/fptosi.ll @@ -221,38 +221,32 @@ define void @fptosi_8f64_8i16() #0 { } define void @fptosi_8f64_8i8() #0 { -; SSE-LABEL: @fptosi_8f64_8i8( -; SSE-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; SSE-NEXT: [[CVT0:%.*]] = fptosi double [[A0]] to i8 -; SSE-NEXT: [[CVT1:%.*]] = fptosi double [[A1]] to i8 -; SSE-NEXT: [[CVT2:%.*]] = fptosi double [[A2]] to i8 -; SSE-NEXT: [[CVT3:%.*]] = fptosi double [[A3]] to i8 -; SSE-NEXT: [[CVT4:%.*]] = fptosi double [[A4]] to i8 -; SSE-NEXT: [[CVT5:%.*]] = fptosi double [[A5]] to i8 -; SSE-NEXT: [[CVT6:%.*]] = fptosi double [[A6]] to i8 -; SSE-NEXT: [[CVT7:%.*]] = fptosi double [[A7]] to i8 -; SSE-NEXT: store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1 -; SSE-NEXT: store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1 -; SSE-NEXT: store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1 -; SSE-NEXT: store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1 -; SSE-NEXT: store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1 -; SSE-NEXT: store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1 -; SSE-NEXT: store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1 -; SSE-NEXT: store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1 -; SSE-NEXT: ret void -; -; AVX-LABEL: @fptosi_8f64_8i8( -; AVX-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 -; AVX-NEXT: [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i8> -; AVX-NEXT: store <8 x i8> [[TMP2]], <8 x i8>* bitcast ([64 x i8]* @dst8 to <8 x i8>*), align 1 -; AVX-NEXT: ret void +; CHECK-LABEL: @fptosi_8f64_8i8( +; CHECK-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 +; CHECK-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 +; CHECK-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 +; CHECK-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 +; CHECK-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 +; CHECK-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 +; CHECK-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 +; CHECK-NEXT: [[CVT0:%.*]] = fptosi double [[A0]] to i8 +; CHECK-NEXT: [[CVT1:%.*]] = fptosi double [[A1]] to i8 +; CHECK-NEXT: [[CVT2:%.*]] = fptosi double [[A2]] to i8 +; CHECK-NEXT: [[CVT3:%.*]] = fptosi double [[A3]] to i8 +; CHECK-NEXT: [[CVT4:%.*]] = fptosi double [[A4]] to i8 +; CHECK-NEXT: [[CVT5:%.*]] = fptosi double [[A5]] to i8 +; CHECK-NEXT: [[CVT6:%.*]] = fptosi double [[A6]] to i8 +; CHECK-NEXT: [[CVT7:%.*]] = fptosi double [[A7]] to i8 +; CHECK-NEXT: store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1 +; CHECK-NEXT: store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1 +; CHECK-NEXT: store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1 +; CHECK-NEXT: store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1 +; CHECK-NEXT: store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1 +; CHECK-NEXT: store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1 +; CHECK-NEXT: store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1 +; CHECK-NEXT: store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1 +; CHECK-NEXT: ret void ; %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 @@ -461,9 +455,30 @@ define void @fptosi_8f32_8i16() #0 { define void @fptosi_8f32_8i8() #0 { ; CHECK-LABEL: @fptosi_8f32_8i8( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 -; CHECK-NEXT: [[TMP2:%.*]] = fptosi <8 x float> [[TMP1]] to <8 x i8> -; CHECK-NEXT: store <8 x i8> [[TMP2]], <8 x i8>* bitcast ([64 x i8]* @dst8 to <8 x i8>*), align 1 +; CHECK-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 +; CHECK-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 +; CHECK-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 +; CHECK-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 +; CHECK-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 +; CHECK-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 +; CHECK-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 +; CHECK-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 +; CHECK-NEXT: [[CVT0:%.*]] = fptosi float [[A0]] to i8 +; CHECK-NEXT: [[CVT1:%.*]] = fptosi float [[A1]] to i8 +; CHECK-NEXT: [[CVT2:%.*]] = fptosi float [[A2]] to i8 +; CHECK-NEXT: [[CVT3:%.*]] = fptosi float [[A3]] to i8 +; CHECK-NEXT: [[CVT4:%.*]] = fptosi float [[A4]] to i8 +; CHECK-NEXT: [[CVT5:%.*]] = fptosi float [[A5]] to i8 +; CHECK-NEXT: [[CVT6:%.*]] = fptosi float [[A6]] to i8 +; CHECK-NEXT: [[CVT7:%.*]] = fptosi float [[A7]] to i8 +; CHECK-NEXT: store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1 +; CHECK-NEXT: store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1 +; CHECK-NEXT: store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1 +; CHECK-NEXT: store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1 +; CHECK-NEXT: store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1 +; CHECK-NEXT: store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1 +; CHECK-NEXT: store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1 +; CHECK-NEXT: store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1 ; CHECK-NEXT: ret void ; %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 diff --git a/test/Transforms/SLPVectorizer/X86/fptoui.ll b/test/Transforms/SLPVectorizer/X86/fptoui.ll index a5c5fb04dd0..33d998ffbbc 100644 --- a/test/Transforms/SLPVectorizer/X86/fptoui.ll +++ b/test/Transforms/SLPVectorizer/X86/fptoui.ll @@ -305,71 +305,32 @@ define void @fptoui_8f64_8i16() #0 { } define void @fptoui_8f64_8i8() #0 { -; SSE-LABEL: @fptoui_8f64_8i8( -; SSE-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; SSE-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; SSE-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; SSE-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; SSE-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; SSE-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; SSE-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; SSE-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; SSE-NEXT: [[CVT0:%.*]] = fptoui double [[A0]] to i8 -; SSE-NEXT: [[CVT1:%.*]] = fptoui double [[A1]] to i8 -; SSE-NEXT: [[CVT2:%.*]] = fptoui double [[A2]] to i8 -; SSE-NEXT: [[CVT3:%.*]] = fptoui double [[A3]] to i8 -; SSE-NEXT: [[CVT4:%.*]] = fptoui double [[A4]] to i8 -; SSE-NEXT: [[CVT5:%.*]] = fptoui double [[A5]] to i8 -; SSE-NEXT: [[CVT6:%.*]] = fptoui double [[A6]] to i8 -; SSE-NEXT: [[CVT7:%.*]] = fptoui double [[A7]] to i8 -; SSE-NEXT: store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1 -; SSE-NEXT: store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1 -; SSE-NEXT: store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1 -; SSE-NEXT: store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1 -; SSE-NEXT: store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1 -; SSE-NEXT: store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1 -; SSE-NEXT: store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1 -; SSE-NEXT: store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1 -; SSE-NEXT: ret void -; -; AVX256NODQ-LABEL: @fptoui_8f64_8i8( -; AVX256NODQ-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; AVX256NODQ-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; AVX256NODQ-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; AVX256NODQ-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; AVX256NODQ-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; AVX256NODQ-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; AVX256NODQ-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = fptoui double [[A0]] to i8 -; AVX256NODQ-NEXT: [[CVT1:%.*]] = fptoui double [[A1]] to i8 -; AVX256NODQ-NEXT: [[CVT2:%.*]] = fptoui double [[A2]] to i8 -; AVX256NODQ-NEXT: [[CVT3:%.*]] = fptoui double [[A3]] to i8 -; AVX256NODQ-NEXT: [[CVT4:%.*]] = fptoui double [[A4]] to i8 -; AVX256NODQ-NEXT: [[CVT5:%.*]] = fptoui double [[A5]] to i8 -; AVX256NODQ-NEXT: [[CVT6:%.*]] = fptoui double [[A6]] to i8 -; AVX256NODQ-NEXT: [[CVT7:%.*]] = fptoui double [[A7]] to i8 -; AVX256NODQ-NEXT: store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1 -; AVX256NODQ-NEXT: store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1 -; AVX256NODQ-NEXT: store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1 -; AVX256NODQ-NEXT: store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1 -; AVX256NODQ-NEXT: store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1 -; AVX256NODQ-NEXT: store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1 -; AVX256NODQ-NEXT: store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1 -; AVX256NODQ-NEXT: store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1 -; AVX256NODQ-NEXT: ret void -; -; AVX512-LABEL: @fptoui_8f64_8i8( -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 -; AVX512-NEXT: [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i8> -; AVX512-NEXT: store <8 x i8> [[TMP2]], <8 x i8>* bitcast ([64 x i8]* @dst8 to <8 x i8>*), align 1 -; AVX512-NEXT: ret void -; -; AVX256DQ-LABEL: @fptoui_8f64_8i8( -; AVX256DQ-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 -; AVX256DQ-NEXT: [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i8> -; AVX256DQ-NEXT: store <8 x i8> [[TMP2]], <8 x i8>* bitcast ([64 x i8]* @dst8 to <8 x i8>*), align 1 -; AVX256DQ-NEXT: ret void +; CHECK-LABEL: @fptoui_8f64_8i8( +; CHECK-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 +; CHECK-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 +; CHECK-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 +; CHECK-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 +; CHECK-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 +; CHECK-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 +; CHECK-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 +; CHECK-NEXT: [[CVT0:%.*]] = fptoui double [[A0]] to i8 +; CHECK-NEXT: [[CVT1:%.*]] = fptoui double [[A1]] to i8 +; CHECK-NEXT: [[CVT2:%.*]] = fptoui double [[A2]] to i8 +; CHECK-NEXT: [[CVT3:%.*]] = fptoui double [[A3]] to i8 +; CHECK-NEXT: [[CVT4:%.*]] = fptoui double [[A4]] to i8 +; CHECK-NEXT: [[CVT5:%.*]] = fptoui double [[A5]] to i8 +; CHECK-NEXT: [[CVT6:%.*]] = fptoui double [[A6]] to i8 +; CHECK-NEXT: [[CVT7:%.*]] = fptoui double [[A7]] to i8 +; CHECK-NEXT: store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1 +; CHECK-NEXT: store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1 +; CHECK-NEXT: store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1 +; CHECK-NEXT: store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1 +; CHECK-NEXT: store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1 +; CHECK-NEXT: store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1 +; CHECK-NEXT: store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1 +; CHECK-NEXT: store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1 +; CHECK-NEXT: ret void ; %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 @@ -655,38 +616,32 @@ define void @fptoui_8f32_8i16() #0 { } define void @fptoui_8f32_8i8() #0 { -; SSE-LABEL: @fptoui_8f32_8i8( -; SSE-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; SSE-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; SSE-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; SSE-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; SSE-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; SSE-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; SSE-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; SSE-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; SSE-NEXT: [[CVT0:%.*]] = fptoui float [[A0]] to i8 -; SSE-NEXT: [[CVT1:%.*]] = fptoui float [[A1]] to i8 -; SSE-NEXT: [[CVT2:%.*]] = fptoui float [[A2]] to i8 -; SSE-NEXT: [[CVT3:%.*]] = fptoui float [[A3]] to i8 -; SSE-NEXT: [[CVT4:%.*]] = fptoui float [[A4]] to i8 -; SSE-NEXT: [[CVT5:%.*]] = fptoui float [[A5]] to i8 -; SSE-NEXT: [[CVT6:%.*]] = fptoui float [[A6]] to i8 -; SSE-NEXT: [[CVT7:%.*]] = fptoui float [[A7]] to i8 -; SSE-NEXT: store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1 -; SSE-NEXT: store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1 -; SSE-NEXT: store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1 -; SSE-NEXT: store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1 -; SSE-NEXT: store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1 -; SSE-NEXT: store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1 -; SSE-NEXT: store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1 -; SSE-NEXT: store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1 -; SSE-NEXT: ret void -; -; AVX-LABEL: @fptoui_8f32_8i8( -; AVX-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 -; AVX-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i8> -; AVX-NEXT: store <8 x i8> [[TMP2]], <8 x i8>* bitcast ([64 x i8]* @dst8 to <8 x i8>*), align 1 -; AVX-NEXT: ret void +; CHECK-LABEL: @fptoui_8f32_8i8( +; CHECK-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 +; CHECK-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 +; CHECK-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 +; CHECK-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 +; CHECK-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 +; CHECK-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 +; CHECK-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 +; CHECK-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 +; CHECK-NEXT: [[CVT0:%.*]] = fptoui float [[A0]] to i8 +; CHECK-NEXT: [[CVT1:%.*]] = fptoui float [[A1]] to i8 +; CHECK-NEXT: [[CVT2:%.*]] = fptoui float [[A2]] to i8 +; CHECK-NEXT: [[CVT3:%.*]] = fptoui float [[A3]] to i8 +; CHECK-NEXT: [[CVT4:%.*]] = fptoui float [[A4]] to i8 +; CHECK-NEXT: [[CVT5:%.*]] = fptoui float [[A5]] to i8 +; CHECK-NEXT: [[CVT6:%.*]] = fptoui float [[A6]] to i8 +; CHECK-NEXT: [[CVT7:%.*]] = fptoui float [[A7]] to i8 +; CHECK-NEXT: store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1 +; CHECK-NEXT: store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1 +; CHECK-NEXT: store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1 +; CHECK-NEXT: store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1 +; CHECK-NEXT: store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1 +; CHECK-NEXT: store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1 +; CHECK-NEXT: store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1 +; CHECK-NEXT: store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1 +; CHECK-NEXT: ret void ; %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 diff --git a/test/Transforms/SLPVectorizer/X86/insertvalue.ll b/test/Transforms/SLPVectorizer/X86/insertvalue.ll index 2e4fa364ed8..1af11609fe6 100644 --- a/test/Transforms/SLPVectorizer/X86/insertvalue.ll +++ b/test/Transforms/SLPVectorizer/X86/insertvalue.ll @@ -216,21 +216,24 @@ top: define void @julia_load_array_of_i16([4 x i16]* %a, [4 x i16]* %b, [4 x i16]* %c) { ; CHECK-LABEL: @julia_load_array_of_i16( ; CHECK-NEXT: top: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast [4 x i16]* [[A:%.*]] to <4 x i16>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 4 -; CHECK-NEXT: [[A_ARR:%.*]] = load [4 x i16], [4 x i16]* [[A]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast [4 x i16]* [[B:%.*]] to <4 x i16>* -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[TMP2]], align 4 -; CHECK-NEXT: [[B_ARR:%.*]] = load [4 x i16], [4 x i16]* [[B]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i16> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i16> [[TMP4]], i32 0 -; CHECK-NEXT: [[C_ARR0:%.*]] = insertvalue [4 x i16] undef, i16 [[TMP5]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i16> [[TMP4]], i32 1 -; CHECK-NEXT: [[C_ARR1:%.*]] = insertvalue [4 x i16] [[C_ARR0]], i16 [[TMP6]], 1 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 -; CHECK-NEXT: [[C_ARR2:%.*]] = insertvalue [4 x i16] [[C_ARR1]], i16 [[TMP7]], 2 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 -; CHECK-NEXT: [[C_ARR3:%.*]] = insertvalue [4 x i16] [[C_ARR2]], i16 [[TMP8]], 3 +; CHECK-NEXT: [[A_ARR:%.*]] = load [4 x i16], [4 x i16]* [[A:%.*]], align 4 +; CHECK-NEXT: [[A0:%.*]] = extractvalue [4 x i16] [[A_ARR]], 0 +; CHECK-NEXT: [[A2:%.*]] = extractvalue [4 x i16] [[A_ARR]], 2 +; CHECK-NEXT: [[A1:%.*]] = extractvalue [4 x i16] [[A_ARR]], 1 +; CHECK-NEXT: [[B_ARR:%.*]] = load [4 x i16], [4 x i16]* [[B:%.*]], align 4 +; CHECK-NEXT: [[B0:%.*]] = extractvalue [4 x i16] [[B_ARR]], 0 +; CHECK-NEXT: [[B2:%.*]] = extractvalue [4 x i16] [[B_ARR]], 2 +; CHECK-NEXT: [[B1:%.*]] = extractvalue [4 x i16] [[B_ARR]], 1 +; CHECK-NEXT: [[A3:%.*]] = extractvalue [4 x i16] [[A_ARR]], 3 +; CHECK-NEXT: [[C1:%.*]] = sub i16 [[A1]], [[B1]] +; CHECK-NEXT: [[B3:%.*]] = extractvalue [4 x i16] [[B_ARR]], 3 +; CHECK-NEXT: [[C0:%.*]] = sub i16 [[A0]], [[B0]] +; CHECK-NEXT: [[C2:%.*]] = sub i16 [[A2]], [[B2]] +; CHECK-NEXT: [[C_ARR0:%.*]] = insertvalue [4 x i16] undef, i16 [[C0]], 0 +; CHECK-NEXT: [[C_ARR1:%.*]] = insertvalue [4 x i16] [[C_ARR0]], i16 [[C1]], 1 +; CHECK-NEXT: [[C3:%.*]] = sub i16 [[A3]], [[B3]] +; CHECK-NEXT: [[C_ARR2:%.*]] = insertvalue [4 x i16] [[C_ARR1]], i16 [[C2]], 2 +; CHECK-NEXT: [[C_ARR3:%.*]] = insertvalue [4 x i16] [[C_ARR2]], i16 [[C3]], 3 ; CHECK-NEXT: store [4 x i16] [[C_ARR3]], [4 x i16]* [[C:%.*]], align 4 ; CHECK-NEXT: ret void ; diff --git a/test/Transforms/SLPVectorizer/X86/phi.ll b/test/Transforms/SLPVectorizer/X86/phi.ll index be63297284b..a0a13b2b5aa 100644 --- a/test/Transforms/SLPVectorizer/X86/phi.ll +++ b/test/Transforms/SLPVectorizer/X86/phi.ll @@ -146,49 +146,44 @@ define float @foo3(float* nocapture readonly %A) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 ; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[REORDER_SHUFFLE]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> undef, float [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP3]], i32 1 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x float> [ [[REORDER_SHUFFLE]], [[ENTRY]] ], [ [[TMP23:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x float> [ [[TMP5]], [[ENTRY]] ], [ [[TMP26:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP6]], 7.000000e+00 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 -; CHECK-NEXT: [[ADD6:%.*]] = fadd float [[TMP9]], [[MUL]] -; CHECK-NEXT: [[TMP10:%.*]] = add nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX14]], align 4 +; CHECK-NEXT: [[R_052:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi float [ [[TMP3]], [[ENTRY]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[TMP14:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = phi <4 x float> [ [[REORDER_SHUFFLE]], [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 7.000000e+00 +; CHECK-NEXT: [[ADD6]] = fadd float [[R_052]], [[MUL]] +; CHECK-NEXT: [[TMP7:%.*]] = add nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[ARRAYIDX14]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3 ; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast float* [[ARRAYIDX19]] to <2 x float>* -; CHECK-NEXT: [[TMP13:%.*]] = load <2 x float>, <2 x float>* [[TMP12]], align 4 -; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP13]], <2 x float> undef, <2 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x float> , float [[TMP14]], i32 3 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[REORDER_SHUFFLE1]], i32 0 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x float> undef, float [[TMP16]], i32 0 -; CHECK-NEXT: [[TMP18]] = extractelement <2 x float> [[REORDER_SHUFFLE1]], i32 1 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[TMP17]], float [[TMP18]], i32 1 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP11]], i32 2 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float 8.000000e+00, i32 3 -; CHECK-NEXT: [[TMP22:%.*]] = fmul <4 x float> [[TMP15]], [[TMP21]] -; CHECK-NEXT: [[TMP23]] = fadd <4 x float> [[TMP7]], [[TMP22]] -; CHECK-NEXT: [[TMP24:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP24]], 121 -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <2 x float> undef, float [[ADD6]], i32 0 -; CHECK-NEXT: [[TMP26]] = insertelement <2 x float> [[TMP25]], float [[TMP16]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[ARRAYIDX19]] to <2 x float>* +; CHECK-NEXT: [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[TMP9]], align 4 +; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> undef, <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> , float [[TMP4]], i32 3 +; CHECK-NEXT: [[TMP12]] = extractelement <2 x float> [[REORDER_SHUFFLE1]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x float> undef, float [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14]] = extractelement <2 x float> [[REORDER_SHUFFLE1]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP14]], i32 1 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP8]], i32 2 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float 8.000000e+00, i32 3 +; CHECK-NEXT: [[TMP18:%.*]] = fmul <4 x float> [[TMP11]], [[TMP17]] +; CHECK-NEXT: [[TMP19]] = fadd <4 x float> [[TMP6]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP20]], 121 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x float> [[TMP23]], i32 3 -; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP27]] -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x float> [[TMP23]], i32 2 -; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP28]] -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x float> [[TMP23]], i32 1 -; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP29]] -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x float> [[TMP23]], i32 0 -; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP30]] +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[TMP19]], i32 3 +; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP21]] +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x float> [[TMP19]], i32 2 +; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP22]] +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x float> [[TMP19]], i32 1 +; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP23]] +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x float> [[TMP19]], i32 0 +; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP24]] ; CHECK-NEXT: ret float [[ADD31]] ; entry: diff --git a/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll b/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll index e6029fb5a97..a0d3c939d4f 100644 --- a/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll +++ b/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=x86_64-pc-linux-gnu -mcpu=generic -slp-vectorizer -slp-min-reg-size=128 -pass-remarks-output=%t < %s | FileCheck %s +; RUN: opt -S -mtriple=x86_64-pc-linux-gnu -mcpu=generic -slp-vectorizer -pass-remarks-output=%t < %s | FileCheck %s ; RUN: FileCheck --input-file=%t --check-prefix=YAML %s define i32 @foo(i32* nocapture readonly %diff) #0 { diff --git a/test/Transforms/SLPVectorizer/X86/reorder_phi.ll b/test/Transforms/SLPVectorizer/X86/reorder_phi.ll index 333ee0df6a1..afee26f0d40 100644 --- a/test/Transforms/SLPVectorizer/X86/reorder_phi.ll +++ b/test/Transforms/SLPVectorizer/X86/reorder_phi.ll @@ -9,38 +9,33 @@ define void @foo (%struct.complex* %A, %struct.complex* %B, %struct.complex* %R ; CHECK-NEXT: [[TMP0:%.*]] = add i64 256, 0 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP25:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP24:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX:%.*]], %struct.complex* [[A:%.*]], i64 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[A]], i64 [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP3]] to <2 x float>* -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[TMP5]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B:%.*]], i64 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[TMP7]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B]], i64 [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = load float, float* [[TMP9]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x float> undef, float [[TMP8]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP8]], i32 1 -; CHECK-NEXT: [[TMP13:%.*]] = fmul <2 x float> [[TMP6]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[TMP6]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x float> undef, float [[TMP14]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP16]], i32 1 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x float> undef, float [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x float> [[TMP18]], float [[TMP10]], i32 1 -; CHECK-NEXT: [[TMP20:%.*]] = fmul <2 x float> [[TMP17]], [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = fsub <2 x float> [[TMP13]], [[TMP20]] -; CHECK-NEXT: [[TMP22:%.*]] = fadd <2 x float> [[TMP13]], [[TMP20]] -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <2 x float> [[TMP21]], <2 x float> [[TMP22]], <2 x i32> -; CHECK-NEXT: [[TMP24]] = fadd <2 x float> [[TMP2]], [[TMP23]] -; CHECK-NEXT: [[TMP25]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], [[TMP0]] -; CHECK-NEXT: br i1 [[TMP26]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP20:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP19:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP18:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX:%.*]], %struct.complex* [[A:%.*]], i64 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[A]], i64 [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B:%.*]], i64 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = load float, float* [[TMP8]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B]], i64 [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[TMP10]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = fmul float [[TMP5]], [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = fmul float [[TMP7]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = fsub float [[TMP12]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = fmul float [[TMP7]], [[TMP9]] +; CHECK-NEXT: [[TMP16:%.*]] = fmul float [[TMP5]], [[TMP11]] +; CHECK-NEXT: [[TMP17:%.*]] = fadd float [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[TMP18]] = fadd float [[TMP3]], [[TMP14]] +; CHECK-NEXT: [[TMP19]] = fadd float [[TMP2]], [[TMP17]] +; CHECK-NEXT: [[TMP20]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], [[TMP0]] +; CHECK-NEXT: br i1 [[TMP21]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT:%.*]], i32 0, i32 0 -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT]], i32 0, i32 1 -; CHECK-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP27]] to <2 x float>* -; CHECK-NEXT: store <2 x float> [[TMP24]], <2 x float>* [[TMP29]], align 4 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT:%.*]], i32 0, i32 0 +; CHECK-NEXT: store float [[TMP18]], float* [[TMP22]], align 4 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT]], i32 0, i32 1 +; CHECK-NEXT: store float [[TMP19]], float* [[TMP23]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/test/Transforms/SLPVectorizer/X86/resched.ll b/test/Transforms/SLPVectorizer/X86/resched.ll index 848b70c4d86..b8b1ff00db4 100644 --- a/test/Transforms/SLPVectorizer/X86/resched.ll +++ b/test/Transforms/SLPVectorizer/X86/resched.ll @@ -38,47 +38,44 @@ define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv() ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[CONV31_I]], i32 3 ; CHECK-NEXT: [[TMP14:%.*]] = lshr <4 x i32> [[TMP13]], ; CHECK-NEXT: [[ARRAYIDX_I_I7_12_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 12 +; CHECK-NEXT: [[SHR_12_I_I:%.*]] = lshr i32 [[CONV31_I]], 13 ; CHECK-NEXT: [[ARRAYIDX_I_I7_13_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 13 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> undef, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[CONV31_I]], i32 1 -; CHECK-NEXT: [[TMP17:%.*]] = lshr <2 x i32> [[TMP16]], +; CHECK-NEXT: [[SHR_13_I_I:%.*]] = lshr i32 [[CONV31_I]], 14 ; CHECK-NEXT: [[ARRAYIDX_I_I7_14_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 14 ; CHECK-NEXT: [[SHR_14_I_I:%.*]] = lshr i32 [[CONV31_I]], 15 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <16 x i32> undef, i32 [[SUB_I]], i32 0 -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i32> [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x i32> [[TMP18]], i32 [[TMP19]], i32 1 -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <8 x i32> [[TMP9]], i32 1 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x i32> [[TMP20]], i32 [[TMP21]], i32 2 -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP9]], i32 2 -; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x i32> [[TMP22]], i32 [[TMP23]], i32 3 -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i32> [[TMP9]], i32 3 -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <16 x i32> [[TMP24]], i32 [[TMP25]], i32 4 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <8 x i32> [[TMP9]], i32 4 -; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x i32> [[TMP26]], i32 [[TMP27]], i32 5 -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <8 x i32> [[TMP9]], i32 5 -; CHECK-NEXT: [[TMP30:%.*]] = insertelement <16 x i32> [[TMP28]], i32 [[TMP29]], i32 6 -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <8 x i32> [[TMP9]], i32 6 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <16 x i32> [[TMP30]], i32 [[TMP31]], i32 7 -; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i32> [[TMP9]], i32 7 -; CHECK-NEXT: [[TMP34:%.*]] = insertelement <16 x i32> [[TMP32]], i32 [[TMP33]], i32 8 -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x i32> [[TMP14]], i32 0 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <16 x i32> [[TMP34]], i32 [[TMP35]], i32 9 -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x i32> [[TMP14]], i32 1 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <16 x i32> [[TMP36]], i32 [[TMP37]], i32 10 -; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i32> [[TMP14]], i32 2 -; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x i32> [[TMP38]], i32 [[TMP39]], i32 11 -; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i32> [[TMP14]], i32 3 -; CHECK-NEXT: [[TMP42:%.*]] = insertelement <16 x i32> [[TMP40]], i32 [[TMP41]], i32 12 -; CHECK-NEXT: [[TMP43:%.*]] = extractelement <2 x i32> [[TMP17]], i32 0 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <16 x i32> [[TMP42]], i32 [[TMP43]], i32 13 -; CHECK-NEXT: [[TMP45:%.*]] = extractelement <2 x i32> [[TMP17]], i32 1 -; CHECK-NEXT: [[TMP46:%.*]] = insertelement <16 x i32> [[TMP44]], i32 [[TMP45]], i32 14 -; CHECK-NEXT: [[TMP47:%.*]] = insertelement <16 x i32> [[TMP46]], i32 [[SHR_14_I_I]], i32 15 -; CHECK-NEXT: [[TMP48:%.*]] = trunc <16 x i32> [[TMP47]] to <16 x i8> -; CHECK-NEXT: [[TMP49:%.*]] = and <16 x i8> , [[TMP48]] +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> undef, i32 [[SUB_I]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[TMP16]], i32 1 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i32> [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[TMP18]], i32 2 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP9]], i32 2 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[TMP20]], i32 3 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP9]], i32 3 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[TMP22]], i32 4 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <8 x i32> [[TMP9]], i32 4 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP24]], i32 5 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[TMP9]], i32 5 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x i32> [[TMP25]], i32 [[TMP26]], i32 6 +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP9]], i32 6 +; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[TMP28]], i32 7 +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x i32> [[TMP9]], i32 7 +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x i32> [[TMP29]], i32 [[TMP30]], i32 8 +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[TMP14]], i32 0 +; CHECK-NEXT: [[TMP33:%.*]] = insertelement <16 x i32> [[TMP31]], i32 [[TMP32]], i32 9 +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[TMP14]], i32 1 +; CHECK-NEXT: [[TMP35:%.*]] = insertelement <16 x i32> [[TMP33]], i32 [[TMP34]], i32 10 +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <4 x i32> [[TMP14]], i32 2 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x i32> [[TMP35]], i32 [[TMP36]], i32 11 +; CHECK-NEXT: [[TMP38:%.*]] = extractelement <4 x i32> [[TMP14]], i32 3 +; CHECK-NEXT: [[TMP39:%.*]] = insertelement <16 x i32> [[TMP37]], i32 [[TMP38]], i32 12 +; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x i32> [[TMP39]], i32 [[SHR_12_I_I]], i32 13 +; CHECK-NEXT: [[TMP41:%.*]] = insertelement <16 x i32> [[TMP40]], i32 [[SHR_13_I_I]], i32 14 +; CHECK-NEXT: [[TMP42:%.*]] = insertelement <16 x i32> [[TMP41]], i32 [[SHR_14_I_I]], i32 15 +; CHECK-NEXT: [[TMP43:%.*]] = trunc <16 x i32> [[TMP42]] to <16 x i8> +; CHECK-NEXT: [[TMP44:%.*]] = and <16 x i8> , [[TMP43]] ; CHECK-NEXT: [[ARRAYIDX_I_I7_15_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 15 -; CHECK-NEXT: [[TMP50:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* -; CHECK-NEXT: store <16 x i8> [[TMP49]], <16 x i8>* [[TMP50]], align 1 +; CHECK-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP44]], <16 x i8>* [[TMP45]], align 1 ; CHECK-NEXT: unreachable ; CHECK: if.end50.i: ; CHECK-NEXT: ret void diff --git a/test/Transforms/SLPVectorizer/X86/rgb_phi.ll b/test/Transforms/SLPVectorizer/X86/rgb_phi.ll index dcb730ff2fd..c7e419b5f56 100644 --- a/test/Transforms/SLPVectorizer/X86/rgb_phi.ll +++ b/test/Transforms/SLPVectorizer/X86/rgb_phi.ll @@ -25,37 +25,39 @@ define float @foo(float* nocapture readonly %A) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[A:%.*]], align 4 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[ARRAYIDX1]] to <2 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> undef, <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX2]], align 4 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[TMP3:%.*]] = phi float [ [[TMP0]], [[ENTRY:%.*]] ], [ [[DOTPRE:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ] ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] +; CHECK-NEXT: [[B_032:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD14:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] +; CHECK-NEXT: [[G_031:%.*]] = phi float [ [[TMP1]], [[ENTRY]] ], [ [[ADD9:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] ; CHECK-NEXT: [[R_030:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD4:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x float> [ [[REORDER_SHUFFLE]], [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] ; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP3]], 7.000000e+00 ; CHECK-NEXT: [[ADD4]] = fadd float [[R_030]], [[MUL]] -; CHECK-NEXT: [[TMP5:%.*]] = add nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[ARRAYIDX7]] to <2 x float>* -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[TMP6]], align 4 -; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> undef, <2 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x float> , [[REORDER_SHUFFLE1]] -; CHECK-NEXT: [[TMP9]] = fadd <2 x float> [[TMP4]], [[TMP8]] +; CHECK-NEXT: [[TMP4:%.*]] = add nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX7]], align 4 +; CHECK-NEXT: [[MUL8:%.*]] = fmul float [[TMP5]], 8.000000e+00 +; CHECK-NEXT: [[ADD9]] = fadd float [[G_031]], [[MUL8]] +; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4 +; CHECK-NEXT: [[MUL13:%.*]] = fmul float [[TMP7]], 9.000000e+00 +; CHECK-NEXT: [[ADD14]] = fadd float [[B_032]], [[MUL13]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 3 -; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP10]], 121 +; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP8]], 121 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]], label [[FOR_END:%.*]] ; CHECK: for.body.for.body_crit_edge: ; CHECK-NEXT: [[ARRAYIDX3_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]] ; CHECK-NEXT: [[DOTPRE]] = load float, float* [[ARRAYIDX3_PHI_TRANS_INSERT]], align 4 ; CHECK-NEXT: br label [[FOR_BODY]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP9]], i32 1 -; CHECK-NEXT: [[ADD16:%.*]] = fadd float [[ADD4]], [[TMP11]] -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i32 0 -; CHECK-NEXT: [[ADD17:%.*]] = fadd float [[ADD16]], [[TMP12]] +; CHECK-NEXT: [[ADD16:%.*]] = fadd float [[ADD4]], [[ADD9]] +; CHECK-NEXT: [[ADD17:%.*]] = fadd float [[ADD16]], [[ADD14]] ; CHECK-NEXT: ret float [[ADD17]] ; entry: diff --git a/test/Transforms/SLPVectorizer/X86/saxpy.ll b/test/Transforms/SLPVectorizer/X86/saxpy.ll index 944dfac2c96..f2f858e3c7d 100644 --- a/test/Transforms/SLPVectorizer/X86/saxpy.ll +++ b/test/Transforms/SLPVectorizer/X86/saxpy.ll @@ -63,11 +63,15 @@ define void @SAXPY_crash(i32* noalias nocapture %x, i32* noalias nocapture %y, i ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[I:%.*]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[Y:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>* -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[TMP4]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = add nsw <2 x i32> undef, [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = add nsw i32 undef, [[TMP4]] +; CHECK-NEXT: store i32 [[TMP5]], i32* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[I]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[Y]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = add nsw i32 undef, [[TMP9]] +; CHECK-NEXT: store i32 [[TMP10]], i32* [[TMP7]], align 4 ; CHECK-NEXT: ret void ; %1 = add i64 %i, 1 diff --git a/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll b/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll index 60b2aa5ba73..3abde37048f 100644 --- a/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll +++ b/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll @@ -14,10 +14,14 @@ define i32 @slp_schedule_bundle() local_unnamed_addr #0 { ; CHECK-NEXT: [[TMP1:%.*]] = lshr <4 x i32> [[TMP0]], ; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i32> , [[TMP1]] ; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([1 x i32]* @a to <4 x i32>*), align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 4, i64 0) to <2 x i32>*), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = lshr <2 x i32> [[TMP3]], -; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i32> , [[TMP4]] -; CHECK-NEXT: store <2 x i32> [[TMP5]], <2 x i32>* bitcast (i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 4, i64 0) to <2 x i32>*), align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 4, i64 0), align 4 +; CHECK-NEXT: [[DOTLOBIT_4:%.*]] = lshr i32 [[TMP3]], 31 +; CHECK-NEXT: [[DOTLOBIT_NOT_4:%.*]] = xor i32 [[DOTLOBIT_4]], 1 +; CHECK-NEXT: store i32 [[DOTLOBIT_NOT_4]], i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 4, i64 0), align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 5, i64 0), align 4 +; CHECK-NEXT: [[DOTLOBIT_5:%.*]] = lshr i32 [[TMP4]], 31 +; CHECK-NEXT: [[DOTLOBIT_NOT_5:%.*]] = xor i32 [[DOTLOBIT_5]], 1 +; CHECK-NEXT: store i32 [[DOTLOBIT_NOT_5]], i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 5, i64 0), align 4 ; CHECK-NEXT: ret i32 undef ; entry: diff --git a/test/Transforms/SLPVectorizer/X86/sext.ll b/test/Transforms/SLPVectorizer/X86/sext.ll index e940bb56f8e..75406c927d9 100644 --- a/test/Transforms/SLPVectorizer/X86/sext.ll +++ b/test/Transforms/SLPVectorizer/X86/sext.ll @@ -808,20 +808,18 @@ define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) { ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 ; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 ; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* -; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = bitcast i32* [[P2]] to <2 x i32>* -; SSE2-NEXT: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 1 -; SSE2-NEXT: [[TMP5:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64> -; SSE2-NEXT: [[TMP6:%.*]] = sext <2 x i32> [[TMP4]] to <2 x i64> -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP7]], i32 0 -; SSE2-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP8]], i32 1 -; SSE2-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 -; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP9]], i32 2 -; SSE2-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 -; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP10]], i32 3 +; SSE2-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1 +; SSE2-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1 +; SSE2-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 +; SSE2-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 +; SSE2-NEXT: [[X0:%.*]] = sext i32 [[I0]] to i64 +; SSE2-NEXT: [[X1:%.*]] = sext i32 [[I1]] to i64 +; SSE2-NEXT: [[X2:%.*]] = sext i32 [[I2]] to i64 +; SSE2-NEXT: [[X3:%.*]] = sext i32 [[I3]] to i64 +; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 +; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 +; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 ; SSE2-NEXT: ret <4 x i64> [[V3]] ; ; SLM-LABEL: @loadext_4i32_to_4i64( @@ -847,18 +845,17 @@ define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) { ; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 ; AVX1-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* ; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 -; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32* [[P2]] to <2 x i32>* -; AVX1-NEXT: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 1 -; AVX1-NEXT: [[TMP5:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64> -; AVX1-NEXT: [[TMP6:%.*]] = sext <2 x i32> [[TMP4]] to <2 x i64> -; AVX1-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 -; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP7]], i32 0 -; AVX1-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 -; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP8]], i32 1 -; AVX1-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 -; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP9]], i32 2 -; AVX1-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 -; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP10]], i32 3 +; AVX1-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 +; AVX1-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 +; AVX1-NEXT: [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64> +; AVX1-NEXT: [[X2:%.*]] = sext i32 [[I2]] to i64 +; AVX1-NEXT: [[X3:%.*]] = sext i32 [[I3]] to i64 +; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 ; AVX1-NEXT: ret <4 x i64> [[V3]] ; ; AVX2-LABEL: @loadext_4i32_to_4i64( diff --git a/test/Transforms/SLPVectorizer/X86/shift-lshr.ll b/test/Transforms/SLPVectorizer/X86/shift-lshr.ll index ceabea78fd8..237673da5a9 100644 --- a/test/Transforms/SLPVectorizer/X86/shift-lshr.ll +++ b/test/Transforms/SLPVectorizer/X86/shift-lshr.ll @@ -125,38 +125,70 @@ define void @lshr_v8i64() { define void @lshr_v16i32() { ; SSE-LABEL: @lshr_v16i32( -; SSE-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @a32 to <2 x i32>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2) to <2 x i32>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <2 x i32>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6) to <2 x i32>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <2 x i32>*), align 4 -; SSE-NEXT: [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10) to <2 x i32>*), align 4 -; SSE-NEXT: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <2 x i32>*), align 4 -; SSE-NEXT: [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14) to <2 x i32>*), align 4 -; SSE-NEXT: [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @b32 to <2 x i32>*), align 4 -; SSE-NEXT: [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2) to <2 x i32>*), align 4 -; SSE-NEXT: [[TMP11:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <2 x i32>*), align 4 -; SSE-NEXT: [[TMP12:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6) to <2 x i32>*), align 4 -; SSE-NEXT: [[TMP13:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <2 x i32>*), align 4 -; SSE-NEXT: [[TMP14:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10) to <2 x i32>*), align 4 -; SSE-NEXT: [[TMP15:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <2 x i32>*), align 4 -; SSE-NEXT: [[TMP16:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14) to <2 x i32>*), align 4 -; SSE-NEXT: [[TMP17:%.*]] = lshr <2 x i32> [[TMP1]], [[TMP9]] -; SSE-NEXT: [[TMP18:%.*]] = lshr <2 x i32> [[TMP2]], [[TMP10]] -; SSE-NEXT: [[TMP19:%.*]] = lshr <2 x i32> [[TMP3]], [[TMP11]] -; SSE-NEXT: [[TMP20:%.*]] = lshr <2 x i32> [[TMP4]], [[TMP12]] -; SSE-NEXT: [[TMP21:%.*]] = lshr <2 x i32> [[TMP5]], [[TMP13]] -; SSE-NEXT: [[TMP22:%.*]] = lshr <2 x i32> [[TMP6]], [[TMP14]] -; SSE-NEXT: [[TMP23:%.*]] = lshr <2 x i32> [[TMP7]], [[TMP15]] -; SSE-NEXT: [[TMP24:%.*]] = lshr <2 x i32> [[TMP8]], [[TMP16]] -; SSE-NEXT: store <2 x i32> [[TMP17]], <2 x i32>* bitcast ([16 x i32]* @c32 to <2 x i32>*), align 4 -; SSE-NEXT: store <2 x i32> [[TMP18]], <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2) to <2 x i32>*), align 4 -; SSE-NEXT: store <2 x i32> [[TMP19]], <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <2 x i32>*), align 4 -; SSE-NEXT: store <2 x i32> [[TMP20]], <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6) to <2 x i32>*), align 4 -; SSE-NEXT: store <2 x i32> [[TMP21]], <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <2 x i32>*), align 4 -; SSE-NEXT: store <2 x i32> [[TMP22]], <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10) to <2 x i32>*), align 4 -; SSE-NEXT: store <2 x i32> [[TMP23]], <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <2 x i32>*), align 4 -; SSE-NEXT: store <2 x i32> [[TMP24]], <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14) to <2 x i32>*), align 4 +; SSE-NEXT: [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4 +; SSE-NEXT: [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4 +; SSE-NEXT: [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4 +; SSE-NEXT: [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4 +; SSE-NEXT: [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4 +; SSE-NEXT: [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4 +; SSE-NEXT: [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4 +; SSE-NEXT: [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4 +; SSE-NEXT: [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4 +; SSE-NEXT: [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4 +; SSE-NEXT: [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 +; SSE-NEXT: [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 +; SSE-NEXT: [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 +; SSE-NEXT: [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 +; SSE-NEXT: [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 +; SSE-NEXT: [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 +; SSE-NEXT: [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4 +; SSE-NEXT: [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4 +; SSE-NEXT: [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4 +; SSE-NEXT: [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4 +; SSE-NEXT: [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4 +; SSE-NEXT: [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4 +; SSE-NEXT: [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4 +; SSE-NEXT: [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4 +; SSE-NEXT: [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4 +; SSE-NEXT: [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4 +; SSE-NEXT: [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4 +; SSE-NEXT: [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4 +; SSE-NEXT: [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4 +; SSE-NEXT: [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4 +; SSE-NEXT: [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4 +; SSE-NEXT: [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4 +; SSE-NEXT: [[R0:%.*]] = lshr i32 [[A0]], [[B0]] +; SSE-NEXT: [[R1:%.*]] = lshr i32 [[A1]], [[B1]] +; SSE-NEXT: [[R2:%.*]] = lshr i32 [[A2]], [[B2]] +; SSE-NEXT: [[R3:%.*]] = lshr i32 [[A3]], [[B3]] +; SSE-NEXT: [[R4:%.*]] = lshr i32 [[A4]], [[B4]] +; SSE-NEXT: [[R5:%.*]] = lshr i32 [[A5]], [[B5]] +; SSE-NEXT: [[R6:%.*]] = lshr i32 [[A6]], [[B6]] +; SSE-NEXT: [[R7:%.*]] = lshr i32 [[A7]], [[B7]] +; SSE-NEXT: [[R8:%.*]] = lshr i32 [[A8]], [[B8]] +; SSE-NEXT: [[R9:%.*]] = lshr i32 [[A9]], [[B9]] +; SSE-NEXT: [[R10:%.*]] = lshr i32 [[A10]], [[B10]] +; SSE-NEXT: [[R11:%.*]] = lshr i32 [[A11]], [[B11]] +; SSE-NEXT: [[R12:%.*]] = lshr i32 [[A12]], [[B12]] +; SSE-NEXT: [[R13:%.*]] = lshr i32 [[A13]], [[B13]] +; SSE-NEXT: [[R14:%.*]] = lshr i32 [[A14]], [[B14]] +; SSE-NEXT: [[R15:%.*]] = lshr i32 [[A15]], [[B15]] +; SSE-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4 +; SSE-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4 +; SSE-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4 +; SSE-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4 +; SSE-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4 +; SSE-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4 +; SSE-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4 +; SSE-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4 +; SSE-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4 +; SSE-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4 +; SSE-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 +; SSE-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 +; SSE-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 +; SSE-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 +; SSE-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 +; SSE-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 ; SSE-NEXT: ret void ; ; AVX-LABEL: @lshr_v16i32( diff --git a/test/Transforms/SLPVectorizer/X86/shift-shl.ll b/test/Transforms/SLPVectorizer/X86/shift-shl.ll index 6993184fb35..8eadd04d1d0 100644 --- a/test/Transforms/SLPVectorizer/X86/shift-shl.ll +++ b/test/Transforms/SLPVectorizer/X86/shift-shl.ll @@ -241,38 +241,134 @@ define void @shl_v16i32() { define void @shl_v32i16() { ; SSE-LABEL: @shl_v32i16( -; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @a16 to <4 x i16>*), align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4) to <4 x i16>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <4 x i16>*), align 2 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12) to <4 x i16>*), align 2 -; SSE-NEXT: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <4 x i16>*), align 2 -; SSE-NEXT: [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20) to <4 x i16>*), align 2 -; SSE-NEXT: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <4 x i16>*), align 2 -; SSE-NEXT: [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28) to <4 x i16>*), align 2 -; SSE-NEXT: [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @b16 to <4 x i16>*), align 2 -; SSE-NEXT: [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4) to <4 x i16>*), align 2 -; SSE-NEXT: [[TMP11:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <4 x i16>*), align 2 -; SSE-NEXT: [[TMP12:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12) to <4 x i16>*), align 2 -; SSE-NEXT: [[TMP13:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <4 x i16>*), align 2 -; SSE-NEXT: [[TMP14:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20) to <4 x i16>*), align 2 -; SSE-NEXT: [[TMP15:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <4 x i16>*), align 2 -; SSE-NEXT: [[TMP16:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28) to <4 x i16>*), align 2 -; SSE-NEXT: [[TMP17:%.*]] = shl <4 x i16> [[TMP1]], [[TMP9]] -; SSE-NEXT: [[TMP18:%.*]] = shl <4 x i16> [[TMP2]], [[TMP10]] -; SSE-NEXT: [[TMP19:%.*]] = shl <4 x i16> [[TMP3]], [[TMP11]] -; SSE-NEXT: [[TMP20:%.*]] = shl <4 x i16> [[TMP4]], [[TMP12]] -; SSE-NEXT: [[TMP21:%.*]] = shl <4 x i16> [[TMP5]], [[TMP13]] -; SSE-NEXT: [[TMP22:%.*]] = shl <4 x i16> [[TMP6]], [[TMP14]] -; SSE-NEXT: [[TMP23:%.*]] = shl <4 x i16> [[TMP7]], [[TMP15]] -; SSE-NEXT: [[TMP24:%.*]] = shl <4 x i16> [[TMP8]], [[TMP16]] -; SSE-NEXT: store <4 x i16> [[TMP17]], <4 x i16>* bitcast ([32 x i16]* @c16 to <4 x i16>*), align 2 -; SSE-NEXT: store <4 x i16> [[TMP18]], <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4) to <4 x i16>*), align 2 -; SSE-NEXT: store <4 x i16> [[TMP19]], <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <4 x i16>*), align 2 -; SSE-NEXT: store <4 x i16> [[TMP20]], <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12) to <4 x i16>*), align 2 -; SSE-NEXT: store <4 x i16> [[TMP21]], <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <4 x i16>*), align 2 -; SSE-NEXT: store <4 x i16> [[TMP22]], <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20) to <4 x i16>*), align 2 -; SSE-NEXT: store <4 x i16> [[TMP23]], <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <4 x i16>*), align 2 -; SSE-NEXT: store <4 x i16> [[TMP24]], <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28) to <4 x i16>*), align 2 +; SSE-NEXT: [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2 +; SSE-NEXT: [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2 +; SSE-NEXT: [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2 +; SSE-NEXT: [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2 +; SSE-NEXT: [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2 +; SSE-NEXT: [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2 +; SSE-NEXT: [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2 +; SSE-NEXT: [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2 +; SSE-NEXT: [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2 +; SSE-NEXT: [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2 +; SSE-NEXT: [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2 +; SSE-NEXT: [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2 +; SSE-NEXT: [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2 +; SSE-NEXT: [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2 +; SSE-NEXT: [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2 +; SSE-NEXT: [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2 +; SSE-NEXT: [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2 +; SSE-NEXT: [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2 +; SSE-NEXT: [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2 +; SSE-NEXT: [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2 +; SSE-NEXT: [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2 +; SSE-NEXT: [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2 +; SSE-NEXT: [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2 +; SSE-NEXT: [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2 +; SSE-NEXT: [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2 +; SSE-NEXT: [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2 +; SSE-NEXT: [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2 +; SSE-NEXT: [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2 +; SSE-NEXT: [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2 +; SSE-NEXT: [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2 +; SSE-NEXT: [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2 +; SSE-NEXT: [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2 +; SSE-NEXT: [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2 +; SSE-NEXT: [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2 +; SSE-NEXT: [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2 +; SSE-NEXT: [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2 +; SSE-NEXT: [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2 +; SSE-NEXT: [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2 +; SSE-NEXT: [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2 +; SSE-NEXT: [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2 +; SSE-NEXT: [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2 +; SSE-NEXT: [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2 +; SSE-NEXT: [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2 +; SSE-NEXT: [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2 +; SSE-NEXT: [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2 +; SSE-NEXT: [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2 +; SSE-NEXT: [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2 +; SSE-NEXT: [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2 +; SSE-NEXT: [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2 +; SSE-NEXT: [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2 +; SSE-NEXT: [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2 +; SSE-NEXT: [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2 +; SSE-NEXT: [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2 +; SSE-NEXT: [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2 +; SSE-NEXT: [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2 +; SSE-NEXT: [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2 +; SSE-NEXT: [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2 +; SSE-NEXT: [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2 +; SSE-NEXT: [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2 +; SSE-NEXT: [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2 +; SSE-NEXT: [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2 +; SSE-NEXT: [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2 +; SSE-NEXT: [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2 +; SSE-NEXT: [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2 +; SSE-NEXT: [[R0:%.*]] = shl i16 [[A0]], [[B0]] +; SSE-NEXT: [[R1:%.*]] = shl i16 [[A1]], [[B1]] +; SSE-NEXT: [[R2:%.*]] = shl i16 [[A2]], [[B2]] +; SSE-NEXT: [[R3:%.*]] = shl i16 [[A3]], [[B3]] +; SSE-NEXT: [[R4:%.*]] = shl i16 [[A4]], [[B4]] +; SSE-NEXT: [[R5:%.*]] = shl i16 [[A5]], [[B5]] +; SSE-NEXT: [[R6:%.*]] = shl i16 [[A6]], [[B6]] +; SSE-NEXT: [[R7:%.*]] = shl i16 [[A7]], [[B7]] +; SSE-NEXT: [[R8:%.*]] = shl i16 [[A8]], [[B8]] +; SSE-NEXT: [[R9:%.*]] = shl i16 [[A9]], [[B9]] +; SSE-NEXT: [[R10:%.*]] = shl i16 [[A10]], [[B10]] +; SSE-NEXT: [[R11:%.*]] = shl i16 [[A11]], [[B11]] +; SSE-NEXT: [[R12:%.*]] = shl i16 [[A12]], [[B12]] +; SSE-NEXT: [[R13:%.*]] = shl i16 [[A13]], [[B13]] +; SSE-NEXT: [[R14:%.*]] = shl i16 [[A14]], [[B14]] +; SSE-NEXT: [[R15:%.*]] = shl i16 [[A15]], [[B15]] +; SSE-NEXT: [[R16:%.*]] = shl i16 [[A16]], [[B16]] +; SSE-NEXT: [[R17:%.*]] = shl i16 [[A17]], [[B17]] +; SSE-NEXT: [[R18:%.*]] = shl i16 [[A18]], [[B18]] +; SSE-NEXT: [[R19:%.*]] = shl i16 [[A19]], [[B19]] +; SSE-NEXT: [[R20:%.*]] = shl i16 [[A20]], [[B20]] +; SSE-NEXT: [[R21:%.*]] = shl i16 [[A21]], [[B21]] +; SSE-NEXT: [[R22:%.*]] = shl i16 [[A22]], [[B22]] +; SSE-NEXT: [[R23:%.*]] = shl i16 [[A23]], [[B23]] +; SSE-NEXT: [[R24:%.*]] = shl i16 [[A24]], [[B24]] +; SSE-NEXT: [[R25:%.*]] = shl i16 [[A25]], [[B25]] +; SSE-NEXT: [[R26:%.*]] = shl i16 [[A26]], [[B26]] +; SSE-NEXT: [[R27:%.*]] = shl i16 [[A27]], [[B27]] +; SSE-NEXT: [[R28:%.*]] = shl i16 [[A28]], [[B28]] +; SSE-NEXT: [[R29:%.*]] = shl i16 [[A29]], [[B29]] +; SSE-NEXT: [[R30:%.*]] = shl i16 [[A30]], [[B30]] +; SSE-NEXT: [[R31:%.*]] = shl i16 [[A31]], [[B31]] +; SSE-NEXT: store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2 +; SSE-NEXT: store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2 +; SSE-NEXT: store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2 +; SSE-NEXT: store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2 +; SSE-NEXT: store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2 +; SSE-NEXT: store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2 +; SSE-NEXT: store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2 +; SSE-NEXT: store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2 +; SSE-NEXT: store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2 +; SSE-NEXT: store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2 +; SSE-NEXT: store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2 +; SSE-NEXT: store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2 +; SSE-NEXT: store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2 +; SSE-NEXT: store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2 +; SSE-NEXT: store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2 +; SSE-NEXT: store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2 +; SSE-NEXT: store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2 +; SSE-NEXT: store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2 +; SSE-NEXT: store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2 +; SSE-NEXT: store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2 +; SSE-NEXT: store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2 +; SSE-NEXT: store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2 +; SSE-NEXT: store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2 +; SSE-NEXT: store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2 +; SSE-NEXT: store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2 +; SSE-NEXT: store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2 +; SSE-NEXT: store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2 +; SSE-NEXT: store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2 +; SSE-NEXT: store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2 +; SSE-NEXT: store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2 +; SSE-NEXT: store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2 +; SSE-NEXT: store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2 ; SSE-NEXT: ret void ; ; AVX-LABEL: @shl_v32i16( diff --git a/test/Transforms/SLPVectorizer/X86/sitofp.ll b/test/Transforms/SLPVectorizer/X86/sitofp.ll index 67f554275b4..d4c4be5bbf6 100644 --- a/test/Transforms/SLPVectorizer/X86/sitofp.ll +++ b/test/Transforms/SLPVectorizer/X86/sitofp.ll @@ -598,35 +598,14 @@ define void @sitofp_8i8_8f64() #0 { ; define void @sitofp_2i64_2f32() #0 { -; SSE-LABEL: @sitofp_2i64_2f32( -; SSE-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 -; SSE-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to float -; SSE-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to float -; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE-NEXT: ret void -; -; AVX256NODQ-LABEL: @sitofp_2i64_2f32( -; AVX256NODQ-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to float -; AVX256NODQ-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to float -; AVX256NODQ-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; AVX256NODQ-NEXT: ret void -; -; AVX512-LABEL: @sitofp_2i64_2f32( -; AVX512-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x float> -; AVX512-NEXT: store <2 x float> [[TMP2]], <2 x float>* bitcast ([16 x float]* @dst32 to <2 x float>*), align 64 -; AVX512-NEXT: ret void -; -; AVX256DQ-LABEL: @sitofp_2i64_2f32( -; AVX256DQ-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64 -; AVX256DQ-NEXT: [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x float> -; AVX256DQ-NEXT: store <2 x float> [[TMP2]], <2 x float>* bitcast ([16 x float]* @dst32 to <2 x float>*), align 64 -; AVX256DQ-NEXT: ret void +; CHECK-LABEL: @sitofp_2i64_2f32( +; CHECK-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 +; CHECK-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to float +; CHECK-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to float +; CHECK-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 +; CHECK-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 +; CHECK-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 diff --git a/test/Transforms/SLPVectorizer/X86/tiny-tree.ll b/test/Transforms/SLPVectorizer/X86/tiny-tree.ll index 95b7d6bf55e..19f06805ff5 100644 --- a/test/Transforms/SLPVectorizer/X86/tiny-tree.ll +++ b/test/Transforms/SLPVectorizer/X86/tiny-tree.ll @@ -172,13 +172,13 @@ define void @tiny_tree_not_fully_vectorizable2(float* noalias nocapture %dst, fl ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 1 ; CHECK-NEXT: store float [[TMP1]], float* [[ARRAYIDX3]], align 4 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX4]], align 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 2 +; CHECK-NEXT: store float [[TMP2]], float* [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX4]] to <2 x float>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX6]], align 4 ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 3 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX5]] to <2 x float>* -; CHECK-NEXT: store <2 x float> [[TMP3]], <2 x float>* [[TMP4]], align 4 +; CHECK-NEXT: store float [[TMP3]], float* [[ARRAYIDX7]], align 4 ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 [[I_023]] ; CHECK-NEXT: [[ADD_PTR8]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 [[I_023]] ; CHECK-NEXT: [[INC]] = add i64 [[I_023]], 1 diff --git a/test/Transforms/SLPVectorizer/X86/uitofp.ll b/test/Transforms/SLPVectorizer/X86/uitofp.ll index 92ff1fbaae8..092918115eb 100644 --- a/test/Transforms/SLPVectorizer/X86/uitofp.ll +++ b/test/Transforms/SLPVectorizer/X86/uitofp.ll @@ -550,35 +550,14 @@ define void @uitofp_8i8_8f64() #0 { ; define void @uitofp_2i64_2f32() #0 { -; SSE-LABEL: @uitofp_2i64_2f32( -; SSE-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 -; SSE-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 -; SSE-NEXT: [[CVT0:%.*]] = uitofp i64 [[LD0]] to float -; SSE-NEXT: [[CVT1:%.*]] = uitofp i64 [[LD1]] to float -; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; SSE-NEXT: ret void -; -; AVX256NODQ-LABEL: @uitofp_2i64_2f32( -; AVX256NODQ-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 -; AVX256NODQ-NEXT: [[CVT0:%.*]] = uitofp i64 [[LD0]] to float -; AVX256NODQ-NEXT: [[CVT1:%.*]] = uitofp i64 [[LD1]] to float -; AVX256NODQ-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; AVX256NODQ-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; AVX256NODQ-NEXT: ret void -; -; AVX512-LABEL: @uitofp_2i64_2f32( -; AVX512-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64 -; AVX512-NEXT: [[TMP2:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x float> -; AVX512-NEXT: store <2 x float> [[TMP2]], <2 x float>* bitcast ([16 x float]* @dst32 to <2 x float>*), align 64 -; AVX512-NEXT: ret void -; -; AVX256DQ-LABEL: @uitofp_2i64_2f32( -; AVX256DQ-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64 -; AVX256DQ-NEXT: [[TMP2:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x float> -; AVX256DQ-NEXT: store <2 x float> [[TMP2]], <2 x float>* bitcast ([16 x float]* @dst32 to <2 x float>*), align 64 -; AVX256DQ-NEXT: ret void +; CHECK-LABEL: @uitofp_2i64_2f32( +; CHECK-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 +; CHECK-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 +; CHECK-NEXT: [[CVT0:%.*]] = uitofp i64 [[LD0]] to float +; CHECK-NEXT: [[CVT1:%.*]] = uitofp i64 [[LD1]] to float +; CHECK-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 +; CHECK-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 +; CHECK-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 diff --git a/test/Transforms/SLPVectorizer/X86/vec-reg-64bit.ll b/test/Transforms/SLPVectorizer/X86/vec-reg-64bit.ll deleted file mode 100644 index f71f3a7d5c1..00000000000 --- a/test/Transforms/SLPVectorizer/X86/vec-reg-64bit.ll +++ /dev/null @@ -1,51 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -basicaa -slp-vectorizer -mcpu=btver2 -S | FileCheck %s --check-prefix=VECT -; RUN: opt < %s -basicaa -slp-vectorizer -mcpu=btver2 -slp-min-reg-size=128 -S | FileCheck %s --check-prefix=NOVECT - -; Check SLPVectorizer works for packed horizontal 128-bit instrs. -; See llvm.org/PR32433 - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -define void @add_pairs_128(<4 x float>, float* nocapture) #0 { -; VECT-LABEL: @add_pairs_128( -; VECT-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP0:%.*]], i32 0 -; VECT-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 -; VECT-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; VECT-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; VECT-NEXT: [[TMP7:%.*]] = insertelement <2 x float> undef, float [[TMP3]], i32 0 -; VECT-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[TMP5]], i32 1 -; VECT-NEXT: [[TMP9:%.*]] = insertelement <2 x float> undef, float [[TMP4]], i32 0 -; VECT-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP6]], i32 1 -; VECT-NEXT: [[TMP11:%.*]] = fadd <2 x float> [[TMP8]], [[TMP10]] -; VECT-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 1 -; VECT-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP1]] to <2 x float>* -; VECT-NEXT: store <2 x float> [[TMP11]], <2 x float>* [[TMP13]], align 4 -; VECT-NEXT: ret void -; -; NOVECT-LABEL: @add_pairs_128( -; NOVECT-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP0:%.*]], i32 0 -; NOVECT-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 -; NOVECT-NEXT: [[TMP5:%.*]] = fadd float [[TMP3]], [[TMP4]] -; NOVECT-NEXT: store float [[TMP5]], float* [[TMP1:%.*]], align 4 -; NOVECT-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOVECT-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOVECT-NEXT: [[TMP8:%.*]] = fadd float [[TMP6]], [[TMP7]] -; NOVECT-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 1 -; NOVECT-NEXT: store float [[TMP8]], float* [[TMP9]], align 4 -; NOVECT-NEXT: ret void -; - %3 = extractelement <4 x float> %0, i32 0 - %4 = extractelement <4 x float> %0, i32 1 - %5 = fadd float %3, %4 - store float %5, float* %1, align 4 - %6 = extractelement <4 x float> %0, i32 2 - %7 = extractelement <4 x float> %0, i32 3 - %8 = fadd float %6, %7 - %9 = getelementptr inbounds float, float* %1, i64 1 - store float %8, float* %9, align 4 - ret void -} - -attributes #0 = { nounwind } diff --git a/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll index 0df2e751ceb..2b593b78652 100644 --- a/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll +++ b/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll @@ -47,16 +47,17 @@ define void @add1(i32* noalias %dst, i32* noalias %src) { ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 ; CHECK-NEXT: store i32 [[TMP0]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 +; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP1]], 1 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 +; CHECK-NEXT: store i32 [[ADD3]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[INCDEC_PTR]] to <2 x i32>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> , [[TMP2]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP2]], 2 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[INCDEC_PTR1]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP3]], <2 x i32>* [[TMP4]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP5]], 3 +; CHECK-NEXT: store i32 [[ADD6]], i32* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4 +; CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP3]], 3 ; CHECK-NEXT: store i32 [[ADD9]], i32* [[INCDEC_PTR7]], align 4 ; CHECK-NEXT: ret void ; @@ -94,12 +95,13 @@ define void @sub0(i32* noalias %dst, i32* noalias %src) { ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 ; CHECK-NEXT: store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[INCDEC_PTR2]] to <2 x i32>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = add nsw <2 x i32> , [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[INCDEC_PTR3]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP4]], <2 x i32>* [[TMP5]], align 4 +; CHECK-NEXT: store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[SUB8:%.*]] = add nsw i32 [[TMP3]], -3 +; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -212,14 +214,13 @@ define void @addsub0(i32* noalias %dst, i32* noalias %src) { ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 ; CHECK-NEXT: store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[INCDEC_PTR2]] to <2 x i32>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = add nsw <2 x i32> [[TMP3]], -; CHECK-NEXT: [[TMP5:%.*]] = sub nsw <2 x i32> [[TMP3]], -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[INCDEC_PTR3]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32>* [[TMP7]], align 4 +; CHECK-NEXT: store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3 +; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -247,22 +248,21 @@ define void @addsub1(i32* noalias %dst, i32* noalias %src) { ; CHECK-LABEL: @addsub1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 +; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <2 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <2 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 +; CHECK-NEXT: [[SUB1:%.*]] = sub nsw i32 [[TMP1]], -1 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[DST]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP4]], <2 x i32>* [[TMP5]], align 4 +; CHECK-NEXT: store i32 [[SUB1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[TMP6]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP7]], -3 +; CHECK-NEXT: store i32 [[TMP2]], i32* [[INCDEC_PTR3]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3 ; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 ; CHECK-NEXT: ret void ; @@ -338,16 +338,17 @@ define void @shl0(i32* noalias %dst, i32* noalias %src) { ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 ; CHECK-NEXT: store i32 [[TMP0]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 +; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP1]], 1 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 +; CHECK-NEXT: store i32 [[SHL]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[INCDEC_PTR]] to <2 x i32>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = shl <2 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[SHL5:%.*]] = shl i32 [[TMP2]], 2 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[INCDEC_PTR1]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP3]], <2 x i32>* [[TMP4]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SHL8:%.*]] = shl i32 [[TMP5]], 3 +; CHECK-NEXT: store i32 [[SHL5]], i32* [[INCDEC_PTR3]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[SHL8:%.*]] = shl i32 [[TMP3]], 3 ; CHECK-NEXT: store i32 [[SHL8]], i32* [[INCDEC_PTR6]], align 4 ; CHECK-NEXT: ret void ; @@ -456,16 +457,17 @@ define void @add1f(float* noalias %dst, float* noalias %src) { ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 ; CHECK-NEXT: store float [[TMP0]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 +; CHECK-NEXT: [[ADD3:%.*]] = fadd fast float [[TMP1]], 1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 +; CHECK-NEXT: store float [[ADD3]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[INCDEC_PTR]] to <2 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fadd fast <2 x float> , [[TMP2]] +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[ADD6:%.*]] = fadd fast float [[TMP2]], 2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[INCDEC_PTR1]] to <2 x float>* -; CHECK-NEXT: store <2 x float> [[TMP3]], <2 x float>* [[TMP4]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP5]], 3.000000e+00 +; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 +; CHECK-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00 ; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 ; CHECK-NEXT: ret void ; @@ -503,12 +505,13 @@ define void @sub0f(float* noalias %dst, float* noalias %src) { ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 ; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[ADD6:%.*]] = fadd fast float [[TMP2]], -2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[INCDEC_PTR2]] to <2 x float>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <2 x float> , [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[INCDEC_PTR4]] to <2 x float>* -; CHECK-NEXT: store <2 x float> [[TMP4]], <2 x float>* [[TMP5]], align 4 +; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 +; CHECK-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP3]], -3.000000e+00 +; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -621,14 +624,13 @@ define void @addsub0f(float* noalias %dst, float* noalias %src) { ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 ; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[SUB5:%.*]] = fadd fast float [[TMP2]], -2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[INCDEC_PTR2]] to <2 x float>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <2 x float> [[TMP3]], -; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP3]], -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[INCDEC_PTR3]] to <2 x float>* -; CHECK-NEXT: store <2 x float> [[TMP6]], <2 x float>* [[TMP7]], align 4 +; CHECK-NEXT: store float [[SUB5]], float* [[INCDEC_PTR3]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00 +; CHECK-NEXT: store float [[SUB8]], float* [[INCDEC_PTR6]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -656,22 +658,21 @@ define void @addsub1f(float* noalias %dst, float* noalias %src) { ; CHECK-LABEL: @addsub1f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 +; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <2 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <2 x float> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <2 x float> [[TMP1]], -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 +; CHECK-NEXT: [[SUB1:%.*]] = fsub fast float [[TMP1]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[DST]] to <2 x float>* -; CHECK-NEXT: store <2 x float> [[TMP4]], <2 x float>* [[TMP5]], align 4 +; CHECK-NEXT: store float [[SUB1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP6:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[TMP6]], float* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP7]], -3.000000e+00 +; CHECK-NEXT: store float [[TMP2]], float* [[INCDEC_PTR3]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00 ; CHECK-NEXT: store float [[SUB8]], float* [[INCDEC_PTR6]], align 4 ; CHECK-NEXT: ret void ; @@ -700,20 +701,21 @@ define void @mulf(float* noalias %dst, float* noalias %src) { ; CHECK-LABEL: @mulf( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = fmul fast float [[TMP0]], 2.570000e+02 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 +; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <2 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> , [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 +; CHECK-NEXT: [[SUB3:%.*]] = fmul fast float [[TMP1]], -3.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <2 x float>* -; CHECK-NEXT: store <2 x float> [[TMP2]], <2 x float>* [[TMP3]], align 4 +; CHECK-NEXT: store float [[SUB3]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[TMP4]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP5]], -9.000000e+00 +; CHECK-NEXT: store float [[TMP2]], float* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 +; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00 ; CHECK-NEXT: store float [[SUB9]], float* [[INCDEC_PTR7]], align 4 ; CHECK-NEXT: ret void ; @@ -784,16 +786,17 @@ define void @add1fn(float* noalias %dst, float* noalias %src) { ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 ; CHECK-NEXT: store float [[TMP0]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 +; CHECK-NEXT: [[ADD3:%.*]] = fadd float [[TMP1]], 1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 +; CHECK-NEXT: store float [[ADD3]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[INCDEC_PTR]] to <2 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> , [[TMP2]] +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[ADD6:%.*]] = fadd float [[TMP2]], 2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[INCDEC_PTR1]] to <2 x float>* -; CHECK-NEXT: store <2 x float> [[TMP3]], <2 x float>* [[TMP4]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = fadd float [[TMP5]], 3.000000e+00 +; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 +; CHECK-NEXT: [[ADD9:%.*]] = fadd float [[TMP3]], 3.000000e+00 ; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 ; CHECK-NEXT: ret void ; @@ -831,12 +834,13 @@ define void @sub0fn(float* noalias %dst, float* noalias %src) { ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 ; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[ADD6:%.*]] = fadd float [[TMP2]], -2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[INCDEC_PTR2]] to <2 x float>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> , [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[INCDEC_PTR4]] to <2 x float>* -; CHECK-NEXT: store <2 x float> [[TMP4]], <2 x float>* [[TMP5]], align 4 +; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 +; CHECK-NEXT: [[ADD9:%.*]] = fadd float [[TMP3]], -3.000000e+00 +; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -940,20 +944,21 @@ define void @mulfn(float* noalias %dst, float* noalias %src) { ; CHECK-LABEL: @mulfn( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = fmul float [[TMP0]], 2.570000e+02 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 +; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <2 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x float> , [[TMP1]] +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 +; CHECK-NEXT: [[SUB3:%.*]] = fmul float [[TMP1]], -3.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <2 x float>* -; CHECK-NEXT: store <2 x float> [[TMP2]], <2 x float>* [[TMP3]], align 4 +; CHECK-NEXT: store float [[SUB3]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[TMP4]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP5]], -9.000000e+00 +; CHECK-NEXT: store float [[TMP2]], float* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 +; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00 ; CHECK-NEXT: store float [[SUB9]], float* [[INCDEC_PTR7]], align 4 ; CHECK-NEXT: ret void ; diff --git a/test/Transforms/SLPVectorizer/X86/zext.ll b/test/Transforms/SLPVectorizer/X86/zext.ll index 02684e18d9b..a05e4186134 100644 --- a/test/Transforms/SLPVectorizer/X86/zext.ll +++ b/test/Transforms/SLPVectorizer/X86/zext.ll @@ -682,20 +682,18 @@ define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) { ; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 ; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 ; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 -; SSE2-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* -; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 -; SSE2-NEXT: [[TMP3:%.*]] = bitcast i32* [[P2]] to <2 x i32>* -; SSE2-NEXT: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 1 -; SSE2-NEXT: [[TMP5:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64> -; SSE2-NEXT: [[TMP6:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64> -; SSE2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 -; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP7]], i32 0 -; SSE2-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 -; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP8]], i32 1 -; SSE2-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 -; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP9]], i32 2 -; SSE2-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 -; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP10]], i32 3 +; SSE2-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1 +; SSE2-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1 +; SSE2-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 +; SSE2-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 +; SSE2-NEXT: [[X0:%.*]] = zext i32 [[I0]] to i64 +; SSE2-NEXT: [[X1:%.*]] = zext i32 [[I1]] to i64 +; SSE2-NEXT: [[X2:%.*]] = zext i32 [[I2]] to i64 +; SSE2-NEXT: [[X3:%.*]] = zext i32 [[I3]] to i64 +; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 +; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 +; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 ; SSE2-NEXT: ret <4 x i64> [[V3]] ; ; SLM-LABEL: @loadext_4i32_to_4i64( @@ -721,18 +719,17 @@ define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) { ; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 ; AVX1-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* ; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 -; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32* [[P2]] to <2 x i32>* -; AVX1-NEXT: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 1 -; AVX1-NEXT: [[TMP5:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64> -; AVX1-NEXT: [[TMP6:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64> -; AVX1-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 -; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP7]], i32 0 -; AVX1-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 -; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP8]], i32 1 -; AVX1-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 -; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP9]], i32 2 -; AVX1-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 -; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP10]], i32 3 +; AVX1-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 +; AVX1-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 +; AVX1-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64> +; AVX1-NEXT: [[X2:%.*]] = zext i32 [[I2]] to i64 +; AVX1-NEXT: [[X3:%.*]] = zext i32 [[I3]] to i64 +; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 ; AVX1-NEXT: ret <4 x i64> [[V3]] ; ; AVX2-LABEL: @loadext_4i32_to_4i64(