From: Sanjay Patel Date: Tue, 10 Mar 2015 16:08:36 +0000 (+0000) Subject: [X86, AVX] replace vinsertf128 intrinsics with generic shuffles X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=137e1f3f289a14153b31ea5dee1856d143305563;p=llvm [X86, AVX] replace vinsertf128 intrinsics with generic shuffles We want to replace as much custom x86 shuffling via intrinsics as possible because pushing the code down the generic shuffle optimization path allows for better codegen and less complexity in LLVM. This is the sibling patch for the Clang half of this change: http://reviews.llvm.org/D8088 Differential Revision: http://reviews.llvm.org/D8086 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@231794 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 4a59f0d54e0..998b249b832 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -1183,19 +1183,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx_vextractf128_si_256 : GCCBuiltin<"__builtin_ia32_vextractf128_si256">, Intrinsic<[llvm_v4i32_ty], [llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>; - - def int_x86_avx_vinsertf128_pd_256 : - GCCBuiltin<"__builtin_ia32_vinsertf128_pd256">, - Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, - llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx_vinsertf128_ps_256 : - GCCBuiltin<"__builtin_ia32_vinsertf128_ps256">, - Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, - llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx_vinsertf128_si_256 : - GCCBuiltin<"__builtin_ia32_vinsertf128_si256">, - Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, - llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>; } // Vector convert diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index c13abd34584..a5f0868894b 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4956,9 +4956,6 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { setValue(&I, Res); return nullptr; } - case Intrinsic::x86_avx_vinsertf128_pd_256: - case Intrinsic::x86_avx_vinsertf128_ps_256: - case Intrinsic::x86_avx_vinsertf128_si_256: case Intrinsic::x86_avx2_vinserti128: { EVT DestVT = TLI.getValueType(I.getType()); EVT ElVT = TLI.getValueType(I.getArgOperand(1)->getType()); diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp index 6800e7a0a13..f9493bc2a81 100644 --- a/lib/IR/AutoUpgrade.cpp +++ b/lib/IR/AutoUpgrade.cpp @@ -7,7 +7,9 @@ // //===----------------------------------------------------------------------===// // -// This file implements the auto-upgrade helper functions +// This file implements the auto-upgrade helper functions. +// This is where deprecated IR intrinsics and other IR features are updated to +// current specifications. // //===----------------------------------------------------------------------===// @@ -156,6 +158,9 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { Name.startswith("x86.avx2.pcmpeq.") || Name.startswith("x86.avx2.pcmpgt.") || Name.startswith("x86.avx.vpermil.") || + Name == "x86.avx.vinsertf128.pd.256" || + Name == "x86.avx.vinsertf128.ps.256" || + Name == "x86.avx.vinsertf128.si.256" || Name == "x86.avx.movnt.dq.256" || Name == "x86.avx.movnt.pd.256" || Name == "x86.avx.movnt.ps.256" || @@ -626,6 +631,51 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { } Rep = Builder.CreateShuffleVector(Op0, Op1, ConstantVector::get(Idxs)); + } else if (Name == "llvm.x86.avx.vinsertf128.pd.256" || + Name == "llvm.x86.avx.vinsertf128.ps.256" || + Name == "llvm.x86.avx.vinsertf128.si.256") { + Value *Op0 = CI->getArgOperand(0); + Value *Op1 = CI->getArgOperand(1); + unsigned Imm = cast(CI->getArgOperand(2))->getZExtValue(); + VectorType *VecTy = cast(CI->getType()); + unsigned NumElts = VecTy->getNumElements(); + + // Mask off the high bits of the immediate value; hardware ignores those. + Imm = Imm & 1; + + // Extend the second operand into a vector that is twice as big. + Value *UndefV = UndefValue::get(Op1->getType()); + SmallVector Idxs; + for (unsigned i = 0; i != NumElts; ++i) { + Idxs.push_back(Builder.getInt32(i)); + } + Rep = Builder.CreateShuffleVector(Op1, UndefV, ConstantVector::get(Idxs)); + + // Insert the second operand into the first operand. + + // Note that there is no guarantee that instruction lowering will actually + // produce a vinsertf128 instruction for the created shuffles. In + // particular, the 0 immediate case involves no lane changes, so it can + // be handled as a blend. + + // Example of shuffle mask for 32-bit elements: + // Imm = 1 + // Imm = 0 + + SmallVector Idxs2; + // The low half of the result is either the low half of the 1st operand + // or the low half of the 2nd operand (the inserted vector). + for (unsigned i = 0; i != NumElts / 2; ++i) { + unsigned Idx = Imm ? i : (i + NumElts); + Idxs2.push_back(Builder.getInt32(Idx)); + } + // The high half of the result is either the low half of the 2nd operand + // (the inserted vector) or the high half of the 1st operand. + for (unsigned i = NumElts / 2; i != NumElts; ++i) { + unsigned Idx = Imm ? (i + NumElts / 2) : i; + Idxs2.push_back(Builder.getInt32(Idx)); + } + Rep = Builder.CreateShuffleVector(Op0, Rep, ConstantVector::get(Idxs2)); } else { bool PD128 = false, PD256 = false, PS128 = false, PS256 = false; if (Name == "llvm.x86.avx.vpermil.pd.256") diff --git a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll index 8d04c16879a..a5ad7ba2616 100644 --- a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll +++ b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll @@ -1,5 +1,41 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s +; We don't check any vinsertf128 variant with immediate 0 because that's just a blend. + +define <4 x double> @test_x86_avx_vinsertf128_pd_256_1(<4 x double> %a0, <2 x double> %a1) { + ; CHECK-LABEL: test_x86_avx_vinsertf128_pd_256_1: + ; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0 + %res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 1) + ret <4 x double> %res +} +declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone + +define <8 x float> @test_x86_avx_vinsertf128_ps_256_1(<8 x float> %a0, <4 x float> %a1) { + ; CHECK-LABEL: test_x86_avx_vinsertf128_ps_256_1: + ; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 1) + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone + +define <8 x i32> @test_x86_avx_vinsertf128_si_256_1(<8 x i32> %a0, <4 x i32> %a1) { + ; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_1: + ; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0 + %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 1) + ret <8 x i32> %res +} + +; Verify that high bits of the immediate are masked off. This should be the equivalent +; of a vinsertf128 $0 which should be optimized into a blend, so just check that it's +; not a vinsertf128 $1. +define <8 x i32> @test_x86_avx_vinsertf128_si_256_2(<8 x i32> %a0, <4 x i32> %a1) { + ; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_2: + ; CHECK-NOT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 + %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 2) + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone + define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) { ; CHECK: vblendpd %res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 7) ; <<4 x double>> [#uses=1] diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll index 3716cf84989..96d80ea7ae6 100644 --- a/test/CodeGen/X86/avx-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx-intrinsics-x86.ll @@ -2187,30 +2187,6 @@ define <4 x i32> @test_x86_avx_vextractf128_si_256(<8 x i32> %a0) { declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone -define <4 x double> @test_x86_avx_vinsertf128_pd_256(<4 x double> %a0, <2 x double> %a1) { - ; CHECK: vinsertf128 - %res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 7) ; <<4 x double>> [#uses=1] - ret <4 x double> %res -} -declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone - - -define <8 x float> @test_x86_avx_vinsertf128_ps_256(<8 x float> %a0, <4 x float> %a1) { - ; CHECK: vinsertf128 - %res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 7) ; <<8 x float>> [#uses=1] - ret <8 x float> %res -} -declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone - - -define <8 x i32> @test_x86_avx_vinsertf128_si_256(<8 x i32> %a0, <4 x i32> %a1) { - ; CHECK: vinsertf128 - %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 7) ; <<8 x i32>> [#uses=1] - ret <8 x i32> %res -} -declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone - - define <4 x double> @test_x86_avx_vperm2f128_pd_256(<4 x double> %a0, <4 x double> %a1) { ; CHECK: vperm2f128 %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 7) ; <<4 x double>> [#uses=1] diff --git a/test/CodeGen/X86/avx-vinsertf128.ll b/test/CodeGen/X86/avx-vinsertf128.ll index d0f8f4ebaea..38389de7a8a 100644 --- a/test/CodeGen/X86/avx-vinsertf128.ll +++ b/test/CodeGen/X86/avx-vinsertf128.ll @@ -1,6 +1,6 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=CHECK-SSE %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s +; CHECK-LABEL: A: ; CHECK-NOT: vunpck ; CHECK: vinsertf128 $1 define <8 x float> @A(<8 x float> %a) nounwind uwtable readnone ssp { @@ -9,6 +9,7 @@ entry: ret <8 x float> %shuffle } +; CHECK-LABEL: B: ; CHECK-NOT: vunpck ; CHECK: vinsertf128 $1 define <4 x double> @B(<4 x double> %a) nounwind uwtable readnone ssp { @@ -22,7 +23,7 @@ declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone ; Just check that no crash happens -; CHECK-SSE: _insert_crash +; CHECK-LABEL: _insert_crash: define void @insert_crash() nounwind { allocas: %v1.i.i451 = shufflevector <4 x double> zeroinitializer, <4 x double> undef, <4 x i32> @@ -39,7 +40,7 @@ allocas: ;; DAG Combine must remove useless vinsertf128 instructions -; CHECK: DAGCombineA +; CHECK-LABEL: DAGCombineA: ; CHECK-NOT: vinsertf128 $1 define <4 x i32> @DAGCombineA(<4 x i32> %v1) nounwind readonly { %1 = shufflevector <4 x i32> %v1, <4 x i32> undef, <8 x i32> @@ -47,7 +48,7 @@ define <4 x i32> @DAGCombineA(<4 x i32> %v1) nounwind readonly { ret <4 x i32> %2 } -; CHECK: DAGCombineB +; CHECK-LABEL: DAGCombineB: ; CHECK: vpaddd %xmm ; CHECK-NOT: vinsertf128 $1 ; CHECK: vpaddd %xmm @@ -57,14 +58,7 @@ define <8 x i32> @DAGCombineB(<8 x i32> %v1, <8 x i32> %v2) nounwind readonly { ret <8 x i32> %2 } -; CHECK: insert_pd -define <4 x double> @insert_pd(<4 x double> %a0, <2 x double> %a1) { -; CHECK: vinsertf128 -%res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 0) -ret <4 x double> %res -} - -; CHECK: insert_undef_pd +; CHECK-LABEL: insert_undef_pd: define <4 x double> @insert_undef_pd(<4 x double> %a0, <2 x double> %a1) { ; CHECK: vmovaps %ymm1, %ymm0 %res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> undef, <2 x double> %a1, i8 0) @@ -73,14 +67,7 @@ ret <4 x double> %res declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone -; CHECK: insert_ps -define <8 x float> @insert_ps(<8 x float> %a0, <4 x float> %a1) { -; CHECK: vinsertf128 -%res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 0) -ret <8 x float> %res -} - -; CHECK: insert_undef_ps +; CHECK-LABEL: insert_undef_ps: define <8 x float> @insert_undef_ps(<8 x float> %a0, <4 x float> %a1) { ; CHECK: vmovaps %ymm1, %ymm0 %res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> undef, <4 x float> %a1, i8 0) @@ -89,14 +76,7 @@ ret <8 x float> %res declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone -; CHECK: insert_si -define <8 x i32> @insert_si(<8 x i32> %a0, <4 x i32> %a1) { -; CHECK: vinsertf128 -%res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 0) -ret <8 x i32> %res -} - -; CHECK: insert_undef_si +; CHECK-LABEL: insert_undef_si: define <8 x i32> @insert_undef_si(<8 x i32> %a0, <4 x i32> %a1) { ; CHECK: vmovaps %ymm1, %ymm0 %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> undef, <4 x i32> %a1, i8 0) @@ -105,7 +85,7 @@ ret <8 x i32> %res declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone ; rdar://10643481 -; CHECK: vinsertf128_combine +; CHECK-LABEL: vinsertf128_combine: define <8 x float> @vinsertf128_combine(float* nocapture %f) nounwind uwtable readonly ssp { ; CHECK-NOT: vmovaps ; CHECK: vinsertf128 @@ -118,7 +98,7 @@ entry: } ; rdar://11076953 -; CHECK: vinsertf128_ucombine +; CHECK-LABEL: vinsertf128_ucombine: define <8 x float> @vinsertf128_ucombine(float* nocapture %f) nounwind uwtable readonly ssp { ; CHECK-NOT: vmovups ; CHECK: vinsertf128 diff --git a/test/CodeGen/X86/unaligned-32-byte-memops.ll b/test/CodeGen/X86/unaligned-32-byte-memops.ll index a44d44d1b69..b337a80b84b 100644 --- a/test/CodeGen/X86/unaligned-32-byte-memops.ll +++ b/test/CodeGen/X86/unaligned-32-byte-memops.ll @@ -48,58 +48,6 @@ define void @store32bytes(<8 x float> %A, <8 x float>* %P) { ; Merge two consecutive 16-byte subvector loads into a single 32-byte load ; if it's faster. -declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) - -; Use the vinsertf128 intrinsic to model source code -; that explicitly uses AVX intrinsics. -define <8 x float> @combine_16_byte_loads(<4 x float>* %ptr) { - ; CHECK-LABEL: combine_16_byte_loads - - ; SANDYB: vmovups - ; SANDYB-NEXT: vinsertf128 - ; SANDYB-NEXT: retq - - ; BTVER2: vmovups - ; BTVER2-NEXT: retq - - ; HASWELL: vmovups - ; HASWELL-NEXT: retq - - %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 1 - %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 2 - %v1 = load <4 x float>, <4 x float>* %ptr1, align 1 - %v2 = load <4 x float>, <4 x float>* %ptr2, align 1 - %shuffle = shufflevector <4 x float> %v1, <4 x float> undef, <8 x i32> - %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v2, i8 1) - ret <8 x float> %v3 -} - -; Swap the operands of the shufflevector and vinsertf128 to ensure that the -; pattern still matches. -define <8 x float> @combine_16_byte_loads_swap(<4 x float>* %ptr) { - ; CHECK-LABEL: combine_16_byte_loads_swap - - ; SANDYB: vmovups - ; SANDYB-NEXT: vinsertf128 - ; SANDYB-NEXT: retq - - ; BTVER2: vmovups - ; BTVER2-NEXT: retq - - ; HASWELL: vmovups - ; HASWELL-NEXT: retq - - %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 2 - %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3 - %v1 = load <4 x float>, <4 x float>* %ptr1, align 1 - %v2 = load <4 x float>, <4 x float>* %ptr2, align 1 - %shuffle = shufflevector <4 x float> %v2, <4 x float> undef, <8 x i32> - %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v1, i8 0) - ret <8 x float> %v3 -} - -; Replace the vinsertf128 intrinsic with a shufflevector as might be -; expected from auto-vectorized code. define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) { ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic