From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 10 Mar 2015 16:08:36 +0000 (+0000)
Subject: [X86, AVX] replace vinsertf128 intrinsics with generic shuffles
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=137e1f3f289a14153b31ea5dee1856d143305563;p=llvm

[X86, AVX] replace vinsertf128 intrinsics with generic shuffles

We want to replace as much custom x86 shuffling via intrinsics
as possible because pushing the code down the generic shuffle
optimization path allows for better codegen and less complexity
in LLVM.

This is the sibling patch for the Clang half of this change:
http://reviews.llvm.org/D8088

Differential Revision: http://reviews.llvm.org/D8086


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@231794 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td
index 4a59f0d54e0..998b249b832 100644
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -1183,19 +1183,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx_vextractf128_si_256 :
         GCCBuiltin<"__builtin_ia32_vextractf128_si256">,
         Intrinsic<[llvm_v4i32_ty], [llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
-
-  def int_x86_avx_vinsertf128_pd_256 :
-        GCCBuiltin<"__builtin_ia32_vinsertf128_pd256">,
-        Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty,
-                  llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx_vinsertf128_ps_256 :
-        GCCBuiltin<"__builtin_ia32_vinsertf128_ps256">,
-        Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
-                  llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx_vinsertf128_si_256 :
-        GCCBuiltin<"__builtin_ia32_vinsertf128_si256">,
-        Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
-                  llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
 }
 
 // Vector convert
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index c13abd34584..a5f0868894b 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4956,9 +4956,6 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     setValue(&I, Res);
     return nullptr;
   }
-  case Intrinsic::x86_avx_vinsertf128_pd_256:
-  case Intrinsic::x86_avx_vinsertf128_ps_256:
-  case Intrinsic::x86_avx_vinsertf128_si_256:
   case Intrinsic::x86_avx2_vinserti128: {
     EVT DestVT = TLI.getValueType(I.getType());
     EVT ElVT = TLI.getValueType(I.getArgOperand(1)->getType());
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp
index 6800e7a0a13..f9493bc2a81 100644
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@@ -7,7 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the auto-upgrade helper functions
+// This file implements the auto-upgrade helper functions.
+// This is where deprecated IR intrinsics and other IR features are updated to
+// current specifications.
 //
 //===----------------------------------------------------------------------===//
 
@@ -156,6 +158,9 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
         Name.startswith("x86.avx2.pcmpeq.") ||
         Name.startswith("x86.avx2.pcmpgt.") ||
         Name.startswith("x86.avx.vpermil.") ||
+        Name == "x86.avx.vinsertf128.pd.256" ||
+        Name == "x86.avx.vinsertf128.ps.256" ||
+        Name == "x86.avx.vinsertf128.si.256" ||
         Name == "x86.avx.movnt.dq.256" ||
         Name == "x86.avx.movnt.pd.256" ||
         Name == "x86.avx.movnt.ps.256" ||
@@ -626,6 +631,51 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       }
 
       Rep = Builder.CreateShuffleVector(Op0, Op1, ConstantVector::get(Idxs));
+    } else if (Name == "llvm.x86.avx.vinsertf128.pd.256" ||
+               Name == "llvm.x86.avx.vinsertf128.ps.256" ||
+               Name == "llvm.x86.avx.vinsertf128.si.256") {
+      Value *Op0 = CI->getArgOperand(0);
+      Value *Op1 = CI->getArgOperand(1);
+      unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
+      VectorType *VecTy = cast<VectorType>(CI->getType());
+      unsigned NumElts = VecTy->getNumElements();
+      
+      // Mask off the high bits of the immediate value; hardware ignores those.
+      Imm = Imm & 1;
+      
+      // Extend the second operand into a vector that is twice as big.
+      Value *UndefV = UndefValue::get(Op1->getType());
+      SmallVector<Constant*, 8> Idxs;
+      for (unsigned i = 0; i != NumElts; ++i) {
+        Idxs.push_back(Builder.getInt32(i));
+      }
+      Rep = Builder.CreateShuffleVector(Op1, UndefV, ConstantVector::get(Idxs));
+
+      // Insert the second operand into the first operand.
+
+      // Note that there is no guarantee that instruction lowering will actually
+      // produce a vinsertf128 instruction for the created shuffles. In
+      // particular, the 0 immediate case involves no lane changes, so it can
+      // be handled as a blend.
+
+      // Example of shuffle mask for 32-bit elements:
+      // Imm = 1  <i32 0, i32 1, i32 2,  i32 3,  i32 8, i32 9, i32 10, i32 11>
+      // Imm = 0  <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6,  i32 7 >
+
+      SmallVector<Constant*, 8> Idxs2;
+      // The low half of the result is either the low half of the 1st operand
+      // or the low half of the 2nd operand (the inserted vector).
+      for (unsigned i = 0; i != NumElts / 2; ++i) {
+        unsigned Idx = Imm ? i : (i + NumElts);
+        Idxs2.push_back(Builder.getInt32(Idx));
+      }
+      // The high half of the result is either the low half of the 2nd operand
+      // (the inserted vector) or the high half of the 1st operand.
+      for (unsigned i = NumElts / 2; i != NumElts; ++i) {
+        unsigned Idx = Imm ? (i + NumElts / 2) : i;
+        Idxs2.push_back(Builder.getInt32(Idx));
+      }
+      Rep = Builder.CreateShuffleVector(Op0, Rep, ConstantVector::get(Idxs2));
     } else {
       bool PD128 = false, PD256 = false, PS128 = false, PS256 = false;
       if (Name == "llvm.x86.avx.vpermil.pd.256")
diff --git a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
index 8d04c16879a..a5ad7ba2616 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
@@ -1,5 +1,41 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
 
+; We don't check any vinsertf128 variant with immediate 0 because that's just a blend. 
+
+define <4 x double> @test_x86_avx_vinsertf128_pd_256_1(<4 x double> %a0, <2 x double> %a1) {
+  ; CHECK-LABEL: test_x86_avx_vinsertf128_pd_256_1: 
+  ; CHECK:       vinsertf128 $1, %xmm1, %ymm0, %ymm0
+  %res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 1)
+  ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone
+
+define <8 x float> @test_x86_avx_vinsertf128_ps_256_1(<8 x float> %a0, <4 x float> %a1) {
+  ; CHECK-LABEL: test_x86_avx_vinsertf128_ps_256_1: 
+  ; CHECK:       vinsertf128 $1, %xmm1, %ymm0, %ymm0
+  %res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 1)
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone
+
+define <8 x i32> @test_x86_avx_vinsertf128_si_256_1(<8 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_1: 
+  ; CHECK:       vinsertf128 $1, %xmm1, %ymm0, %ymm0
+  %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 1)
+  ret <8 x i32> %res
+}
+
+; Verify that high bits of the immediate are masked off. This should be the equivalent
+; of a vinsertf128 $0 which should be optimized into a blend, so just check that it's
+; not a vinsertf128 $1.
+define <8 x i32> @test_x86_avx_vinsertf128_si_256_2(<8 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_2: 
+  ; CHECK-NOT:   vinsertf128 $1, %xmm1, %ymm0, %ymm0
+  %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 2)
+  ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone
+
 define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
   ; CHECK: vblendpd
   %res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 7) ; <<4 x double>> [#uses=1]
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll
index 3716cf84989..96d80ea7ae6 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -2187,30 +2187,6 @@ define <4 x i32> @test_x86_avx_vextractf128_si_256(<8 x i32> %a0) {
 declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone
 
 
-define <4 x double> @test_x86_avx_vinsertf128_pd_256(<4 x double> %a0, <2 x double> %a1) {
-  ; CHECK: vinsertf128
-  %res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 7) ; <<4 x double>> [#uses=1]
-  ret <4 x double> %res
-}
-declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone
-
-
-define <8 x float> @test_x86_avx_vinsertf128_ps_256(<8 x float> %a0, <4 x float> %a1) {
-  ; CHECK: vinsertf128
-  %res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 7) ; <<8 x float>> [#uses=1]
-  ret <8 x float> %res
-}
-declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone
-
-
-define <8 x i32> @test_x86_avx_vinsertf128_si_256(<8 x i32> %a0, <4 x i32> %a1) {
-  ; CHECK: vinsertf128
-  %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 7) ; <<8 x i32>> [#uses=1]
-  ret <8 x i32> %res
-}
-declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone
-
-
 define <4 x double> @test_x86_avx_vperm2f128_pd_256(<4 x double> %a0, <4 x double> %a1) {
   ; CHECK: vperm2f128
   %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 7) ; <<4 x double>> [#uses=1]
diff --git a/test/CodeGen/X86/avx-vinsertf128.ll b/test/CodeGen/X86/avx-vinsertf128.ll
index d0f8f4ebaea..38389de7a8a 100644
--- a/test/CodeGen/X86/avx-vinsertf128.ll
+++ b/test/CodeGen/X86/avx-vinsertf128.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=CHECK-SSE %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s
 
+; CHECK-LABEL: A:
 ; CHECK-NOT: vunpck
 ; CHECK: vinsertf128 $1
 define <8 x float> @A(<8 x float> %a) nounwind uwtable readnone ssp {
@@ -9,6 +9,7 @@ entry:
   ret <8 x float> %shuffle
 }
 
+; CHECK-LABEL: B:
 ; CHECK-NOT: vunpck
 ; CHECK: vinsertf128 $1
 define <4 x double> @B(<4 x double> %a) nounwind uwtable readnone ssp {
@@ -22,7 +23,7 @@ declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind
 declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
 
 ; Just check that no crash happens
-; CHECK-SSE: _insert_crash
+; CHECK-LABEL: _insert_crash:
 define void @insert_crash() nounwind {
 allocas:
   %v1.i.i451 = shufflevector <4 x double> zeroinitializer, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -39,7 +40,7 @@ allocas:
 
 ;; DAG Combine must remove useless vinsertf128 instructions
 
-; CHECK: DAGCombineA
+; CHECK-LABEL: DAGCombineA:
 ; CHECK-NOT: vinsertf128 $1
 define <4 x i32> @DAGCombineA(<4 x i32> %v1) nounwind readonly {
   %1 = shufflevector <4 x i32> %v1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -47,7 +48,7 @@ define <4 x i32> @DAGCombineA(<4 x i32> %v1) nounwind readonly {
   ret <4 x i32> %2
 }
 
-; CHECK: DAGCombineB
+; CHECK-LABEL: DAGCombineB:
 ; CHECK: vpaddd %xmm
 ; CHECK-NOT: vinsertf128  $1
 ; CHECK: vpaddd %xmm
@@ -57,14 +58,7 @@ define <8 x i32> @DAGCombineB(<8 x i32> %v1, <8 x i32> %v2) nounwind readonly {
   ret <8 x i32> %2
 }
 
-; CHECK: insert_pd
-define <4 x double> @insert_pd(<4 x double> %a0, <2 x double> %a1) {
-; CHECK: vinsertf128
-%res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 0)
-ret <4 x double> %res
-}
-
-; CHECK: insert_undef_pd
+; CHECK-LABEL: insert_undef_pd:
 define <4 x double> @insert_undef_pd(<4 x double> %a0, <2 x double> %a1) {
 ; CHECK: vmovaps	%ymm1, %ymm0
 %res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> undef, <2 x double> %a1, i8 0)
@@ -73,14 +67,7 @@ ret <4 x double> %res
 declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone
 
 
-; CHECK: insert_ps
-define <8 x float> @insert_ps(<8 x float> %a0, <4 x float> %a1) {
-; CHECK: vinsertf128
-%res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 0)
-ret <8 x float> %res
-}
-
-; CHECK: insert_undef_ps
+; CHECK-LABEL: insert_undef_ps:
 define <8 x float> @insert_undef_ps(<8 x float> %a0, <4 x float> %a1) {
 ; CHECK: vmovaps	%ymm1, %ymm0
 %res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> undef, <4 x float> %a1, i8 0)
@@ -89,14 +76,7 @@ ret <8 x float> %res
 declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone
 
 
-; CHECK: insert_si
-define <8 x i32> @insert_si(<8 x i32> %a0, <4 x i32> %a1) {
-; CHECK: vinsertf128
-%res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 0)
-ret <8 x i32> %res
-}
-
-; CHECK: insert_undef_si
+; CHECK-LABEL: insert_undef_si:
 define <8 x i32> @insert_undef_si(<8 x i32> %a0, <4 x i32> %a1) {
 ; CHECK: vmovaps	%ymm1, %ymm0
 %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> undef, <4 x i32> %a1, i8 0)
@@ -105,7 +85,7 @@ ret <8 x i32> %res
 declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone
 
 ; rdar://10643481
-; CHECK: vinsertf128_combine
+; CHECK-LABEL: vinsertf128_combine:
 define <8 x float> @vinsertf128_combine(float* nocapture %f) nounwind uwtable readonly ssp {
 ; CHECK-NOT: vmovaps
 ; CHECK: vinsertf128
@@ -118,7 +98,7 @@ entry:
 }
 
 ; rdar://11076953
-; CHECK: vinsertf128_ucombine
+; CHECK-LABEL: vinsertf128_ucombine:
 define <8 x float> @vinsertf128_ucombine(float* nocapture %f) nounwind uwtable readonly ssp {
 ; CHECK-NOT: vmovups
 ; CHECK: vinsertf128
diff --git a/test/CodeGen/X86/unaligned-32-byte-memops.ll b/test/CodeGen/X86/unaligned-32-byte-memops.ll
index a44d44d1b69..b337a80b84b 100644
--- a/test/CodeGen/X86/unaligned-32-byte-memops.ll
+++ b/test/CodeGen/X86/unaligned-32-byte-memops.ll
@@ -48,58 +48,6 @@ define void @store32bytes(<8 x float> %A, <8 x float>* %P) {
 ; Merge two consecutive 16-byte subvector loads into a single 32-byte load
 ; if it's faster.
 
-declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8)
-
-; Use the vinsertf128 intrinsic to model source code 
-; that explicitly uses AVX intrinsics.
-define <8 x float> @combine_16_byte_loads(<4 x float>* %ptr) {
-  ; CHECK-LABEL: combine_16_byte_loads
-
-  ; SANDYB: vmovups
-  ; SANDYB-NEXT: vinsertf128
-  ; SANDYB-NEXT: retq
-
-  ; BTVER2: vmovups
-  ; BTVER2-NEXT: retq
-
-  ; HASWELL: vmovups
-  ; HASWELL-NEXT: retq
-
-  %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 1
-  %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 2
-  %v1 = load <4 x float>, <4 x float>* %ptr1, align 1
-  %v2 = load <4 x float>, <4 x float>* %ptr2, align 1
-  %shuffle = shufflevector <4 x float> %v1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-  %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v2, i8 1)
-  ret <8 x float> %v3
-}
-
-; Swap the operands of the shufflevector and vinsertf128 to ensure that the
-; pattern still matches.
-define <8 x float> @combine_16_byte_loads_swap(<4 x float>* %ptr) {
-  ; CHECK-LABEL: combine_16_byte_loads_swap
-
-  ; SANDYB: vmovups
-  ; SANDYB-NEXT: vinsertf128
-  ; SANDYB-NEXT: retq
-
-  ; BTVER2: vmovups
-  ; BTVER2-NEXT: retq
-
-  ; HASWELL: vmovups
-  ; HASWELL-NEXT: retq
-
-  %ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 2
-  %ptr2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3
-  %v1 = load <4 x float>, <4 x float>* %ptr1, align 1
-  %v2 = load <4 x float>, <4 x float>* %ptr2, align 1
-  %shuffle = shufflevector <4 x float> %v2, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
-  %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v1, i8 0)
-  ret <8 x float> %v3
-}
-
-; Replace the vinsertf128 intrinsic with a shufflevector as might be
-; expected from auto-vectorized code.
 define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) {
   ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic