[x86] enable storeOfVectorConstantIsCheap() target hook

author Sanjay Patel <spatel@rotateright.com>

Sat, 16 Sep 2017 13:29:12 +0000 (13:29 +0000)

committer Sanjay Patel <spatel@rotateright.com>

Sat, 16 Sep 2017 13:29:12 +0000 (13:29 +0000)
author Sanjay Patel <spatel@rotateright.com>
Sat, 16 Sep 2017 13:29:12 +0000 (13:29 +0000)
committer Sanjay Patel <spatel@rotateright.com>
Sat, 16 Sep 2017 13:29:12 +0000 (13:29 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h

index 251d6ec435bb328758b1be20d6d3c2a34304958b..71caa562eec393b12b5f477ec9d11ddc73acde0e 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -1037,6 +1037,13 @@ namespace llvm {
      bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
                                   unsigned Index) const override;
  
+    bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem,
+                                      unsigned AddrSpace) const override {
+      // If we can replace more than 2 scalar stores, there will be a reduction
+      // in instructions even after we add a vector constant load.
+      return NumElem > 2;
+    }
+
      /// Intel processors have a unified instruction and data cache
      const char * getClearCacheBuiltinName() const override {
        return nullptr; // nothing to do, move along.
diff --git a/test/CodeGen/X86/avx512-regcall-Mask.ll b/test/CodeGen/X86/avx512-regcall-Mask.ll

index 781112866ca5cd3992cc900aad86ae5a093e5c02..36fe322d9827c0d9859250fbc4f7bfea892f7648 100644 (file)
--- a/test/CodeGen/X86/avx512-regcall-Mask.ll
+++ b/test/CodeGen/X86/avx512-regcall-Mask.ll
@@ -96,34 +96,21 @@ define x86_regcallcc i64 @test_argv64i1(<64 x i1> %x0, <64 x i1> %x1, <64 x i1>
  }
  
  ; X32-LABEL:  caller_argv64i1:
-; X32:        movl    $2, %eax
-; X32:        movl    $1, %ecx
-; X32:        movl    $2, %edx
-; X32:        movl    $1, %edi
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        pushl    ${{1|2}}
-; X32:        call{{.*}}   _test_argv64i1
-        
+; X32:  pushl %edi
+; X32:  subl  $88, %esp
+; X32:  vmovaps __xmm@00000001000000020000000100000002, %xmm0 # xmm0 = [2,1,2,1]
+; X32:  vmovups %xmm0, 64(%esp)
+; X32:  vmovaps LCPI1_1, %zmm0          # zmm0 = [2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1]
+; X32:  vmovups %zmm0, (%esp)
+; X32:  movl  $1, 84(%esp)
+; X32:  movl  $2, 80(%esp)
+; X32:  movl  $2, %eax
+; X32:  movl  $1, %ecx
+; X32:  movl  $2, %edx
+; X32:  movl  $1, %edi
+; X32:  vzeroupper
+; X32:  calll _test_argv64i1
+ 
  ; WIN64-LABEL: caller_argv64i1:
  ; WIN64:       movabsq    $4294967298, %rax
  ; WIN64:       movq   %rax, (%rsp)
diff --git a/test/CodeGen/X86/merge-store-constants.ll b/test/CodeGen/X86/merge-store-constants.ll

index a06f43f7a1181f5291901754455c9dc96d0dc5a8..f5c36ca4c2f8fb59f49a6bfdbc23eda600c7b7c3 100644 (file)
--- a/test/CodeGen/X86/merge-store-constants.ll
+++ b/test/CodeGen/X86/merge-store-constants.ll
@@ -6,18 +6,14 @@ define void @big_nonzero_16_bytes(i32* nocapture %a) {
  ; X32-LABEL: big_nonzero_16_bytes:
  ; X32:       # BB#0:
  ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl $1, (%eax)
-; X32-NEXT:    movl $2, 4(%eax)
-; X32-NEXT:    movl $3, 8(%eax)
-; X32-NEXT:    movl $4, 12(%eax)
+; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [1,2,3,4]
+; X32-NEXT:    vmovups %xmm0, (%eax)
  ; X32-NEXT:    retl
  ;
  ; X64-LABEL: big_nonzero_16_bytes:
  ; X64:       # BB#0:
-; X64-NEXT:    movabsq $8589934593, %rax # imm = 0x200000001
-; X64-NEXT:    movq %rax, (%rdi)
-; X64-NEXT:    movabsq $17179869187, %rax # imm = 0x400000003
-; X64-NEXT:    movq %rax, 8(%rdi)
+; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [1,2,3,4]
+; X64-NEXT:    vmovups %xmm0, (%rdi)
  ; X64-NEXT:    retq
    %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 1
    %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 2
@@ -30,29 +26,48 @@ define void @big_nonzero_16_bytes(i32* nocapture %a) {
    ret void
  }
  
+; TODO: We assumed that two 64-bit stores were better than 1 vector load and 1 vector store.
+; But if the 64-bit constants can't be represented as sign-extended 32-bit constants, then
+; it takes extra instructions to do this in scalar.
+
+define void @big_nonzero_16_bytes_big64bit_constants(i64* nocapture %a) {
+; X32-LABEL: big_nonzero_16_bytes_big64bit_constants:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [1,1,1,3]
+; X32-NEXT:    vmovups %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: big_nonzero_16_bytes_big64bit_constants:
+; X64:       # BB#0:
+; X64-NEXT:    movabsq $4294967297, %rax # imm = 0x100000001
+; X64-NEXT:    movq %rax, (%rdi)
+; X64-NEXT:    movabsq $12884901889, %rax # imm = 0x300000001
+; X64-NEXT:    movq %rax, 8(%rdi)
+; X64-NEXT:    retq
+  %arrayidx1 = getelementptr inbounds i64, i64* %a, i64 1
+
+  store i64 4294967297, i64* %a
+  store i64 12884901889, i64* %arrayidx1
+  ret void
+}
+
  ; Splats may be an opportunity to use a broadcast op.
  
  define void @big_nonzero_32_bytes_splat(i32* nocapture %a) {
  ; X32-LABEL: big_nonzero_32_bytes_splat:
  ; X32:       # BB#0:
  ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl $42, (%eax)
-; X32-NEXT:    movl $42, 4(%eax)
-; X32-NEXT:    movl $42, 8(%eax)
-; X32-NEXT:    movl $42, 12(%eax)
-; X32-NEXT:    movl $42, 16(%eax)
-; X32-NEXT:    movl $42, 20(%eax)
-; X32-NEXT:    movl $42, 24(%eax)
-; X32-NEXT:    movl $42, 28(%eax)
+; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42]
+; X32-NEXT:    vmovups %ymm0, (%eax)
+; X32-NEXT:    vzeroupper
  ; X32-NEXT:    retl
  ;
  ; X64-LABEL: big_nonzero_32_bytes_splat:
  ; X64:       # BB#0:
-; X64-NEXT:    movabsq $180388626474, %rax # imm = 0x2A0000002A
-; X64-NEXT:    movq %rax, (%rdi)
-; X64-NEXT:    movq %rax, 8(%rdi)
-; X64-NEXT:    movq %rax, 16(%rdi)
-; X64-NEXT:    movq %rax, 24(%rdi)
+; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42]
+; X64-NEXT:    vmovups %ymm0, (%rdi)
+; X64-NEXT:    vzeroupper
  ; X64-NEXT:    retq
    %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 1
    %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 2
@@ -79,37 +94,29 @@ define void @big_nonzero_63_bytes(i8* nocapture %a) {
  ; X32-LABEL: big_nonzero_63_bytes:
  ; X32:       # BB#0:
  ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl $0, 4(%eax)
-; X32-NEXT:    movl $1, (%eax)
-; X32-NEXT:    movl $0, 12(%eax)
-; X32-NEXT:    movl $2, 8(%eax)
-; X32-NEXT:    movl $0, 20(%eax)
-; X32-NEXT:    movl $3, 16(%eax)
-; X32-NEXT:    movl $0, 28(%eax)
-; X32-NEXT:    movl $4, 24(%eax)
-; X32-NEXT:    movl $0, 36(%eax)
-; X32-NEXT:    movl $5, 32(%eax)
-; X32-NEXT:    movl $0, 44(%eax)
-; X32-NEXT:    movl $6, 40(%eax)
+; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [1,0,2,0,3,0,4,0]
+; X32-NEXT:    vmovups %ymm0, (%eax)
+; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [5,0,6,0]
+; X32-NEXT:    vmovups %xmm0, 32(%eax)
  ; X32-NEXT:    movl $0, 52(%eax)
  ; X32-NEXT:    movl $7, 48(%eax)
  ; X32-NEXT:    movl $8, 56(%eax)
  ; X32-NEXT:    movw $9, 60(%eax)
  ; X32-NEXT:    movb $10, 62(%eax)
+; X32-NEXT:    vzeroupper
  ; X32-NEXT:    retl
  ;
  ; X64-LABEL: big_nonzero_63_bytes:
  ; X64:       # BB#0:
-; X64-NEXT:    movq $1, (%rdi)
-; X64-NEXT:    movq $2, 8(%rdi)
-; X64-NEXT:    movq $3, 16(%rdi)
-; X64-NEXT:    movq $4, 24(%rdi)
+; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [1,2,3,4]
+; X64-NEXT:    vmovups %ymm0, (%rdi)
  ; X64-NEXT:    movq $5, 32(%rdi)
  ; X64-NEXT:    movq $6, 40(%rdi)
  ; X64-NEXT:    movq $7, 48(%rdi)
  ; X64-NEXT:    movl $8, 56(%rdi)
  ; X64-NEXT:    movw $9, 60(%rdi)
  ; X64-NEXT:    movb $10, 62(%rdi)
+; X64-NEXT:    vzeroupper
  ; X64-NEXT:    retq
    %a8 = bitcast i8* %a to i64*
    %arrayidx8 = getelementptr inbounds i64, i64* %a8, i64 1
author	Sanjay Patel <spatel@rotateright.com>
	Sat, 16 Sep 2017 13:29:12 +0000 (13:29 +0000)
committer	Sanjay Patel <spatel@rotateright.com>
	Sat, 16 Sep 2017 13:29:12 +0000 (13:29 +0000)
lib/Target/X86/X86ISelLowering.h		patch \| blob \| history
test/CodeGen/X86/avx512-regcall-Mask.ll		patch \| blob \| history
test/CodeGen/X86/merge-store-constants.ll		patch \| blob \| history