AMDGPU: Split MUBUF offset into aligned components

author Nicolai Haehnle <nhaehnle@gmail.com>

Tue, 10 Oct 2017 12:22:23 +0000 (12:22 +0000)

committer Nicolai Haehnle <nhaehnle@gmail.com>

Tue, 10 Oct 2017 12:22:23 +0000 (12:22 +0000)
author Nicolai Haehnle <nhaehnle@gmail.com>
Tue, 10 Oct 2017 12:22:23 +0000 (12:22 +0000)
committer Nicolai Haehnle <nhaehnle@gmail.com>
Tue, 10 Oct 2017 12:22:23 +0000 (12:22 +0000)
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

index 7faf3e123f8119ca445a9f80f60b4cc88776b542..5bd1092196a508f9dffdd18b88068d5dc4c112e0 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1235,24 +1235,30 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFConstant(SDValue Constant,
                                               SDValue &SOffset,
                                               SDValue &ImmOffset) const {
    SDLoc DL(Constant);
+  const uint32_t Align = 4;
+  const uint32_t MaxImm = alignDown(4095, Align);
    uint32_t Imm = cast<ConstantSDNode>(Constant)->getZExtValue();
    uint32_t Overflow = 0;
  
-  if (Imm >= 4096) {
-    if (Imm <= 4095 + 64) {
-      // Use an SOffset inline constant for 1..64
-      Overflow = Imm - 4095;
-      Imm = 4095;
+  if (Imm > MaxImm) {
+    if (Imm <= MaxImm + 64) {
+      // Use an SOffset inline constant for 4..64
+      Overflow = Imm - MaxImm;
+      Imm = MaxImm;
      } else {
        // Try to keep the same value in SOffset for adjacent loads, so that
        // the corresponding register contents can be re-used.
        //
-      // Load values with all low-bits set into SOffset, so that a larger
-      // range of values can be covered using s_movk_i32
-      uint32_t High = (Imm + 1) & ~4095;
-      uint32_t Low = (Imm + 1) & 4095;
+      // Load values with all low-bits (except for alignment bits) set into
+      // SOffset, so that a larger range of values can be covered using
+      // s_movk_i32.
+      //
+      // Atomic operations fail to work correctly when individual address
+      // components are unaligned, even if their sum is aligned.
+      uint32_t High = (Imm + Align) & ~4095;
+      uint32_t Low = (Imm + Align) & 4095;
        Imm = Low;
-      Overflow = High - 1;
+      Overflow = High - Align;
      }
    }
  
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll

index 98f7058b5ef8095a51036bd118f82140ec199d5d..b6f72a114d93a2619a869c30e56e72e15f873326 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll
@@ -3,7 +3,7 @@
  
  ;CHECK-LABEL: {{^}}test1:
  ;CHECK: buffer_atomic_swap v0, off, s[0:3], 0 glc
-;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff
+;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc
  ;CHECK: s_waitcnt vmcnt(0)
  ;CHECK: buffer_atomic_swap v0, v1, s[0:3], 0 idxen glc
  ;CHECK: s_waitcnt vmcnt(0)
@@ -14,7 +14,7 @@
  ;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen offset:42 glc
  ;CHECK-DAG: s_waitcnt vmcnt(0)
  ;SICI: buffer_atomic_swap v0, v1, s[0:3], 0 offen glc
-;VI: buffer_atomic_swap v0, off, s[0:3], [[SOFS]] offset:1 glc
+;VI: buffer_atomic_swap v0, off, s[0:3], [[SOFS]] offset:4 glc
  ;CHECK: s_waitcnt vmcnt(0)
  ;CHECK: buffer_atomic_swap v0, off, s[0:3], 0{{$}}
  define amdgpu_ps float @test1(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) {
@@ -71,24 +71,24 @@ main_body:
  ;CHECK-LABEL: {{^}}test3:
  ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 glc
  ;CHECK: s_waitcnt vmcnt(0)
-;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff
+;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc
  ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 idxen glc
  ;CHECK: s_waitcnt vmcnt(0)
  ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen glc
  ;CHECK: s_waitcnt vmcnt(0)
  ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v[2:3], s[0:3], 0 idxen offen glc
  ;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen offset:42 glc
+;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen offset:44 glc
  ;CHECK-DAG: s_waitcnt vmcnt(0)
  ;SICI: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen glc
-;VI: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[SOFS]] offset:1 glc
+;VI: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[SOFS]] offset:4 glc
  define amdgpu_ps float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) {
  main_body:
    %o1 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
    %o2 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o1, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
    %o3 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o2, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %voffset, i1 0)
    %o4 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o3, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i1 0)
-  %ofs.5 = add i32 %voffset, 42
+  %ofs.5 = add i32 %voffset, 44
    %o5 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o4, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i1 0)
    %o6 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o5, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 8192, i1 0)
  
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll

index 9cb9f25520b8bc7725a38f973df37ebf4e1b9e4b..d5159934d3f92f0d1c41d975bf51a7f711dd2db8 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll
@@ -27,20 +27,20 @@ main_body:
  }
  
  ;CHECK-LABEL: {{^}}buffer_load_immoffs_large:
-;SICI: v_mov_b32_e32 [[VOFS:v[0-9]+]], 0x103c
+;SICI: v_mov_b32_e32 [[VOFS:v[0-9]+]], 0x1038
  ;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[VOFS]], s[0:3], 0 offen
  ;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen
-;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 61 offset:4095
-;VI-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7fff
-;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS1]] offset:4093
+;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 60 offset:4092
+;VI-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7ffc
+;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS1]] offset:4092
  ;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen
-;VI-DAG: s_mov_b32 [[OFS2:s[0-9]+]], 0x8fff
-;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS2]] offset:1
+;VI-DAG: s_mov_b32 [[OFS2:s[0-9]+]], 0x8ffc
+;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS2]] offset:4
  ;CHECK: s_waitcnt
  define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
  main_body:
-  %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4156, i1 0, i1 0)
-  %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 36860, i1 0, i1 0)
+  %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4152, i1 0, i1 0)
+  %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 36856, i1 0, i1 0)
    %d.2 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 36864, i1 0, i1 0)
    %d.3 = fadd <4 x float> %d.0, %d.1
    %data = fadd <4 x float> %d.2, %d.3
@@ -48,10 +48,10 @@ main_body:
  }
  
  ;CHECK-LABEL: {{^}}buffer_load_immoffs_reuse:
-;VI: s_movk_i32 [[OFS:s[0-9]+]], 0xfff
-;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:65
+;VI: s_movk_i32 [[OFS:s[0-9]+]], 0xffc
+;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:68
  ;VI-NOT: s_mov
-;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:81
+;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:84
  ;VI: s_waitcnt
  define amdgpu_ps <4 x float> @buffer_load_immoffs_reuse(<4 x i32> inreg) {
  main_body:
@@ -80,11 +80,11 @@ main_body:
  }
  
  ;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
-;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:58
+;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:60
  ;CHECK: s_waitcnt
  define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) {
  main_body:
-  %ofs = add i32 %1, 58
+  %ofs = add i32 %1, 60
    %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0)
    ret <4 x float> %data
  }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll

index 5c93ae0e78672f82aff238c9fc18420e32648ac8..03caca8d29c45a8052ab44859296546a09a82bc6 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
@@ -18,18 +18,18 @@ main_body:
  }
  
  ;CHECK-LABEL: {{^}}buffer_load_immoffs:
-;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:42
+;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40
  ;CHECK: s_waitcnt
  define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) {
  main_body:
-  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 40, i1 0, i1 0)
    ret <4 x float> %data
  }
  
  ;CHECK-LABEL: {{^}}buffer_load_immoffs_large:
  ;SICI: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 offen
-;VI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1fff
-;VI: buffer_load_dwordx4 v[0:3], off, s[0:3], [[OFFSET]] offset:1
+;VI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1ffc
+;VI: buffer_load_dwordx4 v[0:3], off, s[0:3], [[OFFSET]] offset:4
  ;CHECK: s_waitcnt
  define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
  main_body:
@@ -56,11 +56,11 @@ main_body:
  }
  
  ;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
-;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:58
+;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:60
  ;CHECK: s_waitcnt
  define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) {
  main_body:
-  %ofs = add i32 %1, 58
+  %ofs = add i32 %1, 60
    %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0)
    ret <4 x float> %data
  }
author	Nicolai Haehnle <nhaehnle@gmail.com>
	Tue, 10 Oct 2017 12:22:23 +0000 (12:22 +0000)
committer	Nicolai Haehnle <nhaehnle@gmail.com>
	Tue, 10 Oct 2017 12:22:23 +0000 (12:22 +0000)
lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll		patch \| blob \| history