From 902e7e59d192dff97b799f4e81bbee71bfde74f5 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 19 Apr 2017 17:42:39 +0000
Subject: [PATCH] AMDGPU: Don't align callable functions to 256

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@300720 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp |  4 +++-
 test/CodeGen/AMDGPU/hsa-func-align.ll  | 18 ++++++++++++++++++
 test/CodeGen/AMDGPU/hsa-func.ll        | 13 +++++++++++++
 3 files changed, 34 insertions(+), 1 deletion(-)
 create mode 100644 test/CodeGen/AMDGPU/hsa-func-align.ll

diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index c35a0912adb..7ee4bcb86fe 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -184,9 +184,11 @@ void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 }
 
 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
 
   // The starting address of all shader programs must be 256 bytes aligned.
-  MF.setAlignment(8);
+  // Regular functions just need the basic required instruction alignment.
+  MF.setAlignment(MFI->isEntryFunction() ? 8 : 2);
 
   SetupMachineFunction(MF);
 
diff --git a/test/CodeGen/AMDGPU/hsa-func-align.ll b/test/CodeGen/AMDGPU/hsa-func-align.ll
new file mode 100644
index 00000000000..a00f5e2669d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/hsa-func-align.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=HSA %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj < %s | llvm-readobj -symbols -s -sd | FileCheck -check-prefix=ELF %s
+
+; ELF: Section {
+; ELF: Name: .text
+; ELF: SHF_ALLOC (0x2)
+; ELF: SHF_EXECINSTR (0x4)
+; ELF: AddressAlignment: 32
+; ELF: }
+
+; HSA: .globl simple_align16
+; HSA: .p2align 5
+define void @simple_align16(i32 addrspace(1)* addrspace(2)* %ptr.out) align 32 {
+entry:
+  %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %ptr.out
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/hsa-func.ll b/test/CodeGen/AMDGPU/hsa-func.ll
index b4cdd4030d8..147cf9bbe6c 100644
--- a/test/CodeGen/AMDGPU/hsa-func.ll
+++ b/test/CodeGen/AMDGPU/hsa-func.ll
@@ -14,6 +14,7 @@
 ; ELF: Flags [ (0x6)
 ; ELF: SHF_ALLOC (0x2)
 ; ELF: SHF_EXECINSTR (0x4)
+; ELF: AddressAlignment: 4
 ; ELF: }
 
 ; ELF: SHT_NOTE
@@ -36,6 +37,8 @@
 ; HSA-VI: .hsa_code_object_isa 8,0,1,"AMD","AMDGPU"
 
 ; HSA-NOT: .amdgpu_hsa_kernel simple
+; HSA: .globl simple
+; HSA: .p2align 2
 ; HSA: {{^}}simple:
 ; HSA: .amd_kernel_code_t
 ; HSA: enable_sgpr_private_segment_buffer = 1
@@ -58,3 +61,13 @@ entry:
   store i32 0, i32 addrspace(1)* %out
   ret void
 }
+
+; Ignore explicit alignment that is too low.
+; HSA: .globl simple_align2
+; HSA: .p2align 2
+define void @simple_align2(i32 addrspace(1)* addrspace(2)* %ptr.out) align 2 {
+entry:
+  %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %ptr.out
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+}
-- 
2.50.1