AMDGPU: Break 64-bit arguments into 32-bit pieces

author Matt Arsenault <Matthew.Arsenault@amd.com>

Tue, 31 Jul 2018 19:29:04 +0000 (19:29 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Tue, 31 Jul 2018 19:29:04 +0000 (19:29 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Tue, 31 Jul 2018 19:29:04 +0000 (19:29 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Tue, 31 Jul 2018 19:29:04 +0000 (19:29 +0000)
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp

index d6647c7fe8531da93644b94783df01648abd821e..ee3c0289b6d8712454a00a895db9f350343c6e25 100644 (file)
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -701,9 +701,12 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
    if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
      EVT ScalarVT = VT.getScalarType();
      unsigned Size = ScalarVT.getSizeInBits();
-    if (Size == 32 || Size == 64)
+    if (Size == 32)
        return ScalarVT.getSimpleVT();
  
+    if (Size == 64)
+      return MVT::i32;
+
      if (Size == 16 &&
          Subtarget->has16BitInsts() &&
          isPowerOf2_32(VT.getVectorNumElements()))
@@ -721,9 +724,12 @@ unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
      EVT ScalarVT = VT.getScalarType();
      unsigned Size = ScalarVT.getSizeInBits();
  
-    if (Size == 32 || Size == 64)
+    if (Size == 32)
        return NumElts;
  
+    if (Size == 64)
+      return 2 * NumElts;
+
      // FIXME: Fails to break down as we want with v3.
      if (Size == 16 && Subtarget->has16BitInsts() && isPowerOf2_32(NumElts))
        return VT.getVectorNumElements() / 2;
@@ -740,13 +746,20 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
      unsigned NumElts = VT.getVectorNumElements();
      EVT ScalarVT = VT.getScalarType();
      unsigned Size = ScalarVT.getSizeInBits();
-    if (Size == 32 || Size == 64) {
+    if (Size == 32) {
        RegisterVT = ScalarVT.getSimpleVT();
        IntermediateVT = RegisterVT;
        NumIntermediates = NumElts;
        return NumIntermediates;
      }
  
+    if (Size == 64) {
+      RegisterVT = MVT::i32;
+      IntermediateVT = RegisterVT;
+      NumIntermediates = 2 * NumElts;
+      return NumIntermediates;
+    }
+
      // FIXME: We should fix the ABI to be the same on targets without 16-bit
      // support, but unless we can properly handle 3-vectors, it will be still be
      // inconsistent.
diff --git a/test/CodeGen/AMDGPU/call-argument-types.ll b/test/CodeGen/AMDGPU/call-argument-types.ll

index 599e3595cc9435ab3a01b272e510b3483dbe2689..2cea1414507b37325ccff95344cb23449dd70149 100644 (file)
--- a/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -25,6 +25,8 @@ declare void @external_void_func_f16(half) #0
  declare void @external_void_func_f32(float) #0
  declare void @external_void_func_f64(double) #0
  declare void @external_void_func_v2f32(<2 x float>) #0
+declare void @external_void_func_v2f64(<2 x double>) #0
+declare void @external_void_func_v3f64(<3 x double>) #0
  
  declare void @external_void_func_v2i16(<2 x i16>) #0
  declare void @external_void_func_v2f16(<2 x half>) #0
@@ -274,10 +276,21 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
    ret void
  }
  
+; GCN-LABEL: {{^}}test_call_external_void_func_v2i64_imm:
+; GCN-DAG: v_mov_b32_e32 v0, 1
+; GCN-DAG: v_mov_b32_e32 v1, 2
+; GCN-DAG: v_mov_b32_e32 v2, 3
+; GCN-DAG: v_mov_b32_e32 v3, 4
+; GCN: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
+  call void @external_void_func_v2i64(<2 x i64> <i64 8589934593, i64 17179869187>)
+  ret void
+}
+
  ; GCN-LABEL: {{^}}test_call_external_void_func_v3i64:
  ; GCN: buffer_load_dwordx4 v[0:3]
-; GCN: v_mov_b32_e32 v4, s
-; GCN: v_mov_b32_e32 v5, s
+; GCN: v_mov_b32_e32 v4, 1
+; GCN: v_mov_b32_e32 v5, 2
  ; GCN: s_waitcnt
  ; GCN-NEXT: s_swappc_b64
  define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
@@ -288,13 +301,12 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
    ret void
  }
  
-; FIXME: Immedites should fold directly into v_mov_b32s
  ; GCN-LABEL: {{^}}test_call_external_void_func_v4i64:
  ; GCN: buffer_load_dwordx4 v[0:3]
-; GCN-DAG: v_mov_b32_e32 v4, s
-; GCN-DAG: v_mov_b32_e32 v5, s
-; GCN-DAG: v_mov_b32_e32 v6, s
-; GCN-DAG: v_mov_b32_e32 v7, s
+; GCN-DAG: v_mov_b32_e32 v4, 1
+; GCN-DAG: v_mov_b32_e32 v5, 2
+; GCN-DAG: v_mov_b32_e32 v6, 3
+; GCN-DAG: v_mov_b32_e32 v7, 4
  
  ; GCN: s_waitcnt
  ; GCN-NEXT: s_swappc_b64
@@ -342,6 +354,30 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {
    ret void
  }
  
+; GCN-LABEL: {{^}}test_call_external_void_func_v2f64_imm:
+; GCN: v_mov_b32_e32 v0, 0{{$}}
+; GCN: v_mov_b32_e32 v1, 2.0
+; GCN: v_mov_b32_e32 v2, 0{{$}}
+; GCN: v_mov_b32_e32 v3, 0x40100000
+; GCN: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 {
+  call void @external_void_func_v2f64(<2 x double> <double 2.0, double 4.0>)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_v3f64_imm:
+; GCN-DAG: v_mov_b32_e32 v0, 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v1, 2.0
+; GCN-DAG: v_mov_b32_e32 v2, 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v3, 0x40100000
+; GCN-DAG: v_mov_b32_e32 v4, 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v5, 0x40200000
+; GCN-DAG: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
+  call void @external_void_func_v3f64(<3 x double> <double 2.0, double 4.0, double 8.0>)
+  ret void
+}
+
  ; GCN-LABEL: {{^}}test_call_external_void_func_v2i16:
  ; GFX9: buffer_load_dword v0
  ; GFX9-NOT: v0
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Tue, 31 Jul 2018 19:29:04 +0000 (19:29 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Tue, 31 Jul 2018 19:29:04 +0000 (19:29 +0000)
lib/Target/AMDGPU/SIISelLowering.cpp		patch \| blob \| history
test/CodeGen/AMDGPU/call-argument-types.ll		patch \| blob \| history