From: Matt Arsenault Date: Tue, 31 Jul 2018 19:29:04 +0000 (+0000) Subject: AMDGPU: Break 64-bit arguments into 32-bit pieces X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=4b6157df8bff29ffd276e959815e672cab47f5bd;p=llvm AMDGPU: Break 64-bit arguments into 32-bit pieces git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@338421 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index d6647c7fe85..ee3c0289b6d 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -701,9 +701,12 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) { EVT ScalarVT = VT.getScalarType(); unsigned Size = ScalarVT.getSizeInBits(); - if (Size == 32 || Size == 64) + if (Size == 32) return ScalarVT.getSimpleVT(); + if (Size == 64) + return MVT::i32; + if (Size == 16 && Subtarget->has16BitInsts() && isPowerOf2_32(VT.getVectorNumElements())) @@ -721,9 +724,12 @@ unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, EVT ScalarVT = VT.getScalarType(); unsigned Size = ScalarVT.getSizeInBits(); - if (Size == 32 || Size == 64) + if (Size == 32) return NumElts; + if (Size == 64) + return 2 * NumElts; + // FIXME: Fails to break down as we want with v3. if (Size == 16 && Subtarget->has16BitInsts() && isPowerOf2_32(NumElts)) return VT.getVectorNumElements() / 2; @@ -740,13 +746,20 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( unsigned NumElts = VT.getVectorNumElements(); EVT ScalarVT = VT.getScalarType(); unsigned Size = ScalarVT.getSizeInBits(); - if (Size == 32 || Size == 64) { + if (Size == 32) { RegisterVT = ScalarVT.getSimpleVT(); IntermediateVT = RegisterVT; NumIntermediates = NumElts; return NumIntermediates; } + if (Size == 64) { + RegisterVT = MVT::i32; + IntermediateVT = RegisterVT; + NumIntermediates = 2 * NumElts; + return NumIntermediates; + } + // FIXME: We should fix the ABI to be the same on targets without 16-bit // support, but unless we can properly handle 3-vectors, it will be still be // inconsistent. diff --git a/test/CodeGen/AMDGPU/call-argument-types.ll b/test/CodeGen/AMDGPU/call-argument-types.ll index 599e3595cc9..2cea1414507 100644 --- a/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/test/CodeGen/AMDGPU/call-argument-types.ll @@ -25,6 +25,8 @@ declare void @external_void_func_f16(half) #0 declare void @external_void_func_f32(float) #0 declare void @external_void_func_f64(double) #0 declare void @external_void_func_v2f32(<2 x float>) #0 +declare void @external_void_func_v2f64(<2 x double>) #0 +declare void @external_void_func_v3f64(<3 x double>) #0 declare void @external_void_func_v2i16(<2 x i16>) #0 declare void @external_void_func_v2f16(<2 x half>) #0 @@ -274,10 +276,21 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 { ret void } +; GCN-LABEL: {{^}}test_call_external_void_func_v2i64_imm: +; GCN-DAG: v_mov_b32_e32 v0, 1 +; GCN-DAG: v_mov_b32_e32 v1, 2 +; GCN-DAG: v_mov_b32_e32 v2, 3 +; GCN-DAG: v_mov_b32_e32 v3, 4 +; GCN: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 { + call void @external_void_func_v2i64(<2 x i64> ) + ret void +} + ; GCN-LABEL: {{^}}test_call_external_void_func_v3i64: ; GCN: buffer_load_dwordx4 v[0:3] -; GCN: v_mov_b32_e32 v4, s -; GCN: v_mov_b32_e32 v5, s +; GCN: v_mov_b32_e32 v4, 1 +; GCN: v_mov_b32_e32 v5, 2 ; GCN: s_waitcnt ; GCN-NEXT: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { @@ -288,13 +301,12 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { ret void } -; FIXME: Immedites should fold directly into v_mov_b32s ; GCN-LABEL: {{^}}test_call_external_void_func_v4i64: ; GCN: buffer_load_dwordx4 v[0:3] -; GCN-DAG: v_mov_b32_e32 v4, s -; GCN-DAG: v_mov_b32_e32 v5, s -; GCN-DAG: v_mov_b32_e32 v6, s -; GCN-DAG: v_mov_b32_e32 v7, s +; GCN-DAG: v_mov_b32_e32 v4, 1 +; GCN-DAG: v_mov_b32_e32 v5, 2 +; GCN-DAG: v_mov_b32_e32 v6, 3 +; GCN-DAG: v_mov_b32_e32 v7, 4 ; GCN: s_waitcnt ; GCN-NEXT: s_swappc_b64 @@ -342,6 +354,30 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 { ret void } +; GCN-LABEL: {{^}}test_call_external_void_func_v2f64_imm: +; GCN: v_mov_b32_e32 v0, 0{{$}} +; GCN: v_mov_b32_e32 v1, 2.0 +; GCN: v_mov_b32_e32 v2, 0{{$}} +; GCN: v_mov_b32_e32 v3, 0x40100000 +; GCN: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 { + call void @external_void_func_v2f64(<2 x double> ) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_v3f64_imm: +; GCN-DAG: v_mov_b32_e32 v0, 0{{$}} +; GCN-DAG: v_mov_b32_e32 v1, 2.0 +; GCN-DAG: v_mov_b32_e32 v2, 0{{$}} +; GCN-DAG: v_mov_b32_e32 v3, 0x40100000 +; GCN-DAG: v_mov_b32_e32 v4, 0{{$}} +; GCN-DAG: v_mov_b32_e32 v5, 0x40200000 +; GCN-DAG: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { + call void @external_void_func_v3f64(<3 x double> ) + ret void +} + ; GCN-LABEL: {{^}}test_call_external_void_func_v2i16: ; GFX9: buffer_load_dword v0 ; GFX9-NOT: v0