if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
EVT ScalarVT = VT.getScalarType();
unsigned Size = ScalarVT.getSizeInBits();
- if (Size == 32 || Size == 64)
+ if (Size == 32)
return ScalarVT.getSimpleVT();
+ if (Size == 64)
+ return MVT::i32;
+
if (Size == 16 &&
Subtarget->has16BitInsts() &&
isPowerOf2_32(VT.getVectorNumElements()))
EVT ScalarVT = VT.getScalarType();
unsigned Size = ScalarVT.getSizeInBits();
- if (Size == 32 || Size == 64)
+ if (Size == 32)
return NumElts;
+ if (Size == 64)
+ return 2 * NumElts;
+
// FIXME: Fails to break down as we want with v3.
if (Size == 16 && Subtarget->has16BitInsts() && isPowerOf2_32(NumElts))
return VT.getVectorNumElements() / 2;
unsigned NumElts = VT.getVectorNumElements();
EVT ScalarVT = VT.getScalarType();
unsigned Size = ScalarVT.getSizeInBits();
- if (Size == 32 || Size == 64) {
+ if (Size == 32) {
RegisterVT = ScalarVT.getSimpleVT();
IntermediateVT = RegisterVT;
NumIntermediates = NumElts;
return NumIntermediates;
}
+ if (Size == 64) {
+ RegisterVT = MVT::i32;
+ IntermediateVT = RegisterVT;
+ NumIntermediates = 2 * NumElts;
+ return NumIntermediates;
+ }
+
// FIXME: We should fix the ABI to be the same on targets without 16-bit
// support, but unless we can properly handle 3-vectors, it will be still be
// inconsistent.
declare void @external_void_func_f32(float) #0
declare void @external_void_func_f64(double) #0
declare void @external_void_func_v2f32(<2 x float>) #0
+declare void @external_void_func_v2f64(<2 x double>) #0
+declare void @external_void_func_v3f64(<3 x double>) #0
declare void @external_void_func_v2i16(<2 x i16>) #0
declare void @external_void_func_v2f16(<2 x half>) #0
ret void
}
+; GCN-LABEL: {{^}}test_call_external_void_func_v2i64_imm:
+; GCN-DAG: v_mov_b32_e32 v0, 1
+; GCN-DAG: v_mov_b32_e32 v1, 2
+; GCN-DAG: v_mov_b32_e32 v2, 3
+; GCN-DAG: v_mov_b32_e32 v3, 4
+; GCN: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
+ call void @external_void_func_v2i64(<2 x i64> <i64 8589934593, i64 17179869187>)
+ ret void
+}
+
; GCN-LABEL: {{^}}test_call_external_void_func_v3i64:
; GCN: buffer_load_dwordx4 v[0:3]
-; GCN: v_mov_b32_e32 v4, s
-; GCN: v_mov_b32_e32 v5, s
+; GCN: v_mov_b32_e32 v4, 1
+; GCN: v_mov_b32_e32 v5, 2
; GCN: s_waitcnt
; GCN-NEXT: s_swappc_b64
define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
ret void
}
-; FIXME: Immedites should fold directly into v_mov_b32s
; GCN-LABEL: {{^}}test_call_external_void_func_v4i64:
; GCN: buffer_load_dwordx4 v[0:3]
-; GCN-DAG: v_mov_b32_e32 v4, s
-; GCN-DAG: v_mov_b32_e32 v5, s
-; GCN-DAG: v_mov_b32_e32 v6, s
-; GCN-DAG: v_mov_b32_e32 v7, s
+; GCN-DAG: v_mov_b32_e32 v4, 1
+; GCN-DAG: v_mov_b32_e32 v5, 2
+; GCN-DAG: v_mov_b32_e32 v6, 3
+; GCN-DAG: v_mov_b32_e32 v7, 4
; GCN: s_waitcnt
; GCN-NEXT: s_swappc_b64
ret void
}
+; GCN-LABEL: {{^}}test_call_external_void_func_v2f64_imm:
+; GCN: v_mov_b32_e32 v0, 0{{$}}
+; GCN: v_mov_b32_e32 v1, 2.0
+; GCN: v_mov_b32_e32 v2, 0{{$}}
+; GCN: v_mov_b32_e32 v3, 0x40100000
+; GCN: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 {
+ call void @external_void_func_v2f64(<2 x double> <double 2.0, double 4.0>)
+ ret void
+}
+
+; GCN-LABEL: {{^}}test_call_external_void_func_v3f64_imm:
+; GCN-DAG: v_mov_b32_e32 v0, 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v1, 2.0
+; GCN-DAG: v_mov_b32_e32 v2, 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v3, 0x40100000
+; GCN-DAG: v_mov_b32_e32 v4, 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v5, 0x40200000
+; GCN-DAG: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
+ call void @external_void_func_v3f64(<3 x double> <double 2.0, double 4.0, double 8.0>)
+ ret void
+}
+
; GCN-LABEL: {{^}}test_call_external_void_func_v2i16:
; GFX9: buffer_load_dword v0
; GFX9-NOT: v0