MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
- if (CC != CallingConv::AMDGPU_KERNEL &&
- VT.isVector() && VT.getVectorNumElements() == 3) {
+ // TODO: Consider splitting all arguments into 32-bit pieces.
+ if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
EVT ScalarVT = VT.getScalarType();
- if (ScalarVT.getSizeInBits() == 32)
+ unsigned Size = ScalarVT.getSizeInBits();
+ if (Size == 32 || Size == 64)
return ScalarVT.getSimpleVT();
}
unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
- if (CC != CallingConv::AMDGPU_KERNEL &&
- VT.isVector() && VT.getVectorNumElements() == 3) {
+ if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
EVT ScalarVT = VT.getScalarType();
- if (ScalarVT.getSizeInBits() == 32)
- return 3;
+ unsigned Size = ScalarVT.getSizeInBits();
+ if (Size == 32 || Size == 64)
+ return VT.getVectorNumElements();
}
return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
LLVMContext &Context, CallingConv::ID CC,
EVT VT, EVT &IntermediateVT,
unsigned &NumIntermediates, MVT &RegisterVT) const {
-
- if (CC != CallingConv::AMDGPU_KERNEL && VT.getVectorNumElements() == 3) {
+ if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
EVT ScalarVT = VT.getScalarType();
- if (ScalarVT.getSizeInBits() == 32 ||
- ScalarVT.getSizeInBits() == 64) {
+ unsigned Size = ScalarVT.getSizeInBits();
+ if (Size == 32 || Size == 64) {
RegisterVT = ScalarVT.getSimpleVT();
IntermediateVT = RegisterVT;
- NumIntermediates = 3;
+ NumIntermediates = VT.getVectorNumElements();
return NumIntermediates;
}
}
for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
const ISD::InputArg *Arg = &Ins[I];
+ assert(!Arg->VT.isVector() && "vector type argument should have been split");
+
// First check if it's a PS input addr.
if (CallConv == CallingConv::AMDGPU_PS &&
!Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) {
++PSInputNum;
}
- // Second split vertices into their elements.
- if (Arg->VT.isVector()) {
- ISD::InputArg NewArg = *Arg;
- NewArg.Flags.setSplit();
- NewArg.VT = Arg->VT.getVectorElementType();
-
- // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
- // three or five element vertex only needs three or five registers,
- // NOT four or eight.
- Type *ParamType = FType->getParamType(Arg->getOrigArgIndex());
- unsigned NumElements = ParamType->getVectorNumElements();
-
- for (unsigned J = 0; J != NumElements; ++J) {
- Splits.push_back(NewArg);
- NewArg.PartOffset += NewArg.VT.getStoreSize();
- }
- } else {
- Splits.push_back(*Arg);
- }
+ Splits.push_back(*Arg);
}
}
; FIXME: Immedites should fold directly into v_mov_b32s
; GCN-LABEL: {{^}}test_call_external_void_func_v4i64:
; GCN: buffer_load_dwordx4 v[0:3]
-; GCN: v_mov_b32_e32 v4, s
-; GCN: v_mov_b32_e32 v5, s
-; GCN: v_mov_b32_e32 v6, s
-; GCN: v_mov_b32_e32 v7, s
+; GCN-DAG: v_mov_b32_e32 v4, s
+; GCN-DAG: v_mov_b32_e32 v5, s
+; GCN-DAG: v_mov_b32_e32 v6, s
+; GCN-DAG: v_mov_b32_e32 v7, s
; GCN: s_waitcnt
; GCN-NEXT: s_swappc_b64
ret void
}
+; GCN-LABEL: {{^}}test_call_external_void_func_v2i32_imm:
+; GCN-DAG: v_mov_b32_e32 v0, 1
+; GCN-DAG: v_mov_b32_e32 v1, 2
+; GCN: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
+ call void @external_void_func_v2i32(<2 x i32> <i32 1, i32 2>)
+ ret void
+}
+
; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_imm:
; HSA-DAG: s_mov_b32 s33, s9
; MESA-DAG: s_mov_b32 s33, s3{{$}}
ret void
}
+; GCN-LABEL: {{^}}test_call_external_void_func_v4i32_imm:
+; GCN-DAG: v_mov_b32_e32 v0, 1
+; GCN-DAG: v_mov_b32_e32 v1, 2
+; GCN-DAG: v_mov_b32_e32 v2, 3
+; GCN-DAG: v_mov_b32_e32 v3, 4
+; GCN: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
+ call void @external_void_func_v4i32(<4 x i32> <i32 1, i32 2, i32 3, i32 4>)
+ ret void
+}
+
; GCN-LABEL: {{^}}test_call_external_void_func_v8i32:
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
ret void
}
+; GCN-LABEL: {{^}}test_call_external_void_func_v8i32_imm:
+; GCN-DAG: v_mov_b32_e32 v0, 1
+; GCN-DAG: v_mov_b32_e32 v1, 2
+; GCN-DAG: v_mov_b32_e32 v2, 3
+; GCN-DAG: v_mov_b32_e32 v3, 4
+; GCN-DAG: v_mov_b32_e32 v4, 5
+; GCN-DAG: v_mov_b32_e32 v5, 6
+; GCN-DAG: v_mov_b32_e32 v6, 7
+; GCN-DAG: v_mov_b32_e32 v7, 8
+; GCN: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
+ call void @external_void_func_v8i32(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>)
+ ret void
+}
+
; GCN-LABEL: {{^}}test_call_external_void_func_v16i32:
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
}
; GCN-LABEL: {{^}}v_mad_mix_v2f32:
-; GFX900: v_mov_b32_e32 v3, v1
-; GFX900-NEXT: v_mad_mix_f32 v1, v0, v3, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; GFX900-NEXT: v_mad_mix_f32 v0, v0, v3, v2 op_sel_hi:[1,1,1]
+; GFX900: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX900-NEXT: v_mov_b32_e32 v1, v3
-; GFX906: v_mov_b32_e32 v3, v1
-; GFX906-NEXT: v_fma_mix_f32 v1, v0, v3, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; GFX906-NEXT: v_fma_mix_f32 v0, v0, v3, v2 op_sel_hi:[1,1,1]
+; GFX906: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX906-NEXT: v_mov_b32_e32 v1, v3
; CIVI: v_mac_f32
define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
; GCN-LABEL: {{^}}v_mad_mix_v2f32_shuffle:
; GCN: s_waitcnt
-; GFX900-NEXT: v_mov_b32_e32 v3, v1
-; GFX900-NEXT: v_mad_mix_f32 v1, v0, v3, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1]
-; GFX900-NEXT: v_mad_mix_f32 v0, v0, v3, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1]
+; GFX900: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1]
+; GFX900-NEXT: v_mad_mix_f32 v1, v0, v1, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1]
+; GFX900-NEXT: v_mov_b32_e32 v0, v3
; GFX900-NEXT: s_setpc_b64
-; GFX906-NEXT: v_mov_b32_e32 v3, v1
-; GFX906-NEXT: v_fma_mix_f32 v1, v0, v3, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1]
-; GFX906-NEXT: v_fma_mix_f32 v0, v0, v3, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1]
+; GFX906-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1]
+; GFX906-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1]
+; GFX906-NEXT: v_mov_b32_e32 v0, v3
; GFX906-NEXT: s_setpc_b64
; CIVI: v_mac_f32
}
; GCN-LABEL: {{^}}v_mad_mix_v2f32_f32imm1:
-; GFX9: v_mov_b32_e32 v2, v1
; GFX9: v_mov_b32_e32 v3, 1.0
-; GFX900: v_mad_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
-; GFX900: v_mad_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding
+; GFX900: v_mad_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
+; GFX900: v_mad_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding
+; GFX900: v_mov_b32_e32 v1, v2
-; GFX906: v_fma_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
-; GFX906: v_fma_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding
+; GFX906: v_fma_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
+; GFX906: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding
+; GFX906: v_mov_b32_e32 v1, v2
define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1) #0 {
%src0.ext = fpext <2 x half> %src0 to <2 x float>
%src1.ext = fpext <2 x half> %src1 to <2 x float>
}
; GCN-LABEL: {{^}}v_mad_mix_v2f32_cvtf16imminv2pi:
-; GFX9: v_mov_b32_e32 v2, v1
; GFX9: v_mov_b32_e32 v3, 0x3e230000
-; GFX900: v_mad_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
-; GFX900: v_mad_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding
-; GFX906: v_fma_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
-; GFX906: v_fma_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding
+; GFX900: v_mad_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
+; GFX900: v_mad_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding
+; GFX900: v_mov_b32_e32 v1, v2
+
+; GFX906: v_fma_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
+; GFX906: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding
+; GFX906: v_mov_b32_e32 v1, v2
define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 {
%src0.ext = fpext <2 x half> %src0 to <2 x float>
%src1.ext = fpext <2 x half> %src1 to <2 x float>
}
; GCN-LABEL: {{^}}v_mad_mix_v2f32_f32imminv2pi:
-; GFX9: v_mov_b32_e32 v2, v1
; GFX9: v_mov_b32_e32 v3, 0.15915494
-; GFX900: v_mad_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
-; GFX900: v_mad_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding
+; GFX900: v_mad_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
+; GFX900: v_mad_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding
+; GFX900: v_mov_b32_e32 v1, v2
-; GFX906: v_fma_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
-; GFX906: v_fma_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding
+; GFX906: v_fma_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding
+; GFX906: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding
+; GFX906: v_mov_b32_e32 v1, v2
define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 {
%src0.ext = fpext <2 x half> %src0 to <2 x float>
%src1.ext = fpext <2 x half> %src1 to <2 x float>