SOPK_ZEXT = UINT64_C(1) << 38,
SCALAR_STORE = UINT64_C(1) << 39,
FIXED_SIZE = UINT64_C(1) << 40,
- VOPAsmPrefer32Bit = UINT64_C(1) << 41
-
+ VOPAsmPrefer32Bit = UINT64_C(1) << 41,
+ HasFPClamp = UINT64_C(1) << 42
};
// v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
MachineRegisterInfo *MRI;
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
+ const SISubtarget *ST;
void foldOperand(MachineOperand &OpToFold,
MachineInstr *UseMI,
void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
+ const MachineOperand *isClamp(const MachineInstr &MI) const;
+ bool tryFoldClamp(MachineInstr &MI);
+
public:
SIFoldOperands() : MachineFunctionPass(ID) {
initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
}
}
+const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
+ unsigned Op = MI.getOpcode();
+ switch (Op) {
+ case AMDGPU::V_MAX_F32_e64:
+ case AMDGPU::V_MAX_F16_e64: {
+ if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
+ return nullptr;
+
+ // Make sure sources are identical.
+ const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+ if (!Src0->isReg() || Src0->getSubReg() != Src1->getSubReg() ||
+ Src0->getSubReg() != AMDGPU::NoSubRegister)
+ return nullptr;
+
+ // Can't fold up if we have modifiers.
+ if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
+ TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
+ TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
+ return nullptr;
+ return Src0;
+ }
+ default:
+ return nullptr;
+ }
+}
+
+// We obviously have multiple uses in a clamp since the register is used twice
+// in the same instruction.
+static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) {
+ int Count = 0;
+ for (auto I = MRI.use_instr_nodbg_begin(Reg), E = MRI.use_instr_nodbg_end();
+ I != E; ++I) {
+ if (++Count > 1)
+ return false;
+ }
+
+ return true;
+}
+
+// FIXME: Does this need to check IEEE bit on function?
+bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
+ const MachineOperand *ClampSrc = isClamp(MI);
+ if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg()))
+ return false;
+
+ MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
+ if (!TII->hasFPClamp(*Def))
+ return false;
+ MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
+ if (!DefClamp)
+ return false;
+
+ DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def << '\n');
+
+ // Clamp is applied after omod, so it is OK if omod is set.
+ DefClamp->setImm(1);
+ MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
+ MI.eraseFromParent();
+ return true;
+}
+
bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(*MF.getFunction()))
return false;
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
-
MRI = &MF.getRegInfo();
- TII = ST.getInstrInfo();
+ ST = &MF.getSubtarget<SISubtarget>();
+ TII = ST->getInstrInfo();
TRI = &TII->getRegisterInfo();
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
Next = std::next(I);
MachineInstr &MI = *I;
- if (!isSafeToFold(MI))
+ if (!isSafeToFold(MI)) {
+ // TODO: Try omod also.
+ tryFoldClamp(MI);
continue;
+ }
MachineOperand &OpToFold = MI.getOperand(1);
bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
// is unable to infer the encoding from the operands.
field bit VOPAsmPrefer32Bit = 0;
+ // This bit indicates that this has a floating point result type, so
+ // the clamp modifier has floating point semantics.
+ field bit FPClamp = 0;
+
// These need to be kept in sync with the enum in SIInstrFlags.
let TSFlags{0} = SALU;
let TSFlags{1} = VALU;
let TSFlags{39} = ScalarStore;
let TSFlags{40} = FixedSize;
let TSFlags{41} = VOPAsmPrefer32Bit;
+ let TSFlags{42} = FPClamp;
let SchedRW = [Write32Bit];
return get(Opcode).TSFlags & SIInstrFlags::FIXED_SIZE;
}
+ static bool hasFPClamp(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::HasFPClamp;
+ }
+
+ bool hasFPClamp(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::HasFPClamp;
+ }
+
bool isVGPRCopy(const MachineInstr &MI) const {
assert(MI.isCopy());
unsigned Dest = MI.getOperand(0).getReg();
field bit HasOMod = HasModifiers;
field bit HasClamp = HasModifiers;
field bit HasSDWAClamp = HasSrc0;
+ field bit HasFPClamp = BitAnd<isFloatType<DstVT>.ret, HasClamp>.ret;
field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
let VOP3 = 1;
let VALU = 1;
+ let FPClamp = P.HasFPClamp;
let Uses = [EXEC];
let AsmVariantName = AMDGPUAsmVariants.VOP3;
--- /dev/null
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}v_clamp_add_src_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN-NOT: [[A]]
+; GCN: v_add_f32_e64 v{{[0-9]+}}, [[A]], 1.0 clamp{{$}}
+define amdgpu_kernel void @v_clamp_add_src_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+ %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+ %a = load float, float addrspace(1)* %gep0
+ %add = fadd float %a, 1.0
+ %max = call float @llvm.maxnum.f32(float %add, float 0.0)
+ %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+ store float %clamp, float addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_multi_use_src_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}}
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[ADD]], [[ADD]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_multi_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+ %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+ %a = load float, float addrspace(1)* %gep0
+ %add = fadd float %a, 1.0
+ %max = call float @llvm.maxnum.f32(float %add, float 0.0)
+ %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+ store float %clamp, float addrspace(1)* %out.gep
+ store volatile float %add, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_dbg_use_src_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN-NOT: [[A]]
+; GCN: v_add_f32_e64 v{{[0-9]+}}, [[A]], 1.0 clamp{{$}}
+define amdgpu_kernel void @v_clamp_dbg_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+ %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+ %a = load float, float addrspace(1)* %gep0
+ %add = fadd float %a, 1.0
+ call void @llvm.dbg.value(metadata float %add, i64 0, metadata !4, metadata !9), !dbg !10
+ %max = call float @llvm.maxnum.f32(float %add, float 0.0)
+ %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+ store float %clamp, float addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_add_neg_src_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_floor_f32_e32 [[FLOOR:v[0-9]+]], [[A]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, -[[FLOOR]], -[[FLOOR]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_add_neg_src_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+ %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+ %a = load float, float addrspace(1)* %gep0
+ %floor = call float @llvm.floor.f32(float %a)
+ %neg.floor = fsub float -0.0, %floor
+ %max = call float @llvm.maxnum.f32(float %neg.floor, float 0.0)
+ %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+ store float %clamp, float addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_non_clamp_max_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}}
+; GCN: v_max_f32_e32 v{{[0-9]+}}, 0, [[ADD]]{{$}}
+define amdgpu_kernel void @v_non_clamp_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+ %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+ %a = load float, float addrspace(1)* %gep0
+ %add = fadd float %a, 1.0
+ %max = call float @llvm.maxnum.f32(float %add, float 0.0)
+ store float %max, float addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_add_src_f32_denormals:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_add_f32_e64 [[ADD:v[0-9]+]], [[A]], 1.0 clamp{{$}}
+define amdgpu_kernel void @v_clamp_add_src_f32_denormals(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+ %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+ %a = load float, float addrspace(1)* %gep0
+ %add = fadd float %a, 1.0
+ %max = call float @llvm.maxnum.f32(float %add, float 0.0)
+ %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+ store float %clamp, float addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_add_src_f16_denorm:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; VI: v_add_f16_e64 [[ADD:v[0-9]+]], [[A]], 1.0 clamp{{$}}
+
+; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]]
+; SI: v_add_f32_e64 [[ADD:v[0-9]+]], [[CVT]], 1.0 clamp{{$}}
+; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[ADD]]
+define amdgpu_kernel void @v_clamp_add_src_f16_denorm(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
+ %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
+ %a = load half, half addrspace(1)* %gep0
+ %add = fadd half %a, 1.0
+ %max = call half @llvm.maxnum.f16(half %add, half 0.0)
+ %clamp = call half @llvm.minnum.f16(half %max, half 1.0)
+ store half %clamp, half addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_add_src_f16_no_denormals:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; VI-NOT: [[A]]
+; VI: v_add_f16_e64 v{{[0-9]+}}, [[A]], 1.0 clamp{{$}}
+
+; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]]
+; SI: v_add_f32_e64 [[ADD:v[0-9]+]], [[CVT]], 1.0 clamp{{$}}
+; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[ADD]]
+define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(half addrspace(1)* %out, half addrspace(1)* %aptr) #3 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
+ %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
+ %a = load half, half addrspace(1)* %gep0
+ %add = fadd half %a, 1.0
+ %max = call half @llvm.maxnum.f16(half %add, half 0.0)
+ %clamp = call half @llvm.minnum.f16(half %max, half 1.0)
+ store half %clamp, half addrspace(1)* %out.gep
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_add_src_v2f32:
+; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}
+; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, v[[A]], 1.0 clamp{{$}}
+; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, v[[B]], 1.0 clamp{{$}}
+define amdgpu_kernel void @v_clamp_add_src_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %aptr) #0 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %aptr, i32 %tid
+ %out.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %out, i32 %tid
+ %a = load <2 x float>, <2 x float> addrspace(1)* %gep0
+ %add = fadd <2 x float> %a, <float 1.0, float 1.0>
+ %max = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %add, <2 x float> zeroinitializer)
+ %clamp = call <2 x float> @llvm.minnum.v2f32(<2 x float> %max, <2 x float> <float 1.0, float 1.0>)
+ store <2 x float> %clamp, <2 x float> addrspace(1)* %out.gep
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.floor.f32(float) #1
+declare float @llvm.minnum.f32(float, float) #1
+declare float @llvm.maxnum.f32(float, float) #1
+declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1
+declare double @llvm.fabs.f64(double) #1
+declare double @llvm.minnum.f64(double, double) #1
+declare double @llvm.maxnum.f64(double, double) #1
+declare half @llvm.fabs.f16(half) #1
+declare half @llvm.minnum.f16(half, half) #1
+declare half @llvm.maxnum.f16(half, half) #1
+declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #1
+declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #1
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind "target-features"="+fp32-denormals" }
+attributes #3 = { nounwind "target-features"="-fp64-fp16-denormals" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug)
+!1 = !DIFile(filename: "/tmp/foo.cl", directory: "/dev/null")
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !DILocalVariable(name: "add", arg: 1, scope: !5, file: !1, line: 1)
+!5 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !8}
+!8 = !DIBasicType(name: "float", size: 32, align: 32)
+!9 = !DIExpression()
+!10 = !DILocation(line: 1, column: 42, scope: !5)
--- /dev/null
+# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-fold-operands %s -o - | FileCheck -check-prefix=GCN %s
+--- |
+ define amdgpu_kernel void @v_max_self_clamp_not_set_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) {
+ ret void
+ }
+
+ define amdgpu_kernel void @v_clamp_omod_already_set_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) {
+ ret void
+ }
+
+...
+---
+# GCN-LABEL: name: v_max_self_clamp_not_set_f32
+# GCN: %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
+# GCN-NEXT: %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 0, 0, implicit %exec
+
+name: v_max_self_clamp_not_set_f32
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sgpr_64 }
+ - { id: 1, class: sreg_32_xm0 }
+ - { id: 2, class: sgpr_32 }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: sreg_64_xexec }
+ - { id: 5, class: sreg_64_xexec }
+ - { id: 6, class: sreg_32 }
+ - { id: 7, class: sreg_32 }
+ - { id: 8, class: sreg_32_xm0 }
+ - { id: 9, class: sreg_64 }
+ - { id: 10, class: sreg_32_xm0 }
+ - { id: 11, class: sreg_32_xm0 }
+ - { id: 12, class: sgpr_64 }
+ - { id: 13, class: sgpr_128 }
+ - { id: 14, class: sreg_32_xm0 }
+ - { id: 15, class: sreg_64 }
+ - { id: 16, class: sgpr_128 }
+ - { id: 17, class: vgpr_32 }
+ - { id: 18, class: vreg_64 }
+ - { id: 19, class: vgpr_32 }
+ - { id: 20, class: vgpr_32 }
+ - { id: 21, class: vgpr_32 }
+ - { id: 22, class: vgpr_32 }
+ - { id: 23, class: vreg_64 }
+ - { id: 24, class: vgpr_32 }
+ - { id: 25, class: vreg_64 }
+ - { id: 26, class: vreg_64 }
+liveins:
+ - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+ - { reg: '%vgpr0', virtual-reg: '%3' }
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %sgpr0_sgpr1, %vgpr0
+
+ %3 = COPY %vgpr0
+ %0 = COPY %sgpr0_sgpr1
+ %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %24 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+ %25 = REG_SEQUENCE %3, 1, %24, 2
+ %10 = S_MOV_B32 61440
+ %11 = S_MOV_B32 0
+ %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+ %13 = REG_SEQUENCE killed %5, 17, %12, 18
+ %14 = S_MOV_B32 2
+ %26 = V_LSHL_B64 killed %25, 2, implicit %exec
+ %16 = REG_SEQUENCE killed %4, 17, %12, 18
+ %18 = COPY %26
+ %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, implicit %exec
+ %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
+ %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 0, 0, implicit %exec
+ BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, implicit %exec
+ S_ENDPGM
+
+...
+---
+# GCN-LABEL: name: v_clamp_omod_already_set_f32
+# GCN: %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
+# GCN: %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 1, 3, implicit %exec
+name: v_clamp_omod_already_set_f32
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: sgpr_64 }
+ - { id: 1, class: sreg_32_xm0 }
+ - { id: 2, class: sgpr_32 }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: sreg_64_xexec }
+ - { id: 5, class: sreg_64_xexec }
+ - { id: 6, class: sreg_32 }
+ - { id: 7, class: sreg_32 }
+ - { id: 8, class: sreg_32_xm0 }
+ - { id: 9, class: sreg_64 }
+ - { id: 10, class: sreg_32_xm0 }
+ - { id: 11, class: sreg_32_xm0 }
+ - { id: 12, class: sgpr_64 }
+ - { id: 13, class: sgpr_128 }
+ - { id: 14, class: sreg_32_xm0 }
+ - { id: 15, class: sreg_64 }
+ - { id: 16, class: sgpr_128 }
+ - { id: 17, class: vgpr_32 }
+ - { id: 18, class: vreg_64 }
+ - { id: 19, class: vgpr_32 }
+ - { id: 20, class: vgpr_32 }
+ - { id: 21, class: vgpr_32 }
+ - { id: 22, class: vgpr_32 }
+ - { id: 23, class: vreg_64 }
+ - { id: 24, class: vgpr_32 }
+ - { id: 25, class: vreg_64 }
+ - { id: 26, class: vreg_64 }
+liveins:
+ - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+ - { reg: '%vgpr0', virtual-reg: '%3' }
+body: |
+ bb.0 (%ir-block.0):
+ liveins: %sgpr0_sgpr1, %vgpr0
+
+ %3 = COPY %vgpr0
+ %0 = COPY %sgpr0_sgpr1
+ %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %24 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+ %25 = REG_SEQUENCE %3, 1, %24, 2
+ %10 = S_MOV_B32 61440
+ %11 = S_MOV_B32 0
+ %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+ %13 = REG_SEQUENCE killed %5, 17, %12, 18
+ %14 = S_MOV_B32 2
+ %26 = V_LSHL_B64 killed %25, 2, implicit %exec
+ %16 = REG_SEQUENCE killed %4, 17, %12, 18
+ %18 = COPY %26
+ %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, implicit %exec
+ %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
+ %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 1, 3, implicit %exec
+ BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, implicit %exec
+ S_ENDPGM
+...
; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
; VI: v_max_f16_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
-; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]]
-; SI: v_max_f32_e64 v{{[0-9]+}}, [[CVT]], [[CVT]] clamp{{$}}
-; SI: v_cvt_f16_f32_e32
+; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], [[A]] clamp{{$}}
+; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
define amdgpu_kernel void @v_clamp_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
; VI: v_max_f16_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}}
; FIXME: Better to fold neg into max
-; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]]
-; SI: v_max_f32_e64 v{{[0-9]+}}, [[CVT]], [[CVT]] clamp{{$}}
-; SI: v_cvt_f16_f32
+; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] clamp{{$}}
+; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
define amdgpu_kernel void @v_clamp_neg_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
; FIXME: Better to fold neg/abs into max
-; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -|[[A]]|
-; SI: v_max_f32_e64 v{{[0-9]+}}, [[CVT]], [[CVT]] clamp{{$}}
-; SI: v_cvt_f16_f32_e32
+; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -|[[A]]| clamp{{$}}
+; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
define amdgpu_kernel void @v_clamp_negabs_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid