static cl::opt<bool> EnableSDWAPeephole(
"amdgpu-sdwa-peephole",
cl::desc("Enable SDWA peepholer"),
- cl::init(false));
+ cl::init(true));
// Enable address space based alias analysis
static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
if (SuperReg.getReg() != SubReg.getReg())
return false;
- LaneBitmask::Type SuperMask =
- TRI->getSubRegIndexLaneMask(SuperReg.getSubReg()).getAsInteger();
- LaneBitmask::Type SubMask =
- TRI->getSubRegIndexLaneMask(SubReg.getSubReg()).getAsInteger();
- return TRI->regmaskSubsetEqual(&SubMask, &SuperMask);
+ LaneBitmask SuperMask = TRI->getSubRegIndexLaneMask(SuperReg.getSubReg());
+ LaneBitmask SubMask = TRI->getSubRegIndexLaneMask(SubReg.getSubReg());
+ SuperMask |= ~SubMask;
+ return SuperMask.all();
}
uint64_t SDWASrcOperand::getSrcMods() const {
; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; FIXME: Need to handle non-uniform case for function below (load without gep).
; GCN-LABEL: {{^}}v_test_add_v2i16:
; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
; VI: v_add_i32
-; VI: v_add_i32
+; VI: v_add_i32_sdwa
define amdgpu_kernel void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
%add = add <2 x i16> %a, %b
store <2 x i16> %add, <2 x i16> addrspace(1)* %out
; GFX9: v_pk_add_u16
; GFX9: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
-; VI: v_add_u16_e32
+; VI: v_add_u16_sdwa
; VI: v_add_u16_e32
; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
-; CIVI: v_ashrrev_i32_e32
-; CIVI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
-; CIVI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; CIVI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
-; CIVI: v_or_b32_e32
+; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+
+; CI: v_ashrrev_i32_e32
+; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
+; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; CI: v_or_b32_e32
define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
%result = ashr <2 x i16> %lhs, %rhs
store <2 x i16> %result, <2 x i16> addrspace(1)* %out
; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
-; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
; VI: v_ashrrev_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; VI: v_ashrrev_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; VI: v_ashrrev_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}}
; GCN: {{buffer|flat}}_load_dwordx2
; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+; VI: v_ashrrev_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_ashrrev_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI: v_ashrrev_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_ashrrev_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
; GCN: {{buffer|flat}}_store_dwordx2
define amdgpu_kernel void @v_ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
; FUNC-LABEL: {{^}}v_ctlz_i8:
; GCN: buffer_load_ubyte [[VAL:v[0-9]+]],
-; GCN-DAG: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI-DAG: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; VI-DAG: v_ffbh_u32_sdwa [[RESULT:v[0-9]+]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
; GCN: buffer_store_byte [[RESULT]],
; GCN: s_endpgm
define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
; CI: v_or_b32_e32
-; VI: flat_load_ushort [[LO:v[0-9]+]]
; VI: flat_load_ushort [[HI:v[0-9]+]]
+; VI: flat_load_ushort [[LO:v[0-9]+]]
; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}}
-; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[MASK]], [[LO]]
-; VI-DAG: v_and_b32_e32 [[FABS_HI:v[0-9]+]], [[MASK]], [[HI]]
-; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
-; VI: v_or_b32
+; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[MASK]], [[HI]]
+; VI-DAG: v_and_b32_sdwa [[FABS_HI:v[0-9]+]], [[MASK]], [[LO]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, [[FABS_HI]], [[FABS_LO]]
; VI: flat_store_dword
; GFX9: s_load_dword [[VAL:s[0-9]+]]
; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}}
-; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
-; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
-; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
-; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
+; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
+; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
+; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; GCN: flat_store_dwordx2
define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_add_f16_e32 v[[R_F16_LO:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
+; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
-; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
-; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]]
-; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fadd_v2f16(
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]]
+; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
+; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
-; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
-; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]]
-; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fadd_v2f16_imm_a(
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 1.0, v[[A_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 2.0, v[[A_V2_F16]]
-; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 1.0, v[[A_F16_1]]
-; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[A_F16_1]]
+; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[A_V2_F16]]
+; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_1]]
+
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fadd_v2f16_imm_b(
; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_v2f16:
; VI: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, v{{[0-9]+}}
; VI-DAG: v_lshrrev_b32_e32 [[FNEG_HI:v[0-9]+]], 16, [[FNEG]]
-; VI-DAG: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, [[FNEG]]
; VI-DAG: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, [[FNEG_HI]]
+; VI-DAG: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, [[FNEG]]
; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
; VI-NOT: 0xffff
; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_mul_f16_e32 v[[R_F16_LO:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
+; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
-; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
-; VI-DAG: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]]
-; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fmul_v2f16(
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
; VI-DAG: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
+; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
; VI-DAG: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
+; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; CI: v_or_b32_e32
; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x8000{{$}}
-; VI: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]]
-; VI: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]]
+; VI-DAG: v_xor_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-DAG: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]]
; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}}
+
define amdgpu_kernel void @s_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 {
%fneg = fsub <2 x half> <half -0.0, half -0.0>, %in
store <2 x half> %fneg, <2 x half> addrspace(1)* %out
-; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI -check-prefix=SIGFX9 %s
; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=SIGFX9 %s
; GCN-LABEL: {{^}}fpext_f16_to_f32
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
; GCN-LABEL: {{^}}fpext_v2f16_to_v2f32
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GFX89-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; GFX9-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; GCN-DAG: v_cvt_f32_f16_e32 v[[R_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]]
+; SIGFX9: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]]
+; VI: v_cvt_f32_f16_sdwa v[[R_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GCN: buffer_store_dwordx2 v{{\[}}[[R_F32_0]]:[[R_F32_1]]{{\]}}
; GCN: s_endpgm
+
define amdgpu_kernel void @fpext_v2f16_to_v2f32(
<2 x float> addrspace(1)* %r,
<2 x half> addrspace(1)* %a) #0 {
; GCN-LABEL: {{^}}fpext_v2f16_to_v2f64
; GCN: buffer_load_dword
-; GCN-DAG: v_lshrrev_b32_e32
-; GCN-DAG: v_cvt_f32_f16_e32
+; SIGFX9-DAG: v_lshrrev_b32_e32
+; SIGFX9-DAG: v_cvt_f32_f16_e32
+; VI: v_cvt_f32_f16_sdwa
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f64_f32_e32
; GCN: v_cvt_f64_f32_e32
; GCN: buffer_store_dwordx4
; GCN: s_endpgm
+
define amdgpu_kernel void @fpext_v2f16_to_v2f64(
<2 x double> addrspace(1)* %r,
<2 x half> addrspace(1)* %a) {
; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i16
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; GCN: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]]
-; GCN: v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]]
-; GCN: v_and_b32_e32 v[[R_I16_LO:[0-9]+]], 0xffff, v[[R_I16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_LO]]
+
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]]
+; SI: v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]]
+; SI: v_and_b32_e32 v[[R_I16_LO:[0-9]+]], 0xffff, v[[R_I16_0]]
+; SI: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]]
+; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_LO]]
+
+; VI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; VI: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]]
+; VI: v_cvt_i32_f32_sdwa v[[R_I16_1:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI: v_or_b32_sdwa v[[R_V2_I16:[0-9]+]], v[[R_I16_1]], v[[R_I16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+
; GCN: buffer_store_dword v[[R_V2_I16]]
; GCN: s_endpgm
+
define amdgpu_kernel void @fptosi_v2f16_to_v2i16(
<2 x i16> addrspace(1)* %r,
<2 x half> addrspace(1)* %a) {
; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i32
; GCN: buffer_load_dword
; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; VI: v_cvt_f32_f16_sdwa
; GCN: v_cvt_i32_f32_e32
; GCN: v_cvt_i32_f32_e32
; GCN: buffer_store_dwordx2
; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i64
; GCN: buffer_load_dword
; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; VI: v_cvt_f32_f16_sdwa
; GCN: s_endpgm
define amdgpu_kernel void @fptosi_v2f16_to_v2i64(
<2 x i64> addrspace(1)* %r,
; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i16
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; GCN-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI: v_cvt_u32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]]
; SI: v_cvt_u32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]]
-; VI: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]]
+; SI: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]]
+; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_0]]
+
+; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_V2_F16]]
+; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI: v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]]
-; GCN: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_0]]
+; VI: v_cvt_i32_f32_sdwa v[[R_I16_0:[0-9]+]], v[[A_F32_0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI: v_or_b32_sdwa v[[R_V2_I16:[0-9]+]], v[[R_I16_0]], v[[R_I16_1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+
; GCN: buffer_store_dword v[[R_V2_I16]]
; GCN: s_endpgm
+
define amdgpu_kernel void @fptoui_v2f16_to_v2i16(
<2 x i16> addrspace(1)* %r,
<2 x half> addrspace(1)* %a) {
; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i32
; GCN: buffer_load_dword
; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; VI: v_cvt_f32_f16_sdwa
; GCN: v_cvt_u32_f32_e32
; GCN: v_cvt_u32_f32_e32
; GCN: buffer_store_dwordx2
; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i64
; GCN: buffer_load_dword
; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; VI: v_cvt_f32_f16_sdwa
; GCN: s_endpgm
define amdgpu_kernel void @fptoui_v2f16_to_v2i64(
<2 x i64> addrspace(1)* %r,
-; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SIVI %s
; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global,+fp64-fp16-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX9-DENORM %s
; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX9-FLUSH %s
; GCN-LABEL: {{^}}fptrunc_v2f32_to_v2f16:
; GCN: buffer_load_dwordx2 v{{\[}}[[A_F32_0:[0-9]+]]:[[A_F32_1:[0-9]+]]{{\]}}
; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]]
-; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
-; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+
+; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
; GFX9-FLUSH: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
; GFX9-FLUSH: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
+
define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
<2 x half> addrspace(1)* %r,
<2 x float> addrspace(1)* %a) {
; GCN-DAG: v_cvt_f32_f64_e32 v[[A_F32_0:[0-9]+]], v{{\[}}[[A_F64_0]]:{{[0-9]+}}{{\]}}
; GCN-DAG: v_cvt_f32_f64_e32 v[[A_F32_1:[0-9]+]], v{{\[}}{{[0-9]+}}:[[A_F64_3]]{{\]}}
; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]]
-; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
-; SIVI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; VI: v_cvt_f16_f32_sdwa v[[R_F16_HI:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+
; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
; GFX9-FLUSH: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
; GFX9-FLUSH: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]]
; GFX9-DENORM: v_pack_b32_f16 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GCN: buffer_store_dword v[[R_V2_F16]]
+
define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
<2 x half> addrspace(1)* %r,
<2 x double> addrspace(1)* %a) {
; SI: v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-
-
-; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
; VI-DAG: v_subrev_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
-; VI-DAG: v_subrev_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]]
-
-; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; VI-DAG: v_subrev_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
+
define amdgpu_kernel void @fsub_v2f16(
<2 x half> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
; GCN-LABEL: {{^}}fsub_v2f16_imm_a:
; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SIVI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-
-; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]]
-; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
-; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-
-; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
-; VI-DAG: v_sub_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]]
-; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]]
+; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
+; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; VI-DAG: v_sub_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]]
+; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
+; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x40003c00
; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], [[K]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
+
define amdgpu_kernel void @fsub_v2f16_imm_a(
<2 x half> addrspace(1)* %r,
<2 x half> addrspace(1)* %b) {
; GCN-LABEL: {{^}}fsub_v2f16_imm_b:
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SIVI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-
-; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], -2.0, v[[A_F32_0]]
-; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], -1.0, v[[A_F32_1]]
-; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]]
-; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], -1.0, v[[A_F16_1]]
-; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], -2.0, v[[A_F32_0]]
+; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], -1.0, v[[A_F32_1]]
+; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], -1.0, v[[A_F16_1]]
+; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]]
+; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xbc00c000
; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], [[K]], v[[A_V2_F16]]{{$}}
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
+
define amdgpu_kernel void @fsub_v2f16_imm_b(
<2 x half> addrspace(1)* %r,
<2 x half> addrspace(1)* %a) {
; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32:
; GCN: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; VI: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
; GCN: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
-; SI: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
-; GCN: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
+; SI: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
+; SI: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
+; VI: v_cvt_f32_f16_sdwa v[[CVT1:[0-9]+]], [[LOAD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GCN: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}}
; GCN: s_endpgm
define amdgpu_kernel void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
; GCN: buffer_load_dwordx4
; GCN: buffer_load_dwordx4
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+
+; VI: v_cvt_f32_f16_e32
+; VI: v_cvt_f32_f16_sdwa
+; ...
; GCN: buffer_store_dwordx4
; GCN: buffer_store_dwordx4
; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64:
; GCN-DAG: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
-; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
-; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
-; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT0]]
-; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]]
+
+; SI-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
+; SI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT0]]
+; SI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]]
+
+; VI-DAG: v_cvt_f32_f16_sdwa v[[CVT0:[0-9]+]], [[LOAD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[LOAD]]
+; VI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT0]]
+; VI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT1]]
+
; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[CVT2_LO]]:[[CVT3_HI]]{{\]}}
; GCN: s_endpgm
define amdgpu_kernel void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
; XSI-NOT: v_cvt_f32_f16
; XVI: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
-; XVI-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}}
-; XVI: v_cvt_f32_f16_e32
; XVI: v_cvt_f32_f16_e32
; XVI: v_cvt_f32_f16_e32
+; XVI: v_cvt_f32_f16_sdwa
; XVI-NOT: v_cvt_f32_f16
; GCN: buffer_load_dwordx2 v{{\[}}[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]]
-; VI-DAG: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]]
; GCN-DAG: v_cvt_f32_f16_e32 [[Z32:v[0-9]+]], v[[IN_HI]]
; GCN-DAG: v_cvt_f32_f16_e32 [[X32:v[0-9]+]], v[[IN_LO]]
; SI: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]]
-; GCN-DAG: v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]]
+; SI-DAG: v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]]
+; VI-DAG: v_cvt_f32_f16_sdwa [[Y32:v[0-9]+]], v[[IN_LO]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GCN-DAG: v_cvt_f64_f32_e32 [[Z:v\[[0-9]+:[0-9]+\]]], [[Z32]]
; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[XLO:[0-9]+]]:{{[0-9]+}}], [[X32]]
; GCN-LABEL: {{^}}global_truncstore_v2f32_to_v2f16:
; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]]
-; GCN-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]]
-; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]]
-; GCN-DAG: v_or_b32_e32 [[PACKED:v[0-9]+]], [[SHL]], [[CVT0]]
+
+; SI-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]]
+; SI-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]]
+; SI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[SHL]], [[CVT0]]
+
+; VI-DAG: v_cvt_f16_f32_sdwa [[CVT1:v[0-9]+]], v[[HI]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT1]], [[CVT0]]
+
; GCN-DAG: buffer_store_dword [[PACKED]]
; GCN: s_endpgm
define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16:
; GCN: buffer_load_dwordx4
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN-NOT: v_cvt_f16_f32_e32
+; GCN-DAG: v_cvt_f16_f32_e32
+; SI-DAG: v_cvt_f16_f32_e32
+; VI-DAG: v_cvt_f16_f32_sdwa
+; GCN-DAG: v_cvt_f16_f32_e32
; GCN: buffer_store_short
; GCN: buffer_store_dword
; GCN: s_endpgm
; GCN-LABEL: {{^}}global_truncstore_v4f32_to_v4f16:
; GCN: buffer_load_dwordx4
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
+; GCN-DAG: v_cvt_f16_f32_e32
+; SI-DAG: v_cvt_f16_f32_e32
+; SI-DAG: v_cvt_f16_f32_e32
+; VI-DAG: v_cvt_f16_f32_sdwa
+; VI-DAG: v_cvt_f16_f32_sdwa
+; GCN-DAG: v_cvt_f16_f32_e32
; GCN: buffer_store_dwordx2
; GCN: s_endpgm
define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
; GCN-LABEL: {{^}}global_truncstore_v8f32_to_v8f16:
; GCN: buffer_load_dwordx4
; GCN: buffer_load_dwordx4
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
+; SI: v_cvt_f16_f32_e32
+; SI: v_cvt_f16_f32_e32
+; SI: v_cvt_f16_f32_e32
+; SI: v_cvt_f16_f32_e32
+; SI: v_cvt_f16_f32_e32
+; SI: v_cvt_f16_f32_e32
+; SI: v_cvt_f16_f32_e32
+; SI: v_cvt_f16_f32_e32
+; VI-DAG: v_cvt_f16_f32_e32
+; VI-DAG: v_cvt_f16_f32_e32
+; VI-DAG: v_cvt_f16_f32_e32
+; VI-DAG: v_cvt_f16_f32_e32
+; VI-DAG: v_cvt_f16_f32_sdwa
+; VI-DAG: v_cvt_f16_f32_sdwa
+; VI-DAG: v_cvt_f16_f32_sdwa
+; VI-DAG: v_cvt_f16_f32_sdwa
; GCN: buffer_store_dwordx4
; GCN: s_endpgm
define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}}
; VI-DAG: buffer_load_dword
; VI-NOT: and
-; VI: v_lshrrev_b32_e32 {{v[0-9]+}}, 16,
-; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}
-; VI: v_or_b32
+; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[K]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; VI: buffer_store_dword
define amdgpu_kernel void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
%x = load <2 x half>, <2 x half> addrspace(1)* %in
; GCN-LABEL: {{^}}ceil_v2f16:
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_ceil_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI: v_ceil_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-NOT: and
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
; VI-DAG: v_ceil_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
-; VI-DAG: v_ceil_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
-; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN-NOT: and
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; VI-DAG: v_ceil_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NOT: and
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @ceil_v2f16(
; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[HALF_PIE]], v[[A_F32_0]]
; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[HALF_PIE]], v[[A_F32_1]]
-
-; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; VI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], 0.15915494, v[[A_F32_0]]
-; VI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]]
+; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], 0.15915494, v[[A_F32_0]]
+; VI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]]
; GCN-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
; GCN-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
; GCN-DAG: v_cos_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[R_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GCN-NOT: and
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @cos_v2f16(
-; RUN: llc -O0 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs -mattr=-flat-for-global < %s | FileCheck %s
+; RUN: llc -O0 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -amdgpu-sdwa-peephole=0 -verify-machineinstrs -mattr=-flat-for-global < %s | FileCheck %s
; CHECK-LABEL: {{^}}test_debug_value:
; CHECK: s_load_dwordx2 s[4:5]
; GCN-LABEL: {{^}}exp2_v2f16
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_exp_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI: v_exp_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI-DAG: v_exp_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
-; VI-DAG: v_exp_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
-; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN-NOT: and
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-NOT: and
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_exp_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
+; VI-DAG: v_exp_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NOT: and
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @exp2_v2f16(
; GCN-LABEL: {{^}}floor_v2f16
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_floor_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI: v_floor_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI-DAG: v_floor_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
-; VI-DAG: v_floor_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
-; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN-NOT: and
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-NOT: and
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_floor_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
+; VI-DAG: v_floor_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NOT: and
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @floor_v2f16(
}
; GCN-LABEL: {{^}}fma_v2f16_imm_a:
-; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
-
-; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
-; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]]
; SI: v_mov_b32_e32 v[[A_F32:[0-9]+]], 0x40400000{{$}}
; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}}
; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32]], v[[C_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16]], v[[C_F16_1]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; VI: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_F16]], v[[C_V2_F16]]
-
+; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[C_F16_1]], v[[A_F16]], v[[B_F16_1]]
+; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[C_V2_F16]], v[[A_F16]], v[[B_V2_F16]]
+; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GCN-NOT: and
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}}
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32]], v[[C_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
-; VI_DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16]], v[[C_F16_1]]
-; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_F16]], v[[C_V2_F16]]
-
+; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_F16]], v[[C_V2_F16]]
+; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16]], v[[C_F16_1]]
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GCN-NOT: and
; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}}
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_F16]]
-; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16]]
-
-; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_F16]]
+; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16]]
; GCN-NOT: and
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]]
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
-; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; VI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
-
-
-; VI-FLUSH: v_mac_f16_e32 v[[C_F16_1]], v[[B_F16_1]], v[[A_F16_1]]
-; VI-FLUSH: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]]
-; VI-FLUSH: v_mac_f16_e32 v[[C_V2_F16]], v[[B_V2_F16]], v[[A_V2_F16]]
+; VI-FLUSH: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; VI-FLUSH-DAG: v_mac_f16_sdwa v[[A_F16_1]], v[[C_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-FLUSH-DAG: v_mac_f16_e32 v[[A_V2_F16]], v[[C_V2_F16]], v[[B_V2_F16]]
+; VI-FLUSH-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[A_F16_1]]
; VI-FLUSH-NOT: v_and_b32
-; VI-FLUSH: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[C_V2_F16]]
+; VI-FLUSH: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[A_V2_F16]]
+; VI-DENORM: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; VI-DENORM: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; VI-DENORM: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
; VI-DENORM-DAG: v_fma_f16 v[[RES0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]]
; VI-DENORM-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[RES1]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
+
define amdgpu_kernel void @fmuladd_v2f16(
<2 x half> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
; GCN-LABEL: {{^}}log2_v2f16
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI-DAG: v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
; SI-DAG: v_log_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-NOT: and
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
; VI-DAG: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
-; VI-DAG: v_log_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
-; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN-NOT: and
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; VI-DAG: v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NOT: and
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @log2_v2f16(
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
-; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-NOT: and
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
-; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
-; VI-DAG: v_max_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]]
+; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
+; VI-DAG: v_max_f16_sdwa v[[R_F16_1:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NOT: and
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
-; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN-NOT: and
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @maxnum_v2f16(
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
; VI-DAG: v_max_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
+; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GCN-NOT: and
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @maxnum_v2f16_imm_a(
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
; VI-DAG: v_max_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
+; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GCN-NOT: and
; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
-; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-NOT: and
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
-; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
-; VI-DAG: v_min_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]]
+; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
+; VI-DAG: v_min_f16_sdwa v[[R_F16_1:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NOT: and
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
-; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN-NOT: and
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @minnum_v2f16(
; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
+; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; VI-DAG: v_min_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
+; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GCN-NOT: and
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI_DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
; VI-DAG: v_min_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
+; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GCN-NOT: and
; GCN-LABEL: {{^}}rint_v2f16
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_rndne_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI: v_rndne_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-NOT: v_and_b32
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
-; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; VI-DAG: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
-; VI-DAG: v_rndne_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
-; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; VI-DAG: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
+; VI-DAG: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NOT: v_and_b32
-; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
; GFX9: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
; GFX9: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; GCN-LABEL: {{^}}sin_v2f16
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; SI: v_mov_b32_e32 v[[HALF_PIE:[0-9]+]], 0x3e22f983{{$}}
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[HALF_PIE]], v[[A_F32_0]]
+; SI-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
+; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[HALF_PIE]], v[[A_F32_1]]
+; SI-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
-; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-
-
-; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[HALF_PIE]], v[[A_F32_0]]
-; VI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], 0.15915494, v[[A_F32_0]]
-; GCN-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
-
-; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[HALF_PIE]], v[[A_F32_1]]
-; VI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]]
-; GCN-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
+; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], 0.15915494, v[[A_F32_0]]
+; VI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]]
+; VI-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
+; VI-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
; GCN-DAG: v_sin_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]]
; GCN-DAG: v_sin_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[R_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @sin_v2f16(
; GCN-LABEL: {{^}}sqrt_v2f16
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_sqrt_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI: v_sqrt_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI-DAG: v_sqrt_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
-; VI-DAG: v_sqrt_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
-; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN-NOT: v_and_b32
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-NOT: v_and_b32
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_sqrt_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
+; VI-DAG: v_sqrt_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NOT: v_and_b32
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @sqrt_v2f16(
; GCN-LABEL: {{^}}trunc_v2f16
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_trunc_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI: v_trunc_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI-DAG: v_trunc_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
-; VI-DAG: v_trunc_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
-; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN-NOT: v_and_b32
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-NOT: v_and_b32
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_trunc_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
+; VI-DAG: v_trunc_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NOT: v_and_b32
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @trunc_v2f16(
; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
-; CIVI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; CIVI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; VI: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
; CIVI: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16
; CIVI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
-; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
-; VI: v_lshrrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; VI: v_lshrrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; VI: v_lshrrev_b16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}}
; FIXME: Need to handle non-uniform case for function below (load without gep).
; GCN-LABEL: {{^}}v_test_imax_sge_v2i16:
; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_max_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @v_test_imax_sge_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %aptr, <2 x i16> addrspace(1)* %bptr) nounwind {
; FIXME: Need to handle non-uniform case for function below (load without gep).
; GCN-LABEL: {{^}}v_test_imax_sge_v3i16:
; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_max_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; VI-NOT: v_max_i16
; FIXME: Need to handle non-uniform case for function below (load without gep).
; GCN-LABEL: {{^}}v_test_imax_sge_v4i16:
; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_max_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_max_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; GCN-LABEL: {{^}}v_test_umax_ugt_v2i16:
; VI: v_max_u16_e32
-; VI: v_max_u16_e32
+; VI: v_max_u16_sdwa
; GFX9: v_pk_max_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @v_test_umax_ugt_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %aptr, <2 x i16> addrspace(1)* %bptr) nounwind {
; SI-NOT: v_min_u32_e32
; VI: v_min_u16_e32
-; VI: v_min_u16_e32
+; VI: v_min_u16_sdwa
; VI: v_min_u16_e32
; VI-NOT: v_min_u16_e32
-; RUN: llc -O0 -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=VGPR -check-prefix=GCN %s
+; RUN: llc -O0 -march=amdgcn -mcpu=hawaii -amdgpu-sdwa-peephole=0 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=VGPR -check-prefix=GCN %s
+
+; FIXME: we should disable sdwa peephole because dead-code elimination, that
+; runs after peephole, ruins this test (different register numbers)
; Spill all SGPRs so multiple VGPRs are required for spilling all of them.
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=fiji --amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SDWA -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SDWA -check-prefix=GCN %s
; GCN-LABEL: {{^}}add_shr_i32:
; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}}
; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}}
; NOSDWA-NOT: v_mul_u32_u24_sdwa
-; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; NOSDWA-NOT: v_mul_u32_u24_sdwa
-; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL3]], v[[DST_MUL2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL1]], v[[DST_MUL0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL3]], v[[DST_MUL2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL1]], v[[DST_MUL0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
define amdgpu_kernel void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) {
entry:
; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; NOSDWA-NOT: v_mul_u32_u24_sdwa
-; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL4:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL5:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL6:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL7:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL7]], v[[DST_MUL6]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL5]], v[[DST_MUL4]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL3]], v[[DST_MUL2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL1]], v[[DST_MUL0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL4:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL5:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL6:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL7:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL7]], v[[DST_MUL6]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL5]], v[[DST_MUL4]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL3]], v[[DST_MUL2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL1]], v[[DST_MUL0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
define amdgpu_kernel void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) {
entry:
; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; NOSDWA-NOT: v_mul_f16_sdwa
-; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) {
entry:
; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; NOSDWA-NOT: v_mul_f16_sdwa
-; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) {
entry:
; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; NOSDWA-NOT: v_mul_u32_u24_sdwa
-; SDWA: v_mul_u32_u24_sdwa
-; SDWA: v_mul_u32_u24_sdwa
-; SDWA: v_mul_u32_u24_sdwa
+; SDWA-DAG: v_mul_u32_u24_sdwa
+; SDWA-DAG: v_mul_u32_u24_sdwa
+; SDWA-DAG: v_mul_u32_u24_sdwa
define amdgpu_kernel void @mul_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %ina, <4 x i8> addrspace(1)* %inb) {
entry:
; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; NOSDWA-NOT: v_mul_u32_u24_sdwa
-; SDWA: v_mul_u32_u24_sdwa
-; SDWA: v_mul_u32_u24_sdwa
-; SDWA: v_mul_u32_u24_sdwa
-; SDWA: v_mul_u32_u24_sdwa
-; SDWA: v_mul_u32_u24_sdwa
-; SDWA: v_mul_u32_u24_sdwa
+; SDWA-DAG: v_mul_u32_u24_sdwa
+; SDWA-DAG: v_mul_u32_u24_sdwa
+; SDWA-DAG: v_mul_u32_u24_sdwa
+; SDWA-DAG: v_mul_u32_u24_sdwa
+; SDWA-DAG: v_mul_u32_u24_sdwa
+; SDWA-DAG: v_mul_u32_u24_sdwa
define amdgpu_kernel void @mul_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %ina, <8 x i8> addrspace(1)* %inb) {
entry:
ret void
}
+; GCN-LABEL: {{^}}sitofp_v2i16_to_v2f16:
+; NOSDWA-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
+; NOSDWA-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA-DAG: v_cvt_f32_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-DAG: v_cvt_f32_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_cvt_f32_i32_sdwa
+
+; SDWA-DAG: v_cvt_f32_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; SDWA-DAG: v_cvt_f32_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+
+define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
+ <2 x half> addrspace(1)* %r,
+ <2 x i16> addrspace(1)* %a) {
+entry:
+ %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
+ %r.val = sitofp <2 x i16> %a.val to <2 x half>
+ store <2 x half> %r.val, <2 x half> addrspace(1)* %r
+ ret void
+}
+
; GCN-LABEL: {{^}}mac_v2half:
; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
}
; GCN-LABEL: {{^}}select_v2f16:
-; SI: v_cvt_f32_f16_e32
-; SI: v_cvt_f32_f16_e32
-; SI: v_cvt_f32_f16_e32
-; SI: v_cvt_f32_f16_e32
-; SI: v_cmp_lt_f32_e64
-; SI: v_cmp_lt_f32_e32
-; VI: v_cmp_lt_f16_e32
-; VI: v_cmp_lt_f16_e64
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e64
-; SI: v_cvt_f16_f32_e32
-; SI: v_cvt_f16_f32_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cmp_lt_f32_e64
+; SI: v_cmp_lt_f32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e64
+; SI: v_cvt_f16_f32_e32
+; SI: v_cvt_f16_f32_e32
+
+; VI: v_cmp_lt_f16_e64
+; VI: v_cmp_lt_f16_e32
+; VI: v_cndmask_b32_e64
+; VI: v_cndmask_b32_e32
+
; GCN: s_endpgm
define amdgpu_kernel void @select_v2f16(
<2 x half> addrspace(1)* %r,
; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
-; CIVI: v_lshlrev_b32_e32
-; CIVI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
-; CIVI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; CIVI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
-; CIVI: v_or_b32_e32
+; VI: v_lshlrev_b32_e32
+; VI: v_lshlrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+
+; CI: v_lshlrev_b32_e32
+; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
+; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; CI: v_or_b32_e32
define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
%result = shl <2 x i16> %lhs, %rhs
store <2 x i16> %result, <2 x i16> addrspace(1)* %out
; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
-; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
-; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; VI: v_lshlrev_b16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}}
; GCN-LABEL: {{^}}sitofp_v2i16_to_v2f16
; GCN: buffer_load_dword
-; GCN: v_cvt_f32_i32_e32
-; GCN: v_cvt_f32_i32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN-DAG: v_lshlrev_b32_e32
-; GCN-DAG: v_or_b32_e32
-; GCN: buffer_store_dword
-; GCN: s_endpgm
+
+; SI: v_cvt_f32_i32_e32
+; SI: v_cvt_f32_i32_e32
+; SI: v_cvt_f16_f32_e32
+; SI: v_cvt_f16_f32_e32
+; SI-DAG: v_lshlrev_b32_e32
+; SI: v_or_b32_e32
+
+; VI-DAG: v_cvt_f32_i32_sdwa
+; VI-DAG: v_cvt_f32_i32_sdwa
+; VI-DAG: v_cvt_f16_f32_e32
+; VI-DAG: v_cvt_f16_f32_sdwa
+; VI: v_or_b32_e32
+
+; GCN: buffer_store_dword
+; GCN: s_endpgm
+
define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
<2 x half> addrspace(1)* %r,
<2 x i16> addrspace(1)* %a) {
}
; GCN-LABEL: {{^}}sitofp_v2i32_to_v2f16
-; GCN: buffer_load_dwordx2
-; GCN: v_cvt_f32_i32_e32
-; GCN: v_cvt_f32_i32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN-DAG: v_lshlrev_b32_e32
-; GCN-DAG: v_or_b32_e32
-; GCN: buffer_store_dword
-; GCN: s_endpgm
+; GCN: buffer_load_dwordx2
+
+; SI: v_cvt_f32_i32_e32
+; SI: v_cvt_f32_i32_e32
+; SI: v_cvt_f16_f32_e32
+; SI: v_cvt_f16_f32_e32
+; SI-DAG: v_lshlrev_b32_e32
+; SI: v_or_b32_e32
+
+; VI-DAG: v_cvt_f32_i32_e32
+; VI-DAG: v_cvt_f32_i32_e32
+; VI-DAG: v_cvt_f16_f32_e32
+; VI-DAG: v_cvt_f16_f32_sdwa
+; VI: v_or_b32_e32
+
+; GCN: buffer_store_dword
+; GCN: s_endpgm
define amdgpu_kernel void @sitofp_v2i32_to_v2f16(
<2 x half> addrspace(1)* %r,
<2 x i32> addrspace(1)* %a) {
; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2
-; CIVI: v_sub_i32_e32
-; CIVI-DAG: v_sub_i32_e32
-; CIVI: v_bfe_i32
-; CIVI-DAG: v_bfe_i32
-; CIVI-DAG: v_add_i32_e32
-; CIVI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16
-; CIVI: v_add_i32_e32
-; CIVI: v_and_b32_e32 v{{[0-9]+}}, 0xffff,
-; CIVI: v_or_b32_e32
+; VI: v_sub_i32_e32
+; VI-DAG: v_sub_i32_e32
+; VI: v_max_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI: v_max_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI: v_add_i32_e32
+; VI: v_add_i32_e32
+; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+
+; CI: v_sub_i32_e32
+; CI-DAG: v_sub_i32_e32
+; CI: v_bfe_i32
+; CI-DAG: v_bfe_i32
+; CI-DAG: v_add_i32_e32
+; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16
+; CI: v_add_i32_e32
+; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff,
+; CI: v_or_b32_e32
define amdgpu_kernel void @s_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 {
%neg = sub <2 x i16> zeroinitializer, %val
%cond = icmp sgt <2 x i16> %val, %neg
; FUNC-LABEL: {{^}}ashr_v2i16:
; FIXME: The ashr operation is uniform, but because its operands come from a
; global load we end up with the vector instructions rather than scalar.
-; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
define amdgpu_kernel void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
%b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1
%a = load <2 x i16>, <2 x i16> addrspace(1)* %in
; FUNC-LABEL: {{^}}ashr_v4i16:
; FIXME: The ashr operation is uniform, but because its operands come from a
; global load we end up with the vector instructions rather than scalar.
-; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
define amdgpu_kernel void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
%b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1
%a = load <4 x i16>, <4 x i16> addrspace(1)* %in
; GCN-LABEL: {{^}}v_test_sub_v2i16:
; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; VI: v_subrev_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_subrev_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI: v_subrev_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
; VI: v_subrev_i32_e32
-; VI: v_subrev_i32_e32
+; VI: v_subrev_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
%add = sub <2 x i16> %a, %b
store <2 x i16> %add, <2 x i16> addrspace(1)* %out
; GFX9: v_pk_sub_i16
; GFX9: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
-; VI: v_subrev_u16_e32
+; VI: v_subrev_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI: v_subrev_u16_e32
; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
; GCN-LABEL: {{^}}uitofp_v2i16_to_v2f16
; GCN: buffer_load_dword
-; SI: v_cvt_f32_u32_e32
-; SI: v_cvt_f32_u32_e32
-; VI: v_cvt_f32_i32_e32
-; VI: v_cvt_f32_i32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN-DAG: v_lshlrev_b32_e32
-; GCN-DAG: v_or_b32_e32
-; GCN: buffer_store_dword
-; GCN: s_endpgm
+
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f16_f32_e32
+; SI: v_cvt_f16_f32_e32
+; SI-DAG: v_lshlrev_b32_e32
+; SI: v_or_b32_e32
+
+; VI-DAG: v_cvt_f16_f32_e32
+; VI-DAG: v_cvt_f32_i32_sdwa
+; VI-DAG: v_cvt_f32_i32_sdwa
+; VI-DAG: v_cvt_f16_f32_sdwa
+; VI: v_or_b32_e32
+
+; GCN: buffer_store_dword
+; GCN: s_endpgm
define amdgpu_kernel void @uitofp_v2i16_to_v2f16(
<2 x half> addrspace(1)* %r,
<2 x i16> addrspace(1)* %a) {
; GCN-LABEL: {{^}}uitofp_v2i32_to_v2f16
; GCN: buffer_load_dwordx2
-; GCN: v_cvt_f32_u32_e32
-; GCN: v_cvt_f32_u32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN-DAG: v_lshlrev_b32_e32
-; GCN-DAG: v_or_b32_e32
+
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f16_f32_e32
+; SI: v_cvt_f16_f32_e32
+; SI-DAG: v_lshlrev_b32_e32
+; SI: v_or_b32_e32
+
+; VI-DAG: v_cvt_f32_u32_e32
+; VI-DAG: v_cvt_f32_u32_e32
+; VI-DAG: v_cvt_f16_f32_e32
+; VI-DAG: v_cvt_f16_f32_sdwa
+; VI: v_or_b32_e32
+
; GCN: buffer_store_dword
; GCN: s_endpgm
define amdgpu_kernel void @uitofp_v2i32_to_v2f16(
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]]
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; VI-NOT: and
-; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
-; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; VI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
-
-; VI-DAG: v_mac_f16_e32 v[[C_F16_1]], v[[B_F16_1]], v[[A_F16_1]]
-; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_1:[0-9]+]], 16, v[[C_F16_1]]
-; VI-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[B_V2_F16]], v[[A_V2_F16]]
+; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; VI-DAG: v_mac_f16_sdwa v[[A_F16_1]], v[[C_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_mac_f16_e32 v[[A_V2_F16]], v[[C_V2_F16]], v[[B_V2_F16]]
+; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[A_F16_1]]
; VI-NOT: and
-; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[C_V2_F16]]
-
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[A_V2_F16]]
; GCN: {{buffer|flat}}_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
; SI: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; VI: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD0:v[0-9]+]]
-; VI: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD1:v[0-9]+]]
-; VI-DAG: v_mac_f16_e32 [[ADD0]], v{{[0-9]+}}, v{{[0-9]+}}
-; VI-DAG: v_mac_f16_e32 [[ADD1]], v{{[0-9]+}}, v{{[0-9]+}}
+
+; VI-DAG: v_mac_f16_sdwa v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
; GCN: s_endpgm
define amdgpu_kernel void @mac_v2f16_same_add(
<2 x half> addrspace(1)* %r0,
}
; GCN-LABEL: {{^}}mac_v2f16_neg_a_safe_fp_math:
+
; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]]
; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
+
; VI: v_sub_f16_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
-; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]]
-; VI: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
+; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
+
; GCN: s_endpgm
define amdgpu_kernel void @mac_v2f16_neg_a_safe_fp_math(
<2 x half> addrspace(1)* %r,
}
; GCN-LABEL: {{^}}mac_v2f16_neg_b_safe_fp_math:
+
; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}}
; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
+
; VI: v_sub_f16_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
-; VI: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}}
-; VI: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
+; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
+
; GCN: s_endpgm
define amdgpu_kernel void @mac_v2f16_neg_b_safe_fp_math(
<2 x half> addrspace(1)* %r,
}
; GCN-LABEL: {{^}}mac_v2f16_neg_c_safe_fp_math:
+
; SI: v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
; SI: v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
; SI-DAG: v_mac_f32_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}}
; SI-DAG: v_mac_f32_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}}
+
; VI: v_sub_f16_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
-; VI: v_mac_f16_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}}
-; VI: v_mac_f16_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}}
+; VI-DAG: v_mac_f16_sdwa v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_mac_f16_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}}
+
; GCN: s_endpgm
define amdgpu_kernel void @mac_v2f16_neg_c_safe_fp_math(
<2 x half> addrspace(1)* %r,