defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>;
} // End FPDPRounding = 1
-defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>;
-defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>;
-defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16>;
+defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16, lshl_rev>;
+defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16, lshr_rev>;
+defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16, ashr_rev>;
let isCommutable = 1 in {
let FPDPRounding = 1 in {
defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>;
def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">;
} // End FPDPRounding = 1
-defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>;
-defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>;
+defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16, add>;
+defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16, sub>;
defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">;
-defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16>;
+defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16, mul>;
defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum_like>;
defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum_like>;
-defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16>;
-defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16>;
-defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16>;
-defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16>;
+defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16, umax>;
+defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16, smax>;
+defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16, umin>;
+defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16, smin>;
let Constraints = "$vdst = $src2", DisableEncoding="$src2",
isConvertibleToThreeAddress = 1 in {
// Note: 16-bit instructions produce a 0 result in the high 16-bits
// on GFX8 and GFX9 and preserve high 16 bits on GFX10+
-def ClearHI16 : OutPatFrag<(ops node:$op),
- (V_AND_B32_e64 $op, (V_MOV_B32_e32 (i32 0xffff)))>;
-
-multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst,
- bit PreservesHI16 = 0> {
-
-def : GCNPat<
- (op i16:$src0, i16:$src1),
- !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1))
->;
+multiclass Arithmetic_i16_0Hi_Pats <SDPatternOperator op, Instruction inst> {
def : GCNPat<
(i32 (zext (op i16:$src0, i16:$src1))),
- !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1))
+ (inst $src0, $src1)
>;
def : GCNPat<
(i64 (zext (op i16:$src0, i16:$src1))),
(REG_SEQUENCE VReg_64,
- !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1)),
- sub0,
- (V_MOV_B32_e32 (i32 0)), sub1)
->;
-}
-
-multiclass Bits_OpsRev_i16_Pats <SDPatternOperator op, Instruction inst,
- bit PreservesHI16 = 0> {
-
-def : GCNPat<
- (op i16:$src0, i16:$src1),
- !if(!eq(PreservesHI16,1), (ClearHI16 (inst VSrc_b32:$src1, VSrc_b32:$src0)),
- (inst VSrc_b32:$src1, VSrc_b32:$src0))
->;
-
-def : GCNPat<
- (i32 (zext (op i16:$src0, i16:$src1))),
- !if(!eq(PreservesHI16,1), (ClearHI16 (inst VSrc_b32:$src1, VSrc_b32:$src0)),
- (inst VSrc_b32:$src1, VSrc_b32:$src0))
->;
-
-
-def : GCNPat<
- (i64 (zext (op i16:$src0, i16:$src1))),
- (REG_SEQUENCE VReg_64,
- !if(!eq(PreservesHI16,1), (ClearHI16 (inst VSrc_b32:$src1, VSrc_b32:$src0)),
- (inst VSrc_b32:$src1, VSrc_b32:$src0)),
- sub0,
+ (inst $src0, $src1), sub0,
(V_MOV_B32_e32 (i32 0)), sub1)
>;
}
let Predicates = [Has16BitInsts] in {
let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in {
-defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64>;
-defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64>;
-defm : Arithmetic_i16_Pats<sub, V_SUB_U16_e64>;
-defm : Arithmetic_i16_Pats<smin, V_MIN_I16_e64>;
-defm : Arithmetic_i16_Pats<smax, V_MAX_I16_e64>;
-defm : Arithmetic_i16_Pats<umin, V_MIN_U16_e64>;
-defm : Arithmetic_i16_Pats<umax, V_MAX_U16_e64>;
-}
-
-let Predicates = [Has16BitInsts, isGFX10Plus] in {
-defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64, 1>;
-defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64, 1>;
-defm : Arithmetic_i16_Pats<sub, V_SUB_U16_e64, 1>;
-defm : Arithmetic_i16_Pats<smin, V_MIN_I16_e64, 1>;
-defm : Arithmetic_i16_Pats<smax, V_MAX_I16_e64, 1>;
-defm : Arithmetic_i16_Pats<umin, V_MIN_U16_e64, 1>;
-defm : Arithmetic_i16_Pats<umax, V_MAX_U16_e64, 1>;
-}
-
-let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in {
-defm : Bits_OpsRev_i16_Pats<shl, V_LSHLREV_B16_e64>;
-defm : Bits_OpsRev_i16_Pats<srl, V_LSHRREV_B16_e64>;
-defm : Bits_OpsRev_i16_Pats<sra, V_ASHRREV_I16_e64>;
-}
-
-let Predicates = [Has16BitInsts, isGFX10Plus] in {
-defm : Bits_OpsRev_i16_Pats<shl, V_LSHLREV_B16_e64, 1>;
-defm : Bits_OpsRev_i16_Pats<srl, V_LSHRREV_B16_e64, 1>;
-defm : Bits_OpsRev_i16_Pats<sra, V_ASHRREV_I16_e64, 1>;
+defm : Arithmetic_i16_0Hi_Pats<add, V_ADD_U16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<mul, V_MUL_LO_U16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<sub, V_SUB_U16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<smin, V_MIN_I16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<smax, V_MAX_I16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<umin, V_MIN_U16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<umax, V_MAX_U16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<lshl_rev, V_LSHLREV_B16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<lshr_rev, V_LSHRREV_B16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<ashr_rev, V_ASHRREV_I16_e64>;
}
def : ZExt_i16_i1_Pat<zext>;
; GFX10: $vcc_hi = IMPLICIT_DEF
; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0
- ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec
; GFX10: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec
- ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_ASHRREV_I16_e64_]], [[V_MOV_B32_e32_]], implicit $exec
- ; GFX10: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
+ ; GFX10: S_ENDPGM 0, implicit [[V_ASHRREV_I16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:sgpr(s32) = COPY $sgpr0
%2:vgpr(s16) = G_TRUNC %0
; GFX10: $vcc_hi = IMPLICIT_DEF
; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec
; GFX10: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec
- ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_ASHRREV_I16_e64_]], [[V_MOV_B32_e32_]], implicit $exec
- ; GFX10: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
+ ; GFX10: S_ENDPGM 0, implicit [[V_ASHRREV_I16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s16) = G_TRUNC %0
; GFX10: $vcc_hi = IMPLICIT_DEF
; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec
; GFX10: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec
- ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_ASHRREV_I16_e64_]], [[V_MOV_B32_e32_]], implicit $exec
- ; GFX10: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_AND_B32_e64_]], 0, 16, implicit $exec
+ ; GFX10: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_ASHRREV_I16_e64_]], 0, 16, implicit $exec
; GFX10: S_ENDPGM 0, implicit [[V_BFE_U32_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
; GFX10: $vcc_hi = IMPLICIT_DEF
; GFX10: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0
; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec
; GFX10: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec
- ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_ASHRREV_I16_e64_]], [[V_MOV_B32_e32_]], implicit $exec
- ; GFX10: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
+ ; GFX10: S_ENDPGM 0, implicit [[V_ASHRREV_I16_e64_]]
%0:sgpr(s32) = COPY $sgpr0
%1:vgpr(s32) = COPY $vgpr0
%2:sgpr(s16) = G_TRUNC %0
; GFX10: $vcc_hi = IMPLICIT_DEF
; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0
- ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec
; GFX10: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
- ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_LSHRREV_B16_e64_]], [[V_MOV_B32_e32_]], implicit $exec
- ; GFX10: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
+ ; GFX10: S_ENDPGM 0, implicit [[V_LSHRREV_B16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:sgpr(s32) = COPY $sgpr0
%2:vgpr(s16) = G_TRUNC %0
; GFX10: $vcc_hi = IMPLICIT_DEF
; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec
; GFX10: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
- ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_LSHRREV_B16_e64_]], [[V_MOV_B32_e32_]], implicit $exec
- ; GFX10: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
+ ; GFX10: S_ENDPGM 0, implicit [[V_LSHRREV_B16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s16) = G_TRUNC %0
; GFX10: $vcc_hi = IMPLICIT_DEF
; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec
; GFX10: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
- ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_LSHRREV_B16_e64_]], [[V_MOV_B32_e32_]], implicit $exec
- ; GFX10: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_AND_B32_e64_]], 0, 16, implicit $exec
+ ; GFX10: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_LSHRREV_B16_e64_]], 0, 16, implicit $exec
; GFX10: S_ENDPGM 0, implicit [[V_BFE_U32_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
; GFX10: $vcc_hi = IMPLICIT_DEF
; GFX10: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0
; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec
; GFX10: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
- ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_LSHRREV_B16_e64_]], [[V_MOV_B32_e32_]], implicit $exec
- ; GFX10: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
+ ; GFX10: S_ENDPGM 0, implicit [[V_LSHRREV_B16_e64_]]
%0:sgpr(s32) = COPY $sgpr0
%1:vgpr(s32) = COPY $vgpr0
%2:sgpr(s16) = G_TRUNC %0
; GFX10: $vcc_hi = IMPLICIT_DEF
; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0
- ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec
; GFX10: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
- ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_LSHLREV_B16_e64_]], [[V_MOV_B32_e32_]], implicit $exec
- ; GFX10: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
+ ; GFX10: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:sgpr(s32) = COPY $sgpr0
%2:vgpr(s16) = G_TRUNC %0
; GFX10: $vcc_hi = IMPLICIT_DEF
; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec
; GFX10: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
- ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_LSHLREV_B16_e64_]], [[V_MOV_B32_e32_]], implicit $exec
- ; GFX10: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
+ ; GFX10: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s16) = G_TRUNC %0
; GFX10: $vcc_hi = IMPLICIT_DEF
; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec
; GFX10: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
- ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_LSHLREV_B16_e64_]], [[V_MOV_B32_e32_]], implicit $exec
- ; GFX10: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_AND_B32_e64_]], 0, 16, implicit $exec
+ ; GFX10: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_LSHLREV_B16_e64_]], 0, 16, implicit $exec
; GFX10: S_ENDPGM 0, implicit [[V_BFE_U32_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
; GFX10: $vcc_hi = IMPLICIT_DEF
; GFX10: [[COPY:%[0-9]+]]:sreg_32_xm0 = COPY $sgpr0
; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX10: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec
; GFX10: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
- ; GFX10: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[V_LSHLREV_B16_e64_]], [[V_MOV_B32_e32_]], implicit $exec
- ; GFX10: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
+ ; GFX10: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]]
%0:sgpr(s32) = COPY $sgpr0
%1:vgpr(s32) = COPY $vgpr0
%2:sgpr(s16) = G_TRUNC %0
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s7
; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: global_load_ushort v2, v[2:3], off
-; GFX10-DL-NEXT: global_load_ushort v7, v[0:1], off
+; GFX10-DL-NEXT: global_load_ushort v0, v[0:1], off
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX10-DL-NEXT: v_lshrrev_b16_e64 v1, 8, v2
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_and_b32_sdwa v3, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, v0
; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 8
-; GFX10-DL-NEXT: v_bfe_i32 v0, v7, 0, 8
+; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX10-DL-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_and_b32_sdwa v3, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX10-DL-NEXT: v_and_b32_sdwa v2, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX10-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s2
+; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s3
; GFX10-DL-NEXT: v_mov_b32_e32 v4, s4
; GFX10-DL-NEXT: s_sext_i32_i8 s0, s2
; GFX10-DL-NEXT: s_sext_i32_i8 s1, s3
-; GFX10-DL-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 8
+; GFX10-DL-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX10-DL-NEXT: s_bfe_i32 s4, s2, 0x80010
; GFX10-DL-NEXT: s_bfe_i32 s5, s3, 0x80010
; GFX10-DL-NEXT: v_mad_i32_i24 v4, s0, s1, v4
; GFX10-DL-NEXT: s_ashr_i32 s0, s2, 24
; GFX10-DL-NEXT: s_ashr_i32 s1, s3, 24
-; GFX10-DL-NEXT: v_mad_i32_i24 v2, v3, v2, v4
+; GFX10-DL-NEXT: v_mad_i32_i24 v2, v2, v3, v4
; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2
; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2
; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX10-DL-NEXT: s_bfe_i32 s1, s3, 0x80000
; GFX10-DL-NEXT: s_lshr_b32 s4, s2, 16
; GFX10-DL-NEXT: s_lshr_b32 s5, s3, 16
-; GFX10-DL-NEXT: v_and_b32_sdwa v4, sext(s2), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 8, s2
; GFX10-DL-NEXT: v_and_b32_e32 v7, s0, v2
; GFX10-DL-NEXT: v_and_b32_e32 v6, s1, v2
-; GFX10-DL-NEXT: v_and_b32_sdwa v5, sext(s3), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 8, s3
; GFX10-DL-NEXT: s_bfe_i32 s0, s4, 0x80000
; GFX10-DL-NEXT: s_bfe_i32 s1, s5, 0x80000
; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v7
-; GFX10-DL-NEXT: v_and_b32_sdwa v8, sext(s4), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 8, s4
; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6
-; GFX10-DL-NEXT: v_and_b32_sdwa v6, sext(s5), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 8, s5
; GFX10-DL-NEXT: v_and_b32_e32 v7, s1, v2
; GFX10-DL-NEXT: v_and_b32_e32 v2, s0, v2
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: s_movk_i32 s2, 0xff
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff
+; GFX10-DL-NEXT: s_movk_i32 s3, 0xff
+; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
-; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0
+; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0
+; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_and_b32 s0, s3, s2
-; GFX10-DL-NEXT: s_and_b32 s1, s4, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4
; GFX10-DL-NEXT: v_mov_b32_e32 v3, s5
-; GFX10-DL-NEXT: v_and_b32_sdwa v4, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX10-DL-NEXT: v_and_b32_sdwa v2, s4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX10-DL-NEXT: s_bfe_u32 s2, s3, 0x80010
-; GFX10-DL-NEXT: s_bfe_u32 s5, s4, 0x80010
-; GFX10-DL-NEXT: v_mad_u32_u24 v3, s0, s1, v3
-; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 24
-; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 24
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, v4, v2, v3
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s5, v2
+; GFX10-DL-NEXT: s_and_b32 s0, s4, s3
+; GFX10-DL-NEXT: s_and_b32 s1, s5, s3
+; GFX10-DL-NEXT: v_mov_b32_e32 v4, s6
+; GFX10-DL-NEXT: v_and_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-DL-NEXT: v_and_b32_sdwa v3, s2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-DL-NEXT: s_bfe_u32 s3, s4, 0x80010
+; GFX10-DL-NEXT: s_bfe_u32 s2, s5, 0x80010
+; GFX10-DL-NEXT: v_mad_u32_u24 v4, s0, s1, v4
+; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 24
+; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 24
+; GFX10-DL-NEXT: v_mad_u32_u24 v2, v2, v3, v4
+; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2
; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2
; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX10-DL-NEXT: s_endpgm
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_and_b32_sdwa v4, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s2
; GFX10-DL-NEXT: v_and_b32_sdwa v7, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-DL-NEXT: v_and_b32_sdwa v5, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX10-DL-NEXT: v_lshrrev_b16_e64 v5, 8, s3
; GFX10-DL-NEXT: v_and_b32_sdwa v6, v2, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 16
; GFX10-DL-NEXT: s_lshr_b32 s1, s3, 16
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff
-; GFX10-DL-NEXT: s_movk_i32 s2, 0xff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
-; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT: global_load_ubyte v3, v[0:1], off
+; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_and_b32_sdwa v4, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX10-DL-NEXT: v_and_b32_sdwa v5, s4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 24
-; GFX10-DL-NEXT: s_lshr_b32 s1, s3, 16
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s3, s4
-; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 16
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v5
-; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 24
-; GFX10-DL-NEXT: v_and_b32_sdwa v5, v6, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s1, s3
-; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s0, s4
-; GFX10-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-DL-NEXT: v_and_b32_sdwa v5, v6, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-DL-NEXT: v_and_b32_sdwa v2, v7, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-DL-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX10-DL-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-DL-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2
+; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s2
+; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s3
+; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 24
+; GFX10-DL-NEXT: s_lshr_b32 s1, s3, 24
+; GFX10-DL-NEXT: s_lshr_b32 s4, s2, 16
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s2, s3
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, v3, v4
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s0, s1
+; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 16
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v4
+; GFX10-DL-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s4, s0
+; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX10-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v4
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v4
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v4, v3
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v5
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff
+; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
-; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
+; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0
+; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off
+; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 12
-; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 12
-; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x40000
+; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 12
+; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 12
; GFX10-DL-NEXT: s_bfe_i32 s6, s4, 0x40000
-; GFX10-DL-NEXT: s_bfe_i32 s7, s2, 0x40004
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s0
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s1
-; GFX10-DL-NEXT: s_bfe_i32 s0, s4, 0x40004
-; GFX10-DL-NEXT: s_bfe_i32 s1, s2, 0x40008
-; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40008
-; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v2
-; GFX10-DL-NEXT: v_and_b32_e32 v5, v5, v2
-; GFX10-DL-NEXT: s_bfe_i32 s9, s2, 0x40010
-; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x40010
-; GFX10-DL-NEXT: v_mul_i32_i24_e64 v6, s1, s8
+; GFX10-DL-NEXT: s_bfe_i32 s7, s5, 0x40000
+; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40004
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1
+; GFX10-DL-NEXT: s_bfe_i32 s9, s5, 0x40004
+; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x40008
+; GFX10-DL-NEXT: s_bfe_i32 s11, s5, 0x40008
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v5
-; GFX10-DL-NEXT: s_bfe_i32 s1, s2, 0x40014
-; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40014
-; GFX10-DL-NEXT: s_bfe_i32 s11, s2, 0x40018
-; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v2
-; GFX10-DL-NEXT: v_and_b32_e32 v2, v5, v2
+; GFX10-DL-NEXT: s_bfe_i32 s0, s4, 0x40010
+; GFX10-DL-NEXT: s_bfe_i32 s1, s5, 0x40010
+; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s10, s11
+; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3
+; GFX10-DL-NEXT: v_and_b32_e32 v4, s2, v4
+; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x40014
+; GFX10-DL-NEXT: s_bfe_i32 s11, s5, 0x40014
; GFX10-DL-NEXT: s_bfe_i32 s12, s4, 0x40018
-; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 28
+; GFX10-DL-NEXT: s_bfe_i32 s2, s5, 0x40018
; GFX10-DL-NEXT: s_ashr_i32 s4, s4, 28
+; GFX10-DL-NEXT: s_ashr_i32 s5, s5, 28
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_mad_i32_i24 v3, s5, s6, v3
-; GFX10-DL-NEXT: v_mad_i32_i24 v3, s7, s0, v3
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, v4, v2, v3
-; GFX10-DL-NEXT: v_mad_i32_i24 v2, s9, s10, v2
-; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s8, v2
-; GFX10-DL-NEXT: v_mad_i32_i24 v2, s11, s12, v2
-; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2
+; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s7, v2
+; GFX10-DL-NEXT: v_mad_i32_i24 v2, s8, s9, v2
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2
+; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2
+; GFX10-DL-NEXT: v_mad_i32_i24 v2, s10, s11, v2
+; GFX10-DL-NEXT: v_mad_i32_i24 v2, s12, s2, v2
+; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2
; GFX10-DL-NEXT: global_store_short v[0:1], v2, off
; GFX10-DL-NEXT: s_endpgm
<8 x i4> addrspace(1)* %src2,
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX10-DL-NEXT: s_movk_i32 s2, 0xff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT: global_load_ubyte v3, v[0:1], off
+; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 12
; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 12
; GFX10-DL-NEXT: s_bfe_i32 s6, s4, 0x40000
; GFX10-DL-NEXT: s_bfe_i32 s7, s5, 0x40000
; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40004
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s0
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s1
-; GFX10-DL-NEXT: s_bfe_i32 s0, s5, 0x40004
-; GFX10-DL-NEXT: s_bfe_i32 s1, s4, 0x40008
-; GFX10-DL-NEXT: s_bfe_i32 s9, s5, 0x40008
-; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v2
-; GFX10-DL-NEXT: v_and_b32_e32 v2, v5, v2
-; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x40010
-; GFX10-DL-NEXT: s_bfe_i32 s11, s5, 0x40010
-; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s1, s9
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1
+; GFX10-DL-NEXT: s_bfe_i32 s9, s5, 0x40004
+; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x40008
+; GFX10-DL-NEXT: s_bfe_i32 s11, s5, 0x40008
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v2, 12, v2
-; GFX10-DL-NEXT: s_bfe_i32 s1, s4, 0x40014
-; GFX10-DL-NEXT: s_bfe_i32 s9, s5, 0x40014
+; GFX10-DL-NEXT: s_bfe_i32 s0, s4, 0x40010
+; GFX10-DL-NEXT: s_bfe_i32 s1, s5, 0x40010
+; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s10, s11
+; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3
+; GFX10-DL-NEXT: v_and_b32_e32 v4, s2, v4
+; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x40014
+; GFX10-DL-NEXT: s_bfe_i32 s11, s5, 0x40014
; GFX10-DL-NEXT: s_bfe_i32 s12, s4, 0x40018
-; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-DL-NEXT: v_and_b32_sdwa v2, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-DL-NEXT: s_bfe_i32 s2, s5, 0x40018
; GFX10-DL-NEXT: s_ashr_i32 s4, s4, 28
; GFX10-DL-NEXT: s_ashr_i32 s5, s5, 28
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_mad_i32_i24 v3, s6, s7, v3
-; GFX10-DL-NEXT: v_mad_i32_i24 v3, s8, s0, v3
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX10-DL-NEXT: v_mad_u32_u24 v2, v4, v2, v3
+; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s7, v2
+; GFX10-DL-NEXT: v_mad_i32_i24 v2, s8, s9, v2
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX10-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2
+; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2
; GFX10-DL-NEXT: v_mad_i32_i24 v2, s10, s11, v2
-; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s9, v2
; GFX10-DL-NEXT: v_mad_i32_i24 v2, s12, s2, v2
; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2
; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
;
; GFX10-DL-LABEL: idot8_acc8_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff
-; GFX10-DL-NEXT: s_movk_i32 s2, 0xff
-; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
-; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: global_load_ubyte v3, v[0:1], off
+; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
+; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0
-; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff
+; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0
+; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: s_lshr_b32 s9, s0, 4
-; GFX10-DL-NEXT: s_lshr_b32 s16, s1, 4
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s0
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s1
-; GFX10-DL-NEXT: s_lshr_b32 s10, s0, 8
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s9
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s16
-; GFX10-DL-NEXT: s_lshr_b32 s11, s0, 12
-; GFX10-DL-NEXT: s_lshr_b32 s17, s1, 8
-; GFX10-DL-NEXT: s_lshr_b32 s18, s1, 12
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s10
-; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v2
-; GFX10-DL-NEXT: v_and_b32_e32 v5, v5, v2
-; GFX10-DL-NEXT: v_and_b32_e32 v8, v8, v2
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s18
-; GFX10-DL-NEXT: v_and_b32_e32 v15, v15, v2
-; GFX10-DL-NEXT: s_lshr_b32 s7, s0, 24
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v23, 12, s11
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v31, 12, s17
-; GFX10-DL-NEXT: v_and_b32_e32 v7, v7, v2
+; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 4
+; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 4
+; GFX10-DL-NEXT: s_lshr_b32 s6, s4, 12
+; GFX10-DL-NEXT: s_lshr_b32 s7, s5, 12
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s4
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s6
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s7
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s5
+; GFX10-DL-NEXT: s_lshr_b32 s8, s4, 8
+; GFX10-DL-NEXT: s_lshr_b32 s0, s5, 8
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4
-; GFX10-DL-NEXT: v_and_b32_e32 v13, v13, v2
-; GFX10-DL-NEXT: v_and_b32_e32 v6, v23, v2
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v5
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s8
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s0
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v7
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, v3, v4
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 12, v6
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v8
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v15, 12, v15
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v27, 12, s7
-; GFX10-DL-NEXT: v_and_b32_e32 v14, v31, v2
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v23, 12, v6
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v9
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v5, v7
+; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 20
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v10
+; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 20
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, v6, v8
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s0
+; GFX10-DL-NEXT: s_lshr_b32 s8, s5, 16
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s1
+; GFX10-DL-NEXT: s_lshr_b32 s9, s5, 28
+; GFX10-DL-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NEXT: s_lshr_b32 s7, s4, 28
+; GFX10-DL-NEXT: s_lshr_b32 s6, s4, 16
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v9
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v5
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s9
+; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 24
+; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 24
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s7
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s6
+; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3
+; GFX10-DL-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v7
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v13, 12, v13
-; GFX10-DL-NEXT: v_and_b32_e32 v10, v27, v2
-; GFX10-DL-NEXT: s_lshr_b32 s5, s0, 16
-; GFX10-DL-NEXT: s_lshr_b32 s6, s0, 20
-; GFX10-DL-NEXT: s_lshr_b32 s12, s1, 16
-; GFX10-DL-NEXT: s_lshr_b32 s13, s1, 20
-; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v2
-; GFX10-DL-NEXT: v_and_b32_e32 v5, v5, v2
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v27, 12, v14
-; GFX10-DL-NEXT: v_and_b32_e32 v8, v8, v2
-; GFX10-DL-NEXT: v_and_b32_e32 v15, v15, v2
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s6
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v5
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s5
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v19, 12, s12
-; GFX10-DL-NEXT: s_lshr_b32 s8, s0, 28
-; GFX10-DL-NEXT: s_lshr_b32 s14, s1, 24
-; GFX10-DL-NEXT: s_lshr_b32 s15, s1, 28
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v35, 12, s13
-; GFX10-DL-NEXT: v_and_b32_e32 v6, v23, v2
-; GFX10-DL-NEXT: v_and_b32_e32 v7, v7, v2
-; GFX10-DL-NEXT: v_and_b32_e32 v5, v27, v2
-; GFX10-DL-NEXT: v_and_b32_e32 v13, v13, v2
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v8, v8, v15
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s8
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s15
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v7, v5
-; GFX10-DL-NEXT: v_lshlrev_b16_e64 v17, 12, s14
-; GFX10-DL-NEXT: v_and_b32_e32 v11, v11, v2
-; GFX10-DL-NEXT: v_and_b32_e32 v12, v12, v2
-; GFX10-DL-NEXT: v_and_b32_e32 v18, v35, v2
-; GFX10-DL-NEXT: v_and_b32_e32 v19, v19, v2
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v15, v6, v13
-; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-DL-NEXT: v_and_b32_sdwa v7, v8, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-DL-NEXT: v_and_b32_e32 v9, v9, v2
-; GFX10-DL-NEXT: v_and_b32_e32 v16, v16, v2
-; GFX10-DL-NEXT: v_and_b32_e32 v17, v17, v2
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v8
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s8
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s0
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s1
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v15, 12, v9
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v19, 12, v6
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v13
+; GFX10-DL-NEXT: v_ashrrev_i16_e64 v10, 12, v10
+; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v4
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v7, v8
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v11
-; GFX10-DL-NEXT: v_or_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v12
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v35, 12, v18
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v19, 12, v19
-; GFX10-DL-NEXT: v_and_b32_sdwa v5, v5, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-DL-NEXT: v_and_b32_sdwa v6, v15, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v9
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v31, 12, v10
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v16, 12, v16
-; GFX10-DL-NEXT: v_and_b32_e32 v7, v11, v2
-; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-DL-NEXT: v_and_b32_e32 v4, s4, v4
-; GFX10-DL-NEXT: v_ashrrev_i16_e64 v17, 12, v17
-; GFX10-DL-NEXT: v_and_b32_e32 v10, v12, v2
-; GFX10-DL-NEXT: v_and_b32_e32 v11, v19, v2
-; GFX10-DL-NEXT: v_and_b32_e32 v6, v35, v2
-; GFX10-DL-NEXT: v_and_b32_e32 v8, v9, v2
-; GFX10-DL-NEXT: v_and_b32_e32 v13, v16, v2
-; GFX10-DL-NEXT: v_and_b32_e32 v9, v31, v2
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v10, v10, v11
-; GFX10-DL-NEXT: v_and_b32_e32 v12, v17, v2
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v11, v7, v6
-; GFX10-DL-NEXT: v_or_b32_e32 v5, v4, v5
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v8, v8, v13
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v9, v12
-; GFX10-DL-NEXT: v_and_b32_sdwa v9, v10, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v5
-; GFX10-DL-NEXT: v_and_b32_sdwa v7, v7, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, v19, v10
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v15, v9
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v4
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 8, v6
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v4, v3
-; GFX10-DL-NEXT: v_and_b32_sdwa v4, v11, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-DL-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v10
-; GFX10-DL-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-DL-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
-; GFX10-DL-NEXT: v_and_b32_e32 v4, s4, v4
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX10-DL-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v4
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v2
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v4
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v7
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v11, v12
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v8
+; GFX10-DL-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NEXT: v_or_b32_sdwa v5, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
+; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v5
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v4
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
; GFX10-DL-NEXT: s_endpgm
<8 x i4> addrspace(1)* %src2,
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-DL-NEXT: global_load_ubyte v3, v[0:1], off
+; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40004
; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40004
; GFX10-DL-NEXT: s_and_b32 s8, s5, 15
; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c
; GFX10-DL-NEXT: s_bfe_u32 s9, s5, 0x4000c
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s0, s1
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s0, s1
; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40008
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s8
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s6, s8
; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40008
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v11, s7, s9
-; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s7, s9
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3
; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014
; GFX10-DL-NEXT: s_lshr_b32 s7, s4, 28
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s0, s1
-; GFX10-DL-NEXT: v_and_b32_sdwa v6, v11, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s0, s1
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v5
+; GFX10-DL-NEXT: v_or_b32_e32 v3, v4, v3
; GFX10-DL-NEXT: s_bfe_u32 s0, s5, 0x40014
; GFX10-DL-NEXT: s_lshr_b32 s9, s5, 28
; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40010
-; GFX10-DL-NEXT: v_or_b32_sdwa v5, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX10-DL-NEXT: v_and_b32_e32 v4, s2, v4
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v11, s6, s0
+; GFX10-DL-NEXT: v_or_b32_sdwa v4, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s0
; GFX10-DL-NEXT: s_bfe_u32 s8, s5, 0x40010
; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40018
; GFX10-DL-NEXT: s_bfe_u32 s4, s5, 0x40018
-; GFX10-DL-NEXT: v_or_b32_e32 v5, v4, v5
-; GFX10-DL-NEXT: v_and_b32_sdwa v6, v11, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s1, s8
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v8, s7, s9
-; GFX10-DL-NEXT: v_mul_lo_u16_e64 v9, s0, s4
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v5
-; GFX10-DL-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX10-DL-NEXT: v_and_b32_sdwa v2, v8, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-DL-NEXT: v_and_b32_e32 v6, s2, v6
-; GFX10-DL-NEXT: v_or_b32_sdwa v7, v9, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX10-DL-NEXT: v_or_b32_e32 v2, v6, v7
-; GFX10-DL-NEXT: v_lshrrev_b32_e32 v14, 8, v2
+; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v4
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v5
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s1, s8
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s7, s9
+; GFX10-DL-NEXT: v_mul_lo_u16_e64 v8, s0, s4
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v4
+; GFX10-DL-NEXT: v_or_b32_e32 v5, v6, v5
+; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 8, v7
+; GFX10-DL-NEXT: v_and_b32_e32 v5, s2, v5
+; GFX10-DL-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-DL-NEXT: v_or_b32_e32 v11, v5, v6
+; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v11
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v4, v3
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v10
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v6
-; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v14
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v9
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v7
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
; GFX10-DL-NEXT: s_endpgm
<8 x i4> addrspace(1)* %src2,
; GCN-LABEL: {{^}}shl_i16:
; GCN: v_lshlrev_b16_e{{32|64}} [[OP:v[0-9]+]],
-; GFX9-NEXT: s_setpc_b64
-; GFX10: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]]
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GCN-NEXT: s_setpc_b64
define i16 @shl_i16(i16 %x, i16 %y) {
%res = shl i16 %x, %y
ret i16 %res
; GCN-LABEL: {{^}}lshr_i16:
; GCN: v_lshrrev_b16_e{{32|64}} [[OP:v[0-9]+]],
-; GFX9-NEXT: s_setpc_b64
-; GFX10: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]]
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GCN-NEXT: s_setpc_b64
define i16 @lshr_i16(i16 %x, i16 %y) {
%res = lshr i16 %x, %y
ret i16 %res
; GCN-LABEL: {{^}}ashr_i16:
; GCN: v_ashrrev_i16_e{{32|64}} [[OP:v[0-9]+]],
-; GFX9-NEXT: s_setpc_b64
-; GFX10: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]]
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GCN-NEXT: s_setpc_b64
define i16 @ashr_i16(i16 %x, i16 %y) {
%res = ashr i16 %x, %y
ret i16 %res
; GCN-LABEL: {{^}}add_u16:
; GCN: v_add_{{(nc_)*}}u16_e{{32|64}} [[OP:v[0-9]+]],
-; GFX9-NEXT: s_setpc_b64
-; GFX10: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]]
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GCN-NEXT: s_setpc_b64
define i16 @add_u16(i16 %x, i16 %y) {
%res = add i16 %x, %y
ret i16 %res
; GCN-LABEL: {{^}}sub_u16:
; GCN: v_sub_{{(nc_)*}}u16_e{{32|64}} [[OP:v[0-9]+]],
-; GFX9-NEXT: s_setpc_b64
-; GFX10: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]]
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GCN-NEXT: s_setpc_b64
define i16 @sub_u16(i16 %x, i16 %y) {
%res = sub i16 %x, %y
ret i16 %res
; GCN-LABEL: {{^}}mul_lo_u16:
; GCN: v_mul_lo_u16_e{{32|64}} [[OP:v[0-9]+]],
-; GFX9-NEXT: s_setpc_b64
-; GFX10: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]]
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GCN-NEXT: s_setpc_b64
define i16 @mul_lo_u16(i16 %x, i16 %y) {
%res = mul i16 %x, %y
ret i16 %res
; GCN-LABEL: {{^}}min_u16:
; GCN: v_min_u16_e{{32|64}} [[OP:v[0-9]+]],
-; GFX9-NEXT: s_setpc_b64
-; GFX10: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]]
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GCN-NEXT: s_setpc_b64
define i16 @min_u16(i16 %x, i16 %y) {
%cmp = icmp ule i16 %x, %y
%res = select i1 %cmp, i16 %x, i16 %y
; GCN-LABEL: {{^}}min_i16:
; GCN: v_min_i16_e{{32|64}} [[OP:v[0-9]+]],
-; GFX9-NEXT: s_setpc_b64
-; GFX10: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]]
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GCN-NEXT: s_setpc_b64
define i16 @min_i16(i16 %x, i16 %y) {
%cmp = icmp sle i16 %x, %y
%res = select i1 %cmp, i16 %x, i16 %y
; GCN-LABEL: {{^}}max_u16:
; GCN: v_max_u16_e{{32|64}} [[OP:v[0-9]+]],
-; GFX9-NEXT: s_setpc_b64
-; GFX10: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]]
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GCN-NEXT: s_setpc_b64
define i16 @max_u16(i16 %x, i16 %y) {
%cmp = icmp uge i16 %x, %y
%res = select i1 %cmp, i16 %x, i16 %y
; GCN-LABEL: {{^}}max_i16:
; GCN: v_max_i16_e{{32|64}} [[OP:v[0-9]+]],
-; GFX9-NEXT: s_setpc_b64
-; GFX10: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]]
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GCN-NEXT: s_setpc_b64
define i16 @max_i16(i16 %x, i16 %y) {
%cmp = icmp sge i16 %x, %y
%res = select i1 %cmp, i16 %x, i16 %y
ret i16 %res
}
+
+; GCN-LABEL: {{^}}shl_i16_zext_i32:
+; GCN: v_lshlrev_b16_e{{32|64}} [[OP:v[0-9]+]],
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]]
+; GCN-NEXT: s_setpc_b64
+define i32 @shl_i16_zext_i32(i16 %x, i16 %y) {
+ %res = shl i16 %x, %y
+ %zext = zext i16 %res to i32
+ ret i32 %zext
+}
+
+; GCN-LABEL: {{^}}lshr_i16_zext_i32:
+; GCN: v_lshrrev_b16_e{{32|64}} [[OP:v[0-9]+]],
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]]
+; GCN-NEXT: s_setpc_b64
+define i32 @lshr_i16_zext_i32(i16 %x, i16 %y) {
+ %res = lshr i16 %x, %y
+ %zext = zext i16 %res to i32
+ ret i32 %zext
+}
+
+; GCN-LABEL: {{^}}ashr_i16_zext_i32:
+; GCN: v_ashrrev_i16_e{{32|64}} [[OP:v[0-9]+]],
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]]
+; GCN-NEXT: s_setpc_b64
+define i32 @ashr_i16_zext_i32(i16 %x, i16 %y) {
+ %res = ashr i16 %x, %y
+ %zext = zext i16 %res to i32
+ ret i32 %zext
+}
+
+; GCN-LABEL: {{^}}add_u16_zext_i32:
+; GCN: v_add_{{(nc_)*}}u16_e{{32|64}} [[OP:v[0-9]+]],
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]]
+; GCN-NEXT: s_setpc_b64
+define i32 @add_u16_zext_i32(i16 %x, i16 %y) {
+ %res = add i16 %x, %y
+ %zext = zext i16 %res to i32
+ ret i32 %zext
+}
+
+; GCN-LABEL: {{^}}sub_u16_zext_i32:
+; GCN: v_sub_{{(nc_)*}}u16_e{{32|64}} [[OP:v[0-9]+]],
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]]
+; GCN-NEXT: s_setpc_b64
+define i32 @sub_u16_zext_i32(i16 %x, i16 %y) {
+ %res = sub i16 %x, %y
+ %zext = zext i16 %res to i32
+ ret i32 %zext
+}
+
+; GCN-LABEL: {{^}}mul_lo_u16_zext_i32:
+; GCN: v_mul_lo_u16_e{{32|64}} [[OP:v[0-9]+]],
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]]
+; GCN-NEXT: s_setpc_b64
+define i32 @mul_lo_u16_zext_i32(i16 %x, i16 %y) {
+ %res = mul i16 %x, %y
+ %zext = zext i16 %res to i32
+ ret i32 %zext
+}
+
+; GCN-LABEL: {{^}}min_u16_zext_i32:
+; GCN: v_min_u16_e{{32|64}} [[OP:v[0-9]+]],
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]]
+; GCN-NEXT: s_setpc_b64
+define i32 @min_u16_zext_i32(i16 %x, i16 %y) {
+ %cmp = icmp ule i16 %x, %y
+ %res = select i1 %cmp, i16 %x, i16 %y
+ %zext = zext i16 %res to i32
+ ret i32 %zext
+}
+
+; GCN-LABEL: {{^}}min_i16_zext_i32:
+; GCN: v_min_i16_e{{32|64}} [[OP:v[0-9]+]],
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]]
+; GCN-NEXT: s_setpc_b64
+define i32 @min_i16_zext_i32(i16 %x, i16 %y) {
+ %cmp = icmp sle i16 %x, %y
+ %res = select i1 %cmp, i16 %x, i16 %y
+ %zext = zext i16 %res to i32
+ ret i32 %zext
+}
+
+; GCN-LABEL: {{^}}max_u16_zext_i32:
+; GCN: v_max_u16_e{{32|64}} [[OP:v[0-9]+]],
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]]
+; GCN-NEXT: s_setpc_b64
+define i32 @max_u16_zext_i32(i16 %x, i16 %y) {
+ %cmp = icmp uge i16 %x, %y
+ %res = select i1 %cmp, i16 %x, i16 %y
+ %zext = zext i16 %res to i32
+ ret i32 %zext
+}
+
+; GCN-LABEL: {{^}}max_i16_zext_i32:
+; GCN: v_max_i16_e{{32|64}} [[OP:v[0-9]+]],
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]]
+; GCN-NEXT: s_setpc_b64
+define i32 @max_i16_zext_i32(i16 %x, i16 %y) {
+ %cmp = icmp sge i16 %x, %y
+ %res = select i1 %cmp, i16 %x, i16 %y
+ %zext = zext i16 %res to i32
+ ret i32 %zext
+}
; GFX9: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-
-
+; GFX10: v_lshlrev_b16_e64 v{{[0-9]+}}, 8, v
+; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
define amdgpu_kernel void @mul_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %ina, <2 x i8> addrspace(1)* %inb) {
entry:
%a = load <2 x i8>, <2 x i8> addrspace(1)* %ina, align 4
;
; GFX89: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
;
-; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
define amdgpu_kernel void @pulled_out_test(<8 x i8> addrspace(1)* %sourceA, <8 x i8> addrspace(1)* %destValues) {
entry: