Src3, Rnd),
Mask, PassThru, Subtarget, DAG);
}
+ case IFMA_OP_MASKZ:
+ case IFMA_OP_MASK: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ MVT VT = Op.getSimpleValueType();
+ SDValue PassThru = Src1;
+
+ // set PassThru element
+ if (IntrData->Type == IFMA_OP_MASKZ)
+ PassThru = getZeroVector(VT, Subtarget, DAG, dl);
+
+ // Node we need to swizzle the operands to pass the multiply operands
+ // first.
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
+ dl, Op.getValueType(),
+ Src2, Src3, Src1),
+ Mask, PassThru, Subtarget, DAG);
+ }
case TERLOG_OP_MASK:
case TERLOG_OP_MASKZ: {
SDValue Src1 = Op.getOperand(1);
// Multiply and Add Packed Integers.
VPMADDUBSW, VPMADDWD,
+
+ // AVX512IFMA multiply and add.
+ // NOTE: These are different than the instruction and perform
+ // op0 x op1 + op2.
VPMADD52L, VPMADD52H,
// FMA nodes.
let Constraints = "$src1 = $dst" in {
multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86VectorVTInfo _> {
+ // NOTE: The SDNode have the multiply operands first with the add last.
+ // This enables commuted load patterns to be autogenerated by tablegen.
let ExeDomain = _.ExeDomain in {
defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3)), 1, 1>,
+ (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>,
AVX512FMA3Base;
defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src1, _.RC:$src2, (_.LdFrag addr:$src3)))>,
+ (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1))>,
AVX512FMA3Base;
defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3),
OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
!strconcat("$src2, ${src3}", _.BroadcastStr ),
- (OpNode _.RC:$src1,
- _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>,
+ (OpNode _.RC:$src2,
+ (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))),
+ _.RC:$src1)>,
AVX512FMA3Base, EVEX_B;
}
-
- // TODO: Should be able to match a memory op in operand 2.
- // TODO: These instructions should be marked Commutable on operand 2 and 3.
}
} // Constraints = "$src1 = $dst"
def SDTIFma : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
-def x86vpmadd52l : SDNode<"X86ISD::VPMADD52L", SDTIFma>;
-def x86vpmadd52h : SDNode<"X86ISD::VPMADD52H", SDTIFma>;
+def x86vpmadd52l : SDNode<"X86ISD::VPMADD52L", SDTIFma, [SDNPCommutative]>;
+def x86vpmadd52h : SDNode<"X86ISD::VPMADD52H", SDTIFma, [SDNPCommutative]>;
def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", SDTFPUnaryOpRound>;
def X86rcp28 : SDNode<"X86ISD::RCP28", SDTFPUnaryOpRound>;
INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_MASK_RM, INTR_TYPE_3OP_IMM8_MASK,
FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3,
FMA_OP_SCALAR_MASK, FMA_OP_SCALAR_MASKZ, FMA_OP_SCALAR_MASK3,
+ IFMA_OP_MASK, IFMA_OP_MASKZ,
VPERM_2OP_MASK, VPERM_3OP_MASK, VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK,
INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM,
COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST32x2_TO_VEC,
X86ISD::VPERMV3, 0),
X86_INTRINSIC_DATA(avx512_mask_vpermt2var_qi_512, VPERM_3OP_MASK,
X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_128 , FMA_OP_MASK,
+ X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_128 , IFMA_OP_MASK,
X86ISD::VPMADD52H, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_256 , FMA_OP_MASK,
+ X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_256 , IFMA_OP_MASK,
X86ISD::VPMADD52H, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_512 , FMA_OP_MASK,
+ X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_512 , IFMA_OP_MASK,
X86ISD::VPMADD52H, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_128 , FMA_OP_MASK,
+ X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_128 , IFMA_OP_MASK,
X86ISD::VPMADD52L, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_256 , FMA_OP_MASK,
+ X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_256 , IFMA_OP_MASK,
X86ISD::VPMADD52L, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_512 , FMA_OP_MASK,
+ X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_512 , IFMA_OP_MASK,
X86ISD::VPMADD52L, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_128, FMA_OP_MASK3, ISD::FMA, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_256, FMA_OP_MASK3, ISD::FMA, 0),
X86ISD::VPERMV3, 0),
X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_qi_512, VPERM_3OP_MASKZ,
X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_128, FMA_OP_MASKZ,
+ X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_128, IFMA_OP_MASKZ,
X86ISD::VPMADD52H, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_256, FMA_OP_MASKZ,
+ X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_256, IFMA_OP_MASKZ,
X86ISD::VPMADD52H, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_512, FMA_OP_MASKZ,
+ X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_512, IFMA_OP_MASKZ,
X86ISD::VPMADD52H, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_128, FMA_OP_MASKZ,
+ X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_128, IFMA_OP_MASKZ,
X86ISD::VPMADD52L, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_256, FMA_OP_MASKZ,
+ X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_256, IFMA_OP_MASKZ,
X86ISD::VPMADD52L, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_512, FMA_OP_MASKZ,
+ X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_512, IFMA_OP_MASKZ,
X86ISD::VPMADD52L, 0),
X86_INTRINSIC_DATA(avx512_packssdw_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast(<8 x i64> %x0, i64* %x1ptr, <8 x i64> %x2) {
; CHECK-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpbroadcastq (%rdi), %zmm2
-; CHECK-NEXT: vpmadd52huq %zmm1, %zmm2, %zmm0
+; CHECK-NEXT: vpmadd52huq (%rdi){1to8}, %zmm1, %zmm0
; CHECK-NEXT: retq
%x1load = load i64, i64* %x1ptr
; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vpbroadcastq (%rdi), %zmm2
-; CHECK-NEXT: vpmadd52huq %zmm1, %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: vpmadd52huq (%rdi){1to8}, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%x1load = load i64, i64* %x1ptr
; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vpbroadcastq (%rdi), %zmm2
-; CHECK-NEXT: vpmadd52huq %zmm1, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpmadd52huq (%rdi){1to8}, %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%x1load = load i64, i64* %x1ptr