list<dag> Pattern,
list<dag> MaskingPattern,
bit IsCommutable = 0> {
- let isCommutable = IsCommutable in
+ let isCommutable = IsCommutable in {
def NAME: AVX512<O, F, Outs, Ins,
OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
"$dst, "#IntelSrcAsm#"}",
OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
"$dst {${mask}}, "#IntelSrcAsm#"}",
MaskingPattern>, EVEX_K;
+ }
}
multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _,
// NOTE: Patterns are omitted in favor of manual selection in X86ISelDAGToDAG.
// There are just too many permuations due to commutability and bitcasts.
let ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
- let isCommutable = 1 in
defm rr : AVX512_maskable_cmp<opc, MRMSrcReg, _, (outs _.KRC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (null_frag), (null_frag)>,
+ (null_frag), (null_frag), 1>,
EVEX_4V, Sched<[sched]>;
let mayLoad = 1 in
defm rm : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
}
declare <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double>, <8 x double>, i32, i32)
+define <8 x double> @stack_fold_cmppd_mask(<8 x double> %a0, <8 x double> %a1, <8 x double>* %a2, i8 %mask, <8 x double> %b0, <8 x double> %b1) {
+ ;CHECK-LABEL: stack_fold_cmppd_mask:
+ ;CHECK: vcmpeqpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load
+ %2 = load <8 x double>, <8 x double>* %a2
+ %3 = fadd <8 x double> %a1, %2
+ %4 = bitcast i8 %mask to <8 x i1>
+ %5 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %3, <8 x double> %a0, i32 0, i32 4)
+ %6 = and <8 x i1> %4, %5
+ %7 = select <8 x i1> %6, <8 x double> %b0, <8 x double> %b1
+ ret <8 x double> %7
+}
+
+define <8 x double> @stack_fold_cmppd_mask_commuted(<8 x double> %a0, <8 x double> %a1, <8 x double>* %a2, i8 %mask, <8 x double> %b0, <8 x double> %b1) {
+ ;CHECK-LABEL: stack_fold_cmppd_mask_commuted:
+ ;CHECK: vcmpeqpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load
+ %2 = load <8 x double>, <8 x double>* %a2
+ %3 = fadd <8 x double> %a1, %2
+ %4 = bitcast i8 %mask to <8 x i1>
+ %5 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %a0, <8 x double> %3, i32 0, i32 4)
+ %6 = and <8 x i1> %4, %5
+ %7 = select <8 x i1> %6, <8 x double> %b0, <8 x double> %b1
+ ret <8 x double> %7
+}
+
define i16 @stack_fold_cmpps(<16 x float> %a0, <16 x float> %a1) {
;CHECK-LABEL: stack_fold_cmpps
;CHECK: vcmpeqps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-9]*}} {{.*#+}} 64-byte Folded Reload
}
declare <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float>, <16 x float>, i32, i32)
+define <16 x float> @stack_fold_cmpps_mask(<16 x float> %a0, <16 x float> %a1, <16 x float>* %a2, i16 %mask, <16 x float> %b0, <16 x float> %b1) {
+ ;CHECK-LABEL: stack_fold_cmpps_mask:
+ ;CHECK: vcmpeqps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load
+ %2 = load <16 x float>, <16 x float>* %a2
+ %3 = fadd <16 x float> %a1, %2
+ %4 = bitcast i16 %mask to <16 x i1>
+ %5 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %3, <16 x float> %a0, i32 0, i32 4)
+ %6 = and <16 x i1> %4, %5
+ %7 = select <16 x i1> %6, <16 x float> %b0, <16 x float> %b1
+ ret <16 x float> %7
+}
+
+define <16 x float> @stack_fold_cmpps_mask_commuted(<16 x float> %a0, <16 x float> %a1, <16 x float>* %a2, i16 %mask, <16 x float> %b0, <16 x float> %b1) {
+ ;CHECK-LABEL: stack_fold_cmpps_mask_commuted:
+ ;CHECK: vcmpeqps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load
+ %2 = load <16 x float>, <16 x float>* %a2
+ %3 = fadd <16 x float> %a1, %2
+ %4 = bitcast i16 %mask to <16 x i1>
+ %5 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %a0, <16 x float> %3, i32 0, i32 4)
+ %6 = and <16 x i1> %4, %5
+ %7 = select <16 x i1> %6, <16 x float> %b0, <16 x float> %b1
+ ret <16 x float> %7
+}
+
define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) {
;CHECK-LABEL: stack_fold_divsd_int
;CHECK: vdivsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload