From: Craig Topper Date: Thu, 6 Jun 2019 16:39:04 +0000 (+0000) Subject: [X86] Make masked floating point equality/ordered compares commutable for load foldin... X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=64c4bd40c6a08afc1ba9c52a7970e02032cfc612;p=llvm [X86] Make masked floating point equality/ordered compares commutable for load folding purposes. Same as what is supported for the unmasked form. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@362717 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 4302b3e1ed5..8987b6a33bf 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -365,7 +365,7 @@ multiclass AVX512_maskable_custom_cmp O, Format F, list Pattern, list MaskingPattern, bit IsCommutable = 0> { - let isCommutable = IsCommutable in + let isCommutable = IsCommutable in { def NAME: AVX512 O, Format F, OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"# "$dst {${mask}}, "#IntelSrcAsm#"}", MaskingPattern>, EVEX_K; + } } multiclass AVX512_maskable_common_cmp O, Format F, X86VectorVTInfo _, @@ -5788,11 +5789,10 @@ multiclass avx512_vptest opc, string OpcodeStr, // NOTE: Patterns are omitted in favor of manual selection in X86ISelDAGToDAG. // There are just too many permuations due to commutability and bitcasts. let ExeDomain = _.ExeDomain, hasSideEffects = 0 in { - let isCommutable = 1 in defm rr : AVX512_maskable_cmp, + (null_frag), (null_frag), 1>, EVEX_4V, Sched<[sched]>; let mayLoad = 1 in defm rm : AVX512_maskable_cmp %a0, <8 x double> %a1) { } declare <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double>, <8 x double>, i32, i32) +define <8 x double> @stack_fold_cmppd_mask(<8 x double> %a0, <8 x double> %a1, <8 x double>* %a2, i8 %mask, <8 x double> %b0, <8 x double> %b1) { + ;CHECK-LABEL: stack_fold_cmppd_mask: + ;CHECK: vcmpeqpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load + %2 = load <8 x double>, <8 x double>* %a2 + %3 = fadd <8 x double> %a1, %2 + %4 = bitcast i8 %mask to <8 x i1> + %5 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %3, <8 x double> %a0, i32 0, i32 4) + %6 = and <8 x i1> %4, %5 + %7 = select <8 x i1> %6, <8 x double> %b0, <8 x double> %b1 + ret <8 x double> %7 +} + +define <8 x double> @stack_fold_cmppd_mask_commuted(<8 x double> %a0, <8 x double> %a1, <8 x double>* %a2, i8 %mask, <8 x double> %b0, <8 x double> %b1) { + ;CHECK-LABEL: stack_fold_cmppd_mask_commuted: + ;CHECK: vcmpeqpd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load + %2 = load <8 x double>, <8 x double>* %a2 + %3 = fadd <8 x double> %a1, %2 + %4 = bitcast i8 %mask to <8 x i1> + %5 = call <8 x i1> @llvm.x86.avx512.cmp.pd.512(<8 x double> %a0, <8 x double> %3, i32 0, i32 4) + %6 = and <8 x i1> %4, %5 + %7 = select <8 x i1> %6, <8 x double> %b0, <8 x double> %b1 + ret <8 x double> %7 +} + define i16 @stack_fold_cmpps(<16 x float> %a0, <16 x float> %a1) { ;CHECK-LABEL: stack_fold_cmpps ;CHECK: vcmpeqps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-9]*}} {{.*#+}} 64-byte Folded Reload @@ -156,6 +184,34 @@ define i16 @stack_fold_cmpps(<16 x float> %a0, <16 x float> %a1) { } declare <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float>, <16 x float>, i32, i32) +define <16 x float> @stack_fold_cmpps_mask(<16 x float> %a0, <16 x float> %a1, <16 x float>* %a2, i16 %mask, <16 x float> %b0, <16 x float> %b1) { + ;CHECK-LABEL: stack_fold_cmpps_mask: + ;CHECK: vcmpeqps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load + %2 = load <16 x float>, <16 x float>* %a2 + %3 = fadd <16 x float> %a1, %2 + %4 = bitcast i16 %mask to <16 x i1> + %5 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %3, <16 x float> %a0, i32 0, i32 4) + %6 = and <16 x i1> %4, %5 + %7 = select <16 x i1> %6, <16 x float> %b0, <16 x float> %b1 + ret <16 x float> %7 +} + +define <16 x float> @stack_fold_cmpps_mask_commuted(<16 x float> %a0, <16 x float> %a1, <16 x float>* %a2, i16 %mask, <16 x float> %b0, <16 x float> %b1) { + ;CHECK-LABEL: stack_fold_cmpps_mask_commuted: + ;CHECK: vcmpeqps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load + %2 = load <16 x float>, <16 x float>* %a2 + %3 = fadd <16 x float> %a1, %2 + %4 = bitcast i16 %mask to <16 x i1> + %5 = call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %a0, <16 x float> %3, i32 0, i32 4) + %6 = and <16 x i1> %4, %5 + %7 = select <16 x i1> %6, <16 x float> %b0, <16 x float> %b1 + ret <16 x float> %7 +} + define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) { ;CHECK-LABEL: stack_fold_divsd_int ;CHECK: vdivsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload