From: Craig Topper Date: Thu, 26 Sep 2019 04:42:58 +0000 (+0000) Subject: [X86] Mark the EVEX encoded PSADBW instructions as commutable to enable load folding... X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=dcdbbe2ec49b0810c88ea6524433477ec9c1c930;p=llvm [X86] Mark the EVEX encoded PSADBW instructions as commutable to enable load folding of the other operand. The SSE and VEX versions are already correct. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@372941 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 801d22a36a5..e56e42001e9 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -11136,6 +11136,7 @@ defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq", multiclass avx512_psadbw_packed opc, SDNode OpNode, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _dst, X86VectorVTInfo _src> { + let isCommutable = 1 in def rr : AVX512BI @stack_fold_psadbw(<64 x i8> %a0, <64 x i8> %a1) { } declare <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>) nounwind readnone +define <8 x i64> @stack_fold_psadbw_commute(<64 x i8> %a0, <64 x i8> %a1) { +; CHECK-LABEL: stack_fold_psadbw_commute: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %a1, <64 x i8> %a0) + ret <8 x i64> %2 +} + define <64 x i8> @stack_fold_pshufb_zmm(<64 x i8> %a0, <64 x i8> %a1) { ; CHECK-LABEL: stack_fold_pshufb_zmm: ; CHECK: # %bb.0: diff --git a/test/CodeGen/X86/stack-folding-int-avx512vl.ll b/test/CodeGen/X86/stack-folding-int-avx512vl.ll index f3873a9e491..e1eaa1a52db 100644 --- a/test/CodeGen/X86/stack-folding-int-avx512vl.ll +++ b/test/CodeGen/X86/stack-folding-int-avx512vl.ll @@ -2896,6 +2896,20 @@ define <2 x i64> @stack_fold_psadbw(<16 x i8> %a0, <16 x i8> %a1) { } declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone +define <2 x i64> @stack_fold_psadbw_commute(<16 x i8> %a0, <16 x i8> %a1) { +; CHECK-LABEL: stack_fold_psadbw_commute: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a1, <16 x i8> %a0) + ret <2 x i64> %2 +} + define <4 x i64> @stack_fold_psadbw_ymm(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: stack_fold_psadbw_ymm: ; CHECK: # %bb.0: @@ -2911,6 +2925,20 @@ define <4 x i64> @stack_fold_psadbw_ymm(<32 x i8> %a0, <32 x i8> %a1) { } declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone +define <4 x i64> @stack_fold_psadbw_ymm_commute(<32 x i8> %a0, <32 x i8> %a1) { +; CHECK-LABEL: stack_fold_psadbw_ymm_commute: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: nop +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: vpsadbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; CHECK-NEXT: retq + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a1, <32 x i8> %a0) + ret <4 x i64> %2 +} + define <16 x i8> @stack_fold_pshufb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: stack_fold_pshufb: ; CHECK: # %bb.0: