From bc72642b221bb59499d4635c20318aa6ae7d00df Mon Sep 17 00:00:00 2001 From: Andrea Di Biagio Date: Mon, 2 Sep 2019 12:32:28 +0000 Subject: [PATCH] [X86][BtVer2] Fix latency and throughput of conditional SIMD store instructions. On BtVer2 conditional SIMD stores are heavily microcoded. The latency is directly proportional to the number of packed elements extracted from the input vector. Also, according to micro-benchmarks, most of the computation seems to be done in the integer unit. Only a minority of the uOPs is executed by the FPU. The observed behaviour on the FPU looks similar to this: - The input MASK value is moved to the Integer Unit -- [ a VMOVMSK-like uOP-executed on JFPU0]. - In parallel, each element of the input XMM/YMM is extracted and then sent to the IntegerUnit through JFPU1. As expected, a (conditional) store is executed for every extracted element. Interestingly, a (speculative) load is executed for every extracted element too. It is as-if a "LOAD - BIT_EXTRACT- CMOV" sequence of uOPs is repeated by the integer unit for every contionally stored element. VMASKMOVDQU is a special case: the number of speculative loads is always 2 (presumably, one load per quadword). That means, extra shifts and masking is performed on (one of) the loaded quadwords before each conditional store (that also explains the big number of non-FP uOPs retired). This patch replaces the existing writes for conditional SIMD stores (i.e. WriteFMaskedStore, and WriteFMaskedStoreY) with the following new writes: WriteFMaskedStore32 [ XMM Packed Single ] WriteFMaskedStore32Y [ YMM Packed Single ] WriteFMaskedStore64 [ XMM Packed Double ] WriteFMaskedStore64Y [ YMM Packed Double ] Added a wrapper class named X86SchedWriteMaskMove in X86Schedule.td to describe both RM and MR variants for conditional SIMD moves in a single tablegen definition. Instances of that class are then passed in input to multiclass avx_movmask_rm when constructing MASKMOVPS/PD definitions. Since this patch introduces new writes, I had to update all the X86 scheduling models. Differential Revision: https://reviews.llvm.org/D66801 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@370649 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrSSE.td | 18 ++++++++------ lib/Target/X86/X86SchedBroadwell.td | 8 +++++-- lib/Target/X86/X86SchedHaswell.td | 8 +++++-- lib/Target/X86/X86SchedSandyBridge.td | 8 +++++-- lib/Target/X86/X86SchedSkylakeClient.td | 8 +++++-- lib/Target/X86/X86SchedSkylakeServer.td | 8 +++++-- lib/Target/X86/X86Schedule.td | 24 +++++++++++++++++-- lib/Target/X86/X86ScheduleAtom.td | 6 +++-- lib/Target/X86/X86ScheduleBdVer2.td | 6 +++-- lib/Target/X86/X86ScheduleBtVer2.td | 19 +++++++++++++-- lib/Target/X86/X86ScheduleSLM.td | 8 +++++-- lib/Target/X86/X86ScheduleZnver1.td | 8 +++++-- .../llvm-mca/X86/BtVer2/resources-avx1.s | 22 ++++++++--------- .../llvm-mca/X86/BtVer2/resources-sse2.s | 6 ++--- 14 files changed, 114 insertions(+), 43 deletions(-) diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 16c76ca7a6e..b3c88982a39 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -7061,27 +7061,29 @@ let Predicates = [HasAVX1Only] in { // multiclass avx_movmask_rm opc_rm, bits<8> opc_mr, string OpcodeStr, Intrinsic IntLd, Intrinsic IntLd256, - Intrinsic IntSt, Intrinsic IntSt256> { + Intrinsic IntSt, Intrinsic IntSt256, + X86SchedWriteMaskMove schedX, + X86SchedWriteMaskMove schedY> { def rm : AVX8I, - VEX_4V, Sched<[WriteFMaskedLoad]>; + VEX_4V, Sched<[schedX.RM]>; def Yrm : AVX8I, - VEX_4V, VEX_L, Sched<[WriteFMaskedLoadY]>; + VEX_4V, VEX_L, Sched<[schedY.RM]>; def mr : AVX8I, - VEX_4V, Sched<[WriteFMaskedStore]>; + VEX_4V, Sched<[schedX.MR]>; def Ymr : AVX8I, - VEX_4V, VEX_L, Sched<[WriteFMaskedStoreY]>; + VEX_4V, VEX_L, Sched<[schedY.MR]>; } let ExeDomain = SSEPackedSingle in @@ -7089,13 +7091,15 @@ defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps", int_x86_avx_maskload_ps, int_x86_avx_maskload_ps_256, int_x86_avx_maskstore_ps, - int_x86_avx_maskstore_ps_256>; + int_x86_avx_maskstore_ps_256, + WriteFMaskMove32, WriteFMaskMove32Y>; let ExeDomain = SSEPackedDouble in defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd", int_x86_avx_maskload_pd, int_x86_avx_maskload_pd_256, int_x86_avx_maskstore_pd, - int_x86_avx_maskstore_pd_256>; + int_x86_avx_maskstore_pd_256, + WriteFMaskMove64, WriteFMaskMove64Y>; //===----------------------------------------------------------------------===// // VPERMIL - Permute Single and Double Floating-Point Values diff --git a/lib/Target/X86/X86SchedBroadwell.td b/lib/Target/X86/X86SchedBroadwell.td index 7574e4b8f89..9b1fcaa8a13 100644 --- a/lib/Target/X86/X86SchedBroadwell.td +++ b/lib/Target/X86/X86SchedBroadwell.td @@ -232,8 +232,12 @@ defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; -defm : X86WriteRes; -defm : X86WriteRes; + +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; + defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td index 284d1567c5c..06f417501b2 100644 --- a/lib/Target/X86/X86SchedHaswell.td +++ b/lib/Target/X86/X86SchedHaswell.td @@ -231,8 +231,12 @@ defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; -defm : X86WriteRes; -defm : X86WriteRes; + +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; + defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td index d40bdf728a4..26d4d8fa354 100644 --- a/lib/Target/X86/X86SchedSandyBridge.td +++ b/lib/Target/X86/X86SchedSandyBridge.td @@ -208,8 +208,12 @@ defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; -defm : X86WriteRes; -defm : X86WriteRes; + +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; + defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; diff --git a/lib/Target/X86/X86SchedSkylakeClient.td b/lib/Target/X86/X86SchedSkylakeClient.td index 8f3e4ae62d5..9a511ecc007 100644 --- a/lib/Target/X86/X86SchedSkylakeClient.td +++ b/lib/Target/X86/X86SchedSkylakeClient.td @@ -226,8 +226,12 @@ defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; -defm : X86WriteRes; -defm : X86WriteRes; + +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; + defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; diff --git a/lib/Target/X86/X86SchedSkylakeServer.td b/lib/Target/X86/X86SchedSkylakeServer.td index 58caf1dacfc..a8c65435ab9 100644 --- a/lib/Target/X86/X86SchedSkylakeServer.td +++ b/lib/Target/X86/X86SchedSkylakeServer.td @@ -226,8 +226,12 @@ defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; -defm : X86WriteRes; -defm : X86WriteRes; + +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; + defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td index 55ca85ec1e3..95f710061ae 100644 --- a/lib/Target/X86/X86Schedule.td +++ b/lib/Target/X86/X86Schedule.td @@ -102,6 +102,12 @@ class X86SchedWriteMoveLS { + SchedWrite RM = LoadRM; + SchedWrite MR = StoreMR; +} + // Multiclass that wraps X86SchedWriteMoveLS for each vector width. class X86SchedWriteMoveLSWidths; +// Conditional SIMD Packed Loads and Stores wrappers. +def WriteFMaskMove32 + : X86SchedWriteMaskMove; +def WriteFMaskMove64 + : X86SchedWriteMaskMove; +def WriteFMaskMove32Y + : X86SchedWriteMaskMove; +def WriteFMaskMove64Y + : X86SchedWriteMaskMove; + // Vector width wrappers. def SchedWriteFAdd : X86SchedWriteWidths; diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td index b0334655de7..78acb1065ec 100644 --- a/lib/Target/X86/X86ScheduleAtom.td +++ b/lib/Target/X86/X86ScheduleAtom.td @@ -216,8 +216,10 @@ defm : X86WriteResUnsupported; def : WriteRes; def : WriteRes; defm : X86WriteResUnsupported; -defm : X86WriteResUnsupported; -defm : X86WriteResUnsupported; +defm : X86WriteResUnsupported; +defm : X86WriteResUnsupported; +defm : X86WriteResUnsupported; +defm : X86WriteResUnsupported; def : WriteRes; def : WriteRes; diff --git a/lib/Target/X86/X86ScheduleBdVer2.td b/lib/Target/X86/X86ScheduleBdVer2.td index 8cc01c3acec..d7aea3cf4e9 100644 --- a/lib/Target/X86/X86ScheduleBdVer2.td +++ b/lib/Target/X86/X86ScheduleBdVer2.td @@ -726,8 +726,10 @@ defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; -defm : PdWriteRes; -defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; +defm : PdWriteRes; defm : PdWriteRes; defm : PdWriteRes; diff --git a/lib/Target/X86/X86ScheduleBtVer2.td b/lib/Target/X86/X86ScheduleBtVer2.td index 32549bc06e0..3addf048dba 100644 --- a/lib/Target/X86/X86ScheduleBtVer2.td +++ b/lib/Target/X86/X86ScheduleBtVer2.td @@ -512,8 +512,11 @@ defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; -defm : X86WriteRes; -defm : X86WriteRes; + +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; @@ -818,6 +821,18 @@ def JWriteJVZEROUPPER: SchedWriteRes<[]> { } def : InstRW<[JWriteJVZEROUPPER], (instrs VZEROUPPER)>; +/////////////////////////////////////////////////////////////////////////////// +// SSE2/AVX Store Selected Bytes of Double Quadword - (V)MASKMOVDQ +/////////////////////////////////////////////////////////////////////////////// + +def JWriteMASKMOVDQU: SchedWriteRes<[JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01]> { + let Latency = 34; + let ResourceCycles = [1, 1, 2, 2, 2, 16, 42]; + let NumMicroOps = 63; +} +def : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64, + VMASKMOVDQU, VMASKMOVDQU64)>; + /////////////////////////////////////////////////////////////////////////////// // SchedWriteVariant definitions. /////////////////////////////////////////////////////////////////////////////// diff --git a/lib/Target/X86/X86ScheduleSLM.td b/lib/Target/X86/X86ScheduleSLM.td index 34c251a5c5b..8e3ce721f1a 100644 --- a/lib/Target/X86/X86ScheduleSLM.td +++ b/lib/Target/X86/X86ScheduleSLM.td @@ -186,8 +186,12 @@ def : WriteRes; def : WriteRes; def : WriteRes; def : WriteRes; -def : WriteRes; -def : WriteRes; + +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; + def : WriteRes; def : WriteRes; def : WriteRes; diff --git a/lib/Target/X86/X86ScheduleZnver1.td b/lib/Target/X86/X86ScheduleZnver1.td index 65f6d89df61..06201f4a3a8 100644 --- a/lib/Target/X86/X86ScheduleZnver1.td +++ b/lib/Target/X86/X86ScheduleZnver1.td @@ -268,8 +268,12 @@ defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; -defm : X86WriteRes; -defm : X86WriteRes; + +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; +defm : X86WriteRes; + defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; diff --git a/test/tools/llvm-mca/X86/BtVer2/resources-avx1.s b/test/tools/llvm-mca/X86/BtVer2/resources-avx1.s index 5b9c1dd66e9..1378a115b88 100644 --- a/test/tools/llvm-mca/X86/BtVer2/resources-avx1.s +++ b/test/tools/llvm-mca/X86/BtVer2/resources-avx1.s @@ -1219,15 +1219,15 @@ vzeroupper # CHECK-NEXT: 1 5 1.00 * vlddqu (%rax), %xmm2 # CHECK-NEXT: 1 5 1.00 * vlddqu (%rax), %ymm2 # CHECK-NEXT: 1 3 1.00 * U vldmxcsr (%rax) -# CHECK-NEXT: 1 1 1.00 * * U vmaskmovdqu %xmm0, %xmm1 +# CHECK-NEXT: 63 34 21.00 * * U vmaskmovdqu %xmm0, %xmm1 # CHECK-NEXT: 1 6 1.00 * vmaskmovpd (%rax), %xmm0, %xmm2 # CHECK-NEXT: 2 6 2.00 * vmaskmovpd (%rax), %ymm0, %ymm2 -# CHECK-NEXT: 1 6 2.00 * * vmaskmovpd %xmm0, %xmm1, (%rax) -# CHECK-NEXT: 2 6 2.00 * * vmaskmovpd %ymm0, %ymm1, (%rax) +# CHECK-NEXT: 10 13 2.00 * * vmaskmovpd %xmm0, %xmm1, (%rax) +# CHECK-NEXT: 18 16 4.00 * * vmaskmovpd %ymm0, %ymm1, (%rax) # CHECK-NEXT: 1 6 1.00 * vmaskmovps (%rax), %xmm0, %xmm2 # CHECK-NEXT: 2 6 2.00 * vmaskmovps (%rax), %ymm0, %ymm2 -# CHECK-NEXT: 1 6 2.00 * * vmaskmovps %xmm0, %xmm1, (%rax) -# CHECK-NEXT: 2 6 2.00 * * vmaskmovps %ymm0, %ymm1, (%rax) +# CHECK-NEXT: 19 16 5.00 * * vmaskmovps %xmm0, %xmm1, (%rax) +# CHECK-NEXT: 36 22 10.00 * * vmaskmovps %ymm0, %ymm1, (%rax) # CHECK-NEXT: 1 2 1.00 vmaxpd %xmm0, %xmm1, %xmm2 # CHECK-NEXT: 1 7 1.00 * vmaxpd (%rax), %xmm1, %xmm2 # CHECK-NEXT: 2 2 2.00 vmaxpd %ymm0, %ymm1, %ymm2 @@ -1740,7 +1740,7 @@ vzeroupper # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] -# CHECK-NEXT: 56.00 - - 365.00 915.00 447.50 461.50 394.00 - 51.00 132.00 135.50 159.50 38.00 +# CHECK-NEXT: 86.00 30.00 - 362.00 907.00 449.50 480.50 414.00 - 78.00 154.00 135.50 159.50 38.00 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: @@ -1933,15 +1933,15 @@ vzeroupper # CHECK-NEXT: - - - - - 0.50 0.50 1.00 - - - 0.50 0.50 - vlddqu (%rax), %xmm2 # CHECK-NEXT: - - - - - 0.50 0.50 1.00 - - - 0.50 0.50 - vlddqu (%rax), %ymm2 # CHECK-NEXT: - - - - - - - 1.00 - - - - - - vldmxcsr (%rax) -# CHECK-NEXT: - - - - - - 1.00 - - 1.00 1.00 - - - vmaskmovdqu %xmm0, %xmm1 +# CHECK-NEXT: 21.00 21.00 - 1.00 - 1.00 2.00 2.00 - 16.00 2.00 - - - vmaskmovdqu %xmm0, %xmm1 # CHECK-NEXT: - - - 1.00 1.00 1.00 1.00 1.00 - - - - - - vmaskmovpd (%rax), %xmm0, %xmm2 # CHECK-NEXT: - - - 2.00 2.00 2.00 2.00 2.00 - - - - - - vmaskmovpd (%rax), %ymm0, %ymm2 -# CHECK-NEXT: - - - 2.00 2.00 0.50 0.50 - - 1.00 - - - - vmaskmovpd %xmm0, %xmm1, (%rax) -# CHECK-NEXT: - - - 2.00 2.00 1.00 1.00 - - 2.00 - - - - vmaskmovpd %ymm0, %ymm1, (%rax) +# CHECK-NEXT: 1.00 1.00 - 1.00 - 1.00 2.00 2.00 - 2.00 2.00 - - - vmaskmovpd %xmm0, %xmm1, (%rax) +# CHECK-NEXT: 2.00 2.00 - 1.00 - 1.00 4.00 4.00 - 4.00 4.00 - - - vmaskmovpd %ymm0, %ymm1, (%rax) # CHECK-NEXT: - - - 1.00 1.00 1.00 1.00 1.00 - - - - - - vmaskmovps (%rax), %xmm0, %xmm2 # CHECK-NEXT: - - - 2.00 2.00 2.00 2.00 2.00 - - - - - - vmaskmovps (%rax), %ymm0, %ymm2 -# CHECK-NEXT: - - - 2.00 2.00 0.50 0.50 - - 1.00 - - - - vmaskmovps %xmm0, %xmm1, (%rax) -# CHECK-NEXT: - - - 2.00 2.00 1.00 1.00 - - 2.00 - - - - vmaskmovps %ymm0, %ymm1, (%rax) +# CHECK-NEXT: 2.00 2.00 - 1.00 - 1.00 5.00 4.00 - 4.00 5.00 - - - vmaskmovps %xmm0, %xmm1, (%rax) +# CHECK-NEXT: 4.00 4.00 - 1.00 - 1.00 10.00 8.00 - 8.00 10.00 - - - vmaskmovps %ymm0, %ymm1, (%rax) # CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - vmaxpd %xmm0, %xmm1, %xmm2 # CHECK-NEXT: - - - 1.00 - 1.00 - 1.00 - - - - - - vmaxpd (%rax), %xmm1, %xmm2 # CHECK-NEXT: - - - 2.00 - 2.00 - - - - - - - - vmaxpd %ymm0, %ymm1, %ymm2 diff --git a/test/tools/llvm-mca/X86/BtVer2/resources-sse2.s b/test/tools/llvm-mca/X86/BtVer2/resources-sse2.s index c9d7f32d4e0..924066ae87b 100644 --- a/test/tools/llvm-mca/X86/BtVer2/resources-sse2.s +++ b/test/tools/llvm-mca/X86/BtVer2/resources-sse2.s @@ -465,7 +465,7 @@ xorpd (%rax), %xmm2 # CHECK-NEXT: 1 19 19.00 divsd %xmm0, %xmm2 # CHECK-NEXT: 1 24 19.00 * divsd (%rax), %xmm2 # CHECK-NEXT: 1 1 1.00 * * U lfence -# CHECK-NEXT: 1 1 1.00 * * U maskmovdqu %xmm0, %xmm1 +# CHECK-NEXT: 63 34 21.00 * * U maskmovdqu %xmm0, %xmm1 # CHECK-NEXT: 1 2 1.00 maxpd %xmm0, %xmm2 # CHECK-NEXT: 1 7 1.00 * maxpd (%rax), %xmm2 # CHECK-NEXT: 1 2 1.00 maxsd %xmm0, %xmm2 @@ -693,7 +693,7 @@ xorpd (%rax), %xmm2 # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] -# CHECK-NEXT: 17.00 - - 49.00 204.00 128.50 141.50 118.00 - 16.00 54.00 67.50 67.50 12.00 +# CHECK-NEXT: 38.00 21.00 - 50.00 204.00 129.50 142.50 120.00 - 31.00 55.00 67.50 67.50 12.00 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: @@ -755,7 +755,7 @@ xorpd (%rax), %xmm2 # CHECK-NEXT: - - - - 19.00 - 1.00 - - - - - - - divsd %xmm0, %xmm2 # CHECK-NEXT: - - - - 19.00 - 1.00 1.00 - - - - - - divsd (%rax), %xmm2 # CHECK-NEXT: - - - - - - - - - 1.00 - - - - lfence -# CHECK-NEXT: - - - - - - 1.00 - - 1.00 1.00 - - - maskmovdqu %xmm0, %xmm1 +# CHECK-NEXT: 21.00 21.00 - 1.00 - 1.00 2.00 2.00 - 16.00 2.00 - - - maskmovdqu %xmm0, %xmm1 # CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - maxpd %xmm0, %xmm2 # CHECK-NEXT: - - - 1.00 - 1.00 - 1.00 - - - - - - maxpd (%rax), %xmm2 # CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - maxsd %xmm0, %xmm2 -- 2.40.0