From: Paul Dagnelie <pcd@delphix.com>
Date: Fri, 7 Jun 2019 02:10:43 +0000 (-0700)
Subject: Allow metaslab to be unloaded even when not freed from
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=893a6d62c1895f3e3eeb660b048236571995a564;p=zfs

Allow metaslab to be unloaded even when not freed from

On large systems, the memory used by loaded metaslabs can become
a concern. While range trees are a fairly efficient data structure,
on heavily fragmented pools they can still consume a significant
amount of memory. This problem is amplified when we fail to unload
metaslabs that we aren't using. Currently, we only unload a metaslab
during metaslab_sync_done; in order for that function to be called
on a given metaslab in a given txg, we have to have dirtied that
metaslab in that txg. If the dirtying was the result of an allocation,
we wouldn't be unloading it (since it wouldn't be 8 txgs since it
was selected), so in effect we only unload a metaslab during txgs
where it's being freed from.

We move the unload logic from sync_done to a new function, and
call that function on all metaslabs in a given vdev during
vdev_sync_done().

Reviewed-by: Richard Elling <Richard.Elling@RichardElling.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Closes #8837
---

diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h
index 2790d06c7..330902529 100644
--- a/include/sys/metaslab.h
+++ b/include/sys/metaslab.h
@@ -50,6 +50,7 @@ int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t,
 void metaslab_fini(metaslab_t *);
 
 int metaslab_load(metaslab_t *);
+void metaslab_potentially_unload(metaslab_t *, uint64_t);
 void metaslab_unload(metaslab_t *);
 
 uint64_t metaslab_allocated_space(metaslab_t *);
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index d1d5a243f..41cbaad5f 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
  */
@@ -2949,6 +2949,30 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 	dmu_tx_commit(tx);
 }
 
+void
+metaslab_potentially_unload(metaslab_t *msp, uint64_t txg)
+{
+	/*
+	 * If the metaslab is loaded and we've not tried to load or allocate
+	 * from it in 'metaslab_unload_delay' txgs, then unload it.
+	 */
+	if (msp->ms_loaded &&
+	    msp->ms_disabled == 0 &&
+	    msp->ms_selected_txg + metaslab_unload_delay < txg) {
+		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
+			VERIFY0(range_tree_space(
+			    msp->ms_allocating[(txg + t) & TXG_MASK]));
+		}
+		if (msp->ms_allocator != -1) {
+			metaslab_passivate(msp, msp->ms_weight &
+			    ~METASLAB_ACTIVE_MASK);
+		}
+
+		if (!metaslab_debug_unload)
+			metaslab_unload(msp);
+	}
+}
+
 /*
  * Called after a transaction group has completely synced to mark
  * all of the metaslab's free space as usable.
@@ -3086,27 +3110,6 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 	 */
 	metaslab_recalculate_weight_and_sort(msp);
 
-	/*
-	 * If the metaslab is loaded and we've not tried to load or allocate
-	 * from it in 'metaslab_unload_delay' txgs, then unload it.
-	 */
-	if (msp->ms_loaded &&
-	    msp->ms_disabled == 0 &&
-	    msp->ms_selected_txg + metaslab_unload_delay < txg) {
-
-		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
-			VERIFY0(range_tree_space(
-			    msp->ms_allocating[(txg + t) & TXG_MASK]));
-		}
-		if (msp->ms_allocator != -1) {
-			metaslab_passivate(msp, msp->ms_weight &
-			    ~METASLAB_ACTIVE_MASK);
-		}
-
-		if (!metaslab_debug_unload)
-			metaslab_unload(msp);
-	}
-
 	ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
 	ASSERT0(range_tree_space(msp->ms_freeing));
 	ASSERT0(range_tree_space(msp->ms_freed));
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 1c4812cd8..81ef87e25 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -3234,6 +3234,20 @@ vdev_sync_done(vdev_t *vd, uint64_t txg)
 	    != NULL)
 		metaslab_sync_done(msp, txg);
 
+	/*
+	 * Because this function is only called on dirty vdevs, it's possible
+	 * we won't consider all metaslabs for unloading on every
+	 * txg. However, unless the system is largely idle it is likely that
+	 * we will dirty all vdevs within a few txgs.
+	 */
+	for (int i = 0; i < vd->vdev_ms_count; i++) {
+		msp = vd->vdev_ms[i];
+		mutex_enter(&msp->ms_lock);
+		if (msp->ms_sm != NULL)
+			metaslab_potentially_unload(msp, txg);
+		mutex_exit(&msp->ms_lock);
+	}
+
 	if (reassess)
 		metaslab_sync_reassess(vd->vdev_mg);
 }