]> granicus.if.org Git - zfs/commitdiff
bpobj_enqueue_subobj() should copy small subobj's
authorMatthew Ahrens <mahrens@delphix.com>
Wed, 31 Oct 2018 16:58:17 +0000 (09:58 -0700)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Wed, 31 Oct 2018 16:58:17 +0000 (11:58 -0500)
When we delete a snapshot, we consolidate some bpobj's together because
we no longer need to keep their entries in separate buckets.  This is
done in constant time by including the "sub" bpobj by reference in the
parent bpobj.

After many snapshots have been deleted, we may have many sub-bpobj's.
Usually, most sub-bpobj's don't contain many BP's.  Compared to this
small payload, the sub-bpobj is relatively heavyweight since it is a
object in the MOS.  A common scenario on a long-lived pool is for the
vast majority of MOS objects to be small sub-bpobj's.

To improve this situation, when consolidating bpobj's together,
bpobj_enqueue_subobj() can copy the contents of small bpobj's into the
parent, and then delete the enqueued bpobj, rather than including it by
reference.  Since this copying is limited in size (to one block), the
consolidation is still constant time, though with a larger constant due
to reading in the one block of the enqueued bpobj.

This idea and mechanism are similar to how we handle "sub-subobj's".
When including a sub-bpobj by reference, if the sub-bpobj itself has
less than a block of sub-sub-bpobj's, the list of sub-sub-bpobj's is
copied to the parent bpobj's list of sub-bpobj's.

Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Paul Zuchowski <pzuchowski@datto.com>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #8053
Issue #7908

module/zfs/bpobj.c

index 66ab84df58b2a94be95f043427caf803065628d0..94bd1fd8f0bafb8c2701a652c359fe1dd42352af 100644 (file)
@@ -377,11 +377,98 @@ bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
        return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE));
 }
 
+/*
+ * Logically add subobj's contents to the parent bpobj.
+ *
+ * In the most general case, this is accomplished in constant time by adding
+ * a reference to subobj.  This case is used when enqueuing a large subobj:
+ * +--------------+                        +--------------+
+ * | bpobj        |----------------------->| subobj list  |
+ * +----+----+----+----+----+              +-----+-----+--+--+
+ * | bp | bp | bp | bp | bp |              | obj | obj | obj |
+ * +----+----+----+----+----+              +-----+-----+-----+
+ *
+ * +--------------+                        +--------------+
+ * | sub-bpobj    |----------------------> | subsubobj    |
+ * +----+----+----+----+---------+----+    +-----+-----+--+--------+-----+
+ * | bp | bp | bp | bp |   ...   | bp |    | obj | obj |    ...    | obj |
+ * +----+----+----+----+---------+----+    +-----+-----+-----------+-----+
+ *
+ * Result: sub-bpobj added to parent's subobj list.
+ * +--------------+                        +--------------+
+ * | bpobj        |----------------------->| subobj list  |
+ * +----+----+----+----+----+              +-----+-----+--+--+-----+
+ * | bp | bp | bp | bp | bp |              | obj | obj | obj | OBJ |
+ * +----+----+----+----+----+              +-----+-----+-----+--|--+
+ *                                                              |
+ *       /-----------------------------------------------------/
+ *       v
+ * +--------------+                        +--------------+
+ * | sub-bpobj    |----------------------> | subsubobj    |
+ * +----+----+----+----+---------+----+    +-----+-----+--+--------+-----+
+ * | bp | bp | bp | bp |   ...   | bp |    | obj | obj |    ...    | obj |
+ * +----+----+----+----+---------+----+    +-----+-----+-----------+-----+
+ *
+ *
+ * In a common case, the subobj is small: its bp's and its list of subobj's
+ * are each stored in a single block.  In this case we copy the subobj's
+ * contents to the parent:
+ * +--------------+                        +--------------+
+ * | bpobj        |----------------------->| subobj list  |
+ * +----+----+----+----+----+              +-----+-----+--+--+
+ * | bp | bp | bp | bp | bp |              | obj | obj | obj |
+ * +----+----+----+----+----+              +-----+-----+-----+
+ *                          ^                                ^
+ * +--------------+         |              +--------------+  |
+ * | sub-bpobj    |---------^------------> | subsubobj    |  ^
+ * +----+----+----+         |              +-----+-----+--+  |
+ * | BP | BP |-->-->-->-->-/               | OBJ | OBJ |-->-/
+ * +----+----+                             +-----+-----+
+ *
+ * Result: subobj destroyed, contents copied to parent:
+ * +--------------+                        +--------------+
+ * | bpobj        |----------------------->| subobj list  |
+ * +----+----+----+----+----+----+----+    +-----+-----+--+--+-----+-----+
+ * | bp | bp | bp | bp | bp | BP | BP |    | obj | obj | obj | OBJ | OBJ |
+ * +----+----+----+----+----+----+----+    +-----+-----+-----+-----+-----+
+ *
+ *
+ * If the subobj has many BP's but few subobj's, we can copy the sub-subobj's
+ * but retain the sub-bpobj:
+ * +--------------+                        +--------------+
+ * | bpobj        |----------------------->| subobj list  |
+ * +----+----+----+----+----+              +-----+-----+--+--+
+ * | bp | bp | bp | bp | bp |              | obj | obj | obj |
+ * +----+----+----+----+----+              +-----+-----+-----+
+ *                                                           ^
+ * +--------------+                        +--------------+  |
+ * | sub-bpobj    |----------------------> | subsubobj    |  ^
+ * +----+----+----+----+---------+----+    +-----+-----+--+  |
+ * | bp | bp | bp | bp |   ...   | bp |    | OBJ | OBJ |-->-/
+ * +----+----+----+----+---------+----+    +-----+-----+
+ *
+ * Result: sub-sub-bpobjs and subobj added to parent's subobj list.
+ * +--------------+                     +--------------+
+ * | bpobj        |-------------------->| subobj list  |
+ * +----+----+----+----+----+           +-----+-----+--+--+-----+-----+------+
+ * | bp | bp | bp | bp | bp |           | obj | obj | obj | OBJ | OBJ | OBJ* |
+ * +----+----+----+----+----+           +-----+-----+-----+-----+-----+--|---+
+ *                                                                       |
+ *       /--------------------------------------------------------------/
+ *       v
+ * +--------------+
+ * | sub-bpobj    |
+ * +----+----+----+----+---------+----+
+ * | bp | bp | bp | bp |   ...   | bp |
+ * +----+----+----+----+---------+----+
+ */
 void
 bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
 {
        bpobj_t subbpo;
        uint64_t used, comp, uncomp, subsubobjs;
+       boolean_t copy_subsub = B_TRUE;
+       boolean_t copy_bps = B_TRUE;
 
        ASSERT(bpobj_is_open(bpo));
        ASSERT(subobj != 0);
@@ -406,61 +493,110 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
 
        mutex_enter(&bpo->bpo_lock);
        dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
-       if (bpo->bpo_phys->bpo_subobjs == 0) {
-               bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
-                   DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
-                   DMU_OT_NONE, 0, tx);
-       }
 
-       ASSERTV(dmu_object_info_t doi);
-       ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi));
-       ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
+       dmu_object_info_t doi;
 
-       dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
-           bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
-           sizeof (subobj), &subobj, tx);
-       bpo->bpo_phys->bpo_num_subobjs++;
+       if (bpo->bpo_phys->bpo_subobjs != 0) {
+               ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+                   &doi));
+               ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
+       }
 
        /*
         * If subobj has only one block of subobjs, then move subobj's
-        * subobjs to bpo's subobj list directly.  This reduces
-        * recursion in bpobj_iterate due to nested subobjs.
+        * subobjs to bpo's subobj list directly.  This reduces recursion in
+        * bpobj_iterate due to nested subobjs.
         */
        subsubobjs = subbpo.bpo_phys->bpo_subobjs;
        if (subsubobjs != 0) {
-               dmu_object_info_t doi;
-
-               VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
-               if (doi.doi_max_offset == doi.doi_data_block_size) {
-                       dmu_buf_t *subdb;
-                       uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
-
-                       VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, subsubobjs,
-                           0, FTAG, &subdb, 0));
-                       /*
-                        * Make sure that we are not asking dmu_write()
-                        * to write more data than we have in our buffer.
-                        */
-                       VERIFY3U(subdb->db_size, >=,
-                           numsubsub * sizeof (subobj));
-                       dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
-                           bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
-                           numsubsub * sizeof (subobj), subdb->db_data, tx);
-                       dmu_buf_rele(subdb, FTAG);
-                       bpo->bpo_phys->bpo_num_subobjs += numsubsub;
-
-                       dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
-                       subbpo.bpo_phys->bpo_subobjs = 0;
-                       VERIFY3U(0, ==, dmu_object_free(bpo->bpo_os,
-                           subsubobjs, tx));
+               VERIFY0(dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
+               if (doi.doi_max_offset > doi.doi_data_block_size) {
+                       copy_subsub = B_FALSE;
+               }
+       }
+
+       /*
+        * If, in addition to having only one block of subobj's, subobj has
+        * only one block of bp's, then move subobj's bp's to bpo's bp list
+        * directly. This reduces recursion in bpobj_iterate due to nested
+        * subobjs.
+        */
+       VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subobj, &doi));
+       if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub) {
+               copy_bps = B_FALSE;
+       }
+
+       if (copy_subsub && subsubobjs != 0) {
+               dmu_buf_t *subdb;
+               uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
+
+               VERIFY0(dmu_buf_hold(bpo->bpo_os, subsubobjs,
+                   0, FTAG, &subdb, 0));
+               /*
+                * Make sure that we are not asking dmu_write()
+                * to write more data than we have in our buffer.
+                */
+               VERIFY3U(subdb->db_size, >=,
+                   numsubsub * sizeof (subobj));
+               if (bpo->bpo_phys->bpo_subobjs == 0) {
+                       bpo->bpo_phys->bpo_subobjs =
+                           dmu_object_alloc(bpo->bpo_os,
+                           DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
+                           DMU_OT_NONE, 0, tx);
                }
+               dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+                   bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
+                   numsubsub * sizeof (subobj), subdb->db_data, tx);
+               dmu_buf_rele(subdb, FTAG);
+               bpo->bpo_phys->bpo_num_subobjs += numsubsub;
+
+               dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
+               subbpo.bpo_phys->bpo_subobjs = 0;
+               VERIFY0(dmu_object_free(bpo->bpo_os, subsubobjs, tx));
        }
+
+       if (copy_bps) {
+               dmu_buf_t *bps;
+               uint64_t numbps = subbpo.bpo_phys->bpo_num_blkptrs;
+
+               ASSERT(copy_subsub);
+               VERIFY0(dmu_buf_hold(bpo->bpo_os, subobj,
+                   0, FTAG, &bps, 0));
+
+               /*
+                * Make sure that we are not asking dmu_write()
+                * to write more data than we have in our buffer.
+                */
+               VERIFY3U(bps->db_size, >=, numbps * sizeof (blkptr_t));
+               dmu_write(bpo->bpo_os, bpo->bpo_object,
+                   bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t),
+                   numbps * sizeof (blkptr_t),
+                   bps->db_data, tx);
+               dmu_buf_rele(bps, FTAG);
+               bpo->bpo_phys->bpo_num_blkptrs += numbps;
+
+               bpobj_close(&subbpo);
+               VERIFY0(dmu_object_free(bpo->bpo_os, subobj, tx));
+       } else {
+               bpobj_close(&subbpo);
+               if (bpo->bpo_phys->bpo_subobjs == 0) {
+                       bpo->bpo_phys->bpo_subobjs =
+                           dmu_object_alloc(bpo->bpo_os,
+                           DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
+                           DMU_OT_NONE, 0, tx);
+               }
+
+               dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+                   bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
+                   sizeof (subobj), &subobj, tx);
+               bpo->bpo_phys->bpo_num_subobjs++;
+       }
+
        bpo->bpo_phys->bpo_bytes += used;
        bpo->bpo_phys->bpo_comp += comp;
        bpo->bpo_phys->bpo_uncomp += uncomp;
        mutex_exit(&bpo->bpo_lock);
 
-       bpobj_close(&subbpo);
 }
 
 void