]> granicus.if.org Git - zfs/commitdiff
Stack overflow in recursive bpobj_iterate_impl
authorPaul Zuchowski <31706010+PaulZ-98@users.noreply.github.com>
Wed, 6 Mar 2019 17:50:55 +0000 (12:50 -0500)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Wed, 6 Mar 2019 17:50:55 +0000 (09:50 -0800)
The function bpobj_iterate_impl overflows the stack when bpobjs
are deeply nested. Rewrite the function to eliminate the recursion.

Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Paul Zuchowski <pzuchowski@datto.com>
Closes #7674
Closes #7675
Closes #7908

include/sys/dmu.h
module/zfs/bpobj.c
module/zfs/dbuf.c

index 63c51ecfb3a58450c07e64fd300089c6be9ff14c..e4c2ebc2f2d7e62b9cb520245a221b7d33742b48 100644 (file)
@@ -740,6 +740,7 @@ struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db);
  * (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
  */
 void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
+boolean_t dmu_buf_is_dirty(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
     const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx);
 
index 94bd1fd8f0bafb8c2701a652c359fe1dd42352af..633801956ea7293b76db6c400f430e450f796c74 100644 (file)
@@ -206,29 +206,73 @@ bpobj_is_empty(bpobj_t *bpo)
            (!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0));
 }
 
-static int
-bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
-    boolean_t free)
+/*
+ * A recursive iteration of the bpobjs would be nice here but we run the risk
+ * of overflowing function stack space.  Instead, find each subobj and add it
+ * to the head of our list so it can be scanned for subjobjs.  Like a
+ * recursive implementation, the "deepest" subobjs will be freed first.
+ * When a subobj is found to have no additional subojs, free it.
+ */
+typedef struct bpobj_info {
+       bpobj_t *bpi_bpo;
+       /*
+        * This object is a subobj of bpi_parent,
+        * at bpi_index in its subobj array.
+        */
+       struct bpobj_info *bpi_parent;
+       uint64_t bpi_index;
+       /* How many of our subobj's are left to process. */
+       uint64_t bpi_unprocessed_subobjs;
+       /* True after having visited this bpo's directly referenced BPs. */
+       boolean_t bpi_visited;
+       list_node_t bpi_node;
+} bpobj_info_t;
+
+static bpobj_info_t *
+bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index)
 {
-       dmu_object_info_t doi;
-       int epb;
-       int64_t i;
-       int err = 0;
-       dmu_buf_t *dbuf = NULL;
+       bpobj_info_t *bpi = kmem_zalloc(sizeof (bpobj_info_t), KM_SLEEP);
+       bpi->bpi_bpo = bpo;
+       bpi->bpi_parent = parent;
+       bpi->bpi_index = index;
+       if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
+               bpi->bpi_unprocessed_subobjs = bpo->bpo_phys->bpo_num_subobjs;
+       }
+       return (bpi);
+}
 
-       ASSERT(bpobj_is_open(bpo));
-       mutex_enter(&bpo->bpo_lock);
+/*
+ * Update bpobj and all of its parents with new space accounting.
+ */
+static void
+propagate_space_reduction(bpobj_info_t *bpi, uint64_t freed,
+    uint64_t comp_freed, uint64_t uncomp_freed, dmu_tx_t *tx)
+{
 
-       if (free)
-               dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+       for (; bpi != NULL; bpi = bpi->bpi_parent) {
+               bpobj_t *p = bpi->bpi_bpo;
+               ASSERT(dmu_buf_is_dirty(p->bpo_dbuf, tx));
+               p->bpo_phys->bpo_bytes -= freed;
+               ASSERT3S(p->bpo_phys->bpo_bytes, >=, 0);
+               if (p->bpo_havecomp) {
+                       p->bpo_phys->bpo_comp -= comp_freed;
+                       p->bpo_phys->bpo_uncomp -= uncomp_freed;
+               }
+       }
+}
 
-       for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
-               blkptr_t *bparray;
-               blkptr_t *bp;
-               uint64_t offset, blkoff;
+static int
+bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
+    dmu_tx_t *tx, boolean_t free)
+{
+       int err = 0;
+       uint64_t freed = 0, comp_freed = 0, uncomp_freed = 0;
+       dmu_buf_t *dbuf = NULL;
+       bpobj_t *bpo = bpi->bpi_bpo;
 
-               offset = i * sizeof (blkptr_t);
-               blkoff = P2PHASE(i, bpo->bpo_epb);
+       for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
+               uint64_t offset = i * sizeof (blkptr_t);
+               uint64_t blkoff = P2PHASE(i, bpo->bpo_epb);
 
                if (dbuf == NULL || dbuf->db_offset > offset) {
                        if (dbuf)
@@ -242,119 +286,199 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
                ASSERT3U(offset, >=, dbuf->db_offset);
                ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
 
-               bparray = dbuf->db_data;
-               bp = &bparray[blkoff];
+               blkptr_t *bparray = dbuf->db_data;
+               blkptr_t *bp = &bparray[blkoff];
                err = func(arg, bp, tx);
                if (err)
                        break;
+
                if (free) {
-                       bpo->bpo_phys->bpo_bytes -=
-                           bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
-                       ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
-                       if (bpo->bpo_havecomp) {
-                               bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp);
-                               bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp);
-                       }
+                       spa_t *spa = dmu_objset_spa(bpo->bpo_os);
+                       freed += bp_get_dsize_sync(spa, bp);
+                       comp_freed += BP_GET_PSIZE(bp);
+                       uncomp_freed += BP_GET_UCSIZE(bp);
+                       ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx));
                        bpo->bpo_phys->bpo_num_blkptrs--;
                        ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
                }
        }
+       if (free) {
+               propagate_space_reduction(bpi, freed, comp_freed,
+                   uncomp_freed, tx);
+               VERIFY0(dmu_free_range(bpo->bpo_os,
+                   bpo->bpo_object,
+                   bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t),
+                   DMU_OBJECT_END, tx));
+       }
        if (dbuf) {
                dmu_buf_rele(dbuf, FTAG);
                dbuf = NULL;
        }
-       if (free) {
-               VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object,
-                   (i + 1) * sizeof (blkptr_t), DMU_OBJECT_END, tx));
-       }
-       if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0)
-               goto out;
+       return (err);
+}
 
-       ASSERT(bpo->bpo_havecomp);
-       err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi);
-       if (err) {
-               mutex_exit(&bpo->bpo_lock);
-               return (err);
-       }
-       ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
-       epb = doi.doi_data_block_size / sizeof (uint64_t);
+/*
+ * Given an initial bpo, start by freeing the BPs that are directly referenced
+ * by that bpo. If the bpo has subobjs, read in its last subobj and push the
+ * subobj to our stack. By popping items off our stack, eventually we will
+ * encounter a bpo that has no subobjs.  We can free its bpobj_info_t, and if
+ * requested also free the now-empty bpo from disk and decrement
+ * its parent's subobj count. We continue popping each subobj from our stack,
+ * visiting its last subobj until they too have no more subobjs, and so on.
+ */
+static int
+bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
+    dmu_tx_t *tx, boolean_t free)
+{
+       list_t stack;
+       bpobj_info_t *bpi;
+       int err = 0;
 
-       for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
-               uint64_t *objarray;
-               uint64_t offset, blkoff;
-               bpobj_t sublist;
-               uint64_t used_before, comp_before, uncomp_before;
-               uint64_t used_after, comp_after, uncomp_after;
+       /*
+        * Create a "stack" for us to work with without worrying about
+        * stack overflows. Initialize it with the initial_bpo.
+        */
+       list_create(&stack, sizeof (bpobj_info_t),
+           offsetof(bpobj_info_t, bpi_node));
+       mutex_enter(&initial_bpo->bpo_lock);
+       list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0));
 
-               offset = i * sizeof (uint64_t);
-               blkoff = P2PHASE(i, epb);
+       while ((bpi = list_head(&stack)) != NULL) {
+               bpobj_t *bpo = bpi->bpi_bpo;
 
-               if (dbuf == NULL || dbuf->db_offset > offset) {
-                       if (dbuf)
-                               dmu_buf_rele(dbuf, FTAG);
-                       err = dmu_buf_hold(bpo->bpo_os,
-                           bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0);
-                       if (err)
+               ASSERT3P(bpo, !=, NULL);
+               ASSERT(MUTEX_HELD(&bpo->bpo_lock));
+               ASSERT(bpobj_is_open(bpo));
+
+               if (free)
+                       dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+
+               if (bpi->bpi_visited == B_FALSE) {
+                       err = bpobj_iterate_blkptrs(bpi, func, arg, tx, free);
+                       bpi->bpi_visited = B_TRUE;
+                       if (err != 0)
                                break;
                }
+               /*
+                * We've finished with this bpo's directly-referenced BP's and
+                * it has no more unprocessed subobjs. We can free its
+                * bpobj_info_t (unless it is the topmost, initial_bpo).
+                * If we are freeing from disk, we can also do that.
+                */
+               if (bpi->bpi_unprocessed_subobjs == 0) {
+                       /*
+                        * If there are no entries, there should
+                        * be no bytes.
+                        */
+                       if (bpobj_is_empty(bpo)) {
+                               ASSERT0(bpo->bpo_phys->bpo_bytes);
+                               ASSERT0(bpo->bpo_phys->bpo_comp);
+                               ASSERT0(bpo->bpo_phys->bpo_uncomp);
+                       }
 
-               ASSERT3U(offset, >=, dbuf->db_offset);
-               ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+                       /* The initial_bpo has no parent and is not closed. */
+                       if (bpi->bpi_parent != NULL) {
+                               if (free) {
+                                       bpobj_t *p = bpi->bpi_parent->bpi_bpo;
+
+                                       ASSERT0(bpo->bpo_phys->bpo_num_blkptrs);
+                                       ASSERT3U(p->bpo_phys->bpo_num_subobjs,
+                                           >, 0);
+                                       ASSERT3U(bpi->bpi_index, ==,
+                                           p->bpo_phys->bpo_num_subobjs - 1);
+                                       ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf,
+                                           tx));
+
+                                       p->bpo_phys->bpo_num_subobjs--;
+
+                                       VERIFY0(dmu_free_range(p->bpo_os,
+                                           p->bpo_phys->bpo_subobjs,
+                                           bpi->bpi_index * sizeof (uint64_t),
+                                           sizeof (uint64_t), tx));
+
+                                       /* eliminate the empty subobj list */
+                                       if (bpo->bpo_havesubobj &&
+                                           bpo->bpo_phys->bpo_subobjs != 0) {
+                                               ASSERT0(bpo->bpo_phys->
+                                                   bpo_num_subobjs);
+                                               err = dmu_object_free(
+                                                   bpo->bpo_os,
+                                                   bpo->bpo_phys->bpo_subobjs,
+                                                   tx);
+                                               if (err)
+                                                       break;
+                                               bpo->bpo_phys->bpo_subobjs = 0;
+                                       }
+                                       err = dmu_object_free(p->bpo_os,
+                                           bpo->bpo_object, tx);
+                                       if (err)
+                                               break;
+                               }
+
+                               mutex_exit(&bpo->bpo_lock);
+                               bpobj_close(bpo);
+                               kmem_free(bpo, sizeof (bpobj_t));
+                       } else {
+                               mutex_exit(&bpo->bpo_lock);
+                       }
 
-               objarray = dbuf->db_data;
-               err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]);
-               if (err)
-                       break;
-               if (free) {
-                       err = bpobj_space(&sublist,
-                           &used_before, &comp_before, &uncomp_before);
-                       if (err != 0) {
-                               bpobj_close(&sublist);
+                       /*
+                        * Finished processing this bpo. Unlock, and free
+                        * our "stack" info.
+                        */
+                       list_remove_head(&stack);
+                       kmem_free(bpi, sizeof (bpobj_info_t));
+               } else {
+                       /*
+                        * We have unprocessed subobjs. Process the next one.
+                        */
+                       ASSERT(bpo->bpo_havecomp);
+
+                       /* Add the last subobj to stack. */
+                       int64_t i = bpi->bpi_unprocessed_subobjs - 1;
+                       uint64_t offset = i * sizeof (uint64_t);
+
+                       uint64_t obj_from_sublist;
+                       err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+                           offset, sizeof (uint64_t), &obj_from_sublist,
+                           DMU_READ_PREFETCH);
+                       if (err)
                                break;
-                       }
-               }
-               err = bpobj_iterate_impl(&sublist, func, arg, tx, free);
-               if (free) {
-                       VERIFY3U(0, ==, bpobj_space(&sublist,
-                           &used_after, &comp_after, &uncomp_after));
-                       bpo->bpo_phys->bpo_bytes -= used_before - used_after;
-                       ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
-                       bpo->bpo_phys->bpo_comp -= comp_before - comp_after;
-                       bpo->bpo_phys->bpo_uncomp -=
-                           uncomp_before - uncomp_after;
-               }
+                       bpobj_t *sublist = kmem_alloc(sizeof (bpobj_t),
+                           KM_SLEEP);
 
-               bpobj_close(&sublist);
-               if (err)
-                       break;
-               if (free) {
-                       err = dmu_object_free(bpo->bpo_os,
-                           objarray[blkoff], tx);
+                       err = bpobj_open(sublist, bpo->bpo_os,
+                           obj_from_sublist);
                        if (err)
                                break;
-                       bpo->bpo_phys->bpo_num_subobjs--;
-                       ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0);
+
+                       list_insert_head(&stack, bpi_alloc(sublist, bpi, i));
+                       mutex_enter(&sublist->bpo_lock);
+                       bpi->bpi_unprocessed_subobjs--;
                }
        }
-       if (dbuf) {
-               dmu_buf_rele(dbuf, FTAG);
-               dbuf = NULL;
-       }
-       if (free) {
-               VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os,
-                   bpo->bpo_phys->bpo_subobjs,
-                   (i + 1) * sizeof (uint64_t), DMU_OBJECT_END, tx));
-       }
+       /*
+        * Cleanup anything left on the "stack" after we left the loop.
+        * Every bpo on the stack is locked so we must remember to undo
+        * that now (in LIFO order).
+        */
+       while ((bpi = list_remove_head(&stack)) != NULL) {
+               bpobj_t *bpo = bpi->bpi_bpo;
+               ASSERT(err != 0);
+               ASSERT3P(bpo, !=, NULL);
 
-out:
-       /* If there are no entries, there should be no bytes. */
-       if (bpobj_is_empty(bpo)) {
-               ASSERT0(bpo->bpo_phys->bpo_bytes);
-               ASSERT0(bpo->bpo_phys->bpo_comp);
-               ASSERT0(bpo->bpo_phys->bpo_uncomp);
+               mutex_exit(&bpo->bpo_lock);
+
+               /* do not free the initial_bpo */
+               if (bpi->bpi_parent != NULL) {
+                       bpobj_close(bpi->bpi_bpo);
+                       kmem_free(bpi->bpi_bpo, sizeof (bpobj_t));
+               }
+               kmem_free(bpi, sizeof (bpobj_info_t));
        }
 
-       mutex_exit(&bpo->bpo_lock);
+       list_destroy(&stack);
+
        return (err);
 }
 
index 9f3c9bfd5b95e6e94720ede601d51a96ee291c06..28ff5fc7e4ad1565d2035f3fcc3ec7bd186efd53 100644 (file)
@@ -2311,6 +2311,23 @@ dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
            DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx);
 }
 
+boolean_t
+dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+       mutex_enter(&db->db_mtx);
+       for (dbuf_dirty_record_t *dr = db->db_last_dirty;
+           dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) {
+               if (dr->dr_txg == tx->tx_txg) {
+                       mutex_exit(&db->db_mtx);
+                       return (B_TRUE);
+               }
+       }
+       mutex_exit(&db->db_mtx);
+       return (B_FALSE);
+}
+
 void
 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
@@ -4564,6 +4581,7 @@ EXPORT_SYMBOL(dbuf_release_bp);
 EXPORT_SYMBOL(dbuf_dirty);
 EXPORT_SYMBOL(dmu_buf_set_crypt_params);
 EXPORT_SYMBOL(dmu_buf_will_dirty);
+EXPORT_SYMBOL(dmu_buf_is_dirty);
 EXPORT_SYMBOL(dmu_buf_will_not_fill);
 EXPORT_SYMBOL(dmu_buf_will_fill);
 EXPORT_SYMBOL(dmu_buf_fill_done);