Illumos 4390 - I/O errors can corrupt space map when deleting fs/vol

author Matthew Ahrens <mahrens@delphix.com>

Thu, 5 Jun 2014 21:20:08 +0000 (13:20 -0800)

committer Brian Behlendorf <behlendorf1@llnl.gov>

Mon, 4 Aug 2014 18:50:52 +0000 (11:50 -0700)
author Matthew Ahrens <mahrens@delphix.com>
Thu, 5 Jun 2014 21:20:08 +0000 (13:20 -0800)
committer Brian Behlendorf <behlendorf1@llnl.gov>
Mon, 4 Aug 2014 18:50:52 +0000 (11:50 -0700)
diff --git a/include/sys/bptree.h b/include/sys/bptree.h

index 97150721187536b7323ea49769e093b74cc9bfa8..a533cb949021e475dcb7d427266957783e41f066 100644 (file)
--- a/include/sys/bptree.h
+++ b/include/sys/bptree.h
@@ -19,7 +19,7 @@
   * CDDL HEADER END
   */
  /*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
   */
  
  #ifndef        _SYS_BPTREE_H
@@ -50,6 +50,7 @@ typedef int bptree_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
  
  uint64_t bptree_alloc(objset_t *os, dmu_tx_t *tx);
  int bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx);
+boolean_t bptree_is_empty(objset_t *os, uint64_t obj);
  
  void bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
      uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx);
diff --git a/include/sys/dmu.h b/include/sys/dmu.h

index 89a0e5bd7a93a8f69e1b74a59520f53a4ff5a748..aa3e8f25aa747a7f236c4c79485c5fefeeb03f75 100644 (file)
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -250,7 +250,6 @@ void zfs_znode_byteswap(void *buf, size_t size);
  
  #define        DMU_USERUSED_OBJECT     (-1ULL)
  #define        DMU_GROUPUSED_OBJECT    (-2ULL)
-#define        DMU_DEADLIST_OBJECT     (-3ULL)
  
  /*
   * artificial blkids for bonus buffer and spill blocks
diff --git a/include/sys/dsl_dir.h b/include/sys/dsl_dir.h

index d69d47696f230ae2f396b8b31ea61f881967b86a..3aa77523293bab4f8cc777d5705bbc73e8b88fc8 100644 (file)
--- a/include/sys/dsl_dir.h
+++ b/include/sys/dsl_dir.h
@@ -144,6 +144,7 @@ void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value,
  #define        ORIGIN_DIR_NAME "$ORIGIN"
  #define        XLATION_DIR_NAME "$XLATION"
  #define        FREE_DIR_NAME "$FREE"
+#define        LEAK_DIR_NAME "$LEAK"
  
  #ifdef ZFS_DEBUG
  #define        dprintf_dd(dd, fmt, ...) do { \
diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h

index d5bad8dc1919d3898ccdeec16a8de202a338a922..34dc65ba40ea25c02b8b3c8b66c3d05f7ee56033 100644 (file)
--- a/include/sys/dsl_pool.h
+++ b/include/sys/dsl_pool.h
@@ -87,6 +87,7 @@ typedef struct dsl_pool {
         struct dsl_dir *dp_root_dir;
         struct dsl_dir *dp_mos_dir;
         struct dsl_dir *dp_free_dir;
+       struct dsl_dir *dp_leak_dir;
         struct dsl_dataset *dp_origin_snap;
         uint64_t dp_root_dir_obj;
         struct taskq *dp_iput_taskq;
diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h

index bcb85d67d38e3a0cbaecdcecc1c9156855b1cea2..de6a7d17a3efa9c57b739acf3577920881868b63 100644 (file)
--- a/include/sys/dsl_scan.h
+++ b/include/sys/dsl_scan.h
@@ -116,6 +116,7 @@ typedef struct dsl_scan {
         /* for freeing blocks */
         boolean_t scn_is_bptree;
         boolean_t scn_async_destroying;
+       boolean_t scn_async_stalled;
  
         /* for debugging / information */
         uint64_t scn_visited_this_txg;
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h

index 5371e12efa7eba6d55de0c975c28fe20b014368f..227d8b2fbd8848498f80cdab947f453e7d92298b 100644 (file)
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -193,6 +193,7 @@ typedef enum {
         ZPOOL_PROP_COMMENT,
         ZPOOL_PROP_EXPANDSZ,
         ZPOOL_PROP_FREEING,
+       ZPOOL_PROP_LEAKED,
         ZPOOL_NUM_PROPS
  } zpool_prop_t;
  
diff --git a/include/sys/zfs_debug.h b/include/sys/zfs_debug.h

index e51207955555fe0827f4bc4b54a87e9b4a2bf7fc..829b37a4674fd056c41346d0a34fd5cb9c52595f 100644 (file)
--- a/include/sys/zfs_debug.h
+++ b/include/sys/zfs_debug.h
@@ -48,6 +48,7 @@ extern "C" {
  
  extern int zfs_flags;
  extern int zfs_recover;
+extern int zfs_free_leak_on_eio;
  
  #define        ZFS_DEBUG_DPRINTF       (1<<0)
  #define        ZFS_DEBUG_DBUF_VERIFY   (1<<1)
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c

index a5d2c28affe4f4a05bc615e85a891d4bc63c46d2..fce8fa96d2ceb798b0e09e2cd893bd99a0bc80cd 100644 (file)
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -316,6 +316,7 @@ zpool_get_prop_literal(zpool_handle_t *zhp, zpool_prop_t prop, char *buf,
                 case ZPOOL_PROP_ALLOCATED:
                 case ZPOOL_PROP_FREE:
                 case ZPOOL_PROP_FREEING:
+               case ZPOOL_PROP_LEAKED:
                 case ZPOOL_PROP_EXPANDSZ:
                 case ZPOOL_PROP_ASHIFT:
                         if (literal)
diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5

index d3aa2e6b043b12cae298bc87ccdb41c069a72143..2f87f114c50bab4f8841e1144bd55e9ad5a26610 100644 (file)
--- a/man/man5/zfs-module-parameters.5
+++ b/man/man5/zfs-module-parameters.5
@@ -696,6 +696,43 @@ Set additional debugging flags
  Default value: \fB1\fR.
  .RE
  
+.sp
+.ne 2
+.na
+\fBzfs_free_leak_on_eio\fR (int)
+.ad
+.RS 12n
+If destroy encounters an EIO while reading metadata (e.g. indirect
+blocks), space referenced by the missing metadata can not be freed.
+Normally this causes the background destroy to become "stalled", as
+it is unable to make forward progress.  While in this stalled state,
+all remaining space to free from the error-encountering filesystem is
+"temporarily leaked".  Set this flag to cause it to ignore the EIO,
+permanently leak the space from indirect blocks that can not be read,
+and continue to free everything else that it can.
+
+The default, "stalling" behavior is useful if the storage partially
+fails (i.e. some but not all i/os fail), and then later recovers.  In
+this case, we will be able to continue pool operations while it is
+partially failed, and when it recovers, we can continue to free the
+space, with no leaks.  However, note that this case is actually
+fairly rare.
+
+Typically pools either (a) fail completely (but perhaps temporarily,
+e.g. a top-level vdev going offline), or (b) have localized,
+permanent errors (e.g. disk returns the wrong data due to bit flip or
+firmware bug).  In case (a), this setting does not matter because the
+pool will be suspended and the sync thread will not be able to make
+forward progress regardless.  In case (b), because the error is
+permanent, the best we can do is leak the minimum amount of space,
+which is what setting this flag will do.  Therefore, it is reasonable
+for this flag to normally be set, but we chose the more conservative
+approach of not setting it, so that there is no possibility of
+leaking space in the "partial temporary" failure case.
+.sp
+Default value: \fB0\fR.
+.RE
+
  .sp
  .ne 2
  .na
diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c

index 1173fc0c9cf248f1689de2b5bae31de5c2952ee2..6775c09d3473211bfca2d5ee8c257edecdd0604e 100644 (file)
--- a/module/zcommon/zpool_prop.c
+++ b/module/zcommon/zpool_prop.c
@@ -81,6 +81,8 @@ zpool_prop_init(void)
             ZFS_TYPE_POOL, "<size>", "FREE");
         zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY,
             ZFS_TYPE_POOL, "<size>", "FREEING");
+       zprop_register_number(ZPOOL_PROP_LEAKED, "leaked", 0, PROP_READONLY,
+           ZFS_TYPE_POOL, "<size>", "LEAKED");
         zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0,
             PROP_READONLY, ZFS_TYPE_POOL, "<size>", "ALLOC");
         zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0,
diff --git a/module/zfs/bptree.c b/module/zfs/bptree.c

index 83f365864dec2f3600cab8baecdaf1f1d21406ee..cbe8d1caaed4ec667608263eccdbbb5fed428764 100644 (file)
--- a/module/zfs/bptree.c
+++ b/module/zfs/bptree.c
@@ -102,13 +102,27 @@ bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
         return (dmu_object_free(os, obj, tx));
  }
  
+boolean_t
+bptree_is_empty(objset_t *os, uint64_t obj)
+{
+       dmu_buf_t *db;
+       bptree_phys_t *bt;
+       boolean_t rv;
+
+       VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db));
+       bt = db->db_data;
+       rv = (bt->bt_begin == bt->bt_end);
+       dmu_buf_rele(db, FTAG);
+       return (rv);
+}
+
  void
  bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
      uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx)
  {
         dmu_buf_t *db;
         bptree_phys_t *bt;
-       bptree_entry_phys_t bte;
+       bptree_entry_phys_t *bte;
  
         /*
          * bptree objects are in the pool mos, therefore they can only be
@@ -120,10 +134,11 @@ bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
         VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
         bt = db->db_data;
  
-       bte.be_birth_txg = birth_txg;
-       bte.be_bp = *bp;
-       bzero(&bte.be_zb, sizeof (bte.be_zb));
-       dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx);
+       bte = kmem_zalloc(sizeof (*bte), KM_PUSHPAGE);
+       bte->be_birth_txg = birth_txg;
+       bte->be_bp = *bp;
+       dmu_write(os, obj, bt->bt_end * sizeof (*bte), sizeof (*bte), bte, tx);
+       kmem_free(bte, sizeof (*bte));
  
         dmu_buf_will_dirty(db, tx);
         bt->bt_end++;
@@ -153,10 +168,27 @@ bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
         return (err);
  }
  
+/*
+ * If "free" is set:
+ *  - It is assumed that "func" will be freeing the block pointers.
+ *  - If "func" returns nonzero, the bookmark will be remembered and
+ *    iteration will be restarted from this point on next invocation.
+ *  - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM),
+ *    bptree_iterate will remember the bookmark, continue traversing
+ *    any additional entries, and return 0.
+ *
+ * If "free" is not set, traversal will stop and return an error if
+ * an i/o error is encountered.
+ *
+ * In either case, if zfs_free_leak_on_eio is set, i/o errors will be
+ * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to
+ * traverse_dataset_destroyed()).
+ */
  int
  bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
      void *arg, dmu_tx_t *tx)
  {
+       boolean_t ioerr = B_FALSE;
         int err;
         uint64_t i;
         dmu_buf_t *db;
@@ -182,49 +214,82 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
                 bptree_entry_phys_t bte;
                 int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST;
  
-               ASSERT(!free || i == ba.ba_phys->bt_begin);
-
                 err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
                     &bte, DMU_READ_NO_PREFETCH);
                 if (err != 0)
                         break;
  
-               if (zfs_recover)
+               if (zfs_free_leak_on_eio)
                         flags |= TRAVERSE_HARD;
+               zfs_dbgmsg("bptree index %d: traversing from min_txg=%lld "
+                   "bookmark %lld/%lld/%lld/%lld",
+                   i, (longlong_t)bte.be_birth_txg,
+                   (longlong_t)bte.be_zb.zb_objset,
+                   (longlong_t)bte.be_zb.zb_object,
+                   (longlong_t)bte.be_zb.zb_level,
+                   (longlong_t)bte.be_zb.zb_blkid);
                 err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp,
                     bte.be_birth_txg, &bte.be_zb, flags,
                     bptree_visit_cb, &ba);
                 if (free) {
-                       if (err == ERESTART) {
+                       /*
+                        * The callback has freed the visited block pointers.
+                        * Record our traversal progress on disk, either by
+                        * updating this record's bookmark, or by logically
+                        * removing this record by advancing bt_begin.
+                        */
+                       if (err != 0) {
                                 /* save bookmark for future resume */
                                 ASSERT3U(bte.be_zb.zb_objset, ==,
                                     ZB_DESTROYED_OBJSET);
                                 ASSERT0(bte.be_zb.zb_level);
                                 dmu_write(os, obj, i * sizeof (bte),
                                     sizeof (bte), &bte, tx);
-                               break;
-                       }
-                       if (err != 0) {
+                               if (err == EIO || err == ECKSUM ||
+                                   err == ENXIO) {
+                                       /*
+                                        * Skip the rest of this tree and
+                                        * continue on to the next entry.
+                                        */
+                                       err = 0;
+                                       ioerr = B_TRUE;
+                               } else {
+                                       break;
+                               }
+                       } else if (ioerr) {
                                 /*
-                                * We can not properly handle an i/o
-                                * error, because the traversal code
-                                * does not know how to resume from an
-                                * arbitrary bookmark.
+                                * This entry is finished, but there were
+                                * i/o errors on previous entries, so we
+                                * can't adjust bt_begin.  Set this entry's
+                                * be_birth_txg such that it will be
+                                * treated as a no-op in future traversals.
                                  */
-                               zfs_panic_recover("error %u from "
-                                   "traverse_dataset_destroyed()", err);
+                               bte.be_birth_txg = UINT64_MAX;
+                               dmu_write(os, obj, i * sizeof (bte),
+                                   sizeof (bte), &bte, tx);
                         }
  
-                       ba.ba_phys->bt_begin++;
-                       (void) dmu_free_range(os, obj,
-                           i * sizeof (bte), sizeof (bte), tx);
+                       if (!ioerr) {
+                               ba.ba_phys->bt_begin++;
+                               (void) dmu_free_range(os, obj,
+                                   i * sizeof (bte), sizeof (bte), tx);
+                       }
+               } else if (err != 0) {
+                       break;
                 }
         }
  
-       ASSERT(!free || err != 0 || ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
+       ASSERT(!free || err != 0 || ioerr ||
+           ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
  
         /* if all blocks are free there should be no used space */
         if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) {
+               if (zfs_free_leak_on_eio) {
+                       ba.ba_phys->bt_bytes = 0;
+                       ba.ba_phys->bt_comp = 0;
+                       ba.ba_phys->bt_uncomp = 0;
+               }
+
                 ASSERT0(ba.ba_phys->bt_bytes);
                 ASSERT0(ba.ba_phys->bt_comp);
                 ASSERT0(ba.ba_phys->bt_uncomp);
diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c

index e086e2487e8ae648348ae883356d188c81e8e871..fe352469b012a9ba04daea3537c354c247848a82 100644 (file)
--- a/module/zfs/dmu_traverse.c
+++ b/module/zfs/dmu_traverse.c
@@ -58,12 +58,11 @@ typedef struct traverse_data {
         zbookmark_t *td_resume;
         int td_flags;
         prefetch_data_t *td_pfd;
+       boolean_t td_paused;
         blkptr_cb_t *td_func;
         void *td_arg;
  } traverse_data_t;
  
-#define        TD_HARD(td)     (td->td_flags & TRAVERSE_HARD)
-
  static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
      uint64_t objset, uint64_t object);
  static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *,
@@ -165,7 +164,6 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
                  * If we found the block we're trying to resume from, zero
                  * the bookmark out to indicate that we have resumed.
                  */
-               ASSERT3U(zb->zb_object, <=, td->td_resume->zb_object);
                 if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
                         bzero(td->td_resume, sizeof (*zb));
                         if (td->td_flags & TRAVERSE_POST)
@@ -175,14 +173,6 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
         return (RESUME_SKIP_NONE);
  }
  
-static void
-traverse_pause(traverse_data_t *td, const zbookmark_t *zb)
-{
-       ASSERT(td->td_resume != NULL);
-       ASSERT0(zb->zb_level);
-       bcopy(zb, td->td_resume, sizeof (*td->td_resume));
-}
-
  static void
  traverse_prefetch_metadata(traverse_data_t *td,
      const blkptr_t *bp, const zbookmark_t *zb)
@@ -211,9 +201,8 @@ static int
  traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
      const blkptr_t *bp, const zbookmark_t *zb)
  {
-       int err = 0, lasterr = 0;
+       int err = 0;
         arc_buf_t *buf = NULL;
-       boolean_t pause = B_FALSE;
  
         switch (resume_skip_check(td, dnp, zb)) {
         case RESUME_SKIP_ALL:
@@ -252,7 +241,9 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
  
         if (BP_IS_HOLE(bp)) {
                 err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
-               return (err);
+               if (err != 0)
+                       goto post;
+               return (0);
         }
  
         if (td->td_pfd && !td->td_pfd->pd_exited &&
@@ -273,8 +264,6 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
                     td->td_arg);
                 if (err == TRAVERSE_VISIT_NO_CHILDREN)
                         return (0);
-               if (err == ERESTART)
-                       pause = B_TRUE; /* handle pausing at a common point */
                 if (err != 0)
                         goto post;
         }
@@ -288,7 +277,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
                 err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
                     ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
                 if (err != 0)
-                       return (err);
+                       goto post;
  
                 czb = kmem_alloc(sizeof (zbookmark_t), KM_PUSHPAGE);
  
@@ -307,11 +296,8 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
                             zb->zb_blkid * epb + i);
                         err = traverse_visitbp(td, dnp,
                             &((blkptr_t *)buf->b_data)[i], czb);
-                       if (err != 0) {
-                               if (!TD_HARD(td))
-                                       break;
-                               lasterr = err;
-                       }
+                       if (err != 0)
+                               break;
                 }
  
                 kmem_free(czb, sizeof (zbookmark_t));
@@ -324,7 +310,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
                 err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
                     ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
                 if (err != 0)
-                       return (err);
+                       goto post;
                 dnp = buf->b_data;
  
                 for (i = 0; i < epb; i++) {
@@ -336,11 +322,8 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
                 for (i = 0; i < epb; i++) {
                         err = traverse_dnode(td, &dnp[i], zb->zb_objset,
                             zb->zb_blkid * epb + i);
-                       if (err != 0) {
-                               if (!TD_HARD(td))
-                                       break;
-                               lasterr = err;
-                       }
+                       if (err != 0)
+                               break;
                 }
         } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
                 uint32_t flags = ARC_WAIT;
@@ -350,7 +333,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
                 err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
                     ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
                 if (err != 0)
-                       return (err);
+                       goto post;
  
                 osp = buf->b_data;
                 dnp = &osp->os_meta_dnode;
@@ -365,19 +348,11 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
  
                 err = traverse_dnode(td, dnp, zb->zb_objset,
                     DMU_META_DNODE_OBJECT);
-               if (err && TD_HARD(td)) {
-                       lasterr = err;
-                       err = 0;
-               }
                 if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
                         dnp = &osp->os_groupused_dnode;
                         err = traverse_dnode(td, dnp, zb->zb_objset,
                             DMU_GROUPUSED_OBJECT);
                 }
-               if (err && TD_HARD(td)) {
-                       lasterr = err;
-                       err = 0;
-               }
                 if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
                         dnp = &osp->os_userused_dnode;
                         err = traverse_dnode(td, dnp, zb->zb_objset,
@@ -389,19 +364,37 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
                 (void) arc_buf_remove_ref(buf, &buf);
  
  post:
-       if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
+       if (err == 0 && (td->td_flags & TRAVERSE_POST))
                 err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
-               if (err == ERESTART)
-                       pause = B_TRUE;
+
+       if ((td->td_flags & TRAVERSE_HARD) && (err == EIO || err == ECKSUM)) {
+               /*
+                * Ignore this disk error as requested by the HARD flag,
+                * and continue traversal.
+                */
+               err = 0;
         }
  
-       if (pause && td->td_resume != NULL) {
-               ASSERT3U(err, ==, ERESTART);
-               ASSERT(!TD_HARD(td));
-               traverse_pause(td, zb);
+       /*
+        * If we are stopping here, set td_resume.
+        */
+       if (td->td_resume != NULL && err != 0 && !td->td_paused) {
+               td->td_resume->zb_objset = zb->zb_objset;
+               td->td_resume->zb_object = zb->zb_object;
+               td->td_resume->zb_level = 0;
+               /*
+                * If we have stopped on an indirect block (e.g. due to
+                * i/o error), we have not visited anything below it.
+                * Set the bookmark to the first level-0 block that we need
+                * to visit.  This way, the resuming code does not need to
+                * deal with resuming from indirect blocks.
+                */
+               td->td_resume->zb_blkid = zb->zb_blkid <<
+                   (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
+               td->td_paused = B_TRUE;
         }
  
-       return (err != 0 ? err : lasterr);
+       return (err);
  }
  
  static void
@@ -426,29 +419,21 @@ static int
  traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
      uint64_t objset, uint64_t object)
  {
-       int j, err = 0, lasterr = 0;
+       int j, err = 0;
         zbookmark_t czb;
  
         for (j = 0; j < dnp->dn_nblkptr; j++) {
                 SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
                 err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
-               if (err != 0) {
-                       if (!TD_HARD(td))
-                               break;
-                       lasterr = err;
-               }
+               if (err != 0)
+                       break;
         }
  
         if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
                 SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
                 err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb);
-               if (err != 0) {
-                       if (!TD_HARD(td))
-                               return (err);
-                       lasterr = err;
-               }
         }
-       return (err != 0 ? err : lasterr);
+       return (err);
  }
  
  /* ARGSUSED */
@@ -539,6 +524,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
         td->td_arg = arg;
         td->td_pfd = pd;
         td->td_flags = flags;
+       td->td_paused = B_FALSE;
  
         pd->pd_blks_max = zfs_pd_blks_max;
         pd->pd_flags = flags;
@@ -617,7 +603,7 @@ int
  traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
      blkptr_cb_t func, void *arg)
  {
-       int err, lasterr = 0;
+       int err;
         uint64_t obj;
         dsl_pool_t *dp = spa_get_dsl(spa);
         objset_t *mos = dp->dp_meta_objset;
@@ -630,16 +616,15 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
                 return (err);
  
         /* visit each dataset */
-       for (obj = 1; err == 0 || (err != ESRCH && hard);
+       for (obj = 1; err == 0;
             err = dmu_object_next(mos, &obj, FALSE, txg_start)) {
                 dmu_object_info_t doi;
  
                 err = dmu_object_info(mos, obj, &doi);
                 if (err != 0) {
-                       if (!hard)
-                               return (err);
-                       lasterr = err;
-                       continue;
+                       if (hard)
+                               continue;
+                       break;
                 }
  
                 if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) {
@@ -650,25 +635,21 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
                         err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
                         dsl_pool_config_exit(dp, FTAG);
                         if (err != 0) {
-                               if (!hard)
-                                       return (err);
-                               lasterr = err;
-                               continue;
+                               if (hard)
+                                       continue;
+                               break;
                         }
                         if (ds->ds_phys->ds_prev_snap_txg > txg)
                                 txg = ds->ds_phys->ds_prev_snap_txg;
                         err = traverse_dataset(ds, txg, flags, func, arg);
                         dsl_dataset_rele(ds, FTAG);
-                       if (err != 0) {
-                               if (!hard)
-                                       return (err);
-                               lasterr = err;
-                       }
+                       if (err != 0)
+                               break;
                 }
         }
         if (err == ESRCH)
                 err = 0;
-       return (err != 0 ? err : lasterr);
+       return (err);
  }
  
  #if defined(_KERNEL) && defined(HAVE_SPL)
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c

index 91476cc0b6b741834b603ee241f3bb9ea3326ffb..f1ab29c6214ce9cb192ec0953b8a465f9522660b 100644 (file)
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -245,6 +245,13 @@ dsl_pool_open(dsl_pool_t *dp)
                     dp->dp_meta_objset, obj));
         }
  
+       /*
+        * Note: errors ignored, because the leak dir will not exist if we
+        * have not encountered a leak yet.
+        */
+       (void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME,
+           &dp->dp_leak_dir);
+
         if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
                 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
                     DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
@@ -292,6 +299,8 @@ dsl_pool_close(dsl_pool_t *dp)
                 dsl_dir_rele(dp->dp_mos_dir, dp);
         if (dp->dp_free_dir)
                 dsl_dir_rele(dp->dp_free_dir, dp);
+       if (dp->dp_leak_dir)
+               dsl_dir_rele(dp->dp_leak_dir, dp);
         if (dp->dp_root_dir)
                 dsl_dir_rele(dp->dp_root_dir, dp);
  
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c

index f03f1f54b450ba2994b6b3c563a65c8f1249b376..0f0243b310a01df1599dee45d2e2e4b7334b7ceb 100644 (file)
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -65,7 +65,7 @@ int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
  int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
  int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
  int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
-int zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */
+int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
  enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
  int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */
  
@@ -1417,7 +1417,7 @@ dsl_scan_active(dsl_scan_t *scn)
         if (spa_shutting_down(spa))
                 return (B_FALSE);
         if (scn->scn_phys.scn_state == DSS_SCANNING ||
-           scn->scn_async_destroying)
+           (scn->scn_async_destroying && !scn->scn_async_stalled))
                 return (B_TRUE);
  
         if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
@@ -1432,7 +1432,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
  {
         dsl_scan_t *scn = dp->dp_scan;
         spa_t *spa = dp->dp_spa;
-       int err;
+       int err = 0;
  
         /*
          * Check for scn_restart_txg before checking spa_load_state, so
@@ -1450,7 +1450,10 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
                 dsl_scan_setup_sync(&func, tx);
         }
  
-       if (!dsl_scan_active(scn) ||
+       /*
+        * If the scan is inactive due to a stalled async destroy, try again.
+        */
+       if ((!scn->scn_async_stalled && !dsl_scan_active(scn)) ||
             spa_sync_pass(dp->dp_spa) > 1)
                 return;
  
@@ -1460,10 +1463,11 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
         spa->spa_scrub_active = B_TRUE;
  
         /*
-        * First process the free list.  If we pause the free, don't do
-        * any scanning.  This ensures that there is no free list when
-        * we are scanning, so the scan code doesn't have to worry about
-        * traversing it.
+        * First process the async destroys.  If we pause, don't do
+        * any scrubbing or resilvering.  This ensures that there are no
+        * async destroys while we are scanning, so the scan code doesn't
+        * have to worry about traversing it.  It is also faster to free the
+        * blocks than to scrub them.
          */
         if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
                 scn->scn_is_bptree = B_FALSE;
@@ -1473,48 +1477,92 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
                     dsl_scan_free_block_cb, scn, tx);
                 VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
  
-               if (err == 0 && spa_feature_is_active(spa,
-                   SPA_FEATURE_ASYNC_DESTROY)) {
-                       ASSERT(scn->scn_async_destroying);
-                       scn->scn_is_bptree = B_TRUE;
-                       scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
-                           NULL, ZIO_FLAG_MUSTSUCCEED);
-                       err = bptree_iterate(dp->dp_meta_objset,
-                           dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb,
-                           scn, tx);
-                       VERIFY0(zio_wait(scn->scn_zio_root));
+               if (err != 0 && err != ERESTART)
+                       zfs_panic_recover("error %u from bpobj_iterate()", err);
+       }
  
-                       if (err == 0) {
-                               /* finished; deactivate async destroy feature */
-                               spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY,
-                                   tx);
-                               ASSERT(!spa_feature_is_active(spa,
-                                   SPA_FEATURE_ASYNC_DESTROY));
-                               VERIFY0(zap_remove(dp->dp_meta_objset,
-                                   DMU_POOL_DIRECTORY_OBJECT,
-                                   DMU_POOL_BPTREE_OBJ, tx));
-                               VERIFY0(bptree_free(dp->dp_meta_objset,
-                                   dp->dp_bptree_obj, tx));
-                               dp->dp_bptree_obj = 0;
-                               scn->scn_async_destroying = B_FALSE;
-                       }
+       if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
+               ASSERT(scn->scn_async_destroying);
+               scn->scn_is_bptree = B_TRUE;
+               scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+                   NULL, ZIO_FLAG_MUSTSUCCEED);
+               err = bptree_iterate(dp->dp_meta_objset,
+                   dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx);
+               VERIFY0(zio_wait(scn->scn_zio_root));
+
+               if (err == EIO || err == ECKSUM) {
+                       err = 0;
+               } else if (err != 0 && err != ERESTART) {
+                       zfs_panic_recover("error %u from "
+                           "traverse_dataset_destroyed()", err);
                 }
-               if (scn->scn_visited_this_txg) {
-                       zfs_dbgmsg("freed %llu blocks in %llums from "
-                           "free_bpobj/bptree txg %llu",
-                           (longlong_t)scn->scn_visited_this_txg,
-                           (longlong_t)
-                           NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
-                           (longlong_t)tx->tx_txg);
-                       scn->scn_visited_this_txg = 0;
-                       /*
-                        * Re-sync the ddt so that we can further modify
-                        * it when doing bprewrite.
-                        */
-                       ddt_sync(spa, tx->tx_txg);
+
+               /*
+                * If we didn't make progress, mark the async destroy as
+                * stalled, so that we will not initiate a spa_sync() on
+                * its behalf.
+                */
+               scn->scn_async_stalled = (scn->scn_visited_this_txg == 0);
+
+               if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) {
+                       /* finished; deactivate async destroy feature */
+                       spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx);
+                       ASSERT(!spa_feature_is_active(spa,
+                           SPA_FEATURE_ASYNC_DESTROY));
+                       VERIFY0(zap_remove(dp->dp_meta_objset,
+                           DMU_POOL_DIRECTORY_OBJECT,
+                           DMU_POOL_BPTREE_OBJ, tx));
+                       VERIFY0(bptree_free(dp->dp_meta_objset,
+                           dp->dp_bptree_obj, tx));
+                       dp->dp_bptree_obj = 0;
+                       scn->scn_async_destroying = B_FALSE;
                 }
-               if (err == ERESTART)
-                       return;
+       }
+       if (scn->scn_visited_this_txg) {
+               zfs_dbgmsg("freed %llu blocks in %llums from "
+                   "free_bpobj/bptree txg %llu; err=%u",
+                   (longlong_t)scn->scn_visited_this_txg,
+                   (longlong_t)
+                   NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
+                   (longlong_t)tx->tx_txg, err);
+               scn->scn_visited_this_txg = 0;
+
+               /*
+                * Write out changes to the DDT that may be required as a
+                * result of the blocks freed.  This ensures that the DDT
+                * is clean when a scrub/resilver runs.
+                */
+               ddt_sync(spa, tx->tx_txg);
+       }
+       if (err != 0)
+               return;
+       if (!scn->scn_async_destroying && zfs_free_leak_on_eio &&
+           (dp->dp_free_dir->dd_phys->dd_used_bytes != 0 ||
+           dp->dp_free_dir->dd_phys->dd_compressed_bytes != 0 ||
+           dp->dp_free_dir->dd_phys->dd_uncompressed_bytes != 0)) {
+               /*
+                * We have finished background destroying, but there is still
+                * some space left in the dp_free_dir. Transfer this leaked
+                * space to the dp_leak_dir.
+                */
+               if (dp->dp_leak_dir == NULL) {
+                       rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
+                       (void) dsl_dir_create_sync(dp, dp->dp_root_dir,
+                           LEAK_DIR_NAME, tx);
+                       VERIFY0(dsl_pool_open_special_dir(dp,
+                           LEAK_DIR_NAME, &dp->dp_leak_dir));
+                       rrw_exit(&dp->dp_config_rwlock, FTAG);
+               }
+               dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD,
+                   dp->dp_free_dir->dd_phys->dd_used_bytes,
+                   dp->dp_free_dir->dd_phys->dd_compressed_bytes,
+                   dp->dp_free_dir->dd_phys->dd_uncompressed_bytes, tx);
+               dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
+                   -dp->dp_free_dir->dd_phys->dd_used_bytes,
+                   -dp->dp_free_dir->dd_phys->dd_compressed_bytes,
+                   -dp->dp_free_dir->dd_phys->dd_uncompressed_bytes, tx);
+       }
+       if (!scn->scn_async_destroying) {
                 /* finished; verify that space accounting went to zero */
                 ASSERT0(dp->dp_free_dir->dd_phys->dd_used_bytes);
                 ASSERT0(dp->dp_free_dir->dd_phys->dd_compressed_bytes);
diff --git a/module/zfs/spa.c b/module/zfs/spa.c

index b9fa45f8299e015bc3889c6aef9bb66311359b4d..e4b71ea72d4f51b9d8f9dfa0e703f2e9833cedd4 100644 (file)
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -238,19 +238,25 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
         }
  
         if (pool != NULL) {
-               dsl_dir_t *freedir = pool->dp_free_dir;
-
                 /*
                  * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
                  * when opening pools before this version freedir will be NULL.
                  */
-               if (freedir != NULL) {
+               if (pool->dp_free_dir != NULL) {
                         spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
-                           freedir->dd_phys->dd_used_bytes, src);
+                           pool->dp_free_dir->dd_phys->dd_used_bytes, src);
                 } else {
                         spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
                             NULL, 0, src);
                 }
+
+               if (pool->dp_leak_dir != NULL) {
+                       spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
+                           pool->dp_leak_dir->dd_phys->dd_used_bytes, src);
+               } else {
+                       spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
+                           NULL, 0, src);
+               }
         }
  
         spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
diff --git a/module/zfs/zfs_debug.c b/module/zfs/zfs_debug.c

index 4f612e16ba8b0ffbbaf19efa17477a1012dacdf8..47b7834f58c5c91b3c21fcea21061ccdd814c78a 100644 (file)
--- a/module/zfs/zfs_debug.c
+++ b/module/zfs/zfs_debug.c
@@ -29,7 +29,7 @@
  list_t zfs_dbgmsgs;
  int zfs_dbgmsg_size;
  kmutex_t zfs_dbgmsgs_lock;
-int zfs_dbgmsg_maxsize = 1<<20; /* 1MB */
+int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */
  #endif
  
  /*
@@ -44,7 +44,38 @@ int zfs_flags = 0;
   * This should only be used as a last resort, as it typically results
   * in leaked space, or worse.
   */
-int zfs_recover = 0;
+int zfs_recover = B_FALSE;
+
+/*
+ * If destroy encounters an EIO while reading metadata (e.g. indirect
+ * blocks), space referenced by the missing metadata can not be freed.
+ * Normally this causes the background destroy to become "stalled", as
+ * it is unable to make forward progress.  While in this stalled state,
+ * all remaining space to free from the error-encountering filesystem is
+ * "temporarily leaked".  Set this flag to cause it to ignore the EIO,
+ * permanently leak the space from indirect blocks that can not be read,
+ * and continue to free everything else that it can.
+ *
+ * The default, "stalling" behavior is useful if the storage partially
+ * fails (i.e. some but not all i/os fail), and then later recovers.  In
+ * this case, we will be able to continue pool operations while it is
+ * partially failed, and when it recovers, we can continue to free the
+ * space, with no leaks.  However, note that this case is actually
+ * fairly rare.
+ *
+ * Typically pools either (a) fail completely (but perhaps temporarily,
+ * e.g. a top-level vdev going offline), or (b) have localized,
+ * permanent errors (e.g. disk returns the wrong data due to bit flip or
+ * firmware bug).  In case (a), this setting does not matter because the
+ * pool will be suspended and the sync thread will not be able to make
+ * forward progress regardless.  In case (b), because the error is
+ * permanent, the best we can do is leak the minimum amount of space,
+ * which is what setting this flag will do.  Therefore, it is reasonable
+ * for this flag to normally be set, but we chose the more conservative
+ * approach of not setting it, so that there is no possibility of
+ * leaking space in the "partial temporary" failure case.
+ */
+int zfs_free_leak_on_eio = B_FALSE;
  
  
  void
@@ -163,4 +194,8 @@ MODULE_PARM_DESC(zfs_flags, "Set additional debugging flags");
  
  module_param(zfs_recover, int, 0644);
  MODULE_PARM_DESC(zfs_recover, "Set to attempt to recover from fatal errors");
+
+module_param(zfs_free_leak_on_eio, int, 0644);
+MODULE_PARM_DESC(zfs_free_leak_on_eio,
+       "Set to ignore IO errors during free and permanently leak the space");
  #endif /* _KERNEL */
diff --git a/module/zfs/zio.c b/module/zfs/zio.c

index ad97ef5dbd02bf77295cff007236f58817d5a04e..58d4550fbf79be4f75df3816dd52988f467cba1d 100644 (file)
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -3356,13 +3356,6 @@ zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
         ASSERT(zb1->zb_objset == zb2->zb_objset);
         ASSERT(zb2->zb_level == 0);
  
-       /*
-        * A bookmark in the deadlist is considered to be after
-        * everything else.
-        */
-       if (zb2->zb_object == DMU_DEADLIST_OBJECT)
-               return (B_TRUE);
-
         /* The objset_phys_t isn't before anything. */
         if (dnp == NULL)
                 return (B_FALSE);
author	Matthew Ahrens <mahrens@delphix.com>
	Thu, 5 Jun 2014 21:20:08 +0000 (13:20 -0800)
committer	Brian Behlendorf <behlendorf1@llnl.gov>
	Mon, 4 Aug 2014 18:50:52 +0000 (11:50 -0700)
include/sys/bptree.h		patch \| blob \| history
include/sys/dmu.h		patch \| blob \| history
include/sys/dsl_dir.h		patch \| blob \| history
include/sys/dsl_pool.h		patch \| blob \| history
include/sys/dsl_scan.h		patch \| blob \| history
include/sys/fs/zfs.h		patch \| blob \| history
include/sys/zfs_debug.h		patch \| blob \| history
lib/libzfs/libzfs_pool.c		patch \| blob \| history
man/man5/zfs-module-parameters.5		patch \| blob \| history
module/zcommon/zpool_prop.c		patch \| blob \| history
module/zfs/bptree.c		patch \| blob \| history
module/zfs/dmu_traverse.c		patch \| blob \| history
module/zfs/dsl_pool.c		patch \| blob \| history
module/zfs/dsl_scan.c		patch \| blob \| history
module/zfs/spa.c		patch \| blob \| history
module/zfs/zfs_debug.c		patch \| blob \| history
module/zfs/zio.c		patch \| blob \| history