Illumos #3741

author Will Andrews <will@firepipe.net>

Tue, 11 Jun 2013 17:12:34 +0000 (09:12 -0800)

committer Brian Behlendorf <behlendorf1@llnl.gov>

Mon, 4 Nov 2013 18:55:25 +0000 (10:55 -0800)
author Will Andrews <will@firepipe.net>
Tue, 11 Jun 2013 17:12:34 +0000 (09:12 -0800)
committer Brian Behlendorf <behlendorf1@llnl.gov>
Mon, 4 Nov 2013 18:55:25 +0000 (10:55 -0800)
diff --git a/include/sys/dmu.h b/include/sys/dmu.h

index 60d4aa58abc4d7ea63036986a78ec79365819a63..0dd7e28352696caf23a8c7fc918e9aee68fe7d1a 100644 (file)
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -407,6 +407,8 @@ void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp,
   * object must be held in an assigned transaction before calling
   * dmu_buf_will_dirty.  You may use dmu_buf_set_user() on the bonus
   * buffer as well.  You must release what you hold with dmu_buf_rele().
+ *
+ * Returns ENOENT, EIO, or 0.
   */
  int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
  int dmu_bonus_max(void);
@@ -662,8 +664,14 @@ extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS];
   */
  int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
  void __dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
+/* Like dmu_object_info, but faster if you have a held dnode in hand. */
  void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
+/* Like dmu_object_info, but faster if you have a held dbuf in hand. */
  void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
+/*
+ * Like dmu_object_info_from_db, but faster still when you only care about
+ * the size.  This is specifically optimized for zfs_getattr().
+ */
  void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
      u_longlong_t *nblk512);
  
diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c

index be23382b54e225954c30568a031d4de9c66e20ec..ede0d91f8c5cd83fb2bb70ae97d97cb3347495ce 100644 (file)
--- a/lib/libzfs/libzfs_dataset.c
+++ b/lib/libzfs/libzfs_dataset.c
@@ -4791,6 +4791,11 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl)
         return (err);
  }
  
+/*
+ * Convert the zvol's volume size to an appropriate reservation.
+ * Note: If this routine is updated, it is necessary to update the ZFS test
+ * suite's shell version in reservation.kshlib.
+ */
  uint64_t
  zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props)
  {
diff --git a/module/zfs/arc.c b/module/zfs/arc.c

index a521501bde01f784f38f86ebbcc4e8be4d49dee3..2ae4c37a350d9f8e7242cf186dd3f28a6d115247 100644 (file)
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -260,7 +260,18 @@ typedef struct arc_stats {
         kstat_named_t arcstat_mfu_ghost_hits;
         kstat_named_t arcstat_deleted;
         kstat_named_t arcstat_recycle_miss;
+       /*
+        * Number of buffers that could not be evicted because the hash lock
+        * was held by another thread.  The lock may not necessarily be held
+        * by something using the same buffer, since hash locks are shared
+        * by multiple buffers.
+        */
         kstat_named_t arcstat_mutex_miss;
+       /*
+        * Number of buffers skipped because they have I/O in progress, are
+        * indrect prefetch buffers that have not lived long enough, or are
+        * not from the spa we're trying to evict from.
+        */
         kstat_named_t arcstat_evict_skip;
         kstat_named_t arcstat_evict_l2_cached;
         kstat_named_t arcstat_evict_l2_eligible;
@@ -3174,6 +3185,10 @@ top:
  
                 mutex_exit(hash_lock);
  
+               /*
+                * At this point, we have a level 1 cache miss.  Try again in
+                * L2ARC if possible.
+                */
                 ASSERT3U(hdr->b_size, ==, size);
                 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
                     uint64_t, size, zbookmark_t *, zb);
@@ -3445,8 +3460,8 @@ arc_buf_evict(arc_buf_t *buf)
  }
  
  /*
- * Release this buffer from the cache.  This must be done
- * after a read and prior to modifying the buffer contents.
+ * Release this buffer from the cache, making it an anonymous buffer.  This
+ * must be done after a read and prior to modifying the buffer contents.
   * If the buffer has more than one reference, we must make
   * a new hdr for the buffer.
   */
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c

index 9c60ec55a35a75aec019d0e50a184e47d6622298..95c7b329797c601d7e7a480fef51dcab346fc19d 100644 (file)
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -691,6 +691,14 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
                 if (!havepzio)
                         err = zio_wait(zio);
         } else {
+               /*
+                * Another reader came in while the dbuf was in flight
+                * between UNCACHED and CACHED.  Either a writer will finish
+                * writing the buffer (sending the dbuf to CACHED) or the
+                * first reader's request will reach the read_done callback
+                * and send the dbuf to CACHED.  Otherwise, a failure
+                * occurred and the dbuf went to UNCACHED.
+                */
                 mutex_exit(&db->db_mtx);
                 if (prefetch)
                         dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
@@ -699,6 +707,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
                         rw_exit(&dn->dn_struct_rwlock);
                 DB_DNODE_EXIT(db);
  
+               /* Skip the wait per the caller's request. */
                 mutex_enter(&db->db_mtx);
                 if ((flags & DB_RF_NEVERWAIT) == 0) {
                         while (db->db_state == DB_READ ||
@@ -1313,7 +1322,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
  }
  
  /*
- * Return TRUE if this evicted the dbuf.
+ * Undirty a buffer in the transaction group referenced by the given
+ * transaction.  Return whether this evicted the dbuf.
   */
  static boolean_t
  dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
@@ -2324,6 +2334,7 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
         ASSERT(db->db_level > 0);
         DBUF_VERIFY(db);
  
+       /* Read the block if it hasn't been read yet. */
         if (db->db_buf == NULL) {
                 mutex_exit(&db->db_mtx);
                 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
@@ -2334,10 +2345,12 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
  
         DB_DNODE_ENTER(db);
         dn = DB_DNODE(db);
+       /* Indirect block size must match what the dnode thinks it is. */
         ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
         dbuf_check_blkptr(dn, db);
         DB_DNODE_EXIT(db);
  
+       /* Provide the pending dirty record to child dbufs */
         db->db_data_pending = dr;
  
         mutex_exit(&db->db_mtx);
@@ -2728,6 +2741,7 @@ dbuf_write_override_done(zio_t *zio)
         dbuf_write_done(zio, NULL, db);
  }
  
+/* Issue I/O to commit a dirty buffer to disk. */
  static void
  dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
  {
@@ -2762,11 +2776,19 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
         }
  
         if (parent != dn->dn_dbuf) {
+               /* Our parent is an indirect block. */
+               /* We have a dirty parent that has been scheduled for write. */
                 ASSERT(parent && parent->db_data_pending);
+               /* Our parent's buffer is one level closer to the dnode. */
                 ASSERT(db->db_level == parent->db_level-1);
+               /*
+                * We're about to modify our parent's db_data by modifying
+                * our block pointer, so the parent must be released.
+                */
                 ASSERT(arc_released(parent->db_buf));
                 zio = parent->db_data_pending->dr_zio;
         } else {
+               /* Our parent is the dnode itself. */
                 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
                     db->db_blkid != DMU_SPILL_BLKID) ||
                     (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c

index 9223b907bdd906283a1866ef6c17858e4e9cd000..34f3eeef90e807370287e7c0f1725f003ad8a6fe 100644 (file)
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -1965,7 +1965,7 @@ dmu_init(void)
  void
  dmu_fini(void)
  {
-       arc_fini();
+       arc_fini(); /* arc depends on l2arc, so arc must go first */
         l2arc_fini();
         dmu_tx_fini();
         zfetch_fini();
diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c

index 1cad8d20e6ad0e883644704ee8b9cb95f4264608..caac601935e045481d0d168f1a84ed81e6e7bc41 100644 (file)
--- a/module/zfs/dmu_tx.c
+++ b/module/zfs/dmu_tx.c
@@ -1040,6 +1040,10 @@ dmu_tx_unassign(dmu_tx_t *tx)
  
         txg_rele_to_quiesce(&tx->tx_txgh);
  
+       /*
+        * Walk the transaction's hold list, removing the hold on the
+        * associated dnode, and notifying waiters if the refcount drops to 0.
+        */
         for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh;
             txh = list_next(&tx->tx_holds, txh)) {
                 dnode_t *dn = txh->txh_dnode;
@@ -1157,6 +1161,10 @@ dmu_tx_commit(dmu_tx_t *tx)
  
         ASSERT(tx->tx_txg != 0);
  
+       /*
+        * Go through the transaction's hold list and remove holds on
+        * associated dnodes, notifying waiters if no holds remain.
+        */
         while ((txh = list_head(&tx->tx_holds))) {
                 dnode_t *dn = txh->txh_dnode;
  
diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c

index 1763bae5184a6c40f5e72b8da9d9bc52d1998705..705478c82ef5c6be346f2cfdae9cd338101f587d 100644 (file)
--- a/module/zfs/dmu_zfetch.c
+++ b/module/zfs/dmu_zfetch.c
@@ -48,11 +48,11 @@ unsigned int        zfetch_block_cap = 256;
  unsigned long  zfetch_array_rd_sz = 1024 * 1024;
  
  /* forward decls for static routines */
-static int             dmu_zfetch_colinear(zfetch_t *, zstream_t *);
+static boolean_t       dmu_zfetch_colinear(zfetch_t *, zstream_t *);
  static void            dmu_zfetch_dofetch(zfetch_t *, zstream_t *);
  static uint64_t                dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t);
  static uint64_t                dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t);
-static int             dmu_zfetch_find(zfetch_t *, zstream_t *, int);
+static boolean_t       dmu_zfetch_find(zfetch_t *, zstream_t *, int);
  static int             dmu_zfetch_stream_insert(zfetch_t *, zstream_t *);
  static zstream_t       *dmu_zfetch_stream_reclaim(zfetch_t *);
  static void            dmu_zfetch_stream_remove(zfetch_t *, zstream_t *);
@@ -104,9 +104,9 @@ kstat_t             *zfetch_ksp;
   * last stream, then we are probably in a strided access pattern.  So
   * combine the two sequential streams into a single strided stream.
   *
- * If no co-linear streams are found, return NULL.
+ * Returns whether co-linear streams were found.
   */
-static int
+static boolean_t
  dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh)
  {
         zstream_t       *z_walk;
@@ -326,7 +326,7 @@ dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks)
   * for this block read.  If so, it starts a prefetch for the stream it
   * located and returns true, otherwise it returns false
   */
-static int
+static boolean_t
  dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched)
  {
         zstream_t       *zs;
@@ -639,7 +639,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
  {
         zstream_t       zst;
         zstream_t       *newstream;
-       int             fetched;
+       boolean_t       fetched;
         int             inserted;
         unsigned int    blkshft;
         uint64_t        blksz;
diff --git a/module/zfs/spa.c b/module/zfs/spa.c

index bc9bf2cc3d63f25c3c5ed0e7dc760a1db427959f..c3010777159ff53f329403cc443ae2a983d0b3af 100644 (file)
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -26,6 +26,8 @@
   */
  
  /*
+ * SPA: Storage Pool Allocator
+ *
   * This file contains all the routines used when modifying on-disk SPA state.
   * This includes opening, importing, destroying, exporting a pool, and syncing a
   * pool.
diff --git a/module/zfs/txg.c b/module/zfs/txg.c

index c8a29e14fe4efe4926f9d643cdaea6907895c899..697aa090558021c539b3f781fa92733d64ffaafb 100644 (file)
--- a/module/zfs/txg.c
+++ b/module/zfs/txg.c
@@ -354,6 +354,12 @@ txg_rele_to_sync(txg_handle_t *th)
         th->th_cpu = NULL;      /* defensive */
  }
  
+/*
+ * Blocks until all transactions in the group are committed.
+ *
+ * On return, the transaction group has reached a stable state in which it can
+ * then be passed off to the syncing context.
+ */
  static void
  txg_quiesce(dsl_pool_t *dp, uint64_t txg)
  {
@@ -409,6 +415,9 @@ txg_do_callbacks(list_t *cb_list)
  
  /*
   * Dispatch the commit callbacks registered on this txg to worker threads.
+ *
+ * If no callbacks are registered for a given TXG, nothing happens.
+ * This function creates a taskq for the associated pool, if needed.
   */
  static void
  txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
@@ -419,7 +428,10 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
  
         for (c = 0; c < max_ncpus; c++) {
                 tx_cpu_t *tc = &tx->tx_cpu[c];
-               /* No need to lock tx_cpu_t at this point */
+               /*
+                * No need to lock tx_cpu_t at this point, since this can
+                * only be called once a txg has been synced.
+                */
  
                 int g = txg & TXG_MASK;
  
diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c

index 07e66ebdcf38f8833e5aeb34c2dff371b386d473..0405608c2565dfc3e9faee2576e642d3d53f3474 100644 (file)
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -1035,6 +1035,7 @@ vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
         zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd));
  }
  
+/* Sync the uberblocks to all vdevs in svd[] */
  int
  vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
  {
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c

index 130ec575e6a04107dba413e6f4d9f1ac027b48f7..d2dfd5b43a4095fbee5ec518f99a8d7d4cf251e6 100644 (file)
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -431,23 +431,50 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
         vdev_raidz_cksum_report
  };
  
+/*
+ * Divides the IO evenly across all child vdevs; usually, dcols is
+ * the number of children in the target vdev.
+ */
  static raidz_map_t *
  vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
      uint64_t nparity)
  {
         raidz_map_t *rm;
+       /* The starting RAIDZ (parent) vdev sector of the block. */
         uint64_t b = zio->io_offset >> unit_shift;
+       /* The zio's size in units of the vdev's minimum sector size. */
         uint64_t s = zio->io_size >> unit_shift;
+       /* The first column for this stripe. */
         uint64_t f = b % dcols;
+       /* The starting byte offset on each child vdev. */
         uint64_t o = (b / dcols) << unit_shift;
         uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
  
+       /*
+        * "Quotient": The number of data sectors for this stripe on all but
+        * the "big column" child vdevs that also contain "remainder" data.
+        */
         q = s / (dcols - nparity);
+
+       /*
+        * "Remainder": The number of partial stripe data sectors in this I/O.
+        * This will add a sector to some, but not all, child vdevs.
+        */
         r = s - q * (dcols - nparity);
+
+       /* The number of "big columns" - those which contain remainder data. */
         bc = (r == 0 ? 0 : r + nparity);
+
+       /*
+        * The total number of data and parity sectors associated with
+        * this I/O.
+        */
         tot = s + nparity * (q + (r == 0 ? 0 : 1));
  
+       /* acols: The columns that will be accessed. */
+       /* scols: The columns that will be accessed or skipped. */
         if (q == 0) {
+               /* Our I/O request doesn't span all child vdevs. */
                 acols = bc;
                 scols = MIN(dcols, roundup(bc, nparity + 1));
         } else {
@@ -1521,6 +1548,23 @@ vdev_raidz_child_done(zio_t *zio)
         rc->rc_skipped = 0;
  }
  
+/*
+ * Start an IO operation on a RAIDZ VDev
+ *
+ * Outline:
+ * - For write operations:
+ *   1. Generate the parity data
+ *   2. Create child zio write operations to each column's vdev, for both
+ *      data and parity.
+ *   3. If the column skips any sectors for padding, create optional dummy
+ *      write zio children for those areas to improve aggregation continuity.
+ * - For read operations:
+ *   1. Create child zio read operations to each data column's vdev to read
+ *      the range of data required for zio.
+ *   2. If this is a scrub or resilver operation, or if any of the data
+ *      vdevs have had errors, then create zio read operations to the parity
+ *      columns' VDevs as well.
+ */
  static int
  vdev_raidz_io_start(zio_t *zio)
  {
@@ -1864,6 +1908,27 @@ done:
         return (ret);
  }
  
+/*
+ * Complete an IO operation on a RAIDZ VDev
+ *
+ * Outline:
+ * - For write operations:
+ *   1. Check for errors on the child IOs.
+ *   2. Return, setting an error code if too few child VDevs were written
+ *      to reconstruct the data later.  Note that partial writes are
+ *      considered successful if they can be reconstructed at all.
+ * - For read operations:
+ *   1. Check for errors on the child IOs.
+ *   2. If data errors occurred:
+ *      a. Try to reassemble the data from the parity available.
+ *      b. If we haven't yet read the parity drives, read them now.
+ *      c. If all parity drives have been read but the data still doesn't
+ *         reassemble with a correct checksum, then try combinatorial
+ *         reconstruction.
+ *      d. If that doesn't work, return an error.
+ *   3. If there were unexpected errors or this is a resilver operation,
+ *      rewrite the vdevs that had errors.
+ */
  static void
  vdev_raidz_io_done(zio_t *zio)
  {
diff --git a/module/zfs/zfs_ctldir.c b/module/zfs/zfs_ctldir.c

index ce084fff1102325ccde408e87e6664042709c2b5..c08e9dd9b959cd1433df85b63e640ef5f1380a48 100644 (file)
--- a/module/zfs/zfs_ctldir.c
+++ b/module/zfs/zfs_ctldir.c
@@ -368,6 +368,11 @@ zfsctl_snapshot_zname(struct inode *ip, const char *name, int len, char *zname)
         return (0);
  }
  
+/*
+ * Gets the full dataset name that corresponds to the given snapshot name
+ * Example:
+ *     zfsctl_snapshot_zname("snap1") -> "mypool/myfs@snap1"
+ */
  static int
  zfsctl_snapshot_zpath(struct path *path, int len, char *zpath)
  {
author	Will Andrews <will@firepipe.net>
	Tue, 11 Jun 2013 17:12:34 +0000 (09:12 -0800)
committer	Brian Behlendorf <behlendorf1@llnl.gov>
	Mon, 4 Nov 2013 18:55:25 +0000 (10:55 -0800)
include/sys/dmu.h		patch \| blob \| history
lib/libzfs/libzfs_dataset.c		patch \| blob \| history
module/zfs/arc.c		patch \| blob \| history
module/zfs/dbuf.c		patch \| blob \| history
module/zfs/dmu.c		patch \| blob \| history
module/zfs/dmu_tx.c		patch \| blob \| history
module/zfs/dmu_zfetch.c		patch \| blob \| history
module/zfs/spa.c		patch \| blob \| history
module/zfs/txg.c		patch \| blob \| history
module/zfs/vdev_label.c		patch \| blob \| history
module/zfs/vdev_raidz.c		patch \| blob \| history
module/zfs/zfs_ctldir.c		patch \| blob \| history