From 4d044c4c1d68ed518fe37eea61a4cc77048940fb Mon Sep 17 00:00:00 2001
From: Serapheim Dimitropoulos <serapheim@delphix.com>
Date: Fri, 4 Aug 2017 09:30:49 -0700
Subject: [PATCH] OpenZFS 9238 - ZFS Spacemap Encoding V2

Motivation
==========

The current space map encoding has the following disadvantages:
[1] Assuming 512 sector size each entry can represent at most 16MB for a segment.
    This makes the encoding very inefficient for large regions of space.
[2] As vdev-wide space maps have started to be used by new features (i.e.
    device removal, zpool checkpoint) we've started imposing limits in the
    vdevs that can be used with them based on the maximum addressable offset
    (currently 64PB for a top-level vdev).

New encoding
============

The layout can be found at space_map.h and it remains backwards compatible with
the old one. The introduced two-word entry format, besides extending the limits
imposed by the single-entry layout, also includes a vdev field and some extra
padding after its prefix.

The extra padding after the prefix should is reserved for future usage (e.g.
new prefixes for future encodings or new fields for flags). The new vdev field
not only makes the space maps more self-descriptive, but also opens the doors
for pool-wide space maps (expected to be used in the log spacemap project).

One final important note is that the number of bits used for vdevs is reduced
to 24 bits for blkptrs. That was decided as we don't know of any setups that
use more than 16M vdevs for the time being and we wanted to fit the vdev field
in the space map. In addition that gives us some extra bits in dva_t.

Other references:
=================

The new encoding is also discussed towards the end of the Log Space Map
presentation from 2017's OpenZFS summit.
Link: https://www.youtube.com/watch?v=jj2IxRkl5bQ

Authored by: Serapheim Dimitropoulos <serapheim@delphix.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <gwilson@zfsmail.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Approved by: Gordon Ross <gwr@nexenta.com>
Ported-by: Tim Chase <tim@chase2k.com>
Signed-off-by: Tim Chase <tim@chase2k.com>

OpenZFS-commit: https://github.com/openzfs/openzfs/commit/90a56e6d
OpenZFS-issue: https://www.illumos.org/issues/9238
Closes #7665
---
 cmd/zdb/zdb.c                                 | 125 +--
 cmd/ztest/ztest.c                             |   7 +
 include/sys/spa.h                             |  12 +-
 include/sys/space_map.h                       | 114 ++-
 include/zfeature_common.h                     |   1 +
 man/man5/zpool-features.5                     |  23 +
 module/zcommon/zfeature_common.c              |   6 +
 module/zfs/metaslab.c                         |  58 +-
 module/zfs/spa_checkpoint.c                   |  50 +-
 module/zfs/space_map.c                        | 800 +++++++++++++-----
 module/zfs/vdev.c                             |   2 +-
 module/zfs/vdev_indirect.c                    |   2 +-
 module/zfs/vdev_indirect_mapping.c            |   9 +-
 .../cli_root/zpool_get/zpool_get.cfg          |   1 +
 .../checkpoint_discard_busy.ksh               |   6 +-
 15 files changed, 855 insertions(+), 361 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index d1e77cce7..0d2f3623b 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -776,7 +776,6 @@ verify_spacemap_refcounts(spa_t *spa)
 static void
 dump_spacemap(objset_t *os, space_map_t *sm)
 {
-	uint64_t alloc, offset, entry;
 	const char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
 	    "INVALID", "INVALID", "INVALID", "INVALID" };
 
@@ -793,41 +792,73 @@ dump_spacemap(objset_t *os, space_map_t *sm)
 	/*
 	 * Print out the freelist entries in both encoded and decoded form.
 	 */
-	alloc = 0;
-	for (offset = 0; offset < space_map_length(sm);
-	    offset += sizeof (entry)) {
-		uint8_t mapshift = sm->sm_shift;
+	uint8_t mapshift = sm->sm_shift;
+	int64_t alloc = 0;
+	uint64_t word;
+	for (uint64_t offset = 0; offset < space_map_length(sm);
+	    offset += sizeof (word)) {
 
 		VERIFY0(dmu_read(os, space_map_object(sm), offset,
-		    sizeof (entry), &entry, DMU_READ_PREFETCH));
-		if (SM_DEBUG_DECODE(entry)) {
+		    sizeof (word), &word, DMU_READ_PREFETCH));
 
+		if (sm_entry_is_debug(word)) {
 			(void) printf("\t    [%6llu] %s: txg %llu, pass %llu\n",
-			    (u_longlong_t)(offset / sizeof (entry)),
-			    ddata[SM_DEBUG_ACTION_DECODE(entry)],
-			    (u_longlong_t)SM_DEBUG_TXG_DECODE(entry),
-			    (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(entry));
+			    (u_longlong_t)(offset / sizeof (word)),
+			    ddata[SM_DEBUG_ACTION_DECODE(word)],
+			    (u_longlong_t)SM_DEBUG_TXG_DECODE(word),
+			    (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word));
+			continue;
+		}
+
+		uint8_t words;
+		char entry_type;
+		uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID;
+
+		if (sm_entry_is_single_word(word)) {
+			entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ?
+			    'A' : 'F';
+			entry_off = (SM_OFFSET_DECODE(word) << mapshift) +
+			    sm->sm_start;
+			entry_run = SM_RUN_DECODE(word) << mapshift;
+			words = 1;
 		} else {
-			(void) printf("\t    [%6llu]    %c  range:"
-			    " %010llx-%010llx  size: %06llx\n",
-			    (u_longlong_t)(offset / sizeof (entry)),
-			    SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F',
-			    (u_longlong_t)((SM_OFFSET_DECODE(entry) <<
-			    mapshift) + sm->sm_start),
-			    (u_longlong_t)((SM_OFFSET_DECODE(entry) <<
-			    mapshift) + sm->sm_start +
-			    (SM_RUN_DECODE(entry) << mapshift)),
-			    (u_longlong_t)(SM_RUN_DECODE(entry) << mapshift));
-			if (SM_TYPE_DECODE(entry) == SM_ALLOC)
-				alloc += SM_RUN_DECODE(entry) << mapshift;
-			else
-				alloc -= SM_RUN_DECODE(entry) << mapshift;
+			/* it is a two-word entry so we read another word */
+			ASSERT(sm_entry_is_double_word(word));
+
+			uint64_t extra_word;
+			offset += sizeof (extra_word);
+			VERIFY0(dmu_read(os, space_map_object(sm), offset,
+			    sizeof (extra_word), &extra_word,
+			    DMU_READ_PREFETCH));
+
+			ASSERT3U(offset, <=, space_map_length(sm));
+
+			entry_run = SM2_RUN_DECODE(word) << mapshift;
+			entry_vdev = SM2_VDEV_DECODE(word);
+			entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ?
+			    'A' : 'F';
+			entry_off = (SM2_OFFSET_DECODE(extra_word) <<
+			    mapshift) + sm->sm_start;
+			words = 2;
 		}
+
+		(void) printf("\t    [%6llu]    %c  range:"
+		    " %010llx-%010llx  size: %06llx vdev: %06llu words: %u\n",
+		    (u_longlong_t)(offset / sizeof (word)),
+		    entry_type, (u_longlong_t)entry_off,
+		    (u_longlong_t)(entry_off + entry_run),
+		    (u_longlong_t)entry_run,
+		    (u_longlong_t)entry_vdev, words);
+
+		if (entry_type == 'A')
+			alloc += entry_run;
+		else
+			alloc -= entry_run;
 	}
-	if (alloc != space_map_allocated(sm)) {
-		(void) printf("space_map_object alloc (%llu) INCONSISTENT "
-		    "with space map summary (%llu)\n",
-		    (u_longlong_t)space_map_allocated(sm), (u_longlong_t)alloc);
+	if ((uint64_t)alloc != space_map_allocated(sm)) {
+		(void) printf("space_map_object alloc (%lld) INCONSISTENT "
+		    "with space map summary (%lld)\n",
+		    (longlong_t)space_map_allocated(sm), (longlong_t)alloc);
 	}
 }
 
@@ -1158,7 +1189,7 @@ dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
 	while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
 		dump_dde(ddt, &dde, walk);
 
-	ASSERT(error == ENOENT);
+	ASSERT3U(error, ==, ENOENT);
 
 	(void) printf("\n");
 }
@@ -3579,15 +3610,14 @@ typedef struct checkpoint_sm_exclude_entry_arg {
 } checkpoint_sm_exclude_entry_arg_t;
 
 static int
-checkpoint_sm_exclude_entry_cb(maptype_t type, uint64_t offset, uint64_t size,
-    void *arg)
+checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg)
 {
 	checkpoint_sm_exclude_entry_arg_t *cseea = arg;
 	vdev_t *vd = cseea->cseea_vd;
-	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
-	uint64_t end = offset + size;
+	metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
+	uint64_t end = sme->sme_offset + sme->sme_run;
 
-	ASSERT(type == SM_FREE);
+	ASSERT(sme->sme_type == SM_FREE);
 
 	/*
 	 * Since the vdev_checkpoint_sm exists in the vdev level
@@ -3605,7 +3635,7 @@ checkpoint_sm_exclude_entry_cb(maptype_t type, uint64_t offset, uint64_t size,
 	 * metaslab boundaries. So if needed we could add code
 	 * that handles metaslab-crossing segments in the future.
 	 */
-	VERIFY3U(offset, >=, ms->ms_start);
+	VERIFY3U(sme->sme_offset, >=, ms->ms_start);
 	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
 
 	/*
@@ -3613,10 +3643,10 @@ checkpoint_sm_exclude_entry_cb(maptype_t type, uint64_t offset, uint64_t size,
 	 * also verify that the entry is there to begin with.
 	 */
 	mutex_enter(&ms->ms_lock);
-	range_tree_remove(ms->ms_allocatable, offset, size);
+	range_tree_remove(ms->ms_allocatable, sme->sme_offset, sme->sme_run);
 	mutex_exit(&ms->ms_lock);
 
-	cseea->cseea_checkpoint_size += size;
+	cseea->cseea_checkpoint_size += sme->sme_run;
 	return (0);
 }
 
@@ -4606,15 +4636,14 @@ typedef struct verify_checkpoint_sm_entry_cb_arg {
 #define	ENTRIES_PER_PROGRESS_UPDATE 10000
 
 static int
-verify_checkpoint_sm_entry_cb(maptype_t type, uint64_t offset, uint64_t size,
-    void *arg)
+verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg)
 {
 	verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg;
 	vdev_t *vd = vcsec->vcsec_vd;
-	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
-	uint64_t end = offset + size;
+	metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
+	uint64_t end = sme->sme_offset + sme->sme_run;
 
-	ASSERT(type == SM_FREE);
+	ASSERT(sme->sme_type == SM_FREE);
 
 	if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) {
 		(void) fprintf(stderr,
@@ -4628,7 +4657,7 @@ verify_checkpoint_sm_entry_cb(maptype_t type, uint64_t offset, uint64_t size,
 	/*
 	 * See comment in checkpoint_sm_exclude_entry_cb()
 	 */
-	VERIFY3U(offset, >=, ms->ms_start);
+	VERIFY3U(sme->sme_offset, >=, ms->ms_start);
 	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
 
 	/*
@@ -4637,7 +4666,7 @@ verify_checkpoint_sm_entry_cb(maptype_t type, uint64_t offset, uint64_t size,
 	 * their respective ms_allocateable trees should not contain them.
 	 */
 	mutex_enter(&ms->ms_lock);
-	range_tree_verify(ms->ms_allocatable, offset, size);
+	range_tree_verify(ms->ms_allocatable, sme->sme_offset, sme->sme_run);
 	mutex_exit(&ms->ms_lock);
 
 	return (0);
@@ -4883,7 +4912,7 @@ verify_checkpoint(spa_t *spa)
 	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
 	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
 
-	if (error == ENOENT) {
+	if (error == ENOENT && !dump_opt['L']) {
 		/*
 		 * If the feature is active but the uberblock is missing
 		 * then we must be in the middle of discarding the
@@ -4906,7 +4935,7 @@ verify_checkpoint(spa_t *spa)
 		error = 3;
 	}
 
-	if (error == 0)
+	if (error == 0 && !dump_opt['L'])
 		verify_checkpoint_blocks(spa);
 
 	return (error);
@@ -5015,7 +5044,7 @@ dump_zpool(spa_t *spa)
 	if (dump_opt['h'])
 		dump_history(spa);
 
-	if (rc == 0 && !dump_opt['L'])
+	if (rc == 0)
 		rc = verify_checkpoint(spa);
 
 	if (rc != 0) {
diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
index 78ad7e8de..5347a0abe 100644
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -202,6 +202,7 @@ extern int metaslab_preload_limit;
 extern boolean_t zfs_compressed_arc_enabled;
 extern int zfs_abd_scatter_enabled;
 extern int dmu_object_alloc_chunk_shift;
+extern boolean_t zfs_force_some_double_word_sm_entries;
 
 static ztest_shared_opts_t *ztest_shared_opts;
 static ztest_shared_opts_t ztest_opts;
@@ -7349,6 +7350,12 @@ main(int argc, char **argv)
 
 	dprintf_setup(&argc, argv);
 	zfs_deadman_synctime_ms = 300000;
+	/*
+	 * As two-word space map entries may not come up often (especially
+	 * if pool and vdev sizes are small) we want to force at least some
+	 * of them so the feature get tested.
+	 */
+	zfs_force_some_double_word_sm_entries = B_TRUE;
 
 	action.sa_handler = sig_handler;
 	sigemptyset(&action.sa_mask);
diff --git a/include/sys/spa.h b/include/sys/spa.h
index b6483e11b..4a3fc71f7 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -153,6 +153,7 @@ _NOTE(CONSTCOND) } while (0)
 #define	SPA_ASIZEBITS		24	/* ASIZE up to 64 times larger	*/
 
 #define	SPA_COMPRESSBITS	7
+#define	SPA_VDEVBITS		24
 
 /*
  * All SPA data is represented by 128-bit data virtual addresses (DVAs).
@@ -177,15 +178,15 @@ typedef struct zio_cksum_salt {
  *
  *	64	56	48	40	32	24	16	8	0
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 0	|		vdev1		| GRID  |	  ASIZE		|
+ * 0	|  pad  |	  vdev1         | GRID  |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 1	|G|			 offset1				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 2	|		vdev2		| GRID  |	  ASIZE		|
+ * 2	|  pad  |	  vdev2         | GRID  |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 3	|G|			 offset2				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 4	|		vdev3		| GRID  |	  ASIZE		|
+ * 4	|  pad  |	  vdev3         | GRID  |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 5	|G|			 offset3				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
@@ -443,8 +444,9 @@ typedef struct blkptr {
 #define	DVA_GET_GRID(dva)	BF64_GET((dva)->dva_word[0], 24, 8)
 #define	DVA_SET_GRID(dva, x)	BF64_SET((dva)->dva_word[0], 24, 8, x)
 
-#define	DVA_GET_VDEV(dva)	BF64_GET((dva)->dva_word[0], 32, 32)
-#define	DVA_SET_VDEV(dva, x)	BF64_SET((dva)->dva_word[0], 32, 32, x)
+#define	DVA_GET_VDEV(dva)	BF64_GET((dva)->dva_word[0], 32, SPA_VDEVBITS)
+#define	DVA_SET_VDEV(dva, x)	\
+	BF64_SET((dva)->dva_word[0], 32, SPA_VDEVBITS, x)
 
 #define	DVA_GET_OFFSET(dva)	\
 	BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0)
diff --git a/include/sys/space_map.h b/include/sys/space_map.h
index 98b87269c..64c97bb4d 100644
--- a/include/sys/space_map.h
+++ b/include/sys/space_map.h
@@ -93,50 +93,100 @@ typedef struct space_map {
 /*
  * debug entry
  *
- *    1      3         10                     50
- *  ,---+--------+------------+---------------------------------.
- *  | 1 | action |  syncpass  |        txg (lower bits)         |
- *  `---+--------+------------+---------------------------------'
- *   63  62    60 59        50 49                               0
+ *     2     2        10                     50
+ *  +-----+-----+------------+----------------------------------+
+ *  | 1 0 | act |  syncpass  |        txg (lower bits)          |
+ *  +-----+-----+------------+----------------------------------+
+ *   63 62 61 60 59        50 49                                0
  *
  *
- * non-debug entry
+ * one-word entry
  *
  *    1               47                   1           15
- *  ,-----------------------------------------------------------.
+ *  +-----------------------------------------------------------+
  *  | 0 |   offset (sm_shift units)    | type |       run       |
- *  `-----------------------------------------------------------'
- *   63  62                          17   16   15               0
+ *  +-----------------------------------------------------------+
+ *   63  62                          16   15   14               0
+ *
+ *
+ * two-word entry
+ *
+ *     2     2               36                      24
+ *  +-----+-----+---------------------------+-------------------+
+ *  | 1 1 | pad |            run            |       vdev        |
+ *  +-----+-----+---------------------------+-------------------+
+ *   63 62 61 60 59                       24 23                 0
+ *
+ *     1                            63
+ *  +------+----------------------------------------------------+
+ *  | type |                      offset                        |
+ *  +------+----------------------------------------------------+
+ *     63   62                                                  0
+ *
+ * Note that a two-word entry will not straddle a block boundary.
+ * If necessary, the last word of a block will be padded with a
+ * debug entry (with act = syncpass = txg = 0).
  */
 
-/* All this stuff takes and returns bytes */
-#define	SM_RUN_DECODE(x)	(BF64_DECODE(x, 0, 15) + 1)
-#define	SM_RUN_ENCODE(x)	BF64_ENCODE((x) - 1, 0, 15)
-#define	SM_TYPE_DECODE(x)	BF64_DECODE(x, 15, 1)
-#define	SM_TYPE_ENCODE(x)	BF64_ENCODE(x, 15, 1)
-#define	SM_OFFSET_DECODE(x)	BF64_DECODE(x, 16, 47)
-#define	SM_OFFSET_ENCODE(x)	BF64_ENCODE(x, 16, 47)
-#define	SM_DEBUG_DECODE(x)	BF64_DECODE(x, 63, 1)
-#define	SM_DEBUG_ENCODE(x)	BF64_ENCODE(x, 63, 1)
+typedef enum {
+	SM_ALLOC,
+	SM_FREE
+} maptype_t;
+
+typedef struct space_map_entry {
+	maptype_t sme_type;
+	uint32_t sme_vdev;	/* max is 2^24-1; SM_NO_VDEVID if not present */
+	uint64_t sme_offset;	/* max is 2^63-1; units of sm_shift */
+	uint64_t sme_run;	/* max is 2^36; units of sm_shift */
+} space_map_entry_t;
+
+#define	SM_NO_VDEVID	(1 << SPA_VDEVBITS)
 
-#define	SM_DEBUG_ACTION_DECODE(x)	BF64_DECODE(x, 60, 3)
-#define	SM_DEBUG_ACTION_ENCODE(x)	BF64_ENCODE(x, 60, 3)
+/* one-word entry constants */
+#define	SM_DEBUG_PREFIX	2
+#define	SM_OFFSET_BITS	47
+#define	SM_RUN_BITS	15
 
+/* two-word entry constants */
+#define	SM2_PREFIX	3
+#define	SM2_OFFSET_BITS	63
+#define	SM2_RUN_BITS	36
+
+#define	SM_PREFIX_DECODE(x)	BF64_DECODE(x, 62, 2)
+#define	SM_PREFIX_ENCODE(x)	BF64_ENCODE(x, 62, 2)
+
+#define	SM_DEBUG_ACTION_DECODE(x)	BF64_DECODE(x, 60, 2)
+#define	SM_DEBUG_ACTION_ENCODE(x)	BF64_ENCODE(x, 60, 2)
 #define	SM_DEBUG_SYNCPASS_DECODE(x)	BF64_DECODE(x, 50, 10)
 #define	SM_DEBUG_SYNCPASS_ENCODE(x)	BF64_ENCODE(x, 50, 10)
-
 #define	SM_DEBUG_TXG_DECODE(x)		BF64_DECODE(x, 0, 50)
 #define	SM_DEBUG_TXG_ENCODE(x)		BF64_ENCODE(x, 0, 50)
 
-#define	SM_RUN_MAX			SM_RUN_DECODE(~0ULL)
-
-typedef enum {
-	SM_ALLOC,
-	SM_FREE
-} maptype_t;
-
-typedef int (*sm_cb_t)(maptype_t type, uint64_t offset, uint64_t size,
-    void *arg);
+#define	SM_OFFSET_DECODE(x)	BF64_DECODE(x, 16, SM_OFFSET_BITS)
+#define	SM_OFFSET_ENCODE(x)	BF64_ENCODE(x, 16, SM_OFFSET_BITS)
+#define	SM_TYPE_DECODE(x)	BF64_DECODE(x, 15, 1)
+#define	SM_TYPE_ENCODE(x)	BF64_ENCODE(x, 15, 1)
+#define	SM_RUN_DECODE(x)	(BF64_DECODE(x, 0, SM_RUN_BITS) + 1)
+#define	SM_RUN_ENCODE(x)	BF64_ENCODE((x) - 1, 0, SM_RUN_BITS)
+#define	SM_RUN_MAX		SM_RUN_DECODE(~0ULL)
+#define	SM_OFFSET_MAX		SM_OFFSET_DECODE(~0ULL)
+
+#define	SM2_RUN_DECODE(x)	(BF64_DECODE(x, SPA_VDEVBITS, SM2_RUN_BITS) + 1)
+#define	SM2_RUN_ENCODE(x)	BF64_ENCODE((x) - 1, SPA_VDEVBITS, SM2_RUN_BITS)
+#define	SM2_VDEV_DECODE(x)	BF64_DECODE(x, 0, SPA_VDEVBITS)
+#define	SM2_VDEV_ENCODE(x)	BF64_ENCODE(x, 0, SPA_VDEVBITS)
+#define	SM2_TYPE_DECODE(x)	BF64_DECODE(x, SM2_OFFSET_BITS, 1)
+#define	SM2_TYPE_ENCODE(x)	BF64_ENCODE(x, SM2_OFFSET_BITS, 1)
+#define	SM2_OFFSET_DECODE(x)	BF64_DECODE(x, 0, SM2_OFFSET_BITS)
+#define	SM2_OFFSET_ENCODE(x)	BF64_ENCODE(x, 0, SM2_OFFSET_BITS)
+#define	SM2_RUN_MAX		SM2_RUN_DECODE(~0ULL)
+#define	SM2_OFFSET_MAX		SM2_OFFSET_DECODE(~0ULL)
+
+boolean_t sm_entry_is_debug(uint64_t e);
+boolean_t sm_entry_is_single_word(uint64_t e);
+boolean_t sm_entry_is_double_word(uint64_t e);
+
+typedef int (*sm_cb_t)(space_map_entry_t *sme, void *arg);
 
 int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype);
 int space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg);
@@ -154,7 +204,9 @@ uint64_t space_map_allocated(space_map_t *sm);
 uint64_t space_map_length(space_map_t *sm);
 
 void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
-    dmu_tx_t *tx);
+    uint64_t vdev_id, dmu_tx_t *tx);
+uint64_t space_map_estimate_optimal_size(space_map_t *sm, range_tree_t *rt,
+    uint64_t vdev_id);
 void space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx);
 uint64_t space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx);
 void space_map_free(space_map_t *sm, dmu_tx_t *tx);
diff --git a/include/zfeature_common.h b/include/zfeature_common.h
index c59b800d3..c5aabce0e 100644
--- a/include/zfeature_common.h
+++ b/include/zfeature_common.h
@@ -62,6 +62,7 @@ typedef enum spa_feature {
 	SPA_FEATURE_DEVICE_REMOVAL,
 	SPA_FEATURE_OBSOLETE_COUNTS,
 	SPA_FEATURE_POOL_CHECKPOINT,
+	SPA_FEATURE_SPACEMAP_V2,
 	SPA_FEATURES
 } spa_feature_t;
 
diff --git a/man/man5/zpool-features.5 b/man/man5/zpool-features.5
index e93943ec2..8d5f46821 100644
--- a/man/man5/zpool-features.5
+++ b/man/man5/zpool-features.5
@@ -503,6 +503,29 @@ is used to checkpoint the pool.
 The feature will only return back to being \fBenabled\fR when the pool
 is rewound or the checkpoint has been discarded.
 
+.RE
+.sp
+.ne 2
+.na
+\fB\fBspacemap_v2\fR\fR
+.ad
+.RS 4n
+.TS
+l l .
+GUID	com.delphix:spacemap_v2
+READ\-ONLY COMPATIBLE	yes
+DEPENDENCIES	none
+.TE
+
+This feature enables the use of the new space map encoding which
+consists of two words (instead of one) whenever it is advantageous.
+The new encoding allows space maps to represent large regions of
+space more efficiently on-disk while also increasing their maximum
+addressable offset.
+
+This feature becomes \fBactive\fR once it is \fBenabled\fR, and never
+returns back to being \fBenabled\fR.
+
 .RE
 .sp
 .ne 2
diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c
index b010c8843..f5c98933c 100644
--- a/module/zcommon/zfeature_common.c
+++ b/module/zcommon/zfeature_common.c
@@ -215,6 +215,12 @@ zpool_feature_init(void)
 	    "Pool state can be checkpointed, allowing rewind later.",
 	    ZFEATURE_FLAG_READONLY_COMPAT, NULL);
 
+	zfeature_register(SPA_FEATURE_SPACEMAP_V2,
+	    "com.delphix:spacemap_v2", "spacemap_v2",
+	    "Space maps representing large segments are more efficient.",
+	    ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
+	    NULL);
+
 	zfeature_register(SPA_FEATURE_EXTENSIBLE_DATASET,
 	    "com.delphix:extensible_dataset", "extensible_dataset",
 	    "Enhanced dataset functionality, used by other features.",
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 76fa99e8b..879238e7d 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -2025,17 +2025,6 @@ metaslab_group_preload(metaslab_group_t *mg)
  *
  * 3. The on-disk size of the space map should actually decrease.
  *
- * Checking the first condition is tricky since we don't want to walk
- * the entire AVL tree calculating the estimated on-disk size. Instead we
- * use the size-ordered range tree in the metaslab and calculate the
- * size required to write out the largest segment in our free tree. If the
- * size required to represent that segment on disk is larger than the space
- * map object then we avoid condensing this map.
- *
- * To determine the second criterion we use a best-case estimate and assume
- * each segment can be represented on-disk as a single 64-bit entry. We refer
- * to this best-case estimate as the space map's minimal form.
- *
  * Unfortunately, we cannot compute the on-disk size of the space map in this
  * context because we cannot accurately compute the effects of compression, etc.
  * Instead, we apply the heuristic described in the block comment for
@@ -2046,9 +2035,6 @@ static boolean_t
 metaslab_should_condense(metaslab_t *msp)
 {
 	space_map_t *sm = msp->ms_sm;
-	range_seg_t *rs;
-	uint64_t size, entries, segsz, object_size, optimal_size, record_size;
-	dmu_object_info_t doi;
 	vdev_t *vd = msp->ms_group->mg_vd;
 	uint64_t vdev_blocksize = 1 << vd->vdev_ashift;
 	uint64_t current_txg = spa_syncing_txg(vd->vdev_spa);
@@ -2074,34 +2060,22 @@ metaslab_should_condense(metaslab_t *msp)
 	msp->ms_condense_checked_txg = current_txg;
 
 	/*
-	 * Use the ms_allocatable_by_size range tree, which is ordered by
-	 * size, to obtain the largest segment in the free tree. We always
-	 * condense metaslabs that are empty and metaslabs for which a
-	 * condense request has been made.
+	 * We always condense metaslabs that are empty and metaslabs for
+	 * which a condense request has been made.
 	 */
-	rs = avl_last(&msp->ms_allocatable_by_size);
-	if (rs == NULL || msp->ms_condense_wanted)
+	if (avl_is_empty(&msp->ms_allocatable_by_size) ||
+	    msp->ms_condense_wanted)
 		return (B_TRUE);
 
-	/*
-	 * Calculate the number of 64-bit entries this segment would
-	 * require when written to disk. If this single segment would be
-	 * larger on-disk than the entire current on-disk structure, then
-	 * clearly condensing will increase the on-disk structure size.
-	 */
-	size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
-	entries = size / (MIN(size, SM_RUN_MAX));
-	segsz = entries * sizeof (uint64_t);
-
-	optimal_size =
-	    sizeof (uint64_t) * avl_numnodes(&msp->ms_allocatable->rt_root);
-	object_size = space_map_length(msp->ms_sm);
+	uint64_t object_size = space_map_length(msp->ms_sm);
+	uint64_t optimal_size = space_map_estimate_optimal_size(sm,
+	    msp->ms_allocatable, SM_NO_VDEVID);
 
+	dmu_object_info_t doi;
 	dmu_object_info_from_db(sm->sm_dbuf, &doi);
-	record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
+	uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
 
-	return (segsz <= object_size &&
-	    object_size >= (optimal_size * zfs_condense_pct / 100) &&
+	return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
 	    object_size > zfs_metaslab_condense_block_threshold * record_size);
 }
 
@@ -2177,11 +2151,11 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
 	 * optimal, this is typically close to optimal, and much cheaper to
 	 * compute.
 	 */
-	space_map_write(sm, condense_tree, SM_ALLOC, tx);
+	space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx);
 	range_tree_vacate(condense_tree, NULL, NULL);
 	range_tree_destroy(condense_tree);
 
-	space_map_write(sm, msp->ms_allocatable, SM_FREE, tx);
+	space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
 	mutex_enter(&msp->ms_lock);
 	msp->ms_condensing = B_FALSE;
 }
@@ -2293,8 +2267,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 		metaslab_condense(msp, txg, tx);
 	} else {
 		mutex_exit(&msp->ms_lock);
-		space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
-		space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, tx);
+		space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
+		    SM_NO_VDEVID, tx);
+		space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
+		    SM_NO_VDEVID, tx);
 		mutex_enter(&msp->ms_lock);
 	}
 
@@ -2309,7 +2285,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 		 */
 		mutex_exit(&msp->ms_lock);
 		space_map_write(vd->vdev_checkpoint_sm,
-		    msp->ms_checkpointing, SM_FREE, tx);
+		    msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
 		mutex_enter(&msp->ms_lock);
 		space_map_update(vd->vdev_checkpoint_sm);
 
diff --git a/module/zfs/spa_checkpoint.c b/module/zfs/spa_checkpoint.c
index 544658821..6f7e9ab83 100644
--- a/module/zfs/spa_checkpoint.c
+++ b/module/zfs/spa_checkpoint.c
@@ -203,13 +203,12 @@ typedef struct spa_checkpoint_discard_sync_callback_arg {
 } spa_checkpoint_discard_sync_callback_arg_t;
 
 static int
-spa_checkpoint_discard_sync_callback(maptype_t type, uint64_t offset,
-    uint64_t size, void *arg)
+spa_checkpoint_discard_sync_callback(space_map_entry_t *sme, void *arg)
 {
 	spa_checkpoint_discard_sync_callback_arg_t *sdc = arg;
 	vdev_t *vd = sdc->sdc_vd;
-	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
-	uint64_t end = offset + size;
+	metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
+	uint64_t end = sme->sme_offset + sme->sme_run;
 
 	if (sdc->sdc_entry_limit == 0)
 		return (EINTR);
@@ -224,8 +223,8 @@ spa_checkpoint_discard_sync_callback(maptype_t type, uint64_t offset,
 	 * metaslab boundaries. So if needed we could add code
 	 * that handles metaslab-crossing segments in the future.
 	 */
-	VERIFY3U(type, ==, SM_FREE);
-	VERIFY3U(offset, >=, ms->ms_start);
+	VERIFY3U(sme->sme_type, ==, SM_FREE);
+	VERIFY3U(sme->sme_offset, >=, ms->ms_start);
 	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
 
 	/*
@@ -237,14 +236,15 @@ spa_checkpoint_discard_sync_callback(maptype_t type, uint64_t offset,
 	mutex_enter(&ms->ms_lock);
 	if (range_tree_is_empty(ms->ms_freeing))
 		vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg);
-	range_tree_add(ms->ms_freeing, offset, size);
+	range_tree_add(ms->ms_freeing, sme->sme_offset, sme->sme_run);
 	mutex_exit(&ms->ms_lock);
 
-	ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=, size);
-	ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, size);
+	ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=,
+	    sme->sme_run);
+	ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, sme->sme_run);
 
-	vd->vdev_spa->spa_checkpoint_info.sci_dspace -= size;
-	vd->vdev_stat.vs_checkpoint_space -= size;
+	vd->vdev_spa->spa_checkpoint_info.sci_dspace -= sme->sme_run;
+	vd->vdev_stat.vs_checkpoint_space -= sme->sme_run;
 	sdc->sdc_entry_limit--;
 
 	return (0);
@@ -291,12 +291,13 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
 	 * Thus, we set the maximum entries that the space map callback
 	 * will be applied to be half the entries that could fit in the
 	 * imposed memory limit.
+	 *
+	 * Note that since this is a conservative estimate we also
+	 * assume the worst case scenario in our computation where each
+	 * entry is two-word.
 	 */
 	uint64_t max_entry_limit =
-	    (zfs_spa_discard_memory_limit / sizeof (uint64_t)) >> 1;
-
-	uint64_t entries_in_sm =
-	    space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
+	    (zfs_spa_discard_memory_limit / (2 * sizeof (uint64_t))) >> 1;
 
 	/*
 	 * Iterate from the end of the space map towards the beginning,
@@ -320,14 +321,15 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
 	spa_checkpoint_discard_sync_callback_arg_t sdc;
 	sdc.sdc_vd = vd;
 	sdc.sdc_txg = tx->tx_txg;
-	sdc.sdc_entry_limit = MIN(entries_in_sm, max_entry_limit);
+	sdc.sdc_entry_limit = max_entry_limit;
 
-	uint64_t entries_before = entries_in_sm;
+	uint64_t words_before =
+	    space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
 
 	error = space_map_incremental_destroy(vd->vdev_checkpoint_sm,
 	    spa_checkpoint_discard_sync_callback, &sdc, tx);
 
-	uint64_t entries_after =
+	uint64_t words_after =
 	    space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
 
 #ifdef ZFS_DEBUG
@@ -335,9 +337,9 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
 #endif
 
 	zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %d, "
-	    "deleted %llu entries - %llu entries are left",
-	    tx->tx_txg, vd->vdev_id, (entries_before - entries_after),
-	    entries_after);
+	    "deleted %llu words - %llu words are left",
+	    tx->tx_txg, vd->vdev_id, (words_before - words_after),
+	    words_after);
 
 	if (error != EINTR) {
 		if (error != 0) {
@@ -346,15 +348,15 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
 			    "space map of vdev %llu\n",
 			    error, vd->vdev_id);
 		}
-		ASSERT0(entries_after);
+		ASSERT0(words_after);
 		ASSERT0(vd->vdev_checkpoint_sm->sm_alloc);
-		ASSERT0(vd->vdev_checkpoint_sm->sm_length);
+		ASSERT0(space_map_length(vd->vdev_checkpoint_sm));
 
 		space_map_free(vd->vdev_checkpoint_sm, tx);
 		space_map_close(vd->vdev_checkpoint_sm);
 		vd->vdev_checkpoint_sm = NULL;
 
-		VERIFY0(zap_remove(vd->vdev_spa->spa_meta_objset,
+		VERIFY0(zap_remove(spa_meta_objset(vd->vdev_spa),
 		    vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx));
 	}
 }
diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c
index 0e5a4b976..5f67a7987 100644
--- a/module/zfs/space_map.c
+++ b/module/zfs/space_map.c
@@ -41,11 +41,36 @@
  * Note on space map block size:
  *
  * The data for a given space map can be kept on blocks of any size.
- * Larger blocks entail fewer i/o operations, but they also cause the
- * DMU to keep more data in-core, and also to waste more i/o bandwidth
+ * Larger blocks entail fewer I/O operations, but they also cause the
+ * DMU to keep more data in-core, and also to waste more I/O bandwidth
  * when only a few blocks have changed since the last transaction group.
  */
 
+/*
+ * Enabled whenever we want to stress test the use of double-word
+ * space map entries.
+ */
+boolean_t zfs_force_some_double_word_sm_entries = B_FALSE;
+
+boolean_t
+sm_entry_is_debug(uint64_t e)
+{
+	return (SM_PREFIX_DECODE(e) == SM_DEBUG_PREFIX);
+}
+
+boolean_t
+sm_entry_is_single_word(uint64_t e)
+{
+	uint8_t prefix = SM_PREFIX_DECODE(e);
+	return (prefix != SM_DEBUG_PREFIX && prefix != SM2_PREFIX);
+}
+
+boolean_t
+sm_entry_is_double_word(uint64_t e)
+{
+	return (SM_PREFIX_DECODE(e) == SM2_PREFIX);
+}
+
 /*
  * Iterate through the space map, invoking the callback on each (non-debug)
  * space map entry.
@@ -53,56 +78,157 @@
 int
 space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg)
 {
-	uint64_t *entry, *entry_map, *entry_map_end;
-	uint64_t bufsize, size, offset, end;
+	uint64_t sm_len = space_map_length(sm);
+	ASSERT3U(sm->sm_blksz, !=, 0);
+
+	dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, sm_len,
+	    ZIO_PRIORITY_SYNC_READ);
+
+	uint64_t blksz = sm->sm_blksz;
 	int error = 0;
+	for (uint64_t block_base = 0; block_base < sm_len && error == 0;
+	    block_base += blksz) {
+		dmu_buf_t *db;
+		error = dmu_buf_hold(sm->sm_os, space_map_object(sm),
+		    block_base, FTAG, &db, DMU_READ_PREFETCH);
+		if (error != 0)
+			return (error);
 
-	end = space_map_length(sm);
+		uint64_t *block_start = db->db_data;
+		uint64_t block_length = MIN(sm_len - block_base, blksz);
+		uint64_t *block_end = block_start +
+		    (block_length / sizeof (uint64_t));
 
-	bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
-	entry_map = vmem_alloc(bufsize, KM_SLEEP);
+		VERIFY0(P2PHASE(block_length, sizeof (uint64_t)));
+		VERIFY3U(block_length, !=, 0);
+		ASSERT3U(blksz, ==, db->db_size);
 
-	if (end > bufsize) {
-		dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize,
-		    end - bufsize, ZIO_PRIORITY_SYNC_READ);
-	}
+		for (uint64_t *block_cursor = block_start;
+		    block_cursor < block_end && error == 0; block_cursor++) {
+			uint64_t e = *block_cursor;
 
-	for (offset = 0; offset < end && error == 0; offset += bufsize) {
-		size = MIN(end - offset, bufsize);
-		VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
-		VERIFY(size != 0);
-		ASSERT3U(sm->sm_blksz, !=, 0);
+			if (sm_entry_is_debug(e)) /* Skip debug entries */
+				continue;
 
-		dprintf("object=%llu  offset=%llx  size=%llx\n",
-		    space_map_object(sm), offset, size);
+			uint64_t raw_offset, raw_run, vdev_id;
+			maptype_t type;
+			if (sm_entry_is_single_word(e)) {
+				type = SM_TYPE_DECODE(e);
+				vdev_id = SM_NO_VDEVID;
+				raw_offset = SM_OFFSET_DECODE(e);
+				raw_run = SM_RUN_DECODE(e);
+			} else {
+				/* it is a two-word entry */
+				ASSERT(sm_entry_is_double_word(e));
+				raw_run = SM2_RUN_DECODE(e);
+				vdev_id = SM2_VDEV_DECODE(e);
+
+				/* move on to the second word */
+				block_cursor++;
+				e = *block_cursor;
+				VERIFY3P(block_cursor, <=, block_end);
+
+				type = SM2_TYPE_DECODE(e);
+				raw_offset = SM2_OFFSET_DECODE(e);
+			}
 
-		error = dmu_read(sm->sm_os, space_map_object(sm), offset, size,
-		    entry_map, DMU_READ_PREFETCH);
-		if (error != 0)
-			break;
+			uint64_t entry_offset = (raw_offset << sm->sm_shift) +
+			    sm->sm_start;
+			uint64_t entry_run = raw_run << sm->sm_shift;
 
-		entry_map_end = entry_map + (size / sizeof (uint64_t));
-		for (entry = entry_map; entry < entry_map_end && error == 0;
-		    entry++) {
-			uint64_t e = *entry;
-			uint64_t offset, size;
+			VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift));
+			VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift));
+			ASSERT3U(entry_offset, >=, sm->sm_start);
+			ASSERT3U(entry_offset, <, sm->sm_start + sm->sm_size);
+			ASSERT3U(entry_run, <=, sm->sm_size);
+			ASSERT3U(entry_offset + entry_run, <=,
+			    sm->sm_start + sm->sm_size);
 
-			if (SM_DEBUG_DECODE(e))	/* Skip debug entries */
-				continue;
+			space_map_entry_t sme = {
+			    .sme_type = type,
+			    .sme_vdev = vdev_id,
+			    .sme_offset = entry_offset,
+			    .sme_run = entry_run
+			};
+			error = callback(&sme, arg);
+		}
+		dmu_buf_rele(db, FTAG);
+	}
+	return (error);
+}
 
-			offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) +
-			    sm->sm_start;
-			size = SM_RUN_DECODE(e) << sm->sm_shift;
+/*
+ * Reads the entries from the last block of the space map into
+ * buf in reverse order. Populates nwords with number of words
+ * in the last block.
+ *
+ * Refer to block comment within space_map_incremental_destroy()
+ * to understand why this function is needed.
+ */
+static int
+space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf,
+    uint64_t bufsz, uint64_t *nwords)
+{
+	int error = 0;
+	dmu_buf_t *db;
+
+	/*
+	 * Find the offset of the last word in the space map and use
+	 * that to read the last block of the space map with
+	 * dmu_buf_hold().
+	 */
+	uint64_t last_word_offset =
+	    sm->sm_phys->smp_objsize - sizeof (uint64_t);
+	error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset,
+	    FTAG, &db, DMU_READ_NO_PREFETCH);
+	if (error != 0)
+		return (error);
 
-			VERIFY0(P2PHASE(offset, 1ULL << sm->sm_shift));
-			VERIFY0(P2PHASE(size, 1ULL << sm->sm_shift));
-			VERIFY3U(offset, >=, sm->sm_start);
-			VERIFY3U(offset + size, <=, sm->sm_start + sm->sm_size);
-			error = callback(SM_TYPE_DECODE(e), offset, size, arg);
+	ASSERT3U(sm->sm_object, ==, db->db_object);
+	ASSERT3U(sm->sm_blksz, ==, db->db_size);
+	ASSERT3U(bufsz, >=, db->db_size);
+	ASSERT(nwords != NULL);
+
+	uint64_t *words = db->db_data;
+	*nwords =
+	    (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t);
+
+	ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t));
+
+	uint64_t n = *nwords;
+	uint64_t j = n - 1;
+	for (uint64_t i = 0; i < n; i++) {
+		uint64_t entry = words[i];
+		if (sm_entry_is_double_word(entry)) {
+			/*
+			 * Since we are populating the buffer backwards
+			 * we have to be extra careful and add the two
+			 * words of the double-word entry in the right
+			 * order.
+			 */
+			ASSERT3U(j, >, 0);
+			buf[j - 1] = entry;
+
+			i++;
+			ASSERT3U(i, <, n);
+			entry = words[i];
+			buf[j] = entry;
+			j -= 2;
+		} else {
+			ASSERT(sm_entry_is_debug(entry) ||
+			    sm_entry_is_single_word(entry));
+			buf[j] = entry;
+			j--;
 		}
 	}
 
-	vmem_free(entry_map, bufsize);
+	/*
+	 * Assert that we wrote backwards all the
+	 * way to the beginning of the buffer.
+	 */
+	ASSERT3S(j, ==, -1);
+
+	dmu_buf_rele(db, FTAG);
 	return (error);
 }
 
@@ -116,124 +242,122 @@ int
 space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
     dmu_tx_t *tx)
 {
-	uint64_t bufsize, len;
-	uint64_t *entry_map;
-	int error = 0;
-
-	len = space_map_length(sm);
-	bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
-	entry_map = zio_buf_alloc(bufsize);
+	uint64_t bufsz = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
+	uint64_t *buf = zio_buf_alloc(bufsz);
 
 	dmu_buf_will_dirty(sm->sm_dbuf, tx);
 
 	/*
-	 * Since we can't move the starting offset of the space map
-	 * (e.g there are reference on-disk pointing to it), we destroy
-	 * its entries incrementally starting from the end.
+	 * Ideally we would want to iterate from the beginning of the
+	 * space map to the end in incremental steps. The issue with this
+	 * approach is that we don't have any field on-disk that points
+	 * us where to start between each step. We could try zeroing out
+	 * entries that we've destroyed, but this doesn't work either as
+	 * an entry that is 0 is a valid one (ALLOC for range [0x0:0x200]).
 	 *
-	 * The logic that follows is basically the same as the one used
-	 * in space_map_iterate() but it traverses the space map
-	 * backwards:
+	 * As a result, we destroy its entries incrementally starting from
+	 * the end after applying the callback to each of them.
 	 *
-	 * 1] We figure out the size of the buffer that we want to use
-	 *    to read the on-disk space map entries.
-	 * 2] We figure out the offset at the end of the space map where
-	 *    we will start reading entries into our buffer.
-	 * 3] We read the on-disk entries into the buffer.
-	 * 4] We iterate over the entries from end to beginning calling
-	 *    the callback function on each one. As we move from entry
-	 *    to entry we decrease the size of the space map, deleting
-	 *    effectively each entry.
-	 * 5] If there are no more entries in the space map or the
-	 *    callback returns a value other than 0, we stop iterating
-	 *    over the space map. If there are entries remaining and
-	 *    the callback returned zero we go back to step [1].
+	 * The problem with this approach is that we cannot literally
+	 * iterate through the words in the space map backwards as we
+	 * can't distinguish two-word space map entries from their second
+	 * word. Thus we do the following:
+	 *
+	 * 1] We get all the entries from the last block of the space map
+	 *    and put them into a buffer in reverse order. This way the
+	 *    last entry comes first in the buffer, the second to last is
+	 *    second, etc.
+	 * 2] We iterate through the entries in the buffer and we apply
+	 *    the callback to each one. As we move from entry to entry we
+	 *    we decrease the size of the space map, deleting effectively
+	 *    each entry.
+	 * 3] If there are no more entries in the space map or the callback
+	 *    returns a value other than 0, we stop iterating over the
+	 *    space map. If there are entries remaining and the callback
+	 *    returned 0, we go back to step [1].
 	 */
-	uint64_t offset = 0, size = 0;
-	while (len > 0 && error == 0) {
-		size = MIN(bufsize, len);
-
-		VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
-		VERIFY3U(size, >, 0);
-		ASSERT3U(sm->sm_blksz, !=, 0);
-
-		offset = len - size;
-
-		IMPLY(bufsize > len, offset == 0);
-		IMPLY(bufsize == len, offset == 0);
-		IMPLY(bufsize < len, offset > 0);
-
-
-		EQUIV(size == len, offset == 0);
-		IMPLY(size < len, bufsize < len);
-
-		dprintf("object=%llu  offset=%llx  size=%llx\n",
-		    space_map_object(sm), offset, size);
-
-		error = dmu_read(sm->sm_os, space_map_object(sm),
-		    offset, size, entry_map, DMU_READ_PREFETCH);
+	int error = 0;
+	while (space_map_length(sm) > 0 && error == 0) {
+		uint64_t nwords = 0;
+		error = space_map_reversed_last_block_entries(sm, buf, bufsz,
+		    &nwords);
 		if (error != 0)
 			break;
 
-		uint64_t num_entries = size / sizeof (uint64_t);
-
-		ASSERT3U(num_entries, >, 0);
-
-		while (num_entries > 0) {
-			uint64_t e, entry_offset, entry_size;
-			maptype_t type;
-
-			e = entry_map[num_entries - 1];
+		ASSERT3U(nwords, <=, bufsz / sizeof (uint64_t));
 
-			ASSERT3U(num_entries, >, 0);
-			ASSERT0(error);
+		for (uint64_t i = 0; i < nwords; i++) {
+			uint64_t e = buf[i];
 
-			if (SM_DEBUG_DECODE(e)) {
+			if (sm_entry_is_debug(e)) {
 				sm->sm_phys->smp_objsize -= sizeof (uint64_t);
 				space_map_update(sm);
-				len -= sizeof (uint64_t);
-				num_entries--;
 				continue;
 			}
 
-			type = SM_TYPE_DECODE(e);
-			entry_offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) +
-			    sm->sm_start;
-			entry_size = SM_RUN_DECODE(e) << sm->sm_shift;
+			int words = 1;
+			uint64_t raw_offset, raw_run, vdev_id;
+			maptype_t type;
+			if (sm_entry_is_single_word(e)) {
+				type = SM_TYPE_DECODE(e);
+				vdev_id = SM_NO_VDEVID;
+				raw_offset = SM_OFFSET_DECODE(e);
+				raw_run = SM_RUN_DECODE(e);
+			} else {
+				ASSERT(sm_entry_is_double_word(e));
+				words = 2;
+
+				raw_run = SM2_RUN_DECODE(e);
+				vdev_id = SM2_VDEV_DECODE(e);
+
+				/* move to the second word */
+				i++;
+				e = buf[i];
+
+				ASSERT3P(i, <=, nwords);
+
+				type = SM2_TYPE_DECODE(e);
+				raw_offset = SM2_OFFSET_DECODE(e);
+			}
+
+			uint64_t entry_offset =
+			    (raw_offset << sm->sm_shift) + sm->sm_start;
+			uint64_t entry_run = raw_run << sm->sm_shift;
 
 			VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift));
-			VERIFY0(P2PHASE(entry_size, 1ULL << sm->sm_shift));
+			VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift));
 			VERIFY3U(entry_offset, >=, sm->sm_start);
-			VERIFY3U(entry_offset + entry_size, <=,
+			VERIFY3U(entry_offset, <, sm->sm_start + sm->sm_size);
+			VERIFY3U(entry_run, <=, sm->sm_size);
+			VERIFY3U(entry_offset + entry_run, <=,
 			    sm->sm_start + sm->sm_size);
 
-			error = callback(type, entry_offset, entry_size, arg);
+			space_map_entry_t sme = {
+			    .sme_type = type,
+			    .sme_vdev = vdev_id,
+			    .sme_offset = entry_offset,
+			    .sme_run = entry_run
+			};
+			error = callback(&sme, arg);
 			if (error != 0)
 				break;
 
 			if (type == SM_ALLOC)
-				sm->sm_phys->smp_alloc -= entry_size;
+				sm->sm_phys->smp_alloc -= entry_run;
 			else
-				sm->sm_phys->smp_alloc += entry_size;
-
-			sm->sm_phys->smp_objsize -= sizeof (uint64_t);
+				sm->sm_phys->smp_alloc += entry_run;
+			sm->sm_phys->smp_objsize -= words * sizeof (uint64_t);
 			space_map_update(sm);
-			len -= sizeof (uint64_t);
-			num_entries--;
 		}
-		IMPLY(error == 0, num_entries == 0);
-		EQUIV(offset == 0 && error == 0, len == 0 && num_entries == 0);
 	}
 
-	if (len == 0) {
+	if (space_map_length(sm) == 0) {
 		ASSERT0(error);
-		ASSERT0(offset);
-		ASSERT0(sm->sm_length);
 		ASSERT0(sm->sm_phys->smp_objsize);
 		ASSERT0(sm->sm_alloc);
 	}
 
-	zio_buf_free(entry_map, bufsize);
+	zio_buf_free(buf, bufsz);
 	return (error);
 }
 
@@ -244,16 +368,15 @@ typedef struct space_map_load_arg {
 } space_map_load_arg_t;
 
 static int
-space_map_load_callback(maptype_t type, uint64_t offset, uint64_t size,
-    void *arg)
+space_map_load_callback(space_map_entry_t *sme, void *arg)
 {
 	space_map_load_arg_t *smla = arg;
-	if (type == smla->smla_type) {
-		VERIFY3U(range_tree_space(smla->smla_rt) + size, <=,
+	if (sme->sme_type == smla->smla_type) {
+		VERIFY3U(range_tree_space(smla->smla_rt) + sme->sme_run, <=,
 		    smla->smla_sm->sm_size);
-		range_tree_add(smla->smla_rt, offset, size);
+		range_tree_add(smla->smla_rt, sme->sme_offset, sme->sme_run);
 	} else {
-		range_tree_remove(smla->smla_rt, offset, size);
+		range_tree_remove(smla->smla_rt, sme->sme_offset, sme->sme_run);
 	}
 
 	return (0);
@@ -365,43 +488,237 @@ space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx)
 	}
 }
 
-uint64_t
-space_map_entries(space_map_t *sm, range_tree_t *rt)
+static void
+space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx)
 {
-	avl_tree_t *t = &rt->rt_root;
-	range_seg_t *rs;
-	uint64_t size, entries;
+	dmu_buf_will_dirty(sm->sm_dbuf, tx);
+
+	uint64_t dentry = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
+	    SM_DEBUG_ACTION_ENCODE(maptype) |
+	    SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) |
+	    SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
+
+	dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_objsize,
+	    sizeof (dentry), &dentry, tx);
+
+	sm->sm_phys->smp_objsize += sizeof (dentry);
+}
+
+/*
+ * Writes one or more entries given a segment.
+ *
+ * Note: The function may release the dbuf from the pointer initially
+ * passed to it, and return a different dbuf. Also, the space map's
+ * dbuf must be dirty for the changes in sm_phys to take effect.
+ */
+static void
+space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
+    uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp, void *tag, dmu_tx_t *tx)
+{
+	ASSERT3U(words, !=, 0);
+	ASSERT3U(words, <=, 2);
+
+	/* ensure the vdev_id can be represented by the space map */
+	ASSERT3U(vdev_id, <=, SM_NO_VDEVID);
 
 	/*
-	 * All space_maps always have a debug entry so account for it here.
+	 * if this is a single word entry, ensure that no vdev was
+	 * specified.
 	 */
-	entries = 1;
+	IMPLY(words == 1, vdev_id == SM_NO_VDEVID);
+
+	dmu_buf_t *db = *dbp;
+	ASSERT3U(db->db_size, ==, sm->sm_blksz);
+
+	uint64_t *block_base = db->db_data;
+	uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t));
+	uint64_t *block_cursor = block_base +
+	    (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t);
+
+	ASSERT3P(block_cursor, <=, block_end);
+
+	uint64_t size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
+	uint64_t start = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
+	uint64_t run_max = (words == 2) ? SM2_RUN_MAX : SM_RUN_MAX;
+
+	ASSERT3U(rs->rs_start, >=, sm->sm_start);
+	ASSERT3U(rs->rs_start, <, sm->sm_start + sm->sm_size);
+	ASSERT3U(rs->rs_end - rs->rs_start, <=, sm->sm_size);
+	ASSERT3U(rs->rs_end, <=, sm->sm_start + sm->sm_size);
+
+	while (size != 0) {
+		ASSERT3P(block_cursor, <=, block_end);
+
+		/*
+		 * If we are at the end of this block, flush it and start
+		 * writing again from the beginning.
+		 */
+		if (block_cursor == block_end) {
+			dmu_buf_rele(db, tag);
+
+			uint64_t next_word_offset = sm->sm_phys->smp_objsize;
+			VERIFY0(dmu_buf_hold(sm->sm_os,
+			    space_map_object(sm), next_word_offset,
+			    tag, &db, DMU_READ_PREFETCH));
+			dmu_buf_will_dirty(db, tx);
+
+			/* update caller's dbuf */
+			*dbp = db;
+
+			ASSERT3U(db->db_size, ==, sm->sm_blksz);
+
+			block_base = db->db_data;
+			block_cursor = block_base;
+			block_end = block_base +
+			    (db->db_size / sizeof (uint64_t));
+		}
+
+		/*
+		 * If we are writing a two-word entry and we only have one
+		 * word left on this block, just pad it with an empty debug
+		 * entry and write the two-word entry in the next block.
+		 */
+		uint64_t *next_entry = block_cursor + 1;
+		if (next_entry == block_end && words > 1) {
+			ASSERT3U(words, ==, 2);
+			*block_cursor = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
+			    SM_DEBUG_ACTION_ENCODE(0) |
+			    SM_DEBUG_SYNCPASS_ENCODE(0) |
+			    SM_DEBUG_TXG_ENCODE(0);
+			block_cursor++;
+			sm->sm_phys->smp_objsize += sizeof (uint64_t);
+			ASSERT3P(block_cursor, ==, block_end);
+			continue;
+		}
+
+		uint64_t run_len = MIN(size, run_max);
+		switch (words) {
+		case 1:
+			*block_cursor = SM_OFFSET_ENCODE(start) |
+			    SM_TYPE_ENCODE(maptype) |
+			    SM_RUN_ENCODE(run_len);
+			block_cursor++;
+			break;
+		case 2:
+			/* write the first word of the entry */
+			*block_cursor = SM_PREFIX_ENCODE(SM2_PREFIX) |
+			    SM2_RUN_ENCODE(run_len) |
+			    SM2_VDEV_ENCODE(vdev_id);
+			block_cursor++;
+
+			/* move on to the second word of the entry */
+			ASSERT3P(block_cursor, <, block_end);
+			*block_cursor = SM2_TYPE_ENCODE(maptype) |
+			    SM2_OFFSET_ENCODE(start);
+			block_cursor++;
+			break;
+		default:
+			panic("%d-word space map entries are not supported",
+			    words);
+			break;
+		}
+		sm->sm_phys->smp_objsize += words * sizeof (uint64_t);
+
+		start += run_len;
+		size -= run_len;
+	}
+	ASSERT0(size);
+
+}
+
+/*
+ * Note: The space map's dbuf must be dirty for the changes in sm_phys to
+ * take effect.
+ */
+static void
+space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
+    uint64_t vdev_id, dmu_tx_t *tx)
+{
+	spa_t *spa = tx->tx_pool->dp_spa;
+	dmu_buf_t *db;
+
+	space_map_write_intro_debug(sm, maptype, tx);
+
+#ifdef DEBUG
+	/*
+	 * We do this right after we write the intro debug entry
+	 * because the estimate does not take it into account.
+	 */
+	uint64_t initial_objsize = sm->sm_phys->smp_objsize;
+	uint64_t estimated_growth =
+	    space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID);
+	uint64_t estimated_final_objsize = initial_objsize + estimated_growth;
+#endif
 
 	/*
-	 * Traverse the range tree and calculate the number of space map
-	 * entries that would be required to write out the range tree.
+	 * Find the offset right after the last word in the space map
+	 * and use that to get a hold of the last block, so we can
+	 * start appending to it.
 	 */
-	for (rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
-		size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
-		entries += howmany(size, SM_RUN_MAX);
+	uint64_t next_word_offset = sm->sm_phys->smp_objsize;
+	VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm),
+	    next_word_offset, FTAG, &db, DMU_READ_PREFETCH));
+	ASSERT3U(db->db_size, ==, sm->sm_blksz);
+
+	dmu_buf_will_dirty(db, tx);
+
+	avl_tree_t *t = &rt->rt_root;
+	for (range_seg_t *rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
+		uint64_t offset = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
+		uint64_t length = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
+		uint8_t words = 1;
+
+		/*
+		 * We only write two-word entries when both of the following
+		 * are true:
+		 *
+		 * [1] The feature is enabled.
+		 * [2] The offset or run is too big for a single-word entry,
+		 * 	or the vdev_id is set (meaning not equal to
+		 * 	SM_NO_VDEVID).
+		 *
+		 * Note that for purposes of testing we've added the case that
+		 * we write two-word entries occasionally when the feature is
+		 * enabled and zfs_force_some_double_word_sm_entries has been
+		 * set.
+		 */
+		if (spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_V2) &&
+		    (offset >= (1ULL << SM_OFFSET_BITS) ||
+		    length > SM_RUN_MAX ||
+		    vdev_id != SM_NO_VDEVID ||
+		    (zfs_force_some_double_word_sm_entries &&
+		    spa_get_random(100) == 0)))
+			words = 2;
+
+		space_map_write_seg(sm, rs, maptype, vdev_id, words,
+		    &db, FTAG, tx);
 	}
-	return (entries);
+
+	dmu_buf_rele(db, FTAG);
+
+#ifdef DEBUG
+	/*
+	 * We expect our estimation to be based on the worst case
+	 * scenario [see comment in space_map_estimate_optimal_size()].
+	 * Therefore we expect the actual objsize to be equal or less
+	 * than whatever we estimated it to be.
+	 */
+	ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_objsize);
+#endif
 }
 
+/*
+ * Note: This function manipulates the state of the given space map but
+ * does not hold any locks implicitly. Thus the caller is responsible
+ * for synchronizing writes to the space map.
+ */
 void
 space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
-    dmu_tx_t *tx)
+    uint64_t vdev_id, dmu_tx_t *tx)
 {
-	objset_t *os = sm->sm_os;
-	spa_t *spa = dmu_objset_spa(os);
-	avl_tree_t *t = &rt->rt_root;
-	range_seg_t *rs;
-	uint64_t size, total, rt_space, nodes;
-	uint64_t *entry, *entry_map, *entry_map_end;
-	uint64_t expected_entries, actual_entries = 1;
-
-	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
+	ASSERT(dsl_pool_sync_context(dmu_objset_pool(sm->sm_os)));
 	VERIFY3U(space_map_object(sm), !=, 0);
+
 	dmu_buf_will_dirty(sm->sm_dbuf, tx);
 
 	/*
@@ -421,58 +738,10 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
 	else
 		sm->sm_phys->smp_alloc -= range_tree_space(rt);
 
-	expected_entries = space_map_entries(sm, rt);
-
-	entry_map = vmem_alloc(sm->sm_blksz, KM_SLEEP);
-	entry_map_end = entry_map + (sm->sm_blksz / sizeof (uint64_t));
-	entry = entry_map;
-
-	*entry++ = SM_DEBUG_ENCODE(1) |
-	    SM_DEBUG_ACTION_ENCODE(maptype) |
-	    SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) |
-	    SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
-
-	total = 0;
-	nodes = avl_numnodes(&rt->rt_root);
-	rt_space = range_tree_space(rt);
-	for (rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
-		uint64_t start;
+	uint64_t nodes = avl_numnodes(&rt->rt_root);
+	uint64_t rt_space = range_tree_space(rt);
 
-		size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
-		start = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
-
-		total += size << sm->sm_shift;
-
-		while (size != 0) {
-			uint64_t run_len;
-
-			run_len = MIN(size, SM_RUN_MAX);
-
-			if (entry == entry_map_end) {
-				dmu_write(os, space_map_object(sm),
-				    sm->sm_phys->smp_objsize, sm->sm_blksz,
-				    entry_map, tx);
-				sm->sm_phys->smp_objsize += sm->sm_blksz;
-				entry = entry_map;
-			}
-
-			*entry++ = SM_OFFSET_ENCODE(start) |
-			    SM_TYPE_ENCODE(maptype) |
-			    SM_RUN_ENCODE(run_len);
-
-			start += run_len;
-			size -= run_len;
-			actual_entries++;
-		}
-	}
-
-	if (entry != entry_map) {
-		size = (entry - entry_map) * sizeof (uint64_t);
-		dmu_write(os, space_map_object(sm), sm->sm_phys->smp_objsize,
-		    size, entry_map, tx);
-		sm->sm_phys->smp_objsize += size;
-	}
-	ASSERT3U(expected_entries, ==, actual_entries);
+	space_map_write_impl(sm, rt, maptype, vdev_id, tx);
 
 	/*
 	 * Ensure that the space_map's accounting wasn't changed
@@ -480,9 +749,6 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
 	 */
 	VERIFY3U(nodes, ==, avl_numnodes(&rt->rt_root));
 	VERIFY3U(range_tree_space(rt), ==, rt_space);
-	VERIFY3U(range_tree_space(rt), ==, total);
-
-	vmem_free(entry_map, sm->sm_blksz);
 }
 
 static int
@@ -529,7 +795,6 @@ space_map_open(space_map_t **smp, objset_t *os, uint64_t object,
 		space_map_close(sm);
 		return (error);
 	}
-
 	*smp = sm;
 
 	return (0);
@@ -661,6 +926,133 @@ space_map_free(space_map_t *sm, dmu_tx_t *tx)
 	sm->sm_object = 0;
 }
 
+/*
+ * Given a range tree, it makes a worst-case estimate of how much
+ * space would the tree's segments take if they were written to
+ * the given space map.
+ */
+uint64_t
+space_map_estimate_optimal_size(space_map_t *sm, range_tree_t *rt,
+    uint64_t vdev_id)
+{
+	spa_t *spa = dmu_objset_spa(sm->sm_os);
+	uint64_t shift = sm->sm_shift;
+	uint64_t *histogram = rt->rt_histogram;
+	uint64_t entries_for_seg = 0;
+
+	/*
+	 * In order to get a quick estimate of the optimal size that this
+	 * range tree would have on-disk as a space map, we iterate through
+	 * its histogram buckets instead of iterating through its nodes.
+	 *
+	 * Note that this is a highest-bound/worst-case estimate for the
+	 * following reasons:
+	 *
+	 * 1] We assume that we always add a debug padding for each block
+	 *    we write and we also assume that we start at the last word
+	 *    of a block attempting to write a two-word entry.
+	 * 2] Rounding up errors due to the way segments are distributed
+	 *    in the buckets of the range tree's histogram.
+	 * 3] The activation of zfs_force_some_double_word_sm_entries
+	 *    (tunable) when testing.
+	 *
+	 * = Math and Rounding Errors =
+	 *
+	 * rt_histogram[i] bucket of a range tree represents the number
+	 * of entries in [2^i, (2^(i+1))-1] of that range_tree. Given
+	 * that, we want to divide the buckets into groups: Buckets that
+	 * can be represented using a single-word entry, ones that can
+	 * be represented with a double-word entry, and ones that can
+	 * only be represented with multiple two-word entries.
+	 *
+	 * [Note that if the new encoding feature is not enabled there
+	 * are only two groups: single-word entry buckets and multiple
+	 * single-word entry buckets. The information below assumes
+	 * two-word entries enabled, but it can easily applied when
+	 * the feature is not enabled]
+	 *
+	 * To find the highest bucket that can be represented with a
+	 * single-word entry we look at the maximum run that such entry
+	 * can have, which is 2^(SM_RUN_BITS + sm_shift) [remember that
+	 * the run of a space map entry is shifted by sm_shift, thus we
+	 * add it to the exponent]. This way, excluding the value of the
+	 * maximum run that can be represented by a single-word entry,
+	 * all runs that are smaller exist in buckets 0 to
+	 * SM_RUN_BITS + shift - 1.
+	 *
+	 * To find the highest bucket that can be represented with a
+	 * double-word entry, we follow the same approach. Finally, any
+	 * bucket higher than that are represented with multiple two-word
+	 * entries. To be more specific, if the highest bucket whose
+	 * segments can be represented with a single two-word entry is X,
+	 * then bucket X+1 will need 2 two-word entries for each of its
+	 * segments, X+2 will need 4, X+3 will need 8, ...etc.
+	 *
+	 * With all of the above we make our estimation based on bucket
+	 * groups. There is a rounding error though. As we mentioned in
+	 * the example with the one-word entry, the maximum run that can
+	 * be represented in a one-word entry 2^(SM_RUN_BITS + shift) is
+	 * not part of bucket SM_RUN_BITS + shift - 1. Thus, segments of
+	 * that length fall into the next bucket (and bucket group) where
+	 * we start counting two-word entries and this is one more reason
+	 * why the estimated size may end up being bigger than the actual
+	 * size written.
+	 */
+	uint64_t size = 0;
+	uint64_t idx = 0;
+
+	if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) ||
+	    (vdev_id == SM_NO_VDEVID && sm->sm_size < SM_OFFSET_MAX)) {
+
+		/*
+		 * If we are trying to force some double word entries just
+		 * assume the worst-case of every single word entry being
+		 * written as a double word entry.
+		 */
+		uint64_t entry_size =
+		    (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) &&
+		    zfs_force_some_double_word_sm_entries) ?
+		    (2 * sizeof (uint64_t)) : sizeof (uint64_t);
+
+		uint64_t single_entry_max_bucket = SM_RUN_BITS + shift - 1;
+		for (; idx <= single_entry_max_bucket; idx++)
+			size += histogram[idx] * entry_size;
+
+		if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2)) {
+			for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) {
+				ASSERT3U(idx, >=, single_entry_max_bucket);
+				entries_for_seg =
+				    1ULL << (idx - single_entry_max_bucket);
+				size += histogram[idx] *
+				    entries_for_seg * entry_size;
+			}
+			return (size);
+		}
+	}
+
+	ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2));
+
+	uint64_t double_entry_max_bucket = SM2_RUN_BITS + shift - 1;
+	for (; idx <= double_entry_max_bucket; idx++)
+		size += histogram[idx] * 2 * sizeof (uint64_t);
+
+	for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) {
+		ASSERT3U(idx, >=, double_entry_max_bucket);
+		entries_for_seg = 1ULL << (idx - double_entry_max_bucket);
+		size += histogram[idx] *
+		    entries_for_seg * 2 * sizeof (uint64_t);
+	}
+
+	/*
+	 * Assume the worst case where we start with the padding at the end
+	 * of the current block and we add an extra padding entry at the end
+	 * of all subsequent blocks.
+	 */
+	size += ((size / sm->sm_blksz) + 1) * sizeof (uint64_t);
+
+	return (size);
+}
+
 uint64_t
 space_map_object(space_map_t *sm)
 {
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index cf1bf2837..a2f1f0658 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -2634,7 +2634,7 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	space_map_truncate(vd->vdev_dtl_sm, vdev_dtl_sm_blksz, tx);
-	space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx);
+	space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx);
 	range_tree_vacate(rtsync, NULL, NULL);
 
 	range_tree_destroy(rtsync);
diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c
index b14b153b2..f56d024ca 100644
--- a/module/zfs/vdev_indirect.c
+++ b/module/zfs/vdev_indirect.c
@@ -825,7 +825,7 @@ vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx)
 	    space_map_object(vd->vdev_obsolete_sm));
 
 	space_map_write(vd->vdev_obsolete_sm,
-	    vd->vdev_obsolete_segments, SM_ALLOC, tx);
+	    vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx);
 	space_map_update(vd->vdev_obsolete_sm);
 	range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
 }
diff --git a/module/zfs/vdev_indirect_mapping.c b/module/zfs/vdev_indirect_mapping.c
index d91f23383..a2766bd0d 100644
--- a/module/zfs/vdev_indirect_mapping.c
+++ b/module/zfs/vdev_indirect_mapping.c
@@ -14,7 +14,7 @@
  */
 
 /*
- * Copyright (c) 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2015, 2017 by Delphix. All rights reserved.
  */
 
 #include <sys/dmu_tx.h>
@@ -539,14 +539,13 @@ typedef struct load_obsolete_space_map_arg {
 } load_obsolete_space_map_arg_t;
 
 static int
-load_obsolete_sm_callback(maptype_t type, uint64_t offset, uint64_t size,
-    void *arg)
+load_obsolete_sm_callback(space_map_entry_t *sme, void *arg)
 {
 	load_obsolete_space_map_arg_t *losma = arg;
-	ASSERT3S(type, ==, SM_ALLOC);
+	ASSERT3S(sme->sme_type, ==, SM_ALLOC);
 
 	vdev_indirect_mapping_increment_obsolete_count(losma->losma_vim,
-	    offset, size, losma->losma_counts);
+	    sme->sme_offset, sme->sme_run, losma->losma_counts);
 
 	return (0);
 }
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
index fb389cb10..8994332b5 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
@@ -74,6 +74,7 @@ typeset -a properties=(
     "feature@device_removal"
     "feature@obsolete_counts"
     "feature@zpool_checkpoint"
+    "feature@spacemap_v2"
 )
 
 # Additional properties added for Linux.
diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh
index 54dcd59c3..f1abad063 100755
--- a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh
+++ b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh
@@ -19,7 +19,7 @@
 
 #
 # DESCRIPTION:
-# 	Discard checkpoint on a stressed pool. Ensure that we can
+#	Discard checkpoint on a stressed pool. Ensure that we can
 #	export and import the pool while discarding but not run any
 #	operations that have to do with the checkpoint or change the
 #	pool's config.
@@ -63,6 +63,10 @@ log_onexit test_cleanup
 # the current setup the checkpoint space maps should
 # have tens of thousands of entries.
 #
+# Note: If two-words entries are used in the space
+#	map, we should have even more time to
+#	verify this.
+#
 set_tunable64 zfs_spa_discard_memory_limit 128
 
 log_must zpool checkpoint $NESTEDPOOL
-- 
2.40.0