]> granicus.if.org Git - zfs/commitdiff
Illumos 4757, 4913
authorMatthew Ahrens <mahrens@delphix.com>
Thu, 5 Jun 2014 21:19:08 +0000 (13:19 -0800)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Fri, 1 Aug 2014 21:28:05 +0000 (14:28 -0700)
4757 ZFS embedded-data block pointers ("zero block compression")
4913 zfs release should not be subject to space checks

Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Max Grossman <max.grossman@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Approved by: Dan McDonald <danmcd@omniti.com>

References:
  https://www.illumos.org/issues/4757
  https://www.illumos.org/issues/4913
  https://github.com/illumos/illumos-gate/commit/5d7b4d4

Porting notes:

For compatibility with the fastpath code the zio_done() function
needed to be updated.  Because embedded-data block pointers do
not require DVAs to be allocated the associated vdevs will not
be marked and therefore should not be unmarked.

Ported by: Tim Chase <tim@chase2k.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2544

46 files changed:
cmd/zdb/zdb.c
cmd/zfs/zfs_main.c
cmd/zstreamdump/zstreamdump.c
cmd/ztest/ztest.c
include/libzfs.h
include/libzfs_core.h
include/sys/Makefile.am
include/sys/blkptr.h [new file with mode: 0644]
include/sys/dbuf.h
include/sys/dmu.h
include/sys/dmu_impl.h
include/sys/dmu_send.h
include/sys/spa.h
include/sys/spa_impl.h
include/sys/zfs_ioctl.h
include/sys/zio.h
include/zfeature_common.h
lib/libzfs/libzfs_sendrecv.c
lib/libzfs_core/libzfs_core.c
lib/libzpool/Makefile.am
man/man5/zpool-features.5
man/man8/zfs.8
module/zfs/Makefile.in
module/zfs/arc.c
module/zfs/blkptr.c [new file with mode: 0644]
module/zfs/bpobj.c
module/zfs/dbuf.c
module/zfs/dmu.c
module/zfs/dmu_objset.c
module/zfs/dmu_send.c
module/zfs/dmu_traverse.c
module/zfs/dnode.c
module/zfs/dnode_sync.c
module/zfs/dsl_dataset.c
module/zfs/dsl_destroy.c
module/zfs/dsl_scan.c
module/zfs/dsl_userhold.c
module/zfs/metaslab.c
module/zfs/spa.c
module/zfs/spa_misc.c
module/zfs/zfeature_common.c
module/zfs/zfs_ioctl.c
module/zfs/zil.c
module/zfs/zio.c
module/zfs/zio_checksum.c
module/zfs/zio_compress.c

index 66b91cd97a68859fbc9adf40bbbc1275bb9828fe..dcb77c24ddbc1d83783d645df908f82363139659 100644 (file)
@@ -1047,6 +1047,16 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp)
                return;
        }
 
+       if (BP_IS_EMBEDDED(bp)) {
+               (void) sprintf(blkbuf,
+                   "EMBEDDED et=%u %llxL/%llxP B=%llu",
+                   (int)BPE_GET_ETYPE(bp),
+                   (u_longlong_t)BPE_GET_LSIZE(bp),
+                   (u_longlong_t)BPE_GET_PSIZE(bp),
+                   (u_longlong_t)bp->blk_birth);
+               return;
+       }
+
        blkbuf[0] = '\0';
 
        for (i = 0; i < ndvas; i++)
@@ -1066,7 +1076,7 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp)
                    "%llxL/%llxP F=%llu B=%llu/%llu",
                    (u_longlong_t)BP_GET_LSIZE(bp),
                    (u_longlong_t)BP_GET_PSIZE(bp),
-                   (u_longlong_t)bp->blk_fill,
+                   (u_longlong_t)BP_GET_FILL(bp),
                    (u_longlong_t)bp->blk_birth,
                    (u_longlong_t)BP_PHYSICAL_BIRTH(bp));
        }
@@ -1079,8 +1089,10 @@ print_indirect(blkptr_t *bp, const zbookmark_t *zb,
        char blkbuf[BP_SPRINTF_LEN];
        int l;
 
-       ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
-       ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
+       if (!BP_IS_EMBEDDED(bp)) {
+               ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
+               ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
+       }
 
        (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
 
@@ -1134,10 +1146,10 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
                        err = visit_indirect(spa, dnp, cbp, &czb);
                        if (err)
                                break;
-                       fill += cbp->blk_fill;
+                       fill += BP_GET_FILL(cbp);
                }
                if (!err)
-                       ASSERT3U(fill, ==, bp->blk_fill);
+                       ASSERT3U(fill, ==, BP_GET_FILL(bp));
                (void) arc_buf_remove_ref(buf, &buf);
        }
 
@@ -1861,14 +1873,14 @@ dump_dir(objset_t *os)
 
        if (dds.dds_type == DMU_OST_META) {
                dds.dds_creation_txg = TXG_INITIAL;
-               usedobjs = os->os_rootbp->blk_fill;
+               usedobjs = BP_GET_FILL(os->os_rootbp);
                refdbytes = os->os_spa->spa_dsl_pool->
                    dp_mos_dir->dd_phys->dd_used_bytes;
        } else {
                dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
        }
 
-       ASSERT3U(usedobjs, ==, os->os_rootbp->blk_fill);
+       ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp));
 
        zdb_nicenum(refdbytes, numbuf);
 
@@ -2171,6 +2183,9 @@ typedef struct zdb_cb {
        zdb_blkstats_t  zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
        uint64_t        zcb_dedup_asize;
        uint64_t        zcb_dedup_blocks;
+       uint64_t        zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
+       uint64_t        zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES]
+           [BPE_PAYLOAD_SIZE];
        uint64_t        zcb_start;
        uint64_t        zcb_lastprint;
        uint64_t        zcb_totalasize;
@@ -2204,6 +2219,13 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
                zb->zb_psize_histogram[BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT]++;
        }
 
+       if (BP_IS_EMBEDDED(bp)) {
+               zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++;
+               zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)]
+                   [BPE_GET_PSIZE(bp)]++;
+               return;
+       }
+
        if (dump_opt['L'])
                return;
 
@@ -2301,7 +2323,8 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 
        is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type));
 
-       if (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata)) {
+       if (!BP_IS_EMBEDDED(bp) &&
+           (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
                size_t size = BP_GET_PSIZE(bp);
                void *data = zio_data_buf_alloc(size);
                int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
@@ -2497,8 +2520,9 @@ dump_block_stats(spa_t *spa)
        zdb_blkstats_t *zb, *tzb;
        uint64_t norm_alloc, norm_space, total_alloc, total_found;
        int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD;
-       int leaks = 0;
+       boolean_t leaks = B_FALSE;
        int e;
+       bp_embedded_type_t i;
 
        (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
            (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
@@ -2587,7 +2611,7 @@ dump_block_stats(spa_t *spa)
                    (u_longlong_t)total_alloc,
                    (dump_opt['L']) ? "unreachable" : "leaked",
                    (longlong_t)(total_alloc - total_found));
-               leaks = 1;
+               leaks = B_TRUE;
        }
 
        if (tzb->zb_count == 0)
@@ -2617,6 +2641,23 @@ dump_block_stats(spa_t *spa)
        (void) printf("\tSPA allocated: %10llu     used: %5.2f%%\n",
            (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
 
+       for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
+               if (zcb.zcb_embedded_blocks[i] == 0)
+                       continue;
+               (void) printf("\n");
+               (void) printf("\tadditional, non-pointer bps of type %u: "
+                   "%10llu\n",
+                   i, (u_longlong_t)zcb.zcb_embedded_blocks[i]);
+
+               if (dump_opt['b'] >= 3) {
+                       (void) printf("\t number of (compressed) bytes:  "
+                           "number of bps\n");
+                       dump_histogram(zcb.zcb_embedded_histogram[i],
+                           sizeof (zcb.zcb_embedded_histogram[i]) /
+                           sizeof (zcb.zcb_embedded_histogram[i][0]), 0);
+               }
+       }
+
        if (dump_opt['b'] >= 2) {
                int l, t, level;
                (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
@@ -2718,14 +2759,14 @@ zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
        avl_index_t where;
        zdb_ddt_entry_t *zdde, zdde_search;
 
-       if (BP_IS_HOLE(bp))
+       if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
                return (0);
 
        if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
                (void) printf("traversing objset %llu, %llu objects, "
                    "%lu blocks so far\n",
                    (u_longlong_t)zb->zb_objset,
-                   (u_longlong_t)bp->blk_fill,
+                   (u_longlong_t)BP_GET_FILL(bp),
                    avl_numnodes(t));
        }
 
index 521ce3c2bbe945d9ac66b0754d199d856335740d..84073435e2d769fa1c20899a21eec32ec150ebcf 100644 (file)
@@ -258,9 +258,9 @@ get_usage(zfs_help_t idx)
        case HELP_ROLLBACK:
                return (gettext("\trollback [-rRf] <snapshot>\n"));
        case HELP_SEND:
-               return (gettext("\tsend [-DnPpRrv] [-[iI] snapshot] "
+               return (gettext("\tsend [-DnPpRrve] [-[iI] snapshot] "
                    "<snapshot>\n"
-                   "\tsend [-i snapshot|bookmark] "
+                   "\tsend [-e] [-i snapshot|bookmark] "
                    "<filesystem|volume|snapshot>\n"));
        case HELP_SET:
                return (gettext("\tset <property=value> "
@@ -3338,6 +3338,8 @@ rollback_check_dependent(zfs_handle_t *zhp, void *data)
        zfs_close(zhp);
        return (0);
 }
+
+
 /*
  * Report any snapshots more recent than the one specified.  Used when '-r' is
  * not specified.  We reuse this same callback for the snapshot dependents - if
@@ -3677,7 +3679,7 @@ zfs_do_send(int argc, char **argv)
        boolean_t extraverbose = B_FALSE;
 
        /* check options */
-       while ((c = getopt(argc, argv, ":i:I:RDpvnP")) != -1) {
+       while ((c = getopt(argc, argv, ":i:I:RDpvnPe")) != -1) {
                switch (c) {
                case 'i':
                        if (fromname)
@@ -3712,6 +3714,9 @@ zfs_do_send(int argc, char **argv)
                case 'n':
                        flags.dryrun = B_TRUE;
                        break;
+               case 'e':
+                       flags.embed_data = B_TRUE;
+                       break;
                case ':':
                        (void) fprintf(stderr, gettext("missing argument for "
                            "'%c' option\n"), optopt);
@@ -3750,6 +3755,7 @@ zfs_do_send(int argc, char **argv)
        if (strchr(argv[0], '@') == NULL ||
            (fromname && strchr(fromname, '#') != NULL)) {
                char frombuf[ZFS_MAXNAMELEN];
+               enum lzc_send_flags lzc_flags = 0;
 
                if (flags.replicate || flags.doall || flags.props ||
                    flags.dedup || flags.dryrun || flags.verbose ||
@@ -3764,6 +3770,9 @@ zfs_do_send(int argc, char **argv)
                if (zhp == NULL)
                        return (1);
 
+               if (flags.embed_data)
+                       lzc_flags |= LZC_SEND_FLAG_EMBED_DATA;
+
                if (fromname != NULL &&
                    (fromname[0] == '#' || fromname[0] == '@')) {
                        /*
@@ -3777,7 +3786,7 @@ zfs_do_send(int argc, char **argv)
                        (void) strlcat(frombuf, fromname, sizeof (frombuf));
                        fromname = frombuf;
                }
-               err = zfs_send_one(zhp, fromname, STDOUT_FILENO);
+               err = zfs_send_one(zhp, fromname, STDOUT_FILENO, lzc_flags);
                zfs_close(zhp);
                return (err != 0);
        }
index a4c451d53029e62b3b349138bcf19d9ed6167902..dd8b31ccc9253bdb600281327aeb45fed1e08f44 100644 (file)
@@ -36,7 +36,6 @@
 #include <sys/zfs_ioctl.h>
 #include <zfs_fletcher.h>
 
-uint64_t drr_record_count[DRR_NUMTYPES];
 uint64_t total_write_size = 0;
 uint64_t total_stream_len = 0;
 FILE *send_stream = 0;
@@ -81,6 +80,8 @@ int
 main(int argc, char *argv[])
 {
        char *buf = malloc(INITIAL_BUFLEN);
+       uint64_t drr_record_count[DRR_NUMTYPES] = { 0 };
+       uint64_t total_records = 0;
        dmu_replay_record_t thedrr;
        dmu_replay_record_t *drr = &thedrr;
        struct drr_begin *drrb = &thedrr.drr_u.drr_begin;
@@ -91,6 +92,7 @@ main(int argc, char *argv[])
        struct drr_write_byref *drrwbr = &thedrr.drr_u.drr_write_byref;
        struct drr_free *drrf = &thedrr.drr_u.drr_free;
        struct drr_spill *drrs = &thedrr.drr_u.drr_spill;
+       struct drr_write_embedded *drrwe = &thedrr.drr_u.drr_write_embedded;
        char c;
        boolean_t verbose = B_FALSE;
        boolean_t first = B_TRUE;
@@ -170,6 +172,7 @@ main(int argc, char *argv[])
                }
 
                drr_record_count[drr->drr_type]++;
+               total_records++;
 
                switch (drr->drr_type) {
                case DRR_BEGIN:
@@ -286,8 +289,8 @@ main(int argc, char *argv[])
                                    drro->drr_bonuslen);
                        }
                        if (drro->drr_bonuslen > 0) {
-                               (void) ssread(buf, P2ROUNDUP(drro->drr_bonuslen,
-                                   8), &zc);
+                               (void) ssread(buf,
+                                   P2ROUNDUP(drro->drr_bonuslen, 8), &zc);
                        }
                        break;
 
@@ -397,6 +400,38 @@ main(int argc, char *argv[])
                        }
                        (void) ssread(buf, drrs->drr_length, &zc);
                        break;
+               case DRR_WRITE_EMBEDDED:
+                       if (do_byteswap) {
+                               drrwe->drr_object =
+                                   BSWAP_64(drrwe->drr_object);
+                               drrwe->drr_offset =
+                                   BSWAP_64(drrwe->drr_offset);
+                               drrwe->drr_length =
+                                   BSWAP_64(drrwe->drr_length);
+                               drrwe->drr_toguid =
+                                   BSWAP_64(drrwe->drr_toguid);
+                               drrwe->drr_lsize =
+                                   BSWAP_32(drrwe->drr_lsize);
+                               drrwe->drr_psize =
+                                   BSWAP_32(drrwe->drr_psize);
+                       }
+                       if (verbose) {
+                               (void) printf("WRITE_EMBEDDED object = %llu "
+                                   "offset = %llu length = %llu\n"
+                                   "toguid = %llx comp = %u etype = %u "
+                                   "lsize = %u psize = %u\n",
+                                   (u_longlong_t)drrwe->drr_object,
+                                   (u_longlong_t)drrwe->drr_offset,
+                                   (u_longlong_t)drrwe->drr_length,
+                                   (u_longlong_t)drrwe->drr_toguid,
+                                   drrwe->drr_compression,
+                                   drrwe->drr_etype,
+                                   drrwe->drr_lsize,
+                                   drrwe->drr_psize);
+                       }
+                       (void) ssread(buf,
+                           P2ROUNDUP(drrwe->drr_psize, 8), &zc);
+                       break;
                case DRR_NUMTYPES:
                        /* should never be reached */
                        exit(1);
@@ -418,18 +453,16 @@ main(int argc, char *argv[])
            (u_longlong_t)drr_record_count[DRR_FREEOBJECTS]);
        (void) printf("\tTotal DRR_WRITE records = %lld\n",
            (u_longlong_t)drr_record_count[DRR_WRITE]);
+       (void) printf("\tTotal DRR_WRITE_BYREF records = %lld\n",
+           (u_longlong_t)drr_record_count[DRR_WRITE_BYREF]);
+       (void) printf("\tTotal DRR_WRITE_EMBEDDED records = %lld\n",
+           (u_longlong_t)drr_record_count[DRR_WRITE_EMBEDDED]);
        (void) printf("\tTotal DRR_FREE records = %lld\n",
            (u_longlong_t)drr_record_count[DRR_FREE]);
        (void) printf("\tTotal DRR_SPILL records = %lld\n",
            (u_longlong_t)drr_record_count[DRR_SPILL]);
        (void) printf("\tTotal records = %lld\n",
-           (u_longlong_t)(drr_record_count[DRR_BEGIN] +
-           drr_record_count[DRR_OBJECT] +
-           drr_record_count[DRR_FREEOBJECTS] +
-           drr_record_count[DRR_WRITE] +
-           drr_record_count[DRR_FREE] +
-           drr_record_count[DRR_SPILL] +
-           drr_record_count[DRR_END]));
+           (u_longlong_t)total_records);
        (void) printf("\tTotal write size = %lld (0x%llx)\n",
            (u_longlong_t)total_write_size, (u_longlong_t)total_write_size);
        (void) printf("\tTotal stream length = %lld (0x%llx)\n",
index 0a0fa7f490a86864b9ad4241f4004b25b4c6bd6e..a087444c8bda606297a2ff827e218ad3ebc06ed9 100644 (file)
@@ -52,7 +52,7 @@
  *     At random times, the child self-immolates with a SIGKILL.
  *     This is the software equivalent of pulling the power cord.
  *     The parent then runs the test again, using the existing
- *     storage pool, as many times as desired. If backwards compatability
+ *     storage pool, as many times as desired. If backwards compatibility
  *     testing is enabled ztest will sometimes run the "older" version
  *     of ztest after a SIGKILL.
  *
@@ -1301,13 +1301,13 @@ static void
 ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
     uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
 {
-       ASSERT(bt->bt_magic == BT_MAGIC);
-       ASSERT(bt->bt_objset == dmu_objset_id(os));
-       ASSERT(bt->bt_object == object);
-       ASSERT(bt->bt_offset == offset);
-       ASSERT(bt->bt_gen <= gen);
-       ASSERT(bt->bt_txg <= txg);
-       ASSERT(bt->bt_crtxg == crtxg);
+       ASSERT3U(bt->bt_magic, ==, BT_MAGIC);
+       ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os));
+       ASSERT3U(bt->bt_object, ==, object);
+       ASSERT3U(bt->bt_offset, ==, offset);
+       ASSERT3U(bt->bt_gen, <=, gen);
+       ASSERT3U(bt->bt_txg, <=, txg);
+       ASSERT3U(bt->bt_crtxg, ==, crtxg);
 }
 
 static ztest_block_tag_t *
@@ -3557,6 +3557,11 @@ ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id)
        if (error)
                fatal(0, "dmu_objset_own(%s) = %d", snap2name, error);
        error = dsl_dataset_promote(clone2name, NULL);
+       if (error == ENOSPC) {
+               dmu_objset_disown(os, FTAG);
+               ztest_record_enospc(FTAG);
+               goto out;
+       }
        if (error != EBUSY)
                fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name,
                    error);
@@ -3739,11 +3744,19 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
                return;
        }
 
-       dmu_object_set_checksum(os, bigobj,
-           (enum zio_checksum)ztest_random_dsl_prop(ZFS_PROP_CHECKSUM), tx);
+       enum zio_checksum cksum;
+       do {
+               cksum = (enum zio_checksum)
+                   ztest_random_dsl_prop(ZFS_PROP_CHECKSUM);
+       } while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS);
+       dmu_object_set_checksum(os, bigobj, cksum, tx);
 
-       dmu_object_set_compress(os, bigobj,
-           (enum zio_compress)ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), tx);
+       enum zio_compress comp;
+       do {
+               comp = (enum zio_compress)
+                   ztest_random_dsl_prop(ZFS_PROP_COMPRESSION);
+       } while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS);
+       dmu_object_set_compress(os, bigobj, comp, tx);
 
        /*
         * For each index from n to n + s, verify that the existing bufwad
@@ -4867,8 +4880,13 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id)
        error = dsl_dataset_user_hold(holds, 0, NULL);
        fnvlist_free(holds);
 
-       if (error)
-               fatal(0, "dsl_dataset_user_hold(%s)", fullname, tag);
+       if (error == ENOSPC) {
+               ztest_record_enospc("dsl_dataset_user_hold");
+               goto out;
+       } else if (error) {
+               fatal(0, "dsl_dataset_user_hold(%s, %s) = %u",
+                   fullname, tag, error);
+       }
 
        error = dsl_destroy_snapshot(fullname, B_FALSE);
        if (error != EBUSY) {
@@ -5336,7 +5354,7 @@ ztest_run_zdb(char *pool)
        }
 
        (void) sprintf(zdb,
-           "%s -bcc%s%s -U %s %s",
+           "%s -bcc%s%s -d -U %s %s",
            bin,
            ztest_opts.zo_verbose >= 3 ? "s" : "",
            ztest_opts.zo_verbose >= 4 ? "v" : "",
index 561b34b8756a09ccfb7b211d5305f7e012d4873c..df38d29e1863c768f6d5e165b4499a66e3924d2f 100644 (file)
@@ -39,6 +39,7 @@
 #include <sys/fs/zfs.h>
 #include <sys/avl.h>
 #include <ucred.h>
+#include <libzfs_core.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -614,13 +615,16 @@ typedef struct sendflags {
 
        /* show progress (ie. -v) */
        boolean_t progress;
+
+       /* WRITE_EMBEDDED records of type DATA are permitted */
+       boolean_t embed_data;
 } sendflags_t;
 
 typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *);
 
 extern int zfs_send(zfs_handle_t *, const char *, const char *,
     sendflags_t *, int, snapfilter_cb_t, void *, nvlist_t **);
-extern int zfs_send_one(zfs_handle_t *, const char *, int);
+extern int zfs_send_one(zfs_handle_t *, const char *, int, enum lzc_send_flags);
 
 extern int zfs_promote(zfs_handle_t *);
 extern int zfs_hold(zfs_handle_t *, const char *, const char *,
index 484a48afe2db2f15a97ecfa8c9a9d06f7cbfc5d2..d7d767055d3358173aa894488803ac4155b55f49 100644 (file)
@@ -52,7 +52,11 @@ int lzc_hold(nvlist_t *, int, nvlist_t **);
 int lzc_release(nvlist_t *, nvlist_t **);
 int lzc_get_holds(const char *, nvlist_t **);
 
-int lzc_send(const char *, const char *, int);
+enum lzc_send_flags {
+       LZC_SEND_FLAG_EMBED_DATA = 1 << 0
+};
+
+int lzc_send(const char *, const char *, int, enum lzc_send_flags);
 int lzc_receive(const char *, nvlist_t *, const char *, boolean_t, int);
 int lzc_send_space(const char *, const char *, uint64_t *);
 
index 90f3cce1b6b94da1ccdce1da47deeea914ddbd30..8f01660cb3fd59c03474610be8982f8f2e7d9aaa 100644 (file)
@@ -4,6 +4,7 @@ COMMON_H = \
        $(top_srcdir)/include/sys/arc.h \
        $(top_srcdir)/include/sys/avl.h \
        $(top_srcdir)/include/sys/avl_impl.h \
+       $(top_srcdir)/include/sys/blkptr.h \
        $(top_srcdir)/include/sys/bplist.h \
        $(top_srcdir)/include/sys/bpobj.h \
        $(top_srcdir)/include/sys/bptree.h \
diff --git a/include/sys/blkptr.h b/include/sys/blkptr.h
new file mode 100644 (file)
index 0000000..b720482
--- /dev/null
@@ -0,0 +1,38 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
+
+#ifndef _SYS_BLKPTR_H
+#define        _SYS_BLKPTR_H
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void encode_embedded_bp_compressed(blkptr_t *, void *,
+    enum zio_compress, int, int);
+void decode_embedded_bp_compressed(const blkptr_t *, void *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_BLKPTR_H */
index 9446d9691a214ed49f9cca0af4283579400dbc87..76daea90e9bbc3f1cf9062041b20d5ab55d35fbd 100644 (file)
@@ -272,6 +272,9 @@ void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
 void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
 dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);
+void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
+    bp_embedded_type_t etype, enum zio_compress comp,
+    int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx);
 
 void dbuf_clear(dmu_buf_impl_t *db);
 void dbuf_evict(dmu_buf_impl_t *db);
index ad12ecafa5483c18bd0d82b319ffb37768f72726..89a0e5bd7a93a8f69e1b74a59520f53a4ff5a748 100644 (file)
@@ -116,6 +116,14 @@ typedef enum dmu_object_byteswap {
        ((ot) & DMU_OT_METADATA) : \
        dmu_ot[(int)(ot)].ot_metadata)
 
+/*
+ * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't
+ * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill
+ * is repurposed for embedded BPs.
+ */
+#define        DMU_OT_HAS_FILL(ot) \
+       ((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET)
+
 #define        DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \
        ((ot) & DMU_OT_BYTESWAP_MASK) : \
        dmu_ot[(int)(ot)].ot_byteswap)
@@ -391,6 +399,11 @@ void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
 void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
     dmu_tx_t *tx);
 
+void
+dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
+    void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
+    int compressed_size, int byteorder, dmu_tx_t *tx);
+
 /*
  * Decide how to write a block: checksum, compression, number of copies, etc.
  */
index cb4afbaaeb0d83a80801d1561c5bc1c759640a7b..75d094f0812e9769bd3afe3202d4a6a17026f1fb 100644 (file)
@@ -269,12 +269,15 @@ typedef struct dmu_sendarg {
        int dsa_err;
        dmu_pendop_t dsa_pending_op;
        boolean_t dsa_incremental;
+       uint64_t dsa_featureflags;
        uint64_t dsa_last_data_object;
        uint64_t dsa_last_data_offset;
 } dmu_sendarg_t;
 
 void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *);
 void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *);
+int dmu_buf_hold_noread(objset_t *, uint64_t, uint64_t,
+    void *, dmu_buf_t **);
 
 #ifdef __cplusplus
 }
index 65514b7620aa71f5dd572d86397e3cafe245927d..de590f1d503ba94125227cfe32f8b9a3088fa01b 100644 (file)
@@ -37,12 +37,12 @@ struct dsl_dataset;
 struct drr_begin;
 struct avl_tree;
 
-int dmu_send(const char *tosnap, const char *fromsnap, int outfd,
-    struct vnode *vp, offset_t *off);
+int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+    int outfd, struct vnode *vp, offset_t *off);
 int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds,
     uint64_t *sizep);
 int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
-    int outfd, struct vnode *vp, offset_t *off);
+    boolean_t embedok, int outfd, vnode_t *vp, offset_t *off);
 
 typedef struct dmu_recv_cookie {
        struct dsl_dataset *drc_ds;
index 5c754b0af94327fa815b9b98bcbcb7a031f8c96a..707b1987a77564920a282067fd28d25dfc135d65 100644 (file)
@@ -156,7 +156,7 @@ typedef struct zio_cksum {
  *     +-------+-------+-------+-------+-------+-------+-------+-------+
  * 5   |G|                      offset3                                |
  *     +-------+-------+-------+-------+-------+-------+-------+-------+
- * 6   |BDX|lvl| type  | cksum | comp  |     PSIZE     |     LSIZE     |
+ * 6   |BDX|lvl| type  | cksum |E| comp|    PSIZE      |     LSIZE     |
  *     +-------+-------+-------+-------+-------+-------+-------+-------+
  * 7   |                       padding                                 |
  *     +-------+-------+-------+-------+-------+-------+-------+-------+
@@ -190,7 +190,8 @@ typedef struct zio_cksum {
  * G           gang block indicator
  * B           byteorder (endianness)
  * D           dedup
- * X           unused
+ * X           encryption (on version 30, which is not supported)
+ * E           blkptr_t contains embedded data (see below)
  * lvl         level of indirection
  * type                DMU object type
  * phys birth  txg of block allocation; zero if same as logical birth txg
@@ -198,6 +199,100 @@ typedef struct zio_cksum {
  * fill count  number of non-zero blocks under this bp
  * checksum[4] 256-bit checksum of the data this bp describes
  */
+
+/*
+ * "Embedded" blkptr_t's don't actually point to a block, instead they
+ * have a data payload embedded in the blkptr_t itself.  See the comment
+ * in blkptr.c for more details.
+ *
+ * The blkptr_t is laid out as follows:
+ *
+ *     64      56      48      40      32      24      16      8       0
+ *     +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 0   |      payload                                                  |
+ * 1   |      payload                                                  |
+ * 2   |      payload                                                  |
+ * 3   |      payload                                                  |
+ * 4   |      payload                                                  |
+ * 5   |      payload                                                  |
+ *     +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 6   |BDX|lvl| type  | etype |E| comp| PSIZE|              LSIZE     |
+ *     +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 7   |      payload                                                  |
+ * 8   |      payload                                                  |
+ * 9   |      payload                                                  |
+ *     +-------+-------+-------+-------+-------+-------+-------+-------+
+ * a   |                       logical birth txg                       |
+ *     +-------+-------+-------+-------+-------+-------+-------+-------+
+ * b   |      payload                                                  |
+ * c   |      payload                                                  |
+ * d   |      payload                                                  |
+ * e   |      payload                                                  |
+ * f   |      payload                                                  |
+ *     +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * Legend:
+ *
+ * payload             contains the embedded data
+ * B (byteorder)       byteorder (endianness)
+ * D (dedup)           padding (set to zero)
+ * X                   encryption (set to zero; see above)
+ * E (embedded)                set to one
+ * lvl                 indirection level
+ * type                        DMU object type
+ * etype               how to interpret embedded data (BP_EMBEDDED_TYPE_*)
+ * comp                        compression function of payload
+ * PSIZE               size of payload after compression, in bytes
+ * LSIZE               logical size of payload, in bytes
+ *                     note that 25 bits is enough to store the largest
+ *                     "normal" BP's LSIZE (2^16 * 2^9) in bytes
+ * log. birth          transaction group in which the block was logically born
+ *
+ * Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded
+ * bp's they are stored in units of SPA_MINBLOCKSHIFT.
+ * Generally, the generic BP_GET_*() macros can be used on embedded BP's.
+ * The B, D, X, lvl, type, and comp fields are stored the same as with normal
+ * BP's so the BP_SET_* macros can be used with them.  etype, PSIZE, LSIZE must
+ * be set with the BPE_SET_* macros.  BP_SET_EMBEDDED() should be called before
+ * other macros, as they assert that they are only used on BP's of the correct
+ * "embedded-ness".
+ */
+
+#define        BPE_GET_ETYPE(bp)       \
+       (ASSERT(BP_IS_EMBEDDED(bp)), \
+       BF64_GET((bp)->blk_prop, 40, 8))
+#define        BPE_SET_ETYPE(bp, t)    do { \
+       ASSERT(BP_IS_EMBEDDED(bp)); \
+       BF64_SET((bp)->blk_prop, 40, 8, t); \
+_NOTE(CONSTCOND) } while (0)
+
+#define        BPE_GET_LSIZE(bp)       \
+       (ASSERT(BP_IS_EMBEDDED(bp)), \
+       BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1))
+#define        BPE_SET_LSIZE(bp, x)    do { \
+       ASSERT(BP_IS_EMBEDDED(bp)); \
+       BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \
+_NOTE(CONSTCOND) } while (0)
+
+#define        BPE_GET_PSIZE(bp)       \
+       (ASSERT(BP_IS_EMBEDDED(bp)), \
+       BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1))
+#define        BPE_SET_PSIZE(bp, x)    do { \
+       ASSERT(BP_IS_EMBEDDED(bp)); \
+       BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \
+_NOTE(CONSTCOND) } while (0)
+
+typedef enum bp_embedded_type {
+       BP_EMBEDDED_TYPE_DATA,
+       BP_EMBEDDED_TYPE_RESERVED, /* Reserved for an unintegrated feature. */
+       NUM_BP_EMBEDDED_TYPES = BP_EMBEDDED_TYPE_RESERVED
+} bp_embedded_type_t;
+
+#define        BPE_NUM_WORDS 14
+#define        BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t))
+#define        BPE_IS_PAYLOADWORD(bp, wp) \
+       ((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth)
+
 #define        SPA_BLKPTRSHIFT 7               /* blkptr_t is 128 bytes        */
 #define        SPA_DVAS_PER_BP 3               /* Number of DVAs in a bp       */
 
@@ -244,20 +339,37 @@ typedef struct blkptr {
 #define        DVA_SET_GANG(dva, x)    BF64_SET((dva)->dva_word[1], 63, 1, x)
 
 #define        BP_GET_LSIZE(bp)        \
-       BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)
-#define        BP_SET_LSIZE(bp, x)     \
-       BF64_SET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
+       (BP_IS_EMBEDDED(bp) ?   \
+       (BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \
+       BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1))
+#define        BP_SET_LSIZE(bp, x)     do { \
+       ASSERT(!BP_IS_EMBEDDED(bp)); \
+       BF64_SET_SB((bp)->blk_prop, \
+           0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
+_NOTE(CONSTCOND) } while (0)
 
 #define        BP_GET_PSIZE(bp)        \
-       BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1)
-#define        BP_SET_PSIZE(bp, x)     \
-       BF64_SET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
+       (BP_IS_EMBEDDED(bp) ? 0 : \
+       BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1))
+#define        BP_SET_PSIZE(bp, x)     do { \
+       ASSERT(!BP_IS_EMBEDDED(bp)); \
+       BF64_SET_SB((bp)->blk_prop, \
+           16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
+_NOTE(CONSTCOND) } while (0)
+
+#define        BP_GET_COMPRESS(bp)             BF64_GET((bp)->blk_prop, 32, 7)
+#define        BP_SET_COMPRESS(bp, x)          BF64_SET((bp)->blk_prop, 32, 7, x)
 
-#define        BP_GET_COMPRESS(bp)             BF64_GET((bp)->blk_prop, 32, 8)
-#define        BP_SET_COMPRESS(bp, x)          BF64_SET((bp)->blk_prop, 32, 8, x)
+#define        BP_IS_EMBEDDED(bp)              BF64_GET((bp)->blk_prop, 39, 1)
+#define        BP_SET_EMBEDDED(bp, x)          BF64_SET((bp)->blk_prop, 39, 1, x)
 
-#define        BP_GET_CHECKSUM(bp)             BF64_GET((bp)->blk_prop, 40, 8)
-#define        BP_SET_CHECKSUM(bp, x)          BF64_SET((bp)->blk_prop, 40, 8, x)
+#define        BP_GET_CHECKSUM(bp)             \
+       (BP_IS_EMBEDDED(bp) ? ZIO_CHECKSUM_OFF : \
+       BF64_GET((bp)->blk_prop, 40, 8))
+#define        BP_SET_CHECKSUM(bp, x)          do { \
+       ASSERT(!BP_IS_EMBEDDED(bp)); \
+       BF64_SET((bp)->blk_prop, 40, 8, x); \
+_NOTE(CONSTCOND) } while (0)
 
 #define        BP_GET_TYPE(bp)                 BF64_GET((bp)->blk_prop, 48, 8)
 #define        BP_SET_TYPE(bp, x)              BF64_SET((bp)->blk_prop, 48, 8, x)
@@ -265,9 +377,6 @@ typedef struct blkptr {
 #define        BP_GET_LEVEL(bp)                BF64_GET((bp)->blk_prop, 56, 5)
 #define        BP_SET_LEVEL(bp, x)             BF64_SET((bp)->blk_prop, 56, 5, x)
 
-#define        BP_GET_PROP_BIT_61(bp)          BF64_GET((bp)->blk_prop, 61, 1)
-#define        BP_SET_PROP_BIT_61(bp, x)       BF64_SET((bp)->blk_prop, 61, 1, x)
-
 #define        BP_GET_DEDUP(bp)                BF64_GET((bp)->blk_prop, 62, 1)
 #define        BP_SET_DEDUP(bp, x)             BF64_SET((bp)->blk_prop, 62, 1, x)
 
@@ -275,31 +384,39 @@ typedef struct blkptr {
 #define        BP_SET_BYTEORDER(bp, x)         BF64_SET((bp)->blk_prop, 63, 1, x)
 
 #define        BP_PHYSICAL_BIRTH(bp)           \
-       ((bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
+       (BP_IS_EMBEDDED(bp) ? 0 : \
+       (bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
 
 #define        BP_SET_BIRTH(bp, logical, physical)     \
 {                                              \
+       ASSERT(!BP_IS_EMBEDDED(bp));            \
        (bp)->blk_birth = (logical);            \
        (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
 }
 
+#define        BP_GET_FILL(bp) (BP_IS_EMBEDDED(bp) ? 1 : (bp)->blk_fill)
+
 #define        BP_GET_ASIZE(bp)        \
-       (DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
-               DVA_GET_ASIZE(&(bp)->blk_dva[2]))
+       (BP_IS_EMBEDDED(bp) ? 0 : \
+       DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
+       DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
+       DVA_GET_ASIZE(&(bp)->blk_dva[2]))
 
 #define        BP_GET_UCSIZE(bp) \
        ((BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) ? \
        BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
 
 #define        BP_GET_NDVAS(bp)        \
-       (!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
+       (BP_IS_EMBEDDED(bp) ? 0 : \
+       !!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
        !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
        !!DVA_GET_ASIZE(&(bp)->blk_dva[2]))
 
 #define        BP_COUNT_GANG(bp)       \
+       (BP_IS_EMBEDDED(bp) ? 0 : \
        (DVA_GET_GANG(&(bp)->blk_dva[0]) + \
        DVA_GET_GANG(&(bp)->blk_dva[1]) + \
-       DVA_GET_GANG(&(bp)->blk_dva[2]))
+       DVA_GET_GANG(&(bp)->blk_dva[2])))
 
 #define        DVA_EQUAL(dva1, dva2)   \
        ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
@@ -307,6 +424,7 @@ typedef struct blkptr {
 
 #define        BP_EQUAL(bp1, bp2)      \
        (BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) &&    \
+       (bp1)->blk_birth == (bp2)->blk_birth &&                 \
        DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) &&    \
        DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) &&    \
        DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))
@@ -327,11 +445,13 @@ typedef struct blkptr {
        (zcp)->zc_word[3] = w3;                 \
 }
 
-#define        BP_IDENTITY(bp)         (&(bp)->blk_dva[0])
-#define        BP_IS_GANG(bp)          DVA_GET_GANG(BP_IDENTITY(bp))
+#define        BP_IDENTITY(bp)         (ASSERT(!BP_IS_EMBEDDED(bp)), &(bp)->blk_dva[0])
+#define        BP_IS_GANG(bp)          \
+       (BP_IS_EMBEDDED(bp) ? B_FALSE : DVA_GET_GANG(BP_IDENTITY(bp)))
 #define        DVA_IS_EMPTY(dva)       ((dva)->dva_word[0] == 0ULL &&  \
                                (dva)->dva_word[1] == 0ULL)
-#define        BP_IS_HOLE(bp)          DVA_IS_EMPTY(BP_IDENTITY(bp))
+#define        BP_IS_HOLE(bp) \
+       (!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp)))
 
 /* BP_IS_RAIDZ(bp) assumes no block compression */
 #define        BP_IS_RAIDZ(bp)         (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
@@ -386,6 +506,17 @@ typedef struct blkptr {
                            " birth=%lluL",                             \
                            (u_longlong_t)bp->blk_birth);               \
                }                                                       \
+       } else if (BP_IS_EMBEDDED(bp)) {                                \
+               len = func(buf + len, size - len,                       \
+                   "EMBEDDED [L%llu %s] et=%u %s "                     \
+                   "size=%llxL/%llxP birth=%lluL",                     \
+                   (u_longlong_t)BP_GET_LEVEL(bp),                     \
+                   type,                                               \
+                   (int)BPE_GET_ETYPE(bp),                             \
+                   compress,                                           \
+                   (u_longlong_t)BPE_GET_LSIZE(bp),                    \
+                   (u_longlong_t)BPE_GET_PSIZE(bp),                    \
+                   (u_longlong_t)bp->blk_birth);                       \
        } else {                                                        \
                for (d = 0; d < BP_GET_NDVAS(bp); d++) {                \
                        const dva_t *dva = &bp->blk_dva[d];             \
@@ -419,7 +550,7 @@ typedef struct blkptr {
                    (u_longlong_t)BP_GET_PSIZE(bp),                     \
                    (u_longlong_t)bp->blk_birth,                        \
                    (u_longlong_t)BP_PHYSICAL_BIRTH(bp),                \
-                   (u_longlong_t)bp->blk_fill,                         \
+                   (u_longlong_t)BP_GET_FILL(bp),                      \
                    ws,                                                 \
                    (u_longlong_t)bp->blk_cksum.zc_word[0],             \
                    (u_longlong_t)bp->blk_cksum.zc_word[1],             \
index 5e129a937db11ea7a172f08c65d99e11a1412691..a8ade1d217044a9e99408bd5f531f776a7f592aa 100644 (file)
@@ -38,6 +38,7 @@
 #include <sys/refcount.h>
 #include <sys/bplist.h>
 #include <sys/bpobj.h>
+#include <sys/zfeature.h>
 #include <zfeature_common.h>
 
 #ifdef __cplusplus
index c7bd789e840dd0f5b66eb7e2426d55520e94dbb3..5cfdcc50fda4f840da74c92f997f0af71047a42c 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
  */
 
 #ifndef        _SYS_ZFS_IOCTL_H
@@ -90,15 +90,19 @@ typedef enum drr_headertype {
  * Feature flags for zfs send streams (flags in drr_versioninfo)
  */
 
-#define        DMU_BACKUP_FEATURE_DEDUP        (0x1)
-#define        DMU_BACKUP_FEATURE_DEDUPPROPS   (0x2)
-#define        DMU_BACKUP_FEATURE_SA_SPILL     (0x4)
+#define        DMU_BACKUP_FEATURE_DEDUP                (1<<0)
+#define        DMU_BACKUP_FEATURE_DEDUPPROPS           (1<<1)
+#define        DMU_BACKUP_FEATURE_SA_SPILL             (1<<2)
+/* flags #3 - #15 are reserved for incompatible closed-source implementations */
+#define        DMU_BACKUP_FEATURE_EMBED_DATA           (1<<16)
+#define        DMU_BACKUP_FEATURE_EMBED_DATA_LZ4       (1<<17)
 
 /*
  * Mask of all supported backup features
  */
 #define        DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \
-               DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL)
+    DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \
+    DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4)
 
 /* Are all features in the given flag word currently supported? */
 #define        DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK))
@@ -140,7 +144,7 @@ typedef struct dmu_replay_record {
        enum {
                DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS,
                DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF,
-               DRR_SPILL, DRR_NUMTYPES
+               DRR_SPILL, DRR_WRITE_EMBEDDED, DRR_NUMTYPES
        } drr_type;
        uint32_t drr_payloadlen;
        union {
@@ -217,6 +221,19 @@ typedef struct dmu_replay_record {
                        uint64_t drr_pad[4]; /* needed for crypto */
                        /* spill data follows */
                } drr_spill;
+               struct drr_write_embedded {
+                       uint64_t drr_object;
+                       uint64_t drr_offset;
+                       /* logical length, should equal blocksize */
+                       uint64_t drr_length;
+                       uint64_t drr_toguid;
+                       uint8_t drr_compression;
+                       uint8_t drr_etype;
+                       uint8_t drr_pad[6];
+                       uint32_t drr_lsize; /* uncompressed size of payload */
+                       uint32_t drr_psize; /* compr. (real) size of payload */
+                       /* (possibly compressed) content follows */
+               } drr_write_embedded;
        } drr_u;
 } dmu_replay_record_t;
 
@@ -325,8 +342,8 @@ typedef struct zfs_cmd {
        dmu_objset_stats_t zc_objset_stats;
        struct drr_begin zc_begin_record;
        zinject_record_t zc_inject_record;
-       boolean_t       zc_defer_destroy;
-       boolean_t       zc_temphold;
+       uint32_t        zc_defer_destroy;
+       uint32_t        zc_flags;
        uint64_t        zc_action_handle;
        int             zc_cleanup_fd;
        uint8_t         zc_simple;
index 129e2bcb9b335ca42e533df906a928b7f95da486..181722f6837db65d34eee3f40391fcca2cb93089 100644 (file)
@@ -82,6 +82,12 @@ enum zio_checksum {
        ZIO_CHECKSUM_FUNCTIONS
 };
 
+/*
+ * The number of "legacy" compression functions which can be set on individual
+ * objects.
+ */
+#define        ZIO_CHECKSUM_LEGACY_FUNCTIONS ZIO_CHECKSUM_ZILOG2
+
 #define        ZIO_CHECKSUM_ON_VALUE   ZIO_CHECKSUM_FLETCHER_4
 #define        ZIO_CHECKSUM_DEFAULT    ZIO_CHECKSUM_ON
 
@@ -111,6 +117,12 @@ enum zio_compress {
        ZIO_COMPRESS_FUNCTIONS
 };
 
+/*
+ * The number of "legacy" compression functions which can be set on individual
+ * objects.
+ */
+#define        ZIO_COMPRESS_LEGACY_FUNCTIONS ZIO_COMPRESS_LZ4
+
 #define        ZIO_COMPRESS_ON_VALUE   ZIO_COMPRESS_LZJB
 #define        ZIO_COMPRESS_DEFAULT    ZIO_COMPRESS_OFF
 
index acf23982973d9372aad39ac28478b729239828d3..80074db4fbcc83787735f4cf26e53bce8effd25c 100644 (file)
@@ -46,6 +46,7 @@ typedef enum spa_feature {
        SPA_FEATURE_ENABLED_TXG,
        SPA_FEATURE_HOLE_BIRTH,
        SPA_FEATURE_EXTENSIBLE_DATASET,
+       SPA_FEATURE_EMBEDDED_DATA,
        SPA_FEATURE_BOOKMARKS,
        SPA_FEATURES
 } spa_feature_t;
@@ -65,7 +66,7 @@ typedef struct zfeature_info {
        const spa_feature_t *fi_depends;
 } zfeature_info_t;
 
-typedef int (zfeature_func_t)(zfeature_info_t *fi, void *arg);
+typedef int (zfeature_func_t)(zfeature_info_t *, void *);
 
 #define        ZFS_FEATURE_DEBUG
 
@@ -74,8 +75,8 @@ extern zfeature_info_t spa_feature_table[SPA_FEATURES];
 extern boolean_t zfeature_is_valid_guid(const char *);
 
 extern boolean_t zfeature_is_supported(const char *);
-extern int zfeature_lookup_name(const char *name, spa_feature_t *res);
-extern boolean_t zfeature_depends_on(spa_feature_t fid, spa_feature_t check);
+extern int zfeature_lookup_name(const char *, spa_feature_t *);
+extern boolean_t zfeature_depends_on(spa_feature_t, spa_feature_t);
 
 extern void zpool_feature_init(void);
 
index a11d4bf36710138e7ca2f38eab5fa2e22a2634b5..ebd8a7be2ce4c197c6b99394089f9c5b47fe4ba8 100644 (file)
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
  * All rights reserved
@@ -49,6 +49,7 @@
 #include <time.h>
 
 #include <libzfs.h>
+#include <libzfs_core.h>
 
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
@@ -220,6 +221,7 @@ cksummer(void *arg)
        struct drr_object *drro = &thedrr.drr_u.drr_object;
        struct drr_write *drrw = &thedrr.drr_u.drr_write;
        struct drr_spill *drrs = &thedrr.drr_u.drr_spill;
+       struct drr_write_embedded *drrwe = &thedrr.drr_u.drr_write_embedded;
        FILE *ofp;
        int outfd;
        dmu_replay_record_t wbr_drr = {0};
@@ -415,6 +417,20 @@ cksummer(void *arg)
                        break;
                }
 
+               case DRR_WRITE_EMBEDDED:
+               {
+                       if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
+                           &stream_cksum, outfd) == -1)
+                               goto out;
+                       (void) ssread(buf,
+                           P2ROUNDUP((uint64_t)drrwe->drr_psize, 8), ofp);
+                       if (cksum_and_write(buf,
+                           P2ROUNDUP((uint64_t)drrwe->drr_psize, 8),
+                           &stream_cksum, outfd) == -1)
+                               goto out;
+                       break;
+               }
+
                case DRR_FREE:
                {
                        if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
@@ -796,7 +812,7 @@ typedef struct send_dump_data {
        char prevsnap[ZFS_MAXNAMELEN];
        uint64_t prevsnap_obj;
        boolean_t seenfrom, seento, replicate, doall, fromorigin;
-       boolean_t verbose, dryrun, parsable, progress;
+       boolean_t verbose, dryrun, parsable, progress, embed_data;
        int outfd;
        boolean_t err;
        nvlist_t *fss;
@@ -876,7 +892,8 @@ estimate_ioctl(zfs_handle_t *zhp, uint64_t fromsnap_obj,
  */
 static int
 dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj,
-    boolean_t fromorigin, int outfd, nvlist_t *debugnv)
+    boolean_t fromorigin, int outfd, enum lzc_send_flags flags,
+    nvlist_t *debugnv)
 {
        zfs_cmd_t zc = {"\0"};
        libzfs_handle_t *hdl = zhp->zfs_hdl;
@@ -890,6 +907,7 @@ dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj,
        zc.zc_obj = fromorigin;
        zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID);
        zc.zc_fromobj = fromsnap_obj;
+       zc.zc_flags = flags;
 
        VERIFY(0 == nvlist_alloc(&thisdbg, NV_UNIQUE_NAME, 0));
        if (fromsnap && fromsnap[0] != '\0') {
@@ -1140,8 +1158,12 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
                        }
                }
 
+               enum lzc_send_flags flags = 0;
+               if (sdd->embed_data)
+                       flags |= LZC_SEND_FLAG_EMBED_DATA;
+
                err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj,
-                   fromorigin, sdd->outfd, sdd->debugnv);
+                   fromorigin, sdd->outfd, flags, sdd->debugnv);
 
                if (sdd->progress) {
                        (void) pthread_cancel(tid);
@@ -1485,6 +1507,7 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
        sdd.parsable = flags->parsable;
        sdd.progress = flags->progress;
        sdd.dryrun = flags->dryrun;
+       sdd.embed_data = flags->embed_data;
        sdd.filter_cb = filter_func;
        sdd.filter_cb_arg = cb_arg;
        if (debugnvp)
@@ -1616,7 +1639,8 @@ err_out:
 }
 
 int
-zfs_send_one(zfs_handle_t *zhp, const char *from, int fd)
+zfs_send_one(zfs_handle_t *zhp, const char *from, int fd,
+    enum lzc_send_flags flags)
 {
        int err;
        libzfs_handle_t *hdl = zhp->zfs_hdl;
@@ -1625,7 +1649,7 @@ zfs_send_one(zfs_handle_t *zhp, const char *from, int fd)
        (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
            "warning: cannot send '%s'"), zhp->zfs_name);
 
-       err = lzc_send(zhp->zfs_name, from, fd);
+       err = lzc_send(zhp->zfs_name, from, fd, flags);
        if (err != 0) {
                switch (errno) {
                case EXDEV:
@@ -2543,6 +2567,16 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
                        (void) recv_read(hdl, fd, buf,
                            drr->drr_u.drr_spill.drr_length, B_FALSE, NULL);
                        break;
+               case DRR_WRITE_EMBEDDED:
+                       if (byteswap) {
+                               drr->drr_u.drr_write_embedded.drr_psize =
+                                   BSWAP_32(drr->drr_u.drr_write_embedded.
+                                   drr_psize);
+                       }
+                       (void) recv_read(hdl, fd, buf,
+                           P2ROUNDUP(drr->drr_u.drr_write_embedded.drr_psize,
+                           8), B_FALSE, NULL);
+                       break;
                case DRR_WRITE_BYREF:
                case DRR_FREEOBJECTS:
                case DRR_FREE:
index 0c05bbd275debf64131a83ad63a514f0eda9d990..2198ecd6f24ba12fa7827ba0b6b341649ee20a1a 100644 (file)
@@ -439,6 +439,8 @@ lzc_get_holds(const char *snapname, nvlist_t **holdsp)
 }
 
 /*
+ * Generate a zfs send stream for the specified snapshot and write it to
+ * the specified file descriptor.
  *
  * "snapname" is the full name of the snapshot to send (e.g. "pool/fs@snap")
  *
@@ -452,9 +454,15 @@ lzc_get_holds(const char *snapname, nvlist_t **holdsp)
  * snapshot in the origin, etc.
  *
  * "fd" is the file descriptor to write the send stream to.
+ *
+ * If "flags" contains LZC_SEND_FLAG_EMBED_DATA, the stream is permitted
+ * to contain DRR_WRITE_EMBEDDED records with drr_etype==BP_EMBEDDED_TYPE_DATA,
+ * which the receiving system must support (as indicated by support
+ * for the "embedded_data" feature).
  */
 int
-lzc_send(const char *snapname, const char *from, int fd)
+lzc_send(const char *snapname, const char *from, int fd,
+    enum lzc_send_flags flags)
 {
        nvlist_t *args;
        int err;
@@ -463,6 +471,8 @@ lzc_send(const char *snapname, const char *from, int fd)
        fnvlist_add_int32(args, "fd", fd);
        if (from != NULL)
                fnvlist_add_string(args, "fromsnap", from);
+       if (flags & LZC_SEND_FLAG_EMBED_DATA)
+               fnvlist_add_boolean(args, "embedok");
        err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL);
        nvlist_free(args);
        return (err);
index b960850fbdbe2e59f7cbedb79fe099c8bd4c36f4..f4838da75fd03204ce4d3739a3636ef5c237abc9 100644 (file)
@@ -21,6 +21,7 @@ libzpool_la_SOURCES = \
        $(top_srcdir)/module/zcommon/zpool_prop.c \
        $(top_srcdir)/module/zcommon/zprop_common.c \
        $(top_srcdir)/module/zfs/arc.c \
+       $(top_srcdir)/module/zfs/blkptr.c \
        $(top_srcdir)/module/zfs/bplist.c \
        $(top_srcdir)/module/zfs/bpobj.c \
        $(top_srcdir)/module/zfs/bptree.c \
index edcdb364dce6824d0955b6e4fa6e7d9598c1bb21..ff8313bf5d8532c73821c663bd10485bcc2f2276 100644 (file)
@@ -358,6 +358,33 @@ never return to being \fBenabled\fB.
 
 .RE
 
+.sp
+.ne 2
+.na
+\fB\fBembedded_data\fR\fR
+.ad
+.RS 4n
+.TS
+l l .
+GUID   com.delphix:embedded_data
+READ\-ONLY COMPATIBLE  no
+DEPENDENCIES   none
+.TE
+
+This feature improves the performance and compression ratio of
+highly-compressible blocks.  Blocks whose contents can compress to 112 bytes
+or smaller can take advantage of this feature.
+
+When this feature is enabled, the contents of highly-compressible blocks are
+stored in the block "pointer" itself (a misnomer in this case, as it contains
+the compresseed data, rather than a pointer to its location on disk).  Thus
+the space of the block (one sector, typically 512 bytes or 4KB) is saved,
+and no additional i/o is needed to read and write the data block.
+
+This feature becomes \fBactive\fR as soon as it is enabled and will
+never return to being \fBenabled\fR.
+
+.RE
 
 .SH "SEE ALSO"
 \fBzpool\fR(8)
index 5683206ee6cad09d7e03affb78716065dfd1085e..96cac6f5a87b2a45ac68727e2ce07ef80edc62ad 100644 (file)
@@ -174,12 +174,12 @@ zfs \- configures ZFS file systems
 
 .LP
 .nf
-\fBzfs\fR \fBsend\fR [\fB-DnPpRv\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR
+\fBzfs\fR \fBsend\fR [\fB-DnPpRve\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR
 .fi
 
 .LP
 .nf
-\fBzfs\fR \fBsend\fR [\fB-i \fIsnapshot\fR|\fIbookmark\fR]\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
+\fBzfs\fR \fBsend\fR [\fB-e\fR] [\fB-i \fIsnapshot\fR|\fIbookmark\fR]\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
 .fi
 
 .LP
@@ -2600,7 +2600,7 @@ See \fBzpool-features\fR(5) for details on ZFS feature flags and the
 .sp
 .ne 2
 .na
-\fBzfs send\fR [\fB-DnPpRv\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR
+\fBzfs send\fR [\fB-DnPpRve\fR] [\fB-\fR[\fBiI\fR] \fIsnapshot\fR] \fIsnapshot\fR
 .ad
 .sp .6
 .RS 4n
@@ -2657,6 +2657,23 @@ Generate a deduplicated stream. Blocks which would have been sent multiple times
 .ne 2
 .mk
 .na
+\fB\fB-e\fR\fR
+.ad
+.sp .6
+.RS 4n
+Generate a more compact stream by using WRITE_EMBEDDED records for blocks
+which are stored more compactly on disk by the \fBembedded_data\fR pool
+feature.  This flag has no effect if the \fBembedded_data\fR feature is
+disabled.  The receiving system must have the \fBembedded_data\fR feature
+enabled.  If the \fBlz4_compress\fR feature is active on the sending system,
+then the receiving system must have that feature enabled as well. See
+\fBzpool-features\fR(5) for details on ZFS feature flags and the
+\fBembedded_data\fR feature.
+.RE
+
+.sp
+.ne 2
+.na
 \fB\fB-p\fR\fR
 .ad
 .sp .6
@@ -2705,7 +2722,7 @@ The format of the stream is committed. You will be able to receive your streams
 .sp
 .ne 2
 .na
-\fBzfs send\fR [\fB-i\fR \fIsnapshot\fR|\fIbookmark\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
+\fBzfs send\fR [\fB-e\fR] [\fB-i\fR \fIsnapshot\fR|\fIbookmark\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
 .ad
 .sp .6
 .RS 4n
@@ -2822,6 +2839,22 @@ Do not actually receive the stream. This can be useful in conjunction with the \
 Force a rollback of the file system to the most recent snapshot before performing the receive operation. If receiving an incremental replication stream (for example, one generated by \fBzfs send -R -[iI]\fR), destroy snapshots and file systems that do not exist on the sending side.
 .RE
 
+.sp
+.ne 2
+.na
+\fB\fB-e\fR\fR
+.ad
+.sp .6
+.RS 4n
+Generate a more compact stream by using WRITE_EMBEDDED records for blocks
+which are stored more compactly on disk by the \fBembedded_data\fR pool
+feature.  This flag has no effect if the \fBembedded_data\fR feature is
+disabled.  The receiving system must have the \fBembedded_data\fR feature
+enabled.  If the \fBlz4_compress\fR feature is active on the sending system,
+then the receiving system must have that feature enabled as well. See
+\fBzpool-features\fR(5) for details on ZFS feature flags and the
+\fBembedded_data\fR feature.
+.RE
 .RE
 
 .sp
index 56ecd49186ab2ea60abd6ae2fb54afca4a351644..48e7e97e98148f500fc48f2b4d02d4232d342425 100644 (file)
@@ -5,6 +5,7 @@ EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) @KERNELCPPFLAGS@
 obj-$(CONFIG_ZFS) := $(MODULE).o
 
 $(MODULE)-objs += @top_srcdir@/module/zfs/arc.o
+$(MODULE)-objs += @top_srcdir@/module/zfs/blkptr.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/bplist.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/bpobj.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/dbuf.o
index 7f6df0ae8cc1b03d359ecfcee38c49327dcde702..ec006cb0f6f3b51dba4f1b9c79d0b5bf2ecc4ea7 100644 (file)
@@ -812,8 +812,10 @@ buf_discard_identity(arc_buf_hdr_t *hdr)
 }
 
 static arc_buf_hdr_t *
-buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
+buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
 {
+       const dva_t *dva = BP_IDENTITY(bp);
+       uint64_t birth = BP_PHYSICAL_BIRTH(bp);
        uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
        kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
        arc_buf_hdr_t *buf;
@@ -845,6 +847,8 @@ buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
        arc_buf_hdr_t *fbuf;
        uint32_t i;
 
+       ASSERT(!DVA_IS_EMPTY(&buf->b_dva));
+       ASSERT(buf->b_birth != 0);
        ASSERT(!HDR_IN_HASH_TABLE(buf));
        *lockp = hash_lock;
        mutex_enter(hash_lock);
@@ -3034,10 +3038,10 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
 static void
 arc_read_done(zio_t *zio)
 {
-       arc_buf_hdr_t   *hdr, *found;
+       arc_buf_hdr_t   *hdr;
        arc_buf_t       *buf;
        arc_buf_t       *abuf;  /* buffer we're assigning to callback */
-       kmutex_t        *hash_lock;
+       kmutex_t        *hash_lock = NULL;
        arc_callback_t  *callback_list, *acb;
        int             freeable = FALSE;
 
@@ -3052,12 +3056,24 @@ arc_read_done(zio_t *zio)
         * reason for it not to be found is if we were freed during the
         * read.
         */
-       found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
-           &hash_lock);
-
-       ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
-           (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
-           (found == hdr && HDR_L2_READING(hdr)));
+       if (HDR_IN_HASH_TABLE(hdr)) {
+               arc_buf_hdr_t *found;
+
+               ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
+               ASSERT3U(hdr->b_dva.dva_word[0], ==,
+                   BP_IDENTITY(zio->io_bp)->dva_word[0]);
+               ASSERT3U(hdr->b_dva.dva_word[1], ==,
+                   BP_IDENTITY(zio->io_bp)->dva_word[1]);
+
+               found = buf_hash_find(hdr->b_spa, zio->io_bp,
+                   &hash_lock);
+
+               ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) &&
+                   hash_lock == NULL) ||
+                   (found == hdr &&
+                   DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
+                   (found == hdr && HDR_L2_READING(hdr)));
+       }
 
        hdr->b_flags &= ~ARC_L2_EVICTED;
        if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
@@ -3181,17 +3197,26 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
     void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
     const zbookmark_t *zb)
 {
-       arc_buf_hdr_t *hdr;
+       arc_buf_hdr_t *hdr = NULL;
        arc_buf_t *buf = NULL;
-       kmutex_t *hash_lock;
+       kmutex_t *hash_lock = NULL;
        zio_t *rzio;
        uint64_t guid = spa_load_guid(spa);
        int rc = 0;
 
+       ASSERT(!BP_IS_EMBEDDED(bp) ||
+           BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
+
 top:
-       hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
-           &hash_lock);
-       if (hdr && hdr->b_datacnt > 0) {
+       if (!BP_IS_EMBEDDED(bp)) {
+               /*
+                * Embedded BP's have no DVA and require no I/O to "read".
+                * Create an anonymous arc buf to back it.
+                */
+               hdr = buf_hash_find(guid, bp, &hash_lock);
+       }
+
+       if (hdr != NULL && hdr->b_datacnt > 0) {
 
                *arc_flags |= ARC_CACHED;
 
@@ -3265,7 +3290,7 @@ top:
                        done(NULL, buf, private);
        } else {
                uint64_t size = BP_GET_LSIZE(bp);
-               arc_callback_t  *acb;
+               arc_callback_t *acb;
                vdev_t *vd = NULL;
                uint64_t addr = 0;
                boolean_t devw = B_FALSE;
@@ -3274,15 +3299,17 @@ top:
 
                if (hdr == NULL) {
                        /* this block is not in the cache */
-                       arc_buf_hdr_t   *exists;
+                       arc_buf_hdr_t *exists = NULL;
                        arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
                        buf = arc_buf_alloc(spa, size, private, type);
                        hdr = buf->b_hdr;
-                       hdr->b_dva = *BP_IDENTITY(bp);
-                       hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
-                       hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
-                       exists = buf_hash_insert(hdr, &hash_lock);
-                       if (exists) {
+                       if (!BP_IS_EMBEDDED(bp)) {
+                               hdr->b_dva = *BP_IDENTITY(bp);
+                               hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
+                               hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
+                               exists = buf_hash_insert(hdr, &hash_lock);
+                       }
+                       if (exists != NULL) {
                                /* somebody beat us to the hash insert */
                                mutex_exit(hash_lock);
                                buf_discard_identity(hdr);
@@ -3354,7 +3381,8 @@ top:
                                vd = NULL;
                }
 
-               mutex_exit(hash_lock);
+               if (hash_lock != NULL)
+                       mutex_exit(hash_lock);
 
                /*
                 * At this point, we have a level 1 cache miss.  Try again in
@@ -3526,8 +3554,9 @@ arc_freed(spa_t *spa, const blkptr_t *bp)
        kmutex_t *hash_lock;
        uint64_t guid = spa_load_guid(spa);
 
-       hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
-           &hash_lock);
+       ASSERT(!BP_IS_EMBEDDED(bp));
+
+       hdr = buf_hash_find(guid, bp, &hash_lock);
        if (hdr == NULL)
                return;
        if (HDR_BUF_AVAILABLE(hdr)) {
@@ -3854,7 +3883,7 @@ arc_write_done(zio_t *zio)
        ASSERT(hdr->b_acb == NULL);
 
        if (zio->io_error == 0) {
-               if (BP_IS_HOLE(zio->io_bp)) {
+               if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
                        buf_discard_identity(hdr);
                } else {
                        hdr->b_dva = *BP_IDENTITY(zio->io_bp);
@@ -3866,10 +3895,10 @@ arc_write_done(zio_t *zio)
        }
 
        /*
-        * If the block to be written was all-zero, we may have
-        * compressed it away.  In this case no write was performed
-        * so there will be no dva/birth/checksum.  The buffer must
-        * therefore remain anonymous (and uncached).
+        * If the block to be written was all-zero or compressed enough to be
+        * embedded in the BP, no write was performed so there will be no
+        * dva/birth/checksum.  The buffer must therefore remain anonymous
+        * (and uncached).
         */
        if (!BUF_EMPTY(hdr)) {
                arc_buf_hdr_t *exists;
@@ -5219,7 +5248,7 @@ static boolean_t
 l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
 {
        void *cdata;
-       size_t csize, len;
+       size_t csize, len, rounded;
 
        ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
        ASSERT(l2hdr->b_tmp_cdata != NULL);
@@ -5229,6 +5258,12 @@ l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
        csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
            cdata, l2hdr->b_asize);
 
+       rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE);
+       if (rounded > csize) {
+               bzero((char *)cdata + csize, rounded - csize);
+               csize = rounded;
+       }
+
        if (csize == 0) {
                /* zero block, indicate that there's nothing to write */
                zio_data_buf_free(cdata, len);
diff --git a/module/zfs/blkptr.c b/module/zfs/blkptr.c
new file mode 100644 (file)
index 0000000..d56e199
--- /dev/null
@@ -0,0 +1,121 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/zio_compress.h>
+
+/*
+ * Embedded-data Block Pointers
+ *
+ * Normally, block pointers point (via their DVAs) to a block which holds data.
+ * If the data that we need to store is very small, this is an inefficient
+ * use of space, because a block must be at minimum 1 sector (typically 512
+ * bytes or 4KB).  Additionally, reading these small blocks tends to generate
+ * more random reads.
+ *
+ * Embedded-data Block Pointers allow small pieces of data (the "payload",
+ * up to 112 bytes) to be stored in the block pointer itself, instead of
+ * being pointed to.  The "Pointer" part of this name is a bit of a
+ * misnomer, as nothing is pointed to.
+ *
+ * BP_EMBEDDED_TYPE_DATA block pointers allow highly-compressible data to
+ * be embedded in the block pointer.  The logic for this is handled in
+ * the SPA, by the zio pipeline.  Therefore most code outside the zio
+ * pipeline doesn't need special-cases to handle these block pointers.
+ *
+ * See spa.h for details on the exact layout of embedded block pointers.
+ */
+
+void
+encode_embedded_bp_compressed(blkptr_t *bp, void *data,
+    enum zio_compress comp, int uncompressed_size, int compressed_size)
+{
+       uint64_t *bp64 = (uint64_t *)bp;
+       uint64_t w = 0;
+       uint8_t *data8 = data;
+       int i;
+
+       ASSERT3U(compressed_size, <=, BPE_PAYLOAD_SIZE);
+       ASSERT(uncompressed_size == compressed_size ||
+           comp != ZIO_COMPRESS_OFF);
+       ASSERT3U(comp, >=, ZIO_COMPRESS_OFF);
+       ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
+
+       bzero(bp, sizeof (*bp));
+       BP_SET_EMBEDDED(bp, B_TRUE);
+       BP_SET_COMPRESS(bp, comp);
+       BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+       BPE_SET_LSIZE(bp, uncompressed_size);
+       BPE_SET_PSIZE(bp, compressed_size);
+
+       /*
+        * Encode the byte array into the words of the block pointer.
+        * First byte goes into low bits of first word (little endian).
+        */
+       for (i = 0; i < compressed_size; i++) {
+               BF64_SET(w, (i % sizeof (w)) * NBBY, NBBY, data8[i]);
+               if (i % sizeof (w) == sizeof (w) - 1) {
+                       /* we've reached the end of a word */
+                       ASSERT3P(bp64, <, bp + 1);
+                       *bp64 = w;
+                       bp64++;
+                       if (!BPE_IS_PAYLOADWORD(bp, bp64))
+                               bp64++;
+                       w = 0;
+               }
+       }
+       /* write last partial word */
+       if (bp64 < (uint64_t *)(bp + 1))
+               *bp64 = w;
+}
+
+/*
+ * buf must be at least BPE_GET_PSIZE(bp) bytes long (which will never be
+ * more than BPE_PAYLOAD_SIZE bytes).
+ */
+void
+decode_embedded_bp_compressed(const blkptr_t *bp, void *buf)
+{
+       int psize;
+       uint8_t *buf8 = buf;
+       uint64_t w = 0;
+       const uint64_t *bp64 = (const uint64_t *)bp;
+       int i;
+
+       ASSERT(BP_IS_EMBEDDED(bp));
+
+       psize = BPE_GET_PSIZE(bp);
+
+       /*
+        * Decode the words of the block pointer into the byte array.
+        * Low bits of first word are the first byte (little endian).
+        */
+       for (i = 0; i < psize; i++) {
+               if (i % sizeof (w) == 0) {
+                       /* beginning of a word */
+                       ASSERT3P(bp64, <, bp + 1);
+                       w = *bp64;
+                       bp64++;
+                       if (!BPE_IS_PAYLOADWORD(bp, bp64))
+                               bp64++;
+               }
+               buf8[i] = BF64_GET(w, (i % sizeof (w)) * NBBY, NBBY);
+       }
+}
index eb3233b1044f986141465801d7163ab11f0b07fc..7c8f932f5cf316941bb27935db9de04274b62951 100644 (file)
@@ -192,6 +192,13 @@ bpobj_close(bpobj_t *bpo)
        mutex_destroy(&bpo->bpo_lock);
 }
 
+static boolean_t
+bpobj_hasentries(bpobj_t *bpo)
+{
+       return (bpo->bpo_phys->bpo_num_blkptrs != 0 ||
+           (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs != 0));
+}
+
 static int
 bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
     boolean_t free)
@@ -332,9 +339,11 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
 
 out:
        /* If there are no entries, there should be no bytes. */
-       ASSERT(bpo->bpo_phys->bpo_num_blkptrs > 0 ||
-           (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs > 0) ||
-           bpo->bpo_phys->bpo_bytes == 0);
+       if (!bpobj_hasentries(bpo)) {
+               ASSERT0(bpo->bpo_phys->bpo_bytes);
+               ASSERT0(bpo->bpo_phys->bpo_comp);
+               ASSERT0(bpo->bpo_phys->bpo_uncomp);
+       }
 
        mutex_exit(&bpo->bpo_lock);
        return (err);
@@ -378,7 +387,7 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
        VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
        VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
 
-       if (used == 0) {
+       if (!bpobj_hasentries(&subbpo)) {
                /* No point in having an empty subobj. */
                bpobj_close(&subbpo);
                bpobj_free(bpo->bpo_os, subobj, tx);
@@ -453,13 +462,29 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
        ASSERT(!BP_IS_HOLE(bp));
        ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
 
+       if (BP_IS_EMBEDDED(bp)) {
+               /*
+                * The bpobj will compress better without the payload.
+                *
+                * Note that we store EMBEDDED bp's because they have an
+                * uncompressed size, which must be accounted for.  An
+                * alternative would be to add their size to bpo_uncomp
+                * without storing the bp, but that would create additional
+                * complications: bpo_uncomp would be inconsistent with the
+                * set of BP's stored, and bpobj_iterate() wouldn't visit
+                * all the space accounted for in the bpobj.
+                */
+               bzero(&stored_bp, sizeof (stored_bp));
+               stored_bp.blk_prop = bp->blk_prop;
+               stored_bp.blk_birth = bp->blk_birth;
+       } else if (!BP_GET_DEDUP(bp)) {
+               /* The bpobj will compress better without the checksum */
+               bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
+       }
+
        /* We never need the fill count. */
        stored_bp.blk_fill = 0;
 
-       /* The bpobj will compress better if we can leave off the checksum */
-       if (!BP_GET_DEDUP(bp))
-               bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
-
        mutex_enter(&bpo->bpo_lock);
 
        offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
index c6e7197b6f57deee6781e8522a8bcd4cabdd070a..1e5fac78eedca7893a8f9b9bfa5b589ffad1ac92 100644 (file)
@@ -40,6 +40,8 @@
 #include <sys/dmu_zfetch.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
+#include <sys/zfeature.h>
+#include <sys/blkptr.h>
 #include <sys/range_tree.h>
 
 struct dbuf_hold_impl_data {
@@ -1492,6 +1494,38 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
        mutex_exit(&db->db_mtx);
 }
 
+void
+dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
+    bp_embedded_type_t etype, enum zio_compress comp,
+    int uncompressed_size, int compressed_size, int byteorder,
+    dmu_tx_t *tx)
+{
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
+       struct dirty_leaf *dl;
+       dmu_object_type_t type;
+
+       DB_DNODE_ENTER(db);
+       type = DB_DNODE(db)->dn_type;
+       DB_DNODE_EXIT(db);
+
+       ASSERT0(db->db_level);
+       ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+
+       dmu_buf_will_not_fill(dbuf, tx);
+
+       ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
+       dl = &db->db_last_dirty->dt.dl;
+       encode_embedded_bp_compressed(&dl->dr_overridden_by,
+           data, comp, uncompressed_size, compressed_size);
+       BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
+       BP_SET_TYPE(&dl->dr_overridden_by, type);
+       BP_SET_LEVEL(&dl->dr_overridden_by, 0);
+       BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
+
+       dl->dr_override_state = DR_OVERRIDDEN;
+       dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
+}
+
 /*
  * Directly assign a provided arc buf to a given dbuf if it's not referenced
  * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
@@ -1885,7 +1919,7 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
        }
 
        if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp, NULL) == 0) {
-               if (bp && !BP_IS_HOLE(bp)) {
+               if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
                        dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
                        uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
                        zbookmark_t zb;
@@ -2575,7 +2609,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
        uint64_t fill = 0;
        int i;
 
-       ASSERT(db->db_blkptr == bp);
+       ASSERT3P(db->db_blkptr, ==, bp);
 
        DB_DNODE_ENTER(db);
        dn = DB_DNODE(db);
@@ -2587,7 +2621,8 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
                ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
                    BP_GET_TYPE(bp) == dn->dn_type) ||
                    (db->db_blkid == DMU_SPILL_BLKID &&
-                   BP_GET_TYPE(bp) == dn->dn_bonustype));
+                   BP_GET_TYPE(bp) == dn->dn_bonustype) ||
+                   BP_IS_EMBEDDED(bp));
                ASSERT(BP_GET_LEVEL(bp) == db->db_level);
        }
 
@@ -2628,12 +2663,13 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
                for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
                        if (BP_IS_HOLE(ibp))
                                continue;
-                       fill += ibp->blk_fill;
+                       fill += BP_GET_FILL(ibp);
                }
        }
        DB_DNODE_EXIT(db);
 
-       bp->blk_fill = fill;
+       if (!BP_IS_EMBEDDED(bp))
+               bp->blk_fill = fill;
 
        mutex_exit(&db->db_mtx);
 }
@@ -2745,7 +2781,8 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
                            dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
                        ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
                            db->db.db_size);
-                       arc_set_callback(db->db_buf, dbuf_do_evict, db);
+                       if (!arc_released(db->db_buf))
+                               arc_set_callback(db->db_buf, dbuf_do_evict, db);
                }
                DB_DNODE_EXIT(db);
                mutex_destroy(&dr->dt.di.dr_mtx);
@@ -2871,10 +2908,16 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
        dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
        DB_DNODE_EXIT(db);
 
-       if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
-               ASSERT(db->db_state != DB_NOFILL);
+       if (db->db_level == 0 &&
+           dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+               /*
+                * The BP for this block has been provided by open context
+                * (by dmu_sync() or dmu_buf_write_embedded()).
+                */
+               void *contents = (data != NULL) ? data->b_data : NULL;
+
                dr->dr_zio = zio_write(zio, os->os_spa, txg,
-                   db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
+                   db->db_blkptr, contents, db->db.db_size, &zp,
                    dbuf_write_override_ready, NULL, dbuf_write_override_done,
                    dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
                mutex_enter(&db->db_mtx);
index d8e5739f47727696d2db2aa3a9e3c94e93aea892..305973abf7adce44daa9b345f94e2f513a87e376 100644 (file)
@@ -124,17 +124,13 @@ const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
 };
 
 int
-dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
-    void *tag, dmu_buf_t **dbp, int flags)
+dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
+    void *tag, dmu_buf_t **dbp)
 {
        dnode_t *dn;
        uint64_t blkid;
        dmu_buf_impl_t *db;
        int err;
-       int db_flags = DB_RF_CANFAIL;
-
-       if (flags & DMU_READ_NO_PREFETCH)
-               db_flags |= DB_RF_NOPREFETCH;
 
        err = dnode_hold(os, object, FTAG, &dn);
        if (err)
@@ -143,18 +139,37 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
        rw_enter(&dn->dn_struct_rwlock, RW_READER);
        db = dbuf_hold(dn, blkid, tag);
        rw_exit(&dn->dn_struct_rwlock);
+       dnode_rele(dn, FTAG);
+
        if (db == NULL) {
-               err = SET_ERROR(EIO);
-       } else {
+               *dbp = NULL;
+               return (SET_ERROR(EIO));
+       }
+
+       *dbp = &db->db;
+       return (err);
+}
+
+int
+dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
+    void *tag, dmu_buf_t **dbp, int flags)
+{
+       int err;
+       int db_flags = DB_RF_CANFAIL;
+
+       if (flags & DMU_READ_NO_PREFETCH)
+               db_flags |= DB_RF_NOPREFETCH;
+
+       err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
+       if (err == 0) {
+               dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
                err = dbuf_read(db, NULL, db_flags);
-               if (err) {
+               if (err != 0) {
                        dbuf_rele(db, tag);
-                       db = NULL;
+                       *dbp = NULL;
                }
        }
 
-       dnode_rele(dn, FTAG);
-       *dbp = &db->db; /* NULL db plus first field offset is NULL */
        return (err);
 }
 
@@ -852,6 +867,25 @@ dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
        dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
+void
+dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
+    void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
+    int compressed_size, int byteorder, dmu_tx_t *tx)
+{
+       dmu_buf_t *db;
+
+       ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
+       ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
+       VERIFY0(dmu_buf_hold_noread(os, object, offset,
+           FTAG, &db));
+
+       dmu_buf_write_embedded(db,
+           data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
+           uncompressed_size, compressed_size, byteorder, tx);
+
+       dmu_buf_rele(db, FTAG);
+}
+
 /*
  * DMU support for xuio
  */
@@ -1393,7 +1427,7 @@ dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
                         * block size still needs to be known for replay.
                         */
                        BP_SET_LSIZE(bp, db->db_size);
-               } else {
+               } else if (!BP_IS_EMBEDDED(bp)) {
                        ASSERT(BP_GET_LEVEL(bp) == 0);
                        bp->blk_fill = 1;
                }
@@ -1664,9 +1698,15 @@ dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
 {
        dnode_t *dn;
 
-       /* XXX assumes dnode_hold will not get an i/o error */
-       (void) dnode_hold(os, object, FTAG, &dn);
-       ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
+       /*
+        * Send streams include each object's checksum function.  This
+        * check ensures that the receiving system can understand the
+        * checksum function transmitted.
+        */
+       ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
+
+       VERIFY0(dnode_hold(os, object, FTAG, &dn));
+       ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
        dn->dn_checksum = checksum;
        dnode_setdirty(dn, tx);
        dnode_rele(dn, FTAG);
@@ -1678,9 +1718,14 @@ dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
 {
        dnode_t *dn;
 
-       /* XXX assumes dnode_hold will not get an i/o error */
-       (void) dnode_hold(os, object, FTAG, &dn);
-       ASSERT(compress < ZIO_COMPRESS_FUNCTIONS);
+       /*
+        * Send streams include each object's compression function.  This
+        * check ensures that the receiving system can understand the
+        * compression function transmitted.
+        */
+       ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
+
+       VERIFY0(dnode_hold(os, object, FTAG, &dn));
        dn->dn_compress = compress;
        dnode_setdirty(dn, tx);
        dnode_rele(dn, FTAG);
@@ -1843,7 +1888,7 @@ __dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
        doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
        doi->doi_fill_count = 0;
        for (i = 0; i < dnp->dn_nblkptr; i++)
-               doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill;
+               doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
 }
 
 void
index b82783098511c1037c03c4ac709b05a8a0d73496..238892cf40631c291bc450b9ec6ce87010669569 100644 (file)
@@ -337,7 +337,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
         * default (fletcher2/off).  Snapshots don't need to know about
         * checksum/compression/copies.
         */
-       if (ds) {
+       if (ds != NULL) {
                err = dsl_prop_register(ds,
                    zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
                    primary_cache_changed_cb, os);
@@ -390,7 +390,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
                        kmem_free(os, sizeof (objset_t));
                        return (err);
                }
-       } else if (ds == NULL) {
+       } else {
                /* It's the meta-objset. */
                os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
                os->os_compress = ZIO_COMPRESS_LZJB;
@@ -434,17 +434,6 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
                    &os->os_groupused_dnode);
        }
 
-       /*
-        * We should be the only thread trying to do this because we
-        * have ds_opening_lock
-        */
-       if (ds) {
-               mutex_enter(&ds->ds_lock);
-               ASSERT(ds->ds_objset == NULL);
-               ds->ds_objset = os;
-               mutex_exit(&ds->ds_lock);
-       }
-
        *osp = os;
        return (0);
 }
@@ -455,11 +444,19 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
        int err = 0;
 
        mutex_enter(&ds->ds_opening_lock);
-       *osp = ds->ds_objset;
-       if (*osp == NULL) {
+       if (ds->ds_objset == NULL) {
+               objset_t *os;
                err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
-                   ds, dsl_dataset_get_blkptr(ds), osp);
+                   ds, dsl_dataset_get_blkptr(ds), &os);
+
+               if (err == 0) {
+                       mutex_enter(&ds->ds_lock);
+                       ASSERT(ds->ds_objset == NULL);
+                       ds->ds_objset = os;
+                       mutex_exit(&ds->ds_lock);
+               }
        }
+       *osp = ds->ds_objset;
        mutex_exit(&ds->ds_opening_lock);
        return (err);
 }
@@ -981,6 +978,7 @@ dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
        objset_t *os = arg;
        dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
 
+       ASSERT(!BP_IS_EMBEDDED(bp));
        ASSERT3P(bp, ==, os->os_rootbp);
        ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
        ASSERT0(BP_GET_LEVEL(bp));
@@ -993,7 +991,7 @@ dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
         */
        bp->blk_fill = 0;
        for (i = 0; i < dnp->dn_nblkptr; i++)
-               bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
+               bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]);
 }
 
 /* ARGSUSED */
index 885da23ba2b8cdbf410c1ac45b1a9677272c3b77..39f282ce662a3830468e30ef5e17199cc2a6a2c1 100644 (file)
@@ -50,7 +50,9 @@
 #include <sys/zfs_onexit.h>
 #include <sys/dmu_send.h>
 #include <sys/dsl_destroy.h>
+#include <sys/blkptr.h>
 #include <sys/dsl_bookmark.h>
+#include <sys/zfeature.h>
 
 /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
 int zfs_send_corrupt_data = B_FALSE;
@@ -197,7 +199,7 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
 }
 
 static int
-dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type,
+dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
     uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data)
 {
        struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
@@ -232,13 +234,22 @@ dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type,
        drrw->drr_offset = offset;
        drrw->drr_length = blksz;
        drrw->drr_toguid = dsp->dsa_toguid;
-       drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
-       if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup)
-               drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
-       DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
-       DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
-       DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
-       drrw->drr_key.ddk_cksum = bp->blk_cksum;
+       if (BP_IS_EMBEDDED(bp)) {
+               /*
+                * There's no pre-computed checksum of embedded BP's, so
+                * (like fletcher4-checkummed blocks) userland will have
+                * to compute a dedup-capable checksum itself.
+                */
+               drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
+       } else {
+               drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
+               if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup)
+                       drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
+               DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
+               DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
+               DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
+               drrw->drr_key.ddk_cksum = bp->blk_cksum;
+       }
 
        if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
                return (SET_ERROR(EINTR));
@@ -247,6 +258,43 @@ dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type,
        return (0);
 }
 
+static int
+dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
+    int blksz, const blkptr_t *bp)
+{
+       char buf[BPE_PAYLOAD_SIZE];
+       struct drr_write_embedded *drrw =
+           &(dsp->dsa_drr->drr_u.drr_write_embedded);
+
+       if (dsp->dsa_pending_op != PENDING_NONE) {
+               if (dump_bytes(dsp, dsp->dsa_drr,
+                   sizeof (dmu_replay_record_t)) != 0)
+                       return (EINTR);
+               dsp->dsa_pending_op = PENDING_NONE;
+       }
+
+       ASSERT(BP_IS_EMBEDDED(bp));
+
+       bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
+       dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED;
+       drrw->drr_object = object;
+       drrw->drr_offset = offset;
+       drrw->drr_length = blksz;
+       drrw->drr_toguid = dsp->dsa_toguid;
+       drrw->drr_compression = BP_GET_COMPRESS(bp);
+       drrw->drr_etype = BPE_GET_ETYPE(bp);
+       drrw->drr_lsize = BPE_GET_LSIZE(bp);
+       drrw->drr_psize = BPE_GET_PSIZE(bp);
+
+       decode_embedded_bp_compressed(bp, buf);
+
+       if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
+               return (EINTR);
+       if (dump_bytes(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0)
+               return (EINTR);
+       return (0);
+}
+
 static int
 dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data)
 {
@@ -367,6 +415,33 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
        return (0);
 }
 
+static boolean_t
+backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp)
+{
+       if (!BP_IS_EMBEDDED(bp))
+               return (B_FALSE);
+
+       /*
+        * Compression function must be legacy, or explicitly enabled.
+        */
+       if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS &&
+           !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4)))
+               return (B_FALSE);
+
+       /*
+        * Embed type must be explicitly enabled.
+        */
+       switch (BPE_GET_ETYPE(bp)) {
+       case BP_EMBEDDED_TYPE_DATA:
+               if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
+                       return (B_TRUE);
+               break;
+       default:
+               return (B_FALSE);
+       }
+       return (B_FALSE);
+}
+
 #define        BP_SPAN(dnp, level) \
        (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
        (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
@@ -435,11 +510,17 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 
                err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data);
                (void) arc_buf_remove_ref(abuf, &abuf);
+       } else if (backup_do_embed(dsp, bp)) {
+               /* it's an embedded level-0 block of a regular object */
+               int blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+               err = dump_write_embedded(dsp, zb->zb_object,
+                   zb->zb_blkid * blksz, blksz, bp);
        } else { /* it's a level-0 block of a regular object */
                uint32_t aflags = ARC_WAIT;
                arc_buf_t *abuf;
                int blksz = BP_GET_LSIZE(bp);
 
+               ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
                ASSERT0(zb->zb_level);
                if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
                    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
@@ -458,7 +539,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
                        }
                }
 
-               err = dump_data(dsp, type, zb->zb_object, zb->zb_blkid * blksz,
+               err = dump_write(dsp, type, zb->zb_object, zb->zb_blkid * blksz,
                    blksz, bp, abuf->b_data);
                (void) arc_buf_remove_ref(abuf, &abuf);
        }
@@ -472,14 +553,15 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
  */
 static int
 dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
-    zfs_bookmark_phys_t *fromzb, boolean_t is_clone, int outfd,
-    vnode_t *vp, offset_t *off)
+    zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok,
+    int outfd, vnode_t *vp, offset_t *off)
 {
        objset_t *os;
        dmu_replay_record_t *drr;
        dmu_sendarg_t *dsp;
        int err;
        uint64_t fromtxg = 0;
+       uint64_t featureflags = 0;
 
        err = dmu_objset_from_ds(ds, &os);
        if (err != 0) {
@@ -502,13 +584,23 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
                        return (SET_ERROR(EINVAL));
                }
                if (version >= ZPL_VERSION_SA) {
-                       DMU_SET_FEATUREFLAGS(
-                           drr->drr_u.drr_begin.drr_versioninfo,
-                           DMU_BACKUP_FEATURE_SA_SPILL);
+                       featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
                }
        }
 #endif
 
+       if (embedok &&
+           spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
+               featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
+               if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
+                       featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4;
+       } else {
+               embedok = B_FALSE;
+       }
+
+       DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo,
+           featureflags);
+
        drr->drr_u.drr_begin.drr_creation_time =
            ds->ds_phys->ds_creation_time;
        drr->drr_u.drr_begin.drr_type = dmu_objset_type(os);
@@ -540,6 +632,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
        ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0);
        dsp->dsa_pending_op = PENDING_NONE;
        dsp->dsa_incremental = (fromzb != NULL);
+       dsp->dsa_featureflags = featureflags;
 
        mutex_enter(&ds->ds_sendstream_lock);
        list_insert_head(&ds->ds_sendstreams, dsp);
@@ -591,7 +684,7 @@ out:
 
 int
 dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
-    int outfd, vnode_t *vp, offset_t *off)
+    boolean_t embedok, int outfd, vnode_t *vp, offset_t *off)
 {
        dsl_pool_t *dp;
        dsl_dataset_t *ds;
@@ -625,10 +718,10 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
                zb.zbm_guid = fromds->ds_phys->ds_guid;
                is_clone = (fromds->ds_dir != ds->ds_dir);
                dsl_dataset_rele(fromds, FTAG);
-               err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+               err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
                    outfd, vp, off);
        } else {
-               err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+               err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
                    outfd, vp, off);
        }
        dsl_dataset_rele(ds, FTAG);
@@ -636,7 +729,7 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
 }
 
 int
-dmu_send(const char *tosnap, const char *fromsnap,
+dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
     int outfd, vnode_t *vp, offset_t *off)
 {
        dsl_pool_t *dp;
@@ -703,10 +796,10 @@ dmu_send(const char *tosnap, const char *fromsnap,
                        dsl_pool_rele(dp, FTAG);
                        return (err);
                }
-               err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+               err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok,
                    outfd, vp, off);
        } else {
-               err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+               err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok,
                    outfd, vp, off);
        }
        if (owned)
@@ -861,6 +954,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
        uint64_t fromguid = drrb->drr_fromguid;
        int flags = drrb->drr_flags;
        int error;
+       uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
        dsl_dataset_t *ds;
        const char *tofs = drba->drba_cookie->drc_tofs;
 
@@ -874,11 +968,22 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
                return (SET_ERROR(EINVAL));
 
        /* Verify pool version supports SA if SA_SPILL feature set */
-       if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
-           DMU_BACKUP_FEATURE_SA_SPILL) &&
-           spa_version(dp->dp_spa) < SPA_VERSION_SA) {
+       if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
+           spa_version(dp->dp_spa) < SPA_VERSION_SA)
+               return (SET_ERROR(ENOTSUP));
+
+       /*
+        * The receiving code doesn't know how to translate a WRITE_EMBEDDED
+        * record to a plan WRITE record, so the pool must have the
+        * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
+        * records.  Same with WRITE_EMBEDDED records that use LZ4 compression.
+        */
+       if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
+           !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
+               return (SET_ERROR(ENOTSUP));
+       if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) &&
+           !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
                return (SET_ERROR(ENOTSUP));
-       }
 
        error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
        if (error == 0) {
@@ -1153,7 +1258,6 @@ backup_byteswap(dmu_replay_record_t *drr)
                break;
        case DRR_OBJECT:
                DO64(drr_object.drr_object);
-               /* DO64(drr_object.drr_allocation_txg); */
                DO32(drr_object.drr_type);
                DO32(drr_object.drr_bonustype);
                DO32(drr_object.drr_blksz);
@@ -1191,6 +1295,14 @@ backup_byteswap(dmu_replay_record_t *drr)
                DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]);
                DO64(drr_write_byref.drr_key.ddk_prop);
                break;
+       case DRR_WRITE_EMBEDDED:
+               DO64(drr_write_embedded.drr_object);
+               DO64(drr_write_embedded.drr_offset);
+               DO64(drr_write_embedded.drr_length);
+               DO64(drr_write_embedded.drr_toguid);
+               DO32(drr_write_embedded.drr_lsize);
+               DO32(drr_write_embedded.drr_psize);
+               break;
        case DRR_FREE:
                DO64(drr_free.drr_object);
                DO64(drr_free.drr_offset);
@@ -1380,7 +1492,7 @@ restore_write_byref(struct restorearg *ra, objset_t *os,
        int err;
        guid_map_entry_t gmesrch;
        guid_map_entry_t *gmep;
-       avl_index_t     where;
+       avl_index_t where;
        objset_t *ref_os = NULL;
        dmu_buf_t *dbp;
 
@@ -1405,7 +1517,7 @@ restore_write_byref(struct restorearg *ra, objset_t *os,
 
        err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
            drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH);
-       if (err)
+       if (err != 0)
                return (err);
 
        tx = dmu_tx_create(os);
@@ -1424,6 +1536,48 @@ restore_write_byref(struct restorearg *ra, objset_t *os,
        return (0);
 }
 
+static int
+restore_write_embedded(struct restorearg *ra, objset_t *os,
+    struct drr_write_embedded *drrwnp)
+{
+       dmu_tx_t *tx;
+       int err;
+       void *data;
+
+       if (drrwnp->drr_offset + drrwnp->drr_length < drrwnp->drr_offset)
+               return (EINVAL);
+
+       if (drrwnp->drr_psize > BPE_PAYLOAD_SIZE)
+               return (EINVAL);
+
+       if (drrwnp->drr_etype >= NUM_BP_EMBEDDED_TYPES)
+               return (EINVAL);
+       if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
+               return (EINVAL);
+
+       data = restore_read(ra, P2ROUNDUP(drrwnp->drr_psize, 8));
+       if (data == NULL)
+               return (ra->err);
+
+       tx = dmu_tx_create(os);
+
+       dmu_tx_hold_write(tx, drrwnp->drr_object,
+           drrwnp->drr_offset, drrwnp->drr_length);
+       err = dmu_tx_assign(tx, TXG_WAIT);
+       if (err != 0) {
+               dmu_tx_abort(tx);
+               return (err);
+       }
+
+       dmu_write_embedded(os, drrwnp->drr_object,
+           drrwnp->drr_offset, data, drrwnp->drr_etype,
+           drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize,
+           ra->byteswap ^ ZFS_HOST_BYTEORDER, tx);
+
+       dmu_tx_commit(tx);
+       return (0);
+}
+
 static int
 restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
 {
@@ -1618,6 +1772,13 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
                        ra.err = restore_write_byref(&ra, os, &drrwbr);
                        break;
                }
+               case DRR_WRITE_EMBEDDED:
+               {
+                       struct drr_write_embedded drrwe =
+                           drr->drr_u.drr_write_embedded;
+                       ra.err = restore_write_embedded(&ra, os, &drrwe);
+                       break;
+               }
                case DRR_FREE:
                {
                        struct drr_free drrf = drr->drr_u.drr_free;
index 8c44e1771ad75dfb7bb0bbe7fcef3c748fd37e7a..e086e2487e8ae648348ae883356d188c81e8e871 100644 (file)
@@ -463,7 +463,7 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
        if (pfd->pd_cancel)
                return (SET_ERROR(EINTR));
 
-       if (BP_IS_HOLE(bp) ||
+       if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) ||
            !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
            BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) ||
            BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
index 5aee10409aa95498eaa410576a1bf2ab857fb029..4368164970830b7add2b8b48a7663920178cabdd 100644 (file)
@@ -1814,8 +1814,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
                *offset = *offset >> span;
                for (i = BF64_GET(*offset, 0, epbs);
                    i >= 0 && i < epb; i += inc) {
-                       if (bp[i].blk_fill >= minfill &&
-                           bp[i].blk_fill <= maxfill &&
+                       if (BP_GET_FILL(&bp[i]) >= minfill &&
+                           BP_GET_FILL(&bp[i]) <= maxfill &&
                            (hole || bp[i].blk_birth > txg))
                                break;
                        if (inc > 0 || *offset > 0)
index 23892006080c194236941fb4f4b93948ecfb8e8d..676578859018ffa1813ea81b4f3745af5d3d84c9 100644 (file)
@@ -237,8 +237,6 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
 }
 #endif
 
-#define        ALL -1
-
 static void
 free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
     dmu_tx_t *tx)
@@ -601,11 +599,14 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
                dnp->dn_bonustype = dn->dn_bonustype;
                dnp->dn_bonuslen = dn->dn_bonuslen;
        }
-
        ASSERT(dnp->dn_nlevels > 1 ||
            BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
+           BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) ||
            BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
            dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+       ASSERT(dnp->dn_nlevels < 2 ||
+           BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
+           BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 1 << dnp->dn_indblkshift);
 
        if (dn->dn_next_type[txgoff] != 0) {
                dnp->dn_type = dn->dn_type;
index 2fe6b858dcb7a317ff6ce41f678c52d8b0bfb6a6..e23dd6b061a0edf405c739df2f12526b16e904a3 100644 (file)
@@ -1525,7 +1525,7 @@ dsl_dataset_space(dsl_dataset_t *ds,
                else
                        *availbytesp = 0;
        }
-       *usedobjsp = ds->ds_phys->ds_bp.blk_fill;
+       *usedobjsp = BP_GET_FILL(&ds->ds_phys->ds_bp);
        *availobjsp = DN_MAX_OBJECT - *usedobjsp;
 }
 
index 113b7261b95788ad95081a9d2c88adfb2806a97b..dc49fd558c3caa7a5143005d4a6292105e1640a7 100644 (file)
@@ -546,7 +546,7 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
        struct killarg *ka = arg;
        dmu_tx_t *tx = ka->tx;
 
-       if (BP_IS_HOLE(bp))
+       if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
                return (0);
 
        if (zb->zb_level == ZB_ZIL_LEVEL) {
@@ -596,6 +596,7 @@ dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds)
        uint64_t count;
        objset_t *mos;
 
+       ASSERT(!dsl_dataset_is_snapshot(ds));
        if (dsl_dataset_is_snapshot(ds))
                return (SET_ERROR(EINVAL));
 
@@ -708,7 +709,7 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
            ds->ds_prev->ds_phys->ds_num_children == 2 &&
            ds->ds_prev->ds_userrefs == 0);
 
-       /* Remove our reservation */
+       /* Remove our reservation. */
        if (ds->ds_reserved != 0) {
                dsl_dataset_set_refreservation_sync_impl(ds,
                    (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
index bf06b51ee4709012b572f19e49c80eb678588d89..f03f1f54b450ba2994b6b3c563a65c8f1249b376 100644 (file)
@@ -1515,6 +1515,10 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
                }
                if (err == ERESTART)
                        return;
+               /* finished; verify that space accounting went to zero */
+               ASSERT0(dp->dp_free_dir->dd_phys->dd_used_bytes);
+               ASSERT0(dp->dp_free_dir->dd_phys->dd_compressed_bytes);
+               ASSERT0(dp->dp_free_dir->dd_phys->dd_uncompressed_bytes);
        }
 
        if (scn->scn_phys.scn_state != DSS_SCANNING)
@@ -1700,6 +1704,9 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
 
        count_block(dp->dp_blkstats, bp);
 
+       if (BP_IS_EMBEDDED(bp))
+               return (0);
+
        ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
        if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
                zio_flags |= ZIO_FLAG_SCRUB;
index e24ed644408316597c982ad8091988b4fb973060..93fd5d9e1751906e810d5ef5c02324f80ef01534 100644 (file)
@@ -610,8 +610,7 @@ dsl_dataset_user_release_impl(nvlist_t *holds, nvlist_t *errlist,
            KM_PUSHPAGE));
 
        error = dsl_sync_task(pool, dsl_dataset_user_release_check,
-           dsl_dataset_user_release_sync, &ddura,
-           fnvlist_num_pairs(holds));
+           dsl_dataset_user_release_sync, &ddura, 0);
        fnvlist_free(ddura.ddura_todelete);
        fnvlist_free(ddura.ddura_chkholds);
 
index 2dfdafb3ab8c25dae7a37ba9da3b1ac7fd9b6caa..9c09837d57fc7aebdc9043c22aa6b58c87c39c8a 100644 (file)
@@ -2236,6 +2236,7 @@ metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp)
        vdev_t *vd;
 
        ASSERT(!BP_IS_HOLE(bp));
+       ASSERT(!BP_IS_EMBEDDED(bp));
        ASSERT(psize > 0);
 
        spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
@@ -2259,6 +2260,7 @@ metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp)
        vdev_t *vd;
 
        ASSERT(!BP_IS_HOLE(bp));
+       ASSERT(!BP_IS_EMBEDDED(bp));
        ASSERT(psize > 0);
 
        spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
index 5bf59315a678715d75b578813d6db870d691bcb5..b9fa45f8299e015bc3889c6aef9bb66311359b4d 100644 (file)
@@ -1872,7 +1872,7 @@ static int
 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
 {
-       if (!BP_IS_HOLE(bp)) {
+       if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
                zio_t *rio = arg;
                size_t size = BP_GET_PSIZE(bp);
                void *data = zio_data_buf_alloc(size);
@@ -2423,9 +2423,8 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
 
        if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
                if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
-                   &spa->spa_feat_enabled_txg_obj) != 0) {
+                   &spa->spa_feat_enabled_txg_obj) != 0)
                        return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
-               }
        }
 
        spa->spa_is_initializing = B_TRUE;
@@ -5333,11 +5332,6 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
                ASSERT(!locked);
                ASSERT(vd == vd->vdev_top);
 
-               /*
-                * XXX - Once we have bp-rewrite this should
-                * become the common case.
-                */
-
                mg = vd->vdev_mg;
 
                /*
@@ -6487,7 +6481,7 @@ spa_upgrade(spa_t *spa, uint64_t version)
         * possible.
         */
        ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
-       ASSERT(version >= spa->spa_uberblock.ub_version);
+       ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
 
        spa->spa_uberblock.ub_version = version;
        vdev_config_dirty(spa->spa_root_vdev);
index 02ccb13a2e417e6458022deb29a2576fa20894cd..1bed90027ecdaf86abf2623380bc4f104b854cd3 100644 (file)
@@ -1293,7 +1293,10 @@ snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
                        (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name,
                            sizeof (type));
                }
-               checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
+               if (!BP_IS_EMBEDDED(bp)) {
+                       checksum =
+                           zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
+               }
                compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
        }
 
@@ -1588,7 +1591,7 @@ bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
        uint64_t dsize = 0;
        int d;
 
-       for (d = 0; d < SPA_DVAS_PER_BP; d++)
+       for (d = 0; d < BP_GET_NDVAS(bp); d++)
                dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
 
        return (dsize);
@@ -1602,7 +1605,7 @@ bp_get_dsize(spa_t *spa, const blkptr_t *bp)
 
        spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 
-       for (d = 0; d < SPA_DVAS_PER_BP; d++)
+       for (d = 0; d < BP_GET_NDVAS(bp); d++)
                dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
 
        spa_config_exit(spa, SCL_VDEV, FTAG);
index 2ea22024536f1960f0ec2840ed8c0a11e9e0c927..f99e5349e8e06b82a7f85160e84f21283da0e99b 100644 (file)
@@ -213,4 +213,9 @@ zpool_feature_init(void)
            "\"zfs bookmark\" command",
            B_TRUE, B_FALSE, B_FALSE, bookmarks_deps);
        }
+
+       zfeature_register(SPA_FEATURE_EMBEDDED_DATA,
+           "com.delphix:embedded_data", "embedded_data",
+           "Blocks which compress very well use even less space.",
+           B_FALSE, B_TRUE, B_TRUE, NULL);
 }
index 409d2c737a5b26f98d571e655b128e32e618a20b..491ba492c4a9b5395abfa3143f05a149598b7c9b 100644 (file)
@@ -4238,6 +4238,7 @@ out:
  * zc_fromobj  objsetid of incremental fromsnap (may be zero)
  * zc_guid     if set, estimate size of stream only.  zc_cookie is ignored.
  *             output size in zc_objset_type.
+ * zc_flags    if =1, WRITE_EMBEDDED records are permitted
  *
  * outputs:
  * zc_objset_type      estimated size, if zc_guid is set
@@ -4248,6 +4249,7 @@ zfs_ioc_send(zfs_cmd_t *zc)
        int error;
        offset_t off;
        boolean_t estimate = (zc->zc_guid != 0);
+       boolean_t embedok = (zc->zc_flags & 0x1);
 
        if (zc->zc_obj != 0) {
                dsl_pool_t *dp;
@@ -4308,7 +4310,7 @@ zfs_ioc_send(zfs_cmd_t *zc)
 
                off = fp->f_offset;
                error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
-                   zc->zc_fromobj, zc->zc_cookie, fp->f_vnode, &off);
+                   zc->zc_fromobj, embedok, zc->zc_cookie, fp->f_vnode, &off);
 
                if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
                        fp->f_offset = off;
@@ -5174,6 +5176,8 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
  * innvl: {
  *     "fd" -> file descriptor to write stream to (int32)
  *     (optional) "fromsnap" -> full snap name to send an incremental from
+ *     (optional) "embedok" -> (value ignored)
+ *         presence indicates DRR_WRITE_EMBEDDED records are permitted
  * }
  *
  * outnvl is unused
@@ -5187,6 +5191,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
        char *fromname = NULL;
        int fd;
        file_t *fp;
+       boolean_t embedok;
 
        error = nvlist_lookup_int32(innvl, "fd", &fd);
        if (error != 0)
@@ -5194,11 +5199,13 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
 
        (void) nvlist_lookup_string(innvl, "fromsnap", &fromname);
 
+       embedok = nvlist_exists(innvl, "embedok");
+
        if ((fp = getf(fd)) == NULL)
                return (SET_ERROR(EBADF));
 
        off = fp->f_offset;
-       error = dmu_send(snapname, fromname, fd, fp->f_vnode, &off);
+       error = dmu_send(snapname, fromname, embedok, fd, fp->f_vnode, &off);
 
        if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
                fp->f_offset = off;
index b5ee395d1f63cc5931d58517b9a310dfda8a18b2..7c3c6592b51f59a007546271356b9242a0058f47 100644 (file)
@@ -159,10 +159,15 @@ int
 zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
 {
        avl_tree_t *t = &zilog->zl_bp_tree;
-       const dva_t *dva = BP_IDENTITY(bp);
+       const dva_t *dva;
        zil_bp_node_t *zn;
        avl_index_t where;
 
+       if (BP_IS_EMBEDDED(bp))
+               return (0);
+
+       dva = BP_IDENTITY(bp);
+
        if (avl_find(t, dva, &where) != NULL)
                return (SET_ERROR(EEXIST));
 
@@ -863,7 +868,7 @@ zil_lwb_write_done(zio_t *zio)
        ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
        ASSERT(!BP_IS_GANG(zio->io_bp));
        ASSERT(!BP_IS_HOLE(zio->io_bp));
-       ASSERT(zio->io_bp->blk_fill == 0);
+       ASSERT(BP_GET_FILL(zio->io_bp) == 0);
 
        /*
         * Ensure the lwb buffer pointer is cleared before releasing
index 6352ab3a3fe27b3231a8d897b0abafcaef9918ba..ad97ef5dbd02bf77295cff007236f58817d5a04e 100644 (file)
@@ -36,6 +36,7 @@
 #include <sys/dmu_objset.h>
 #include <sys/arc.h>
 #include <sys/ddt.h>
+#include <sys/blkptr.h>
 #include <sys/zfeature.h>
 
 /*
@@ -243,7 +244,7 @@ zio_buf_alloc(size_t size)
 {
        size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 
-       ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+       ASSERT3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
        return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE | KM_NODEBUG));
 }
@@ -711,6 +712,16 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
        zio->io_physdone = physdone;
        zio->io_prop = *zp;
 
+       /*
+        * Data can be NULL if we are going to call zio_write_override() to
+        * provide the already-allocated BP.  But we may need the data to
+        * verify a dedup hit (if requested).  In this case, don't try to
+        * dedup (just take the already-allocated BP verbatim).
+        */
+       if (data == NULL && zio->io_prop.zp_dedup_verify) {
+               zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
+       }
+
        return (zio);
 }
 
@@ -750,6 +761,14 @@ zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
 void
 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
 {
+
+       /*
+        * The check for EMBEDDED is a performance optimization.  We
+        * process the free here (by ignoring it) rather than
+        * putting it on the list and then processing it in zio_free_sync().
+        */
+       if (BP_IS_EMBEDDED(bp))
+               return;
        metaslab_check_free(spa, bp);
 
        /*
@@ -774,13 +793,13 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
        zio_t *zio;
        enum zio_stage stage = ZIO_FREE_PIPELINE;
 
-       dprintf_bp(bp, "freeing in txg %llu, pass %u",
-           (longlong_t)txg, spa->spa_sync_pass);
-
        ASSERT(!BP_IS_HOLE(bp));
        ASSERT(spa_syncing_txg(spa) == txg);
        ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
 
+       if (BP_IS_EMBEDDED(bp))
+               return (zio_null(pio, spa, NULL, NULL, NULL, 0));
+
        metaslab_check_free(spa, bp);
        arc_freed(spa, bp);
 
@@ -805,6 +824,11 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 {
        zio_t *zio;
 
+       dprintf_bp(bp, "claiming in txg %llu", txg);
+
+       if (BP_IS_EMBEDDED(bp))
+               return (zio_null(pio, spa, NULL, NULL, NULL, 0));
+
        /*
         * A claim is an allocation of a specific block.  Claims are needed
         * to support immediate writes in the intent log.  The issue is that
@@ -1011,12 +1035,20 @@ zio_read_bp_init(zio_t *zio)
        if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
            zio->io_child_type == ZIO_CHILD_LOGICAL &&
            !(zio->io_flags & ZIO_FLAG_RAW)) {
-               uint64_t psize = BP_GET_PSIZE(bp);
+               uint64_t psize =
+                   BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
                void *cbuf = zio_buf_alloc(psize);
 
                zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
        }
 
+       if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
+               zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+               decode_embedded_bp_compressed(bp, zio->io_data);
+       } else {
+               ASSERT(!BP_IS_EMBEDDED(bp));
+       }
+
        if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
                zio->io_flags |= ZIO_FLAG_DONT_CACHE;
 
@@ -1060,6 +1092,9 @@ zio_write_bp_init(zio_t *zio)
                *bp = *zio->io_bp_override;
                zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
+               if (BP_IS_EMBEDDED(bp))
+                       return (ZIO_PIPELINE_CONTINUE);
+
                /*
                 * If we've been overridden and nopwrite is set then
                 * set the flag accordingly to indicate that a nopwrite
@@ -1108,7 +1143,7 @@ zio_write_bp_init(zio_t *zio)
                        compress = ZIO_COMPRESS_OFF;
 
                /* Make sure someone doesn't change their mind on overwrites */
-               ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp),
+               ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
                    spa_max_replication(spa)) == BP_GET_NDVAS(bp));
        }
 
@@ -1118,9 +1153,38 @@ zio_write_bp_init(zio_t *zio)
                if (psize == 0 || psize == lsize) {
                        compress = ZIO_COMPRESS_OFF;
                        zio_buf_free(cbuf, lsize);
+               } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
+                   zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
+                   spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
+                       encode_embedded_bp_compressed(bp,
+                           cbuf, compress, lsize, psize);
+                       BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
+                       BP_SET_TYPE(bp, zio->io_prop.zp_type);
+                       BP_SET_LEVEL(bp, zio->io_prop.zp_level);
+                       zio_buf_free(cbuf, lsize);
+                       bp->blk_birth = zio->io_txg;
+                       zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+                       ASSERT(spa_feature_is_active(spa,
+                           SPA_FEATURE_EMBEDDED_DATA));
+                       return (ZIO_PIPELINE_CONTINUE);
                } else {
-                       ASSERT(psize < lsize);
-                       zio_push_transform(zio, cbuf, psize, lsize, NULL);
+                       /*
+                        * Round up compressed size to MINBLOCKSIZE and
+                        * zero the tail.
+                        */
+                       size_t rounded =
+                           P2ROUNDUP(psize, (size_t)SPA_MINBLOCKSIZE);
+                       if (rounded > psize) {
+                               bzero((char *)cbuf + psize, rounded - psize);
+                               psize = rounded;
+                       }
+                       if (psize == lsize) {
+                               compress = ZIO_COMPRESS_OFF;
+                               zio_buf_free(cbuf, lsize);
+                       } else {
+                               zio_push_transform(zio, cbuf,
+                                   psize, lsize, NULL);
+                       }
                }
        }
 
@@ -2873,7 +2937,7 @@ zio_checksum_verified(zio_t *zio)
 /*
  * ==========================================================================
  * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
- * An error of 0 indictes success.  ENXIO indicates whole-device failure,
+ * An error of 0 indicates success.  ENXIO indicates whole-device failure,
  * which may be transient (e.g. unplugged) or permament.  ECKSUM and EIO
  * indicate errors that are specific to one I/O, and most likely permanent.
  * Any other error is presumed to be worse because we weren't expecting it.
@@ -2979,7 +3043,7 @@ zio_done(zio_t *zio)
                for (w = 0; w < ZIO_WAIT_TYPES; w++)
                        ASSERT(zio->io_children[c][w] == 0);
 
-       if (zio->io_bp != NULL) {
+       if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) {
                ASSERT(zio->io_bp->blk_pad[0] == 0);
                ASSERT(zio->io_bp->blk_pad[1] == 0);
                ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy,
@@ -3216,7 +3280,8 @@ zio_done(zio_t *zio)
        }
 
        if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp &&
-           !BP_IS_HOLE(zio->io_bp) && !(zio->io_flags & ZIO_FLAG_NOPWRITE)) {
+           !BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) &&
+           !(zio->io_flags & ZIO_FLAG_NOPWRITE)) {
                metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp);
        }
 
index bc7331764c7bad41992eec891d8f0cfb0aa424eb..3a5c73a6a1e94cd7c9cb22ea95b703a44c3bf262 100644 (file)
@@ -126,7 +126,7 @@ zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child,
 static void
 zio_checksum_gang_verifier(zio_cksum_t *zcp, blkptr_t *bp)
 {
-       dva_t *dva = BP_IDENTITY(bp);
+       const dva_t *dva = BP_IDENTITY(bp);
        uint64_t txg = BP_PHYSICAL_BIRTH(bp);
 
        ASSERT(BP_IS_GANG(bp));
index 5b63f0aa0c73a4fbf708c40910492c5523a6f379..07446234922b63e92d1214483b0251ef0230d417 100644 (file)
@@ -80,7 +80,7 @@ size_t
 zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
 {
        uint64_t *word, *word_end;
-       size_t c_len, d_len, r_len;
+       size_t c_len, d_len;
        zio_compress_info_t *ci = &zio_compress_table[c];
 
        ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS);
@@ -102,28 +102,13 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
                return (s_len);
 
        /* Compress at least 12.5% */
-       d_len = P2ALIGN(s_len - (s_len >> 3), (size_t)SPA_MINBLOCKSIZE);
-       if (d_len == 0)
-               return (s_len);
-
+       d_len = s_len - (s_len >> 3);
        c_len = ci->ci_compress(src, dst, s_len, d_len, ci->ci_level);
 
        if (c_len > d_len)
                return (s_len);
 
-       /*
-        * Cool.  We compressed at least as much as we were hoping to.
-        * For both security and repeatability, pad out the last sector.
-        */
-       r_len = P2ROUNDUP(c_len, (size_t)SPA_MINBLOCKSIZE);
-       if (r_len > c_len) {
-               bzero((char *)dst + c_len, r_len - c_len);
-               c_len = r_len;
-       }
-
        ASSERT3U(c_len, <=, d_len);
-       ASSERT(P2PHASE(c_len, (size_t)SPA_MINBLOCKSIZE) == 0);
-
        return (c_len);
 }