]> granicus.if.org Git - zfs/commitdiff
Illumos 5746 - more checksumming in zfs send
authorMatthew Ahrens <mahrens@delphix.com>
Mon, 6 Jul 2015 03:20:31 +0000 (05:20 +0200)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Wed, 30 Dec 2015 22:24:14 +0000 (14:24 -0800)
5746 more checksumming in zfs send
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Bayard Bell <buffer.g.overflow@gmail.com>
Approved by: Albert Lee <trisk@omniti.com>

References:
  https://www.illumos.org/issues/5746
  https://github.com/illumos/illumos-gate/commit/98110f0
  https://github.com/zfsonlinux/zfs/issues/905

Porting notes:
- Minor conflicts due to:
  - https://github.com/zfsonlinux/zfs/commit/2024041
  - https://github.com/zfsonlinux/zfs/commit/044baf0
  - https://github.com/zfsonlinux/zfs/commit/88904bb
- Fix ISO C90 warnings (-Werror=declaration-after-statement)
  - arc_buf_t *abuf;
  - dmu_buf_t *bonus;
  - zio_cksum_t cksum_orig;
  - zio_cksum_t *cksump;
- Fix format '%llx' format specifier warning
- Align message in zstreamdump safe_malloc() with upstream

Ported-by: kernelOfTruth kerneloftruth@gmail.com
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3611

cmd/zstreamdump/zstreamdump.c
include/sys/spa.h
include/sys/zfs_ioctl.h
include/sys/zio_checksum.h
lib/libzfs/libzfs_sendrecv.c
module/zfs/dmu_send.c

index 176dd66b268af4c9c8b2becbe2d91bdd0cc7b530..f288d148e574b6211ec560b5d502378a98df8cf9 100644 (file)
@@ -27,7 +27,7 @@
  */
 
 /*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
  */
 
 #include <ctype.h>
@@ -36,6 +36,7 @@
 #include <stdlib.h>
 #include <strings.h>
 #include <unistd.h>
+#include <stddef.h>
 
 #include <sys/dmu.h>
 #include <sys/zfs_ioctl.h>
@@ -73,8 +74,8 @@ safe_malloc(size_t size)
 {
        void *rv = malloc(size);
        if (rv == NULL) {
-               (void) fprintf(stderr, "ERROR; failed to allocate %u bytes\n",
-                   (unsigned)size);
+               (void) fprintf(stderr, "ERROR; failed to allocate %zu bytes\n",
+                   size);
                abort();
        }
        return (rv);
@@ -85,7 +86,6 @@ safe_malloc(size_t size)
  *
  * Read while computing incremental checksum
  */
-
 static size_t
 ssread(void *buf, size_t len, zio_cksum_t *cksum)
 {
@@ -94,7 +94,7 @@ ssread(void *buf, size_t len, zio_cksum_t *cksum)
        if ((outlen = fread(buf, len, 1, send_stream)) == 0)
                return (0);
 
-       if (do_cksum && cksum) {
+       if (do_cksum) {
                if (do_byteswap)
                        fletcher_4_incremental_byteswap(buf, len, cksum);
                else
@@ -104,6 +104,34 @@ ssread(void *buf, size_t len, zio_cksum_t *cksum)
        return (outlen);
 }
 
+static size_t
+read_hdr(dmu_replay_record_t *drr, zio_cksum_t *cksum)
+{
+       ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+           ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
+       size_t r = ssread(drr, sizeof (*drr) - sizeof (zio_cksum_t), cksum);
+       if (r == 0)
+               return (0);
+       zio_cksum_t saved_cksum = *cksum;
+       r = ssread(&drr->drr_u.drr_checksum.drr_checksum,
+           sizeof (zio_cksum_t), cksum);
+       if (r == 0)
+               return (0);
+       if (!ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.drr_checksum.drr_checksum) &&
+           !ZIO_CHECKSUM_EQUAL(saved_cksum,
+           drr->drr_u.drr_checksum.drr_checksum)) {
+               fprintf(stderr, "invalid checksum\n");
+               (void) printf("Incorrect checksum in record header.\n");
+               (void) printf("Expected checksum = %llx/%llx/%llx/%llx\n",
+                   (longlong_t)saved_cksum.zc_word[0],
+                   (longlong_t)saved_cksum.zc_word[1],
+                   (longlong_t)saved_cksum.zc_word[2],
+                   (longlong_t)saved_cksum.zc_word[3]);
+               exit(1);
+       }
+       return (sizeof (*drr));
+}
+
 /*
  * Print part of a block in ASCII characters
  */
@@ -135,7 +163,7 @@ print_block(char *buf, int length)
         * Start printing ASCII characters at a constant offset, after
         * the hex prints. Leave 3 characters per byte on a line (2 digit
         * hex number plus 1 space) plus spaces between characters and
-        * groupings
+        * groupings.
         */
        int ascii_start = BYTES_PER_LINE * 3 +
            BYTES_PER_LINE / DUMP_GROUPING + 2;
@@ -185,8 +213,10 @@ main(int argc, char *argv[])
        struct drr_free *drrf = &thedrr.drr_u.drr_free;
        struct drr_spill *drrs = &thedrr.drr_u.drr_spill;
        struct drr_write_embedded *drrwe = &thedrr.drr_u.drr_write_embedded;
+       struct drr_checksum *drrc = &thedrr.drr_u.drr_checksum;
        char c;
        boolean_t verbose = B_FALSE;
+       boolean_t very_verbose = B_FALSE;
        boolean_t first = B_TRUE;
        /*
         * dump flag controls whether the contents of any modified data blocks
@@ -204,11 +234,14 @@ main(int argc, char *argv[])
                        do_cksum = B_FALSE;
                        break;
                case 'v':
+                       if (verbose)
+                               very_verbose = B_TRUE;
                        verbose = B_TRUE;
                        break;
                case 'd':
                        dump = B_TRUE;
                        verbose = B_TRUE;
+                       very_verbose = B_TRUE;
                        break;
                case ':':
                        (void) fprintf(stderr,
@@ -231,7 +264,7 @@ main(int argc, char *argv[])
        }
 
        send_stream = stdin;
-       while (ssread(drr, sizeof (dmu_replay_record_t), &zc)) {
+       while (read_hdr(drr, &zc)) {
 
                /*
                 * If this is the first DMU record being processed, check for
@@ -437,7 +470,7 @@ main(int argc, char *argv[])
                        if (verbose) {
                                (void) printf("WRITE object = %llu type = %u "
                                    "checksum type = %u\n"
-                                   "offset = %llu length = %llu "
+                                   "    offset = %llu length = %llu "
                                    "props = %llx\n",
                                    (u_longlong_t)drrw->drr_object,
                                    drrw->drr_type,
@@ -481,9 +514,9 @@ main(int argc, char *argv[])
                        if (verbose) {
                                (void) printf("WRITE_BYREF object = %llu "
                                    "checksum type = %u props = %llx\n"
-                                   "offset = %llu length = %llu\n"
+                                   "    offset = %llu length = %llu\n"
                                    "toguid = %llx refguid = %llx\n"
-                                   "refobject = %llu refoffset = %llu\n",
+                                   "    refobject = %llu refoffset = %llu\n",
                                    (u_longlong_t)drrwbr->drr_object,
                                    drrwbr->drr_checksumtype,
                                    (u_longlong_t)drrwbr->drr_key.ddk_prop,
@@ -544,7 +577,7 @@ main(int argc, char *argv[])
                        if (verbose) {
                                (void) printf("WRITE_EMBEDDED object = %llu "
                                    "offset = %llu length = %llu\n"
-                                   "toguid = %llx comp = %u etype = %u "
+                                   "    toguid = %llx comp = %u etype = %u "
                                    "lsize = %u psize = %u\n",
                                    (u_longlong_t)drrwe->drr_object,
                                    (u_longlong_t)drrwe->drr_offset,
@@ -562,6 +595,13 @@ main(int argc, char *argv[])
                        /* should never be reached */
                        exit(1);
                }
+               if (drr->drr_type != DRR_BEGIN && very_verbose) {
+                       (void) printf("    checksum = %llx/%llx/%llx/%llx\n",
+                           (longlong_t)drrc->drr_checksum.zc_word[0],
+                           (longlong_t)drrc->drr_checksum.zc_word[1],
+                           (longlong_t)drrc->drr_checksum.zc_word[2],
+                           (longlong_t)drrc->drr_checksum.zc_word[3]);
+               }
                pcksum = zc;
        }
        free(buf);
index 5dc9084dad6b5a5479abe399072db0d300e6ce33..c80e8337ea3a006d2e9ecd927faabca50da92a5c 100644 (file)
@@ -446,6 +446,19 @@ _NOTE(CONSTCOND) } while (0)
        ((zc1).zc_word[2] - (zc2).zc_word[2]) | \
        ((zc1).zc_word[3] - (zc2).zc_word[3])))
 
+#define        ZIO_CHECKSUM_IS_ZERO(zc) \
+       (0 == ((zc)->zc_word[0] | (zc)->zc_word[1] | \
+       (zc)->zc_word[2] | (zc)->zc_word[3]))
+
+#define        ZIO_CHECKSUM_BSWAP(zcp)                                 \
+{                                                              \
+       (zcp)->zc_word[0] = BSWAP_64((zcp)->zc_word[0]);        \
+       (zcp)->zc_word[1] = BSWAP_64((zcp)->zc_word[1]);        \
+       (zcp)->zc_word[2] = BSWAP_64((zcp)->zc_word[2]);        \
+       (zcp)->zc_word[3] = BSWAP_64((zcp)->zc_word[3]);        \
+}
+
+
 #define        DVA_IS_VALID(dva)       (DVA_GET_ASIZE(dva) != 0)
 
 #define        ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3)   \
index 09a96c043bf0e234f7973fc051f535d215135616..601a9a70c5803f9b94055508c007e3e58f12477d 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 #ifndef        _SYS_ZFS_IOCTL_H
@@ -237,6 +237,22 @@ typedef struct dmu_replay_record {
                        uint32_t drr_psize; /* compr. (real) size of payload */
                        /* (possibly compressed) content follows */
                } drr_write_embedded;
+
+               /*
+                * Nore: drr_checksum is overlaid with all record types
+                * except DRR_BEGIN.  Therefore its (non-pad) members
+                * must not overlap with members from the other structs.
+                * We accomplish this by putting its members at the very
+                * end of the struct.
+                */
+               struct drr_checksum {
+                       uint64_t drr_pad[34];
+                       /*
+                        * fletcher-4 checksum of everything preceding the
+                        * checksum.
+                        */
+                       zio_cksum_t drr_checksum;
+               } drr_checksum;
        } drr_u;
 } dmu_replay_record_t;
 
index de89bc9a796726cffe2b51005f63d9f3a6810776..56b83b559377050bdf3c32b84f6cdec66a202151 100644 (file)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_ZIO_CHECKSUM_H
@@ -34,13 +35,13 @@ extern "C" {
 /*
  * Signature for checksum functions.
  */
-typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
+typedef void zio_checksum_func_t(const void *, uint64_t, zio_cksum_t *);
 
 /*
  * Information about each checksum function.
  */
 typedef const struct zio_checksum_info {
-       zio_checksum_t  *ci_func[2]; /* checksum function for each byteorder */
+       zio_checksum_func_t *ci_func[2]; /* checksum function per byteorder */
        int             ci_correctable; /* number of correctable bits   */
        int             ci_eck;         /* uses zio embedded checksum? */
        int             ci_dedup;       /* strong enough for dedup? */
@@ -61,7 +62,7 @@ extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS];
 /*
  * Checksum routines.
  */
-extern zio_checksum_t zio_checksum_SHA256;
+extern zio_checksum_func_t zio_checksum_SHA256;
 
 extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
     void *data, uint64_t size);
index bd2fd294706d661e721207f357a98a8a9b07dd8d..b35428f907cda374bd78cacbbb6a1dca163759df 100644 (file)
@@ -187,10 +187,28 @@ ddt_update(libzfs_handle_t *hdl, dedup_table_t *ddt, zio_cksum_t *cs,
 }
 
 static int
-cksum_and_write(const void *buf, uint64_t len, zio_cksum_t *zc, int outfd)
+dump_record(dmu_replay_record_t *drr, void *payload, int payload_len,
+    zio_cksum_t *zc, int outfd)
 {
-       fletcher_4_incremental_native(buf, len, zc);
-       return (write(outfd, buf, len));
+       ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+           ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
+       fletcher_4_incremental_native(drr,
+           offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc);
+       if (drr->drr_type != DRR_BEGIN) {
+               ASSERT(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.
+                   drr_checksum.drr_checksum));
+               drr->drr_u.drr_checksum.drr_checksum = *zc;
+       }
+       fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum,
+           sizeof (zio_cksum_t), zc);
+       if (write(outfd, drr, sizeof (*drr)) == -1)
+               return (errno);
+       if (payload_len != 0) {
+               fletcher_4_incremental_native(payload, payload_len, zc);
+               if (write(outfd, payload, payload_len) == -1)
+                       return (errno);
+       }
+       return (0);
 }
 
 /*
@@ -217,26 +235,18 @@ cksummer(void *arg)
        char *buf = zfs_alloc(dda->dedup_hdl, SPA_MAXBLOCKSIZE);
        dmu_replay_record_t thedrr;
        dmu_replay_record_t *drr = &thedrr;
-       struct drr_begin *drrb = &thedrr.drr_u.drr_begin;
-       struct drr_end *drre = &thedrr.drr_u.drr_end;
-       struct drr_object *drro = &thedrr.drr_u.drr_object;
-       struct drr_write *drrw = &thedrr.drr_u.drr_write;
-       struct drr_spill *drrs = &thedrr.drr_u.drr_spill;
-       struct drr_write_embedded *drrwe = &thedrr.drr_u.drr_write_embedded;
        FILE *ofp;
        int outfd;
-       dmu_replay_record_t wbr_drr = {0};
-       struct drr_write_byref *wbr_drrr = &wbr_drr.drr_u.drr_write_byref;
        dedup_table_t ddt;
        zio_cksum_t stream_cksum;
        uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE);
        uint64_t numbuckets;
 
        ddt.max_ddt_size =
-           MAX((physmem * MAX_DDT_PHYSMEM_PERCENT)/100,
-           SMALLEST_POSSIBLE_MAX_DDT_MB<<20);
+           MAX((physmem * MAX_DDT_PHYSMEM_PERCENT) / 100,
+           SMALLEST_POSSIBLE_MAX_DDT_MB << 20);
 
-       numbuckets = ddt.max_ddt_size/(sizeof (dedup_entry_t));
+       numbuckets = ddt.max_ddt_size / (sizeof (dedup_entry_t));
 
        /*
         * numbuckets must be a power of 2.  Increase number to
@@ -252,32 +262,29 @@ cksummer(void *arg)
        ddt.numhashbits = high_order_bit(numbuckets) - 1;
        ddt.ddt_full = B_FALSE;
 
-       /* Initialize the write-by-reference block. */
-       wbr_drr.drr_type = DRR_WRITE_BYREF;
-       wbr_drr.drr_payloadlen = 0;
-
        outfd = dda->outputfd;
        ofp = fdopen(dda->inputfd, "r");
-       while (ssread(drr, sizeof (dmu_replay_record_t), ofp) != 0) {
+       while (ssread(drr, sizeof (*drr), ofp) != 0) {
 
                switch (drr->drr_type) {
                case DRR_BEGIN:
                {
-                       int     fflags;
+                       struct drr_begin *drrb = &drr->drr_u.drr_begin;
+                       int fflags;
+                       int sz = 0;
                        ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
 
+                       ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
+
                        /* set the DEDUP feature flag for this stream */
                        fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
                        fflags |= (DMU_BACKUP_FEATURE_DEDUP |
                            DMU_BACKUP_FEATURE_DEDUPPROPS);
                        DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags);
 
-                       if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
-                           &stream_cksum, outfd) == -1)
-                               goto out;
                        if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
                            DMU_COMPOUNDSTREAM && drr->drr_payloadlen != 0) {
-                               int sz = drr->drr_payloadlen;
+                               sz = drr->drr_payloadlen;
 
                                if (sz > SPA_MAXBLOCKSIZE) {
                                        buf = zfs_realloc(dda->dedup_hdl, buf,
@@ -286,64 +293,60 @@ cksummer(void *arg)
                                (void) ssread(buf, sz, ofp);
                                if (ferror(stdin))
                                        perror("fread");
-                               if (cksum_and_write(buf, sz, &stream_cksum,
-                                   outfd) == -1)
-                                       goto out;
                        }
+                       if (dump_record(drr, buf, sz, &stream_cksum,
+                           outfd) != 0)
+                               goto out;
                        break;
                }
 
                case DRR_END:
                {
+                       struct drr_end *drre = &drr->drr_u.drr_end;
                        /* use the recalculated checksum */
-                       ZIO_SET_CHECKSUM(&drre->drr_checksum,
-                           stream_cksum.zc_word[0], stream_cksum.zc_word[1],
-                           stream_cksum.zc_word[2], stream_cksum.zc_word[3]);
-                       if ((write(outfd, drr,
-                           sizeof (dmu_replay_record_t))) == -1)
+                       drre->drr_checksum = stream_cksum;
+                       if (dump_record(drr, NULL, 0, &stream_cksum,
+                           outfd) != 0)
                                goto out;
                        break;
                }
 
                case DRR_OBJECT:
                {
-                       if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
-                           &stream_cksum, outfd) == -1)
-                               goto out;
+                       struct drr_object *drro = &drr->drr_u.drr_object;
                        if (drro->drr_bonuslen > 0) {
                                (void) ssread(buf,
                                    P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8),
                                    ofp);
-                               if (cksum_and_write(buf,
-                                   P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8),
-                                   &stream_cksum, outfd) == -1)
-                                       goto out;
                        }
+                       if (dump_record(drr, buf,
+                           P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8),
+                           &stream_cksum, outfd) != 0)
+                               goto out;
                        break;
                }
 
                case DRR_SPILL:
                {
-                       if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
-                           &stream_cksum, outfd) == -1)
-                               goto out;
+                       struct drr_spill *drrs = &drr->drr_u.drr_spill;
                        (void) ssread(buf, drrs->drr_length, ofp);
-                       if (cksum_and_write(buf, drrs->drr_length,
-                           &stream_cksum, outfd) == -1)
+                       if (dump_record(drr, buf, drrs->drr_length,
+                           &stream_cksum, outfd) != 0)
                                goto out;
                        break;
                }
 
                case DRR_FREEOBJECTS:
                {
-                       if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
-                           &stream_cksum, outfd) == -1)
+                       if (dump_record(drr, NULL, 0, &stream_cksum,
+                           outfd) != 0)
                                goto out;
                        break;
                }
 
                case DRR_WRITE:
                {
+                       struct drr_write *drrw = &drr->drr_u.drr_write;
                        dataref_t       dataref;
 
                        (void) ssread(buf, drrw->drr_length, ofp);
@@ -380,7 +383,13 @@ cksummer(void *arg)
                        if (ddt_update(dda->dedup_hdl, &ddt,
                            &drrw->drr_key.ddk_cksum, drrw->drr_key.ddk_prop,
                            &dataref)) {
+                               dmu_replay_record_t wbr_drr = {0};
+                               struct drr_write_byref *wbr_drrr =
+                                   &wbr_drr.drr_u.drr_write_byref;
+
                                /* block already present in stream */
+                               wbr_drr.drr_type = DRR_WRITE_BYREF;
+
                                wbr_drrr->drr_object = drrw->drr_object;
                                wbr_drrr->drr_offset = drrw->drr_offset;
                                wbr_drrr->drr_length = drrw->drr_length;
@@ -400,19 +409,13 @@ cksummer(void *arg)
                                wbr_drrr->drr_key.ddk_prop =
                                    drrw->drr_key.ddk_prop;
 
-                               if (cksum_and_write(&wbr_drr,
-                                   sizeof (dmu_replay_record_t), &stream_cksum,
-                                   outfd) == -1)
+                               if (dump_record(&wbr_drr, NULL, 0,
+                                   &stream_cksum, outfd) != 0)
                                        goto out;
                        } else {
                                /* block not previously seen */
-                               if (cksum_and_write(drr,
-                                   sizeof (dmu_replay_record_t), &stream_cksum,
-                                   outfd) == -1)
-                                       goto out;
-                               if (cksum_and_write(buf,
-                                   drrw->drr_length,
-                                   &stream_cksum, outfd) == -1)
+                               if (dump_record(drr, buf, drrw->drr_length,
+                                   &stream_cksum, outfd) != 0)
                                        goto out;
                        }
                        break;
@@ -420,28 +423,27 @@ cksummer(void *arg)
 
                case DRR_WRITE_EMBEDDED:
                {
-                       if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
-                           &stream_cksum, outfd) == -1)
-                               goto out;
+                       struct drr_write_embedded *drrwe =
+                           &drr->drr_u.drr_write_embedded;
                        (void) ssread(buf,
                            P2ROUNDUP((uint64_t)drrwe->drr_psize, 8), ofp);
-                       if (cksum_and_write(buf,
+                       if (dump_record(drr, buf,
                            P2ROUNDUP((uint64_t)drrwe->drr_psize, 8),
-                           &stream_cksum, outfd) == -1)
+                           &stream_cksum, outfd) != 0)
                                goto out;
                        break;
                }
 
                case DRR_FREE:
                {
-                       if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
-                           &stream_cksum, outfd) == -1)
+                       if (dump_record(drr, NULL, 0, &stream_cksum,
+                           outfd) != 0)
                                goto out;
                        break;
                }
 
                default:
-                       (void) printf("INVALID record type 0x%x\n",
+                       (void) fprintf(stderr, "INVALID record type 0x%x\n",
                            drr->drr_type);
                        /* should never happen, so assert */
                        assert(B_FALSE);
@@ -1491,18 +1493,11 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
                            sizeof (drr.drr_u.drr_begin.drr_toname),
                            "%s@%s", zhp->zfs_name, tosnap);
                        drr.drr_payloadlen = buflen;
-                       err = cksum_and_write(&drr, sizeof (drr), &zc, outfd);
 
-                       /* write header nvlist */
-                       if (err != -1 && packbuf != NULL) {
-                               err = cksum_and_write(packbuf, buflen, &zc,
-                                   outfd);
-                       }
+                       err = dump_record(&drr, packbuf, buflen, &zc, outfd);
                        free(packbuf);
-                       if (err == -1) {
-                               err = errno;
+                       if (err != 0)
                                goto stderr_out;
-                       }
 
                        /* write end record */
                        bzero(&drr, sizeof (drr));
@@ -1736,6 +1731,8 @@ recv_read(libzfs_handle_t *hdl, int fd, void *buf, int ilen,
        int rv;
        int len = ilen;
 
+       assert(ilen <= SPA_MAXBLOCKSIZE);
+
        do {
                rv = read(fd, cp, len);
                cp += rv;
index b028e5ba4fa53e4469407a104477b9640dca7a5a..2502b2c97cc5b570d33f98e34c93b77d55507302 100644 (file)
@@ -75,7 +75,6 @@ dump_bytes_cb(void *arg)
        ssize_t resid; /* have to get resid to get detailed errno */
        ASSERT0(dbi->dbi_len % 8);
 
-       fletcher_4_incremental_native(dbi->dbi_buf, dbi->dbi_len, &dsp->dsa_zc);
        dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp,
            (caddr_t)dbi->dbi_buf, dbi->dbi_len,
            0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid);
@@ -110,6 +109,38 @@ dump_bytes(dmu_sendarg_t *dsp, void *buf, int len)
        return (dsp->dsa_err);
 }
 
+/*
+ * For all record types except BEGIN, fill in the checksum (overlaid in
+ * drr_u.drr_checksum.drr_checksum).  The checksum verifies everything
+ * up to the start of the checksum itself.
+ */
+static int
+dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
+{
+       ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+           ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
+       fletcher_4_incremental_native(dsp->dsa_drr,
+           offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+           &dsp->dsa_zc);
+       if (dsp->dsa_drr->drr_type != DRR_BEGIN) {
+               ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u.
+                   drr_checksum.drr_checksum));
+               dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc;
+       }
+       fletcher_4_incremental_native(&dsp->dsa_drr->
+           drr_u.drr_checksum.drr_checksum,
+           sizeof (zio_cksum_t), &dsp->dsa_zc);
+       if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
+               return (SET_ERROR(EINTR));
+       if (payload_len != 0) {
+               fletcher_4_incremental_native(payload, payload_len,
+                   &dsp->dsa_zc);
+               if (dump_bytes(dsp, payload, payload_len) != 0)
+                       return (SET_ERROR(EINTR));
+       }
+       return (0);
+}
+
 static int
 dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
     uint64_t length)
@@ -154,8 +185,7 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
         */
        if (dsp->dsa_pending_op != PENDING_NONE &&
            dsp->dsa_pending_op != PENDING_FREE) {
-               if (dump_bytes(dsp, dsp->dsa_drr,
-                   sizeof (dmu_replay_record_t)) != 0)
+               if (dump_record(dsp, NULL, 0) != 0)
                        return (SET_ERROR(EINTR));
                dsp->dsa_pending_op = PENDING_NONE;
        }
@@ -178,8 +208,7 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
                        return (0);
                } else {
                        /* not a continuation.  Push out pending record */
-                       if (dump_bytes(dsp, dsp->dsa_drr,
-                           sizeof (dmu_replay_record_t)) != 0)
+                       if (dump_record(dsp, NULL, 0) != 0)
                                return (SET_ERROR(EINTR));
                        dsp->dsa_pending_op = PENDING_NONE;
                }
@@ -192,8 +221,7 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
        drrf->drr_length = length;
        drrf->drr_toguid = dsp->dsa_toguid;
        if (length == -1ULL) {
-               if (dump_bytes(dsp, dsp->dsa_drr,
-                   sizeof (dmu_replay_record_t)) != 0)
+               if (dump_record(dsp, NULL, 0) != 0)
                        return (SET_ERROR(EINTR));
        } else {
                dsp->dsa_pending_op = PENDING_FREE;
@@ -225,12 +253,11 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
         * of different types.
         */
        if (dsp->dsa_pending_op != PENDING_NONE) {
-               if (dump_bytes(dsp, dsp->dsa_drr,
-                   sizeof (dmu_replay_record_t)) != 0)
+               if (dump_record(dsp, NULL, 0) != 0)
                        return (SET_ERROR(EINTR));
                dsp->dsa_pending_op = PENDING_NONE;
        }
-       /* write a DATA record */
+       /* write a WRITE record */
        bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
        dsp->dsa_drr->drr_type = DRR_WRITE;
        drrw->drr_object = object;
@@ -256,9 +283,7 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
                drrw->drr_key.ddk_cksum = bp->blk_cksum;
        }
 
-       if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
-               return (SET_ERROR(EINTR));
-       if (dump_bytes(dsp, data, blksz) != 0)
+       if (dump_record(dsp, data, blksz) != 0)
                return (SET_ERROR(EINTR));
        return (0);
 }
@@ -272,8 +297,7 @@ dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
            &(dsp->dsa_drr->drr_u.drr_write_embedded);
 
        if (dsp->dsa_pending_op != PENDING_NONE) {
-               if (dump_bytes(dsp, dsp->dsa_drr,
-                   sizeof (dmu_replay_record_t)) != 0)
+               if (dump_record(dsp, NULL, 0) != 0)
                        return (EINTR);
                dsp->dsa_pending_op = PENDING_NONE;
        }
@@ -293,9 +317,7 @@ dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
 
        decode_embedded_bp_compressed(bp, buf);
 
-       if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
-               return (EINTR);
-       if (dump_bytes(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0)
+       if (dump_record(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0)
                return (EINTR);
        return (0);
 }
@@ -306,8 +328,7 @@ dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data)
        struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill);
 
        if (dsp->dsa_pending_op != PENDING_NONE) {
-               if (dump_bytes(dsp, dsp->dsa_drr,
-                   sizeof (dmu_replay_record_t)) != 0)
+               if (dump_record(dsp, NULL, 0) != 0)
                        return (SET_ERROR(EINTR));
                dsp->dsa_pending_op = PENDING_NONE;
        }
@@ -319,9 +340,7 @@ dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data)
        drrs->drr_length = blksz;
        drrs->drr_toguid = dsp->dsa_toguid;
 
-       if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)))
-               return (SET_ERROR(EINTR));
-       if (dump_bytes(dsp, data, blksz))
+       if (dump_record(dsp, data, blksz) != 0)
                return (SET_ERROR(EINTR));
        return (0);
 }
@@ -344,8 +363,7 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
         */
        if (dsp->dsa_pending_op != PENDING_NONE &&
            dsp->dsa_pending_op != PENDING_FREEOBJECTS) {
-               if (dump_bytes(dsp, dsp->dsa_drr,
-                   sizeof (dmu_replay_record_t)) != 0)
+               if (dump_record(dsp, NULL, 0) != 0)
                        return (SET_ERROR(EINTR));
                dsp->dsa_pending_op = PENDING_NONE;
        }
@@ -359,8 +377,7 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
                        return (0);
                } else {
                        /* can't be aggregated.  Push out pending record */
-                       if (dump_bytes(dsp, dsp->dsa_drr,
-                           sizeof (dmu_replay_record_t)) != 0)
+                       if (dump_record(dsp, NULL, 0) != 0)
                                return (SET_ERROR(EINTR));
                        dsp->dsa_pending_op = PENDING_NONE;
                }
@@ -387,8 +404,7 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
                return (dump_freeobjects(dsp, object, 1));
 
        if (dsp->dsa_pending_op != PENDING_NONE) {
-               if (dump_bytes(dsp, dsp->dsa_drr,
-                   sizeof (dmu_replay_record_t)) != 0)
+               if (dump_record(dsp, NULL, 0) != 0)
                        return (SET_ERROR(EINTR));
                dsp->dsa_pending_op = PENDING_NONE;
        }
@@ -409,11 +425,10 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
            drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
                drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
 
-       if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
-               return (SET_ERROR(EINTR));
-
-       if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0)
+       if (dump_record(dsp, DN_BONUS(dnp),
+           P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) {
                return (SET_ERROR(EINTR));
+       }
 
        /* Free anything past the end of the file. */
        if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) *
@@ -657,7 +672,6 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
        dsp->dsa_os = os;
        dsp->dsa_off = off;
        dsp->dsa_toguid = dsl_dataset_phys(ds)->ds_guid;
-       ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0);
        dsp->dsa_pending_op = PENDING_NONE;
        dsp->dsa_incremental = (fromzb != NULL);
        dsp->dsa_featureflags = featureflags;
@@ -669,7 +683,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
        dsl_dataset_long_hold(ds, FTAG);
        dsl_pool_rele(dp, tag);
 
-       if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) {
+       if (dump_record(dsp, NULL, 0) != 0) {
                err = dsp->dsa_err;
                goto out;
        }
@@ -678,7 +692,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
            backup_cb, dsp);
 
        if (dsp->dsa_pending_op != PENDING_NONE)
-               if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0)
+               if (dump_record(dsp, NULL, 0) != 0)
                        err = SET_ERROR(EINTR);
 
        if (err != 0) {
@@ -692,7 +706,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
        drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc;
        drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid;
 
-       if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) {
+       if (dump_record(dsp, NULL, 0) != 0) {
                err = dsp->dsa_err;
                goto out;
        }
@@ -1300,13 +1314,19 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
 }
 
 struct restorearg {
+       objset_t *os;
        int err;
        boolean_t byteswap;
        vnode_t *vp;
-       char *buf;
        uint64_t voff;
        int bufsize; /* amount of memory allocated for buf */
+
+       dmu_replay_record_t *drr;
+       dmu_replay_record_t *next_drr;
+       char *buf;
        zio_cksum_t cksum;
+       zio_cksum_t prev_cksum;
+
        avl_tree_t *guid_to_ds_map;
 };
 
@@ -1345,14 +1365,11 @@ free_guid_map_onexit(void *arg)
        kmem_free(ca, sizeof (avl_tree_t));
 }
 
-static void *
-restore_read(struct restorearg *ra, int len, char *buf)
+static int
+restore_read(struct restorearg *ra, int len, void *buf)
 {
        int done = 0;
 
-       if (buf == NULL)
-               buf = ra->buf;
-
        /* some things will require 8-byte alignment, so everything must */
        ASSERT0(len % 8);
        ASSERT3U(len, <=, ra->bufsize);
@@ -1361,7 +1378,7 @@ restore_read(struct restorearg *ra, int len, char *buf)
                ssize_t resid;
 
                ra->err = vn_rdwr(UIO_READ, ra->vp,
-                   buf + done, len - done,
+                   (char *)buf + done, len - done,
                    ra->voff, UIO_SYSSPACE, FAPPEND,
                    RLIM64_INFINITY, CRED(), &resid);
 
@@ -1370,24 +1387,21 @@ restore_read(struct restorearg *ra, int len, char *buf)
                ra->voff += len - done - resid;
                done = len - resid;
                if (ra->err != 0)
-                       return (NULL);
+                       return (ra->err);
        }
 
        ASSERT3U(done, ==, len);
-       if (ra->byteswap)
-               fletcher_4_incremental_byteswap(buf, len, &ra->cksum);
-       else
-               fletcher_4_incremental_native(buf, len, &ra->cksum);
-       return (buf);
+       return (0);
 }
 
 noinline static void
-backup_byteswap(dmu_replay_record_t *drr)
+byteswap_record(dmu_replay_record_t *drr)
 {
 #define        DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
 #define        DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
        drr->drr_type = BSWAP_32(drr->drr_type);
        drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
+
        switch (drr->drr_type) {
        case DRR_BEGIN:
                DO64(drr_begin.drr_magic);
@@ -1417,10 +1431,7 @@ backup_byteswap(dmu_replay_record_t *drr)
                DO64(drr_write.drr_offset);
                DO64(drr_write.drr_length);
                DO64(drr_write.drr_toguid);
-               DO64(drr_write.drr_key.ddk_cksum.zc_word[0]);
-               DO64(drr_write.drr_key.ddk_cksum.zc_word[1]);
-               DO64(drr_write.drr_key.ddk_cksum.zc_word[2]);
-               DO64(drr_write.drr_key.ddk_cksum.zc_word[3]);
+               ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum);
                DO64(drr_write.drr_key.ddk_prop);
                break;
        case DRR_WRITE_BYREF:
@@ -1431,10 +1442,8 @@ backup_byteswap(dmu_replay_record_t *drr)
                DO64(drr_write_byref.drr_refguid);
                DO64(drr_write_byref.drr_refobject);
                DO64(drr_write_byref.drr_refoffset);
-               DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]);
-               DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]);
-               DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]);
-               DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]);
+               ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write_byref.
+                   drr_key.ddk_cksum);
                DO64(drr_write_byref.drr_key.ddk_prop);
                break;
        case DRR_WRITE_EMBEDDED:
@@ -1457,15 +1466,17 @@ backup_byteswap(dmu_replay_record_t *drr)
                DO64(drr_spill.drr_toguid);
                break;
        case DRR_END:
-               DO64(drr_end.drr_checksum.zc_word[0]);
-               DO64(drr_end.drr_checksum.zc_word[1]);
-               DO64(drr_end.drr_checksum.zc_word[2]);
-               DO64(drr_end.drr_checksum.zc_word[3]);
                DO64(drr_end.drr_toguid);
+               ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum);
                break;
        default:
                break;
        }
+
+       if (drr->drr_type != DRR_BEGIN) {
+               ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum);
+       }
+
 #undef DO64
 #undef DO32
 }
@@ -1482,11 +1493,10 @@ deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size)
 }
 
 noinline static int
-restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
+restore_object(struct restorearg *ra, struct drr_object *drro, void *data)
 {
        dmu_object_info_t doi;
        dmu_tx_t *tx;
-       void *data = NULL;
        uint64_t object;
        int err;
 
@@ -1497,23 +1507,17 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
            drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
            P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
            drro->drr_blksz < SPA_MINBLOCKSIZE ||
-           drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(os)) ||
+           drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(ra->os)) ||
            drro->drr_bonuslen > DN_MAX_BONUSLEN) {
                return (SET_ERROR(EINVAL));
        }
 
-       err = dmu_object_info(os, drro->drr_object, &doi);
+       err = dmu_object_info(ra->os, drro->drr_object, &doi);
 
        if (err != 0 && err != ENOENT)
                return (SET_ERROR(EINVAL));
        object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT;
 
-       if (drro->drr_bonuslen) {
-               data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8), NULL);
-               if (ra->err != 0)
-                       return (ra->err);
-       }
-
        /*
         * If we are losing blkptrs or changing the block size this must
         * be a new file instance.  We must clear out the previous file
@@ -1527,14 +1531,14 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
 
                if (drro->drr_blksz != doi.doi_data_block_size ||
                    nblkptr < doi.doi_nblkptr) {
-                       err = dmu_free_long_range(os, drro->drr_object,
+                       err = dmu_free_long_range(ra->os, drro->drr_object,
                            0, DMU_OBJECT_END);
                        if (err != 0)
                                return (SET_ERROR(EINVAL));
                }
        }
 
-       tx = dmu_tx_create(os);
+       tx = dmu_tx_create(ra->os);
        dmu_tx_hold_bonus(tx, object);
        err = dmu_tx_assign(tx, TXG_WAIT);
        if (err != 0) {
@@ -1544,7 +1548,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
 
        if (object == DMU_NEW_OBJECT) {
                /* currently free, want to be allocated */
-               err = dmu_object_claim(os, drro->drr_object,
+               err = dmu_object_claim(ra->os, drro->drr_object,
                    drro->drr_type, drro->drr_blksz,
                    drro->drr_bonustype, drro->drr_bonuslen, tx);
        } else if (drro->drr_type != doi.doi_type ||
@@ -1552,7 +1556,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
            drro->drr_bonustype != doi.doi_bonus_type ||
            drro->drr_bonuslen != doi.doi_bonus_size) {
                /* currently allocated, but with different properties */
-               err = dmu_object_reclaim(os, drro->drr_object,
+               err = dmu_object_reclaim(ra->os, drro->drr_object,
                    drro->drr_type, drro->drr_blksz,
                    drro->drr_bonustype, drro->drr_bonuslen, tx);
        }
@@ -1561,14 +1565,15 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
                return (SET_ERROR(EINVAL));
        }
 
-       dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype,
-           tx);
-       dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx);
+       dmu_object_set_checksum(ra->os, drro->drr_object,
+           drro->drr_checksumtype, tx);
+       dmu_object_set_compress(ra->os, drro->drr_object,
+           drro->drr_compress, tx);
 
        if (data != NULL) {
                dmu_buf_t *db;
 
-               VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db));
+               VERIFY0(dmu_bonus_hold(ra->os, drro->drr_object, FTAG, &db));
                dmu_buf_will_dirty(db, tx);
 
                ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
@@ -1587,7 +1592,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
 
 /* ARGSUSED */
 noinline static int
-restore_freeobjects(struct restorearg *ra, objset_t *os,
+restore_freeobjects(struct restorearg *ra,
     struct drr_freeobjects *drrfo)
 {
        uint64_t obj;
@@ -1597,13 +1602,13 @@ restore_freeobjects(struct restorearg *ra, objset_t *os,
 
        for (obj = drrfo->drr_firstobj;
            obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
-           (void) dmu_object_next(os, &obj, FALSE, 0)) {
+           (void) dmu_object_next(ra->os, &obj, FALSE, 0)) {
                int err;
 
-               if (dmu_object_info(os, obj, NULL) != 0)
+               if (dmu_object_info(ra->os, obj, NULL) != 0)
                        continue;
 
-               err = dmu_free_long_object(os, obj);
+               err = dmu_free_long_object(ra->os, obj);
                if (err != 0)
                        return (err);
        }
@@ -1611,50 +1616,37 @@ restore_freeobjects(struct restorearg *ra, objset_t *os,
 }
 
 noinline static int
-restore_write(struct restorearg *ra, objset_t *os,
-    struct drr_write *drrw)
+restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf)
 {
        dmu_tx_t *tx;
        dmu_buf_t *bonus;
-       arc_buf_t *abuf;
-       void *data;
        int err;
 
        if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset ||
            !DMU_OT_IS_VALID(drrw->drr_type))
                return (SET_ERROR(EINVAL));
 
-       if (dmu_object_info(os, drrw->drr_object, NULL) != 0)
-               return (SET_ERROR(EINVAL));
-
-       if (dmu_bonus_hold(os, drrw->drr_object, FTAG, &bonus) != 0)
+       if (dmu_object_info(ra->os, drrw->drr_object, NULL) != 0)
                return (SET_ERROR(EINVAL));
 
-       abuf = dmu_request_arcbuf(bonus, drrw->drr_length);
-
-       data = restore_read(ra, drrw->drr_length, abuf->b_data);
-       if (data == NULL) {
-               dmu_return_arcbuf(abuf);
-               dmu_buf_rele(bonus, FTAG);
-               return (ra->err);
-       }
-
-       tx = dmu_tx_create(os);
+       tx = dmu_tx_create(ra->os);
 
        dmu_tx_hold_write(tx, drrw->drr_object,
            drrw->drr_offset, drrw->drr_length);
        err = dmu_tx_assign(tx, TXG_WAIT);
        if (err != 0) {
-               dmu_return_arcbuf(abuf);
-               dmu_buf_rele(bonus, FTAG);
                dmu_tx_abort(tx);
                return (err);
        }
        if (ra->byteswap) {
                dmu_object_byteswap_t byteswap =
                    DMU_OT_BYTESWAP(drrw->drr_type);
-               dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length);
+               dmu_ot_byteswap[byteswap].ob_func(abuf->b_data,
+                   drrw->drr_length);
        }
+
+       if (dmu_bonus_hold(ra->os, drrw->drr_object, FTAG, &bonus) != 0)
+               return (SET_ERROR(EINVAL));
        dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx);
        dmu_tx_commit(tx);
        dmu_buf_rele(bonus, FTAG);
@@ -1669,8 +1661,7 @@ restore_write(struct restorearg *ra, objset_t *os,
  * data from the stream to fulfill this write.
  */
 static int
-restore_write_byref(struct restorearg *ra, objset_t *os,
-    struct drr_write_byref *drrwbr)
+restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr)
 {
        dmu_tx_t *tx;
        int err;
@@ -1696,7 +1687,7 @@ restore_write_byref(struct restorearg *ra, objset_t *os,
                if (dmu_objset_from_ds(gmep->gme_ds, &ref_os))
                        return (SET_ERROR(EINVAL));
        } else {
-               ref_os = os;
+               ref_os = ra->os;
        }
 
        err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
@@ -1704,7 +1695,7 @@ restore_write_byref(struct restorearg *ra, objset_t *os,
        if (err != 0)
                return (err);
 
-       tx = dmu_tx_create(os);
+       tx = dmu_tx_create(ra->os);
 
        dmu_tx_hold_write(tx, drrwbr->drr_object,
            drrwbr->drr_offset, drrwbr->drr_length);
@@ -1713,7 +1704,7 @@ restore_write_byref(struct restorearg *ra, objset_t *os,
                dmu_tx_abort(tx);
                return (err);
        }
-       dmu_write(os, drrwbr->drr_object,
+       dmu_write(ra->os, drrwbr->drr_object,
            drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
        dmu_buf_rele(dbp, FTAG);
        dmu_tx_commit(tx);
@@ -1721,12 +1712,11 @@ restore_write_byref(struct restorearg *ra, objset_t *os,
 }
 
 static int
-restore_write_embedded(struct restorearg *ra, objset_t *os,
-    struct drr_write_embedded *drrwnp)
+restore_write_embedded(struct restorearg *ra,
+    struct drr_write_embedded *drrwnp, void *data)
 {
        dmu_tx_t *tx;
        int err;
-       void *data;
 
        if (drrwnp->drr_offset + drrwnp->drr_length < drrwnp->drr_offset)
                return (EINVAL);
@@ -1739,11 +1729,7 @@ restore_write_embedded(struct restorearg *ra, objset_t *os,
        if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
                return (EINVAL);
 
-       data = restore_read(ra, P2ROUNDUP(drrwnp->drr_psize, 8), NULL);
-       if (data == NULL)
-               return (ra->err);
-
-       tx = dmu_tx_create(os);
+       tx = dmu_tx_create(ra->os);
 
        dmu_tx_hold_write(tx, drrwnp->drr_object,
            drrwnp->drr_offset, drrwnp->drr_length);
@@ -1753,7 +1739,7 @@ restore_write_embedded(struct restorearg *ra, objset_t *os,
                return (err);
        }
 
-       dmu_write_embedded(os, drrwnp->drr_object,
+       dmu_write_embedded(ra->os, drrwnp->drr_object,
            drrwnp->drr_offset, data, drrwnp->drr_etype,
            drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize,
            ra->byteswap ^ ZFS_HOST_BYTEORDER, tx);
@@ -1763,31 +1749,26 @@ restore_write_embedded(struct restorearg *ra, objset_t *os,
 }
 
 static int
-restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
+restore_spill(struct restorearg *ra, struct drr_spill *drrs, void *data)
 {
        dmu_tx_t *tx;
-       void *data;
        dmu_buf_t *db, *db_spill;
        int err;
 
        if (drrs->drr_length < SPA_MINBLOCKSIZE ||
-           drrs->drr_length > spa_maxblocksize(dmu_objset_spa(os)))
+           drrs->drr_length > spa_maxblocksize(dmu_objset_spa(ra->os)))
                return (SET_ERROR(EINVAL));
 
-       data = restore_read(ra, drrs->drr_length, NULL);
-       if (data == NULL)
-               return (ra->err);
-
-       if (dmu_object_info(os, drrs->drr_object, NULL) != 0)
+       if (dmu_object_info(ra->os, drrs->drr_object, NULL) != 0)
                return (SET_ERROR(EINVAL));
 
-       VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db));
+       VERIFY0(dmu_bonus_hold(ra->os, drrs->drr_object, FTAG, &db));
        if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) {
                dmu_buf_rele(db, FTAG);
                return (err);
        }
 
-       tx = dmu_tx_create(os);
+       tx = dmu_tx_create(ra->os);
 
        dmu_tx_hold_spill(tx, db->db_object);
 
@@ -1814,8 +1795,7 @@ restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
 
 /* ARGSUSED */
 noinline static int
-restore_free(struct restorearg *ra, objset_t *os,
-    struct drr_free *drrf)
+restore_free(struct restorearg *ra, struct drr_free *drrf)
 {
        int err;
 
@@ -1823,10 +1803,10 @@ restore_free(struct restorearg *ra, objset_t *os,
            drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
                return (SET_ERROR(EINVAL));
 
-       if (dmu_object_info(os, drrf->drr_object, NULL) != 0)
+       if (dmu_object_info(ra->os, drrf->drr_object, NULL) != 0)
                return (SET_ERROR(EINVAL));
 
-       err = dmu_free_long_range(os, drrf->drr_object,
+       err = dmu_free_long_range(ra->os, drrf->drr_object,
            drrf->drr_offset, drrf->drr_length);
        return (err);
 }
@@ -1841,6 +1821,157 @@ dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
        (void) dsl_destroy_head(name);
 }
 
+static void
+restore_cksum(struct restorearg *ra, int len, void *buf)
+{
+       if (ra->byteswap) {
+               fletcher_4_incremental_byteswap(buf, len, &ra->cksum);
+       } else {
+               fletcher_4_incremental_native(buf, len, &ra->cksum);
+       }
+}
+
+/*
+ * If len != 0, read payload into buf.
+ * Read next record's header into ra->next_drr.
+ * Verify checksum of payload and next record.
+ */
+static int
+restore_read_payload_and_next_header(struct restorearg *ra, int len, void *buf)
+{
+       int err;
+       zio_cksum_t cksum_orig;
+       zio_cksum_t *cksump;
+
+       if (len != 0) {
+               ASSERT3U(len, <=, ra->bufsize);
+               err = restore_read(ra, len, buf);
+               if (err != 0)
+                       return (err);
+               restore_cksum(ra, len, buf);
+       }
+
+       ra->prev_cksum = ra->cksum;
+
+       err = restore_read(ra, sizeof (*ra->next_drr), ra->next_drr);
+       if (err != 0)
+               return (err);
+       if (ra->next_drr->drr_type == DRR_BEGIN)
+               return (SET_ERROR(EINVAL));
+
+       /*
+        * Note: checksum is of everything up to but not including the
+        * checksum itself.
+        */
+       ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+           ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
+       restore_cksum(ra,
+           offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+           ra->next_drr);
+
+       cksum_orig = ra->next_drr->drr_u.drr_checksum.drr_checksum;
+       cksump = &ra->next_drr->drr_u.drr_checksum.drr_checksum;
+
+       if (ra->byteswap)
+               byteswap_record(ra->next_drr);
+
+       if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) &&
+           !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump))
+               return (SET_ERROR(ECKSUM));
+
+       restore_cksum(ra, sizeof (cksum_orig), &cksum_orig);
+
+       return (0);
+}
+
+static int
+restore_process_record(struct restorearg *ra)
+{
+       int err;
+
+       switch (ra->drr->drr_type) {
+       case DRR_OBJECT:
+       {
+               struct drr_object *drro = &ra->drr->drr_u.drr_object;
+               err = restore_read_payload_and_next_header(ra,
+                   P2ROUNDUP(drro->drr_bonuslen, 8), ra->buf);
+               if (err != 0)
+                       return (err);
+               return (restore_object(ra, drro, ra->buf));
+       }
+       case DRR_FREEOBJECTS:
+       {
+               struct drr_freeobjects *drrfo =
+                   &ra->drr->drr_u.drr_freeobjects;
+               err = restore_read_payload_and_next_header(ra, 0, NULL);
+               if (err != 0)
+                       return (err);
+               return (restore_freeobjects(ra, drrfo));
+       }
+       case DRR_WRITE:
+       {
+               struct drr_write *drrw = &ra->drr->drr_u.drr_write;
+               arc_buf_t *abuf = arc_loan_buf(dmu_objset_spa(ra->os),
+                   drrw->drr_length);
+
+               err = restore_read_payload_and_next_header(ra,
+                   drrw->drr_length, abuf->b_data);
+               if (err != 0)
+                       return (err);
+               err = restore_write(ra, drrw, abuf);
+               /* if restore_write() is successful, it consumes the arc_buf */
+               if (err != 0)
+                       dmu_return_arcbuf(abuf);
+               return (err);
+       }
+       case DRR_WRITE_BYREF:
+       {
+               struct drr_write_byref *drrwbr =
+                   &ra->drr->drr_u.drr_write_byref;
+               err = restore_read_payload_and_next_header(ra, 0, NULL);
+               if (err != 0)
+                       return (err);
+               return (restore_write_byref(ra, drrwbr));
+       }
+       case DRR_WRITE_EMBEDDED:
+       {
+               struct drr_write_embedded *drrwe =
+                   &ra->drr->drr_u.drr_write_embedded;
+               err = restore_read_payload_and_next_header(ra,
+                   P2ROUNDUP(drrwe->drr_psize, 8), ra->buf);
+               if (err != 0)
+                       return (err);
+               return (restore_write_embedded(ra, drrwe, ra->buf));
+       }
+       case DRR_FREE:
+       {
+               struct drr_free *drrf = &ra->drr->drr_u.drr_free;
+               err = restore_read_payload_and_next_header(ra, 0, NULL);
+               if (err != 0)
+                       return (err);
+               return (restore_free(ra, drrf));
+       }
+       case DRR_END:
+       {
+               struct drr_end *drre = &ra->drr->drr_u.drr_end;
+               if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum))
+                       return (SET_ERROR(EINVAL));
+               return (0);
+       }
+       case DRR_SPILL:
+       {
+               struct drr_spill *drrs = &ra->drr->drr_u.drr_spill;
+               err = restore_read_payload_and_next_header(ra,
+                   drrs->drr_length, ra->buf);
+               if (err != 0)
+                       return (err);
+               return (restore_spill(ra, drrs, ra->buf));
+       }
+       default:
+               return (SET_ERROR(EINVAL));
+       }
+}
+
 /*
  * NB: callers *must* call dmu_recv_end() if this succeeds.
  */
@@ -1848,10 +1979,8 @@ int
 dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
     int cleanup_fd, uint64_t *action_handlep)
 {
+       int err = 0;
        struct restorearg ra = { 0 };
-       dmu_replay_record_t *drr;
-       objset_t *os;
-       zio_cksum_t pcksum;
        int featureflags;
 
        ra.byteswap = drc->drc_byteswap;
@@ -1859,7 +1988,9 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
        ra.vp = vp;
        ra.voff = *voffp;
        ra.bufsize = SPA_MAXBLOCKSIZE;
+       ra.drr = kmem_alloc(sizeof (*ra.drr), KM_SLEEP);
        ra.buf = vmem_alloc(ra.bufsize, KM_SLEEP);
+       ra.next_drr = kmem_alloc(sizeof (*ra.next_drr), KM_SLEEP);
 
        /* these were verified in dmu_recv_begin */
        ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
@@ -1869,7 +2000,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
        /*
         * Open the objset we are modifying.
         */
-       VERIFY0(dmu_objset_from_ds(drc->drc_ds, &os));
+       VERIFY0(dmu_objset_from_ds(drc->drc_ds, &ra.os));
 
        ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT);
 
@@ -1895,13 +2026,13 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
                        avl_create(ra.guid_to_ds_map, guid_compare,
                            sizeof (guid_map_entry_t),
                            offsetof(guid_map_entry_t, avlnode));
-                       ra.err = zfs_onexit_add_cb(minor,
+                       err = zfs_onexit_add_cb(minor,
                            free_guid_map_onexit, ra.guid_to_ds_map,
                            action_handlep);
                        if (ra.err != 0)
                                goto out;
                } else {
-                       ra.err = zfs_onexit_cb_data(minor, *action_handlep,
+                       err = zfs_onexit_cb_data(minor, *action_handlep,
                            (void **)&ra.guid_to_ds_map);
                        if (ra.err != 0)
                                goto out;
@@ -1910,96 +2041,34 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
                drc->drc_guid_to_ds_map = ra.guid_to_ds_map;
        }
 
-       /*
-        * Read records and process them.
-        */
-       pcksum = ra.cksum;
-       while (ra.err == 0 &&
-           NULL != (drr = restore_read(&ra, sizeof (*drr), NULL))) {
+       err = restore_read_payload_and_next_header(&ra, 0, NULL);
+       if (err != 0)
+               goto out;
+       for (;;) {
+               void *tmp;
+
                if (issig(JUSTLOOKING) && issig(FORREAL)) {
-                       ra.err = SET_ERROR(EINTR);
-                       goto out;
+                       err = SET_ERROR(EINTR);
+                       break;
                }
 
-               if (ra.byteswap)
-                       backup_byteswap(drr);
+               tmp = ra.next_drr;
+               ra.next_drr = ra.drr;
+               ra.drr = tmp;
 
-               switch (drr->drr_type) {
-               case DRR_OBJECT:
-               {
-                       /*
-                        * We need to make a copy of the record header,
-                        * because restore_{object,write} may need to
-                        * restore_read(), which will invalidate drr.
-                        */
-                       struct drr_object drro = drr->drr_u.drr_object;
-                       ra.err = restore_object(&ra, os, &drro);
-                       break;
-               }
-               case DRR_FREEOBJECTS:
-               {
-                       struct drr_freeobjects drrfo =
-                           drr->drr_u.drr_freeobjects;
-                       ra.err = restore_freeobjects(&ra, os, &drrfo);
-                       break;
-               }
-               case DRR_WRITE:
-               {
-                       struct drr_write drrw = drr->drr_u.drr_write;
-                       ra.err = restore_write(&ra, os, &drrw);
-                       break;
-               }
-               case DRR_WRITE_BYREF:
-               {
-                       struct drr_write_byref drrwbr =
-                           drr->drr_u.drr_write_byref;
-                       ra.err = restore_write_byref(&ra, os, &drrwbr);
-                       break;
-               }
-               case DRR_WRITE_EMBEDDED:
-               {
-                       struct drr_write_embedded drrwe =
-                           drr->drr_u.drr_write_embedded;
-                       ra.err = restore_write_embedded(&ra, os, &drrwe);
-                       break;
-               }
-               case DRR_FREE:
-               {
-                       struct drr_free drrf = drr->drr_u.drr_free;
-                       ra.err = restore_free(&ra, os, &drrf);
+               /* process ra.drr, read in ra.next_drr */
+               err = restore_process_record(&ra);
+               if (err != 0)
                        break;
-               }
-               case DRR_END:
-               {
-                       struct drr_end drre = drr->drr_u.drr_end;
-                       /*
-                        * We compare against the *previous* checksum
-                        * value, because the stored checksum is of
-                        * everything before the DRR_END record.
-                        */
-                       if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum))
-                               ra.err = SET_ERROR(ECKSUM);
-                       goto out;
-               }
-               case DRR_SPILL:
-               {
-                       struct drr_spill drrs = drr->drr_u.drr_spill;
-                       ra.err = restore_spill(&ra, os, &drrs);
+               if (ra.drr->drr_type == DRR_END)
                        break;
-               }
-               default:
-                       ra.err = SET_ERROR(EINVAL);
-                       goto out;
-               }
-               pcksum = ra.cksum;
        }
-       ASSERT(ra.err != 0);
 
 out:
        if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1))
                zfs_onexit_fd_rele(cleanup_fd);
 
-       if (ra.err != 0) {
+       if (err != 0) {
                /*
                 * destroy what we created, so we don't leave it in the
                 * inconsistent restoring state.
@@ -2007,9 +2076,11 @@ out:
                dmu_recv_cleanup_ds(drc);
        }
 
+       kmem_free(ra.drr, sizeof (*ra.drr));
        vmem_free(ra.buf, ra.bufsize);
+       kmem_free(ra.next_drr, sizeof (*ra.next_drr));
        *voffp = ra.voff;
-       return (ra.err);
+       return (err);
 }
 
 static int