Fix handling of maxblkid for raw sends

author Tom Caputi <tcaputi@datto.com>

Wed, 13 Mar 2019 17:52:01 +0000 (13:52 -0400)

committer Brian Behlendorf <behlendorf1@llnl.gov>

Wed, 13 Mar 2019 17:52:01 +0000 (10:52 -0700)
author Tom Caputi <tcaputi@datto.com>
Wed, 13 Mar 2019 17:52:01 +0000 (13:52 -0400)
committer Brian Behlendorf <behlendorf1@llnl.gov>
Wed, 13 Mar 2019 17:52:01 +0000 (10:52 -0700)
diff --git a/include/sys/dmu.h b/include/sys/dmu.h

index e4c2ebc2f2d7e62b9cb520245a221b7d33742b48..93d05aac42e16fe995b8544b77339d36c479de95 100644 (file)
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -477,7 +477,8 @@ int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size,
  
  /*
   * Manually set the maxblkid on a dnode. This will adjust nlevels accordingly
- * to accommodate the change.
+ * to accommodate the change. When calling this function, the caller must
+ * ensure that the object's nlevels can sufficiently support the new maxblkid.
   */
  int dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid,
      dmu_tx_t *tx);
diff --git a/include/sys/dnode.h b/include/sys/dnode.h

index 48ef927d4ad848eaf59aa7d9e0adf3f8d909568f..6355aaa5020880c0ad8a6d34c91fa332e4e534bd 100644 (file)
--- a/include/sys/dnode.h
+++ b/include/sys/dnode.h
@@ -371,6 +371,12 @@ struct dnode {
         struct zfetch   dn_zfetch;
  };
  
+/*
+ * We use this (otherwise unused) bit to indicate if the value of
+ * dn_next_maxblkid[txgoff] is valid to use in dnode_sync().
+ */
+#define        DMU_NEXT_MAXBLKID_SET           (1ULL << 63)
+
  /*
   * Adds a level of indirection between the dbuf and the dnode to avoid
   * iterating descendent dbufs in dnode_move(). Handles are not allocated
@@ -423,7 +429,8 @@ int dnode_set_nlevels(dnode_t *dn, int nlevels, dmu_tx_t *tx);
  int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx);
  void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx);
  void dnode_diduse_space(dnode_t *dn, int64_t space);
-void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t);
+void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx,
+    boolean_t have_read, boolean_t force);
  uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid);
  void dnode_init(void);
  void dnode_fini(void);
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c

index 28ff5fc7e4ad1565d2035f3fcc3ec7bd186efd53..e6ce2bca5218c42a1c122556ae5f82c1d57f0310 100644 (file)
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -2116,7 +2116,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
         if (db->db_level == 0) {
                 ASSERT(!db->db_objset->os_raw_receive ||
                     dn->dn_maxblkid >= db->db_blkid);
-               dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
+               dnode_new_blkid(dn, db->db_blkid, tx,
+                   drop_struct_lock, B_FALSE);
                 ASSERT(dn->dn_maxblkid >= db->db_blkid);
         }
  
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c

index 36fc8815cd07317a1e0d98e466f6f5c110c188a8..1621772840cc437ff81c99d714d47aa7b1b70af7 100644 (file)
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -2155,7 +2155,7 @@ dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid,
         if (err)
                 return (err);
         rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-       dnode_new_blkid(dn, maxblkid, tx, B_FALSE);
+       dnode_new_blkid(dn, maxblkid, tx, B_FALSE, B_TRUE);
         rw_exit(&dn->dn_struct_rwlock);
         dnode_rele(dn, FTAG);
         return (0);
diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c

index e05b5ad821f0ec81299a057babd16addd88a43a1..87946031854b9a00812773b1630d064b475bac73 100644 (file)
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@@ -1179,10 +1179,22 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
  
                 object = drro->drr_object;
  
-               /* nblkptr will be bounded by the bonus size and type */
+               /* nblkptr should be bounded by the bonus size and type */
                 if (rwa->raw && nblkptr != drro->drr_nblkptr)
                         return (SET_ERROR(EINVAL));
  
+               /*
+                * Check for indicators that the object was freed and
+                * reallocated. For all sends, these indicators are:
+                *     - A changed block size
+                *     - A smaller nblkptr
+                *     - A changed dnode size
+                * For raw sends we also check a few other fields to
+                * ensure we are preserving the objset structure exactly
+                * as it was on the receive side:
+                *     - A changed indirect block size
+                *     - A smaller nlevels
+                */
                 if (drro->drr_blksz != doi.doi_data_block_size ||
                     nblkptr < doi.doi_nblkptr ||
                     dn_slots != doi.doi_dnodesize >> DNODE_SHIFT ||
@@ -1197,13 +1209,14 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
  
                 /*
                  * The dmu does not currently support decreasing nlevels
-                * on an object. For non-raw sends, this does not matter
-                * and the new object can just use the previous one's nlevels.
-                * For raw sends, however, the structure of the received dnode
-                * (including nlevels) must match that of the send side.
-                * Therefore, instead of using dmu_object_reclaim(), we must
-                * free the object completely and call dmu_object_claim_dnsize()
-                * instead.
+                * or changing the number of dnode slots on an object. For
+                * non-raw sends, this does not matter and the new object
+                * can just use the previous one's nlevels. For raw sends,
+                * however, the structure of the received dnode (including
+                * nlevels and dnode slots) must match that of the send
+                * side. Therefore, instead of using dmu_object_reclaim(),
+                * we must free the object completely and call
+                * dmu_object_claim_dnsize() instead.
                  */
                 if ((rwa->raw && drro->drr_nlevels < doi.doi_indirection) ||
                     dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) {
@@ -1214,6 +1227,23 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
                         txg_wait_synced(dmu_objset_pool(rwa->os), 0);
                         object = DMU_NEW_OBJECT;
                 }
+
+               /*
+                * For raw receives, free everything beyond the new incoming
+                * maxblkid. Normally this would be done with a DRR_FREE
+                * record that would come after this DRR_OBJECT record is
+                * processed. However, for raw receives we manually set the
+                * maxblkid from the drr_maxblkid and so we must first free
+                * everything above that blkid to ensure the DMU is always
+                * consistent with itself.
+                */
+               if (rwa->raw) {
+                       err = dmu_free_long_range(rwa->os, drro->drr_object,
+                           (drro->drr_maxblkid + 1) * drro->drr_blksz,
+                           DMU_OBJECT_END);
+                       if (err != 0)
+                               return (SET_ERROR(EINVAL));
+               }
         } else if (err == EEXIST) {
                 /*
                  * The object requested is currently an interior slot of a
@@ -1333,14 +1363,24 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
         /* handle more restrictive dnode structuring for raw recvs */
         if (rwa->raw) {
                 /*
-                * Set the indirect block shift and nlevels. This will not fail
-                * because we ensured all of the blocks were free earlier if
-                * this is a new object.
+                * Set the indirect block size, block shift, nlevels.
+                * This will not fail because we ensured all of the
+                * blocks were freed earlier if this is a new object.
+                * For non-new objects block size and indirect block
+                * shift cannot change and nlevels can only increase.
                  */
                 VERIFY0(dmu_object_set_blocksize(rwa->os, drro->drr_object,
                     drro->drr_blksz, drro->drr_indblkshift, tx));
                 VERIFY0(dmu_object_set_nlevels(rwa->os, drro->drr_object,
                     drro->drr_nlevels, tx));
+
+               /*
+                * Set the maxblkid. We will never free the first block of
+                * an object here because a maxblkid of 0 could indicate
+                * an object with a single block or one with no blocks.
+                * This will always succeed because we freed all blocks
+                * beyond the new maxblkid above.
+                */
                 VERIFY0(dmu_object_set_maxblkid(rwa->os, drro->drr_object,
                     drro->drr_maxblkid, tx));
         }
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c

index 9a19ddabba5ea3094dcb5f9e694c079cfab5b5fe..35aefa7cb93e154388ebe57622d2e15cd4afdeee 100644 (file)
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -1828,7 +1828,8 @@ out:
  
  /* read-holding callers must not rely on the lock being continuously held */
  void
-dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
+dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read,
+    boolean_t force)
  {
         int epbs, new_nlevels;
         uint64_t sz;
@@ -1853,14 +1854,25 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
                 }
         }
  
-       if (blkid <= dn->dn_maxblkid)
+       /*
+        * Raw sends (indicated by the force flag) require that we take the
+        * given blkid even if the value is lower than the current value.
+        */
+       if (!force && blkid <= dn->dn_maxblkid)
                 goto out;
  
+       /*
+        * We use the (otherwise unused) top bit of dn_next_maxblkid[txgoff]
+        * to indicate that this field is set. This allows us to set the
+        * maxblkid to 0 on an existing object in dnode_sync().
+        */
         dn->dn_maxblkid = blkid;
-       dn->dn_next_maxblkid[tx->tx_txg & TXG_MASK] = blkid;
+       dn->dn_next_maxblkid[tx->tx_txg & TXG_MASK] =
+           blkid | DMU_NEXT_MAXBLKID_SET;
  
         /*
          * Compute the number of levels necessary to support the new maxblkid.
+        * Raw sends will ensure nlevels is set correctly for us.
          */
         new_nlevels = 1;
         epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
@@ -1870,8 +1882,12 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
  
         ASSERT3U(new_nlevels, <=, DN_MAX_LEVELS);
  
-       if (new_nlevels > dn->dn_nlevels)
-               dnode_set_nlevels_impl(dn, new_nlevels, tx);
+       if (!force) {
+               if (new_nlevels > dn->dn_nlevels)
+                       dnode_set_nlevels_impl(dn, new_nlevels, tx);
+       } else {
+               ASSERT3U(dn->dn_nlevels, >=, new_nlevels);
+       }
  
  out:
         if (have_read)
diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c

index c4062beb3173590d7e011ad95582340befea69ba..581f812a14d15e41c4c87e1ccbdb124cacd3e202 100644 (file)
--- a/module/zfs/dnode_sync.c
+++ b/module/zfs/dnode_sync.c
@@ -384,12 +384,7 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
                 }
         }
  
-       /*
-        * Do not truncate the maxblkid if we are performing a raw
-        * receive. The raw receive sets the mablkid manually and
-        * must not be overriden.
-        */
-       if (trunc && !dn->dn_objset->os_raw_receive) {
+       if (trunc) {
                 ASSERTV(uint64_t off);
                 dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1;
  
@@ -765,11 +760,13 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
  
         /*
          * This must be done after dnode_sync_free_range()
-        * and dnode_increase_indirection().
+        * and dnode_increase_indirection(). See dnode_new_blkid()
+        * for an explanation of the high bit being set.
          */
         if (dn->dn_next_maxblkid[txgoff]) {
                 mutex_enter(&dn->dn_mtx);
-               dnp->dn_maxblkid = dn->dn_next_maxblkid[txgoff];
+               dnp->dn_maxblkid =
+                   dn->dn_next_maxblkid[txgoff] & ~DMU_NEXT_MAXBLKID_SET;
                 dn->dn_next_maxblkid[txgoff] = 0;
                 mutex_exit(&dn->dn_mtx);
         }
diff --git a/module/zfs/dsl_crypt.c b/module/zfs/dsl_crypt.c

index da2a126f2ebc4e05c6dfb608f8df88e72851aace..9271128b9639631a6069c6936d91b9560a82027f 100644 (file)
--- a/module/zfs/dsl_crypt.c
+++ b/module/zfs/dsl_crypt.c
@@ -2099,7 +2099,7 @@ dsl_crypto_recv_raw_objset_sync(dsl_dataset_t *ds, dmu_objset_type_t ostype,
         mdn->dn_checksum = checksum;
  
         rw_enter(&mdn->dn_struct_rwlock, RW_WRITER);
-       dnode_new_blkid(mdn, maxblkid, tx, B_FALSE);
+       dnode_new_blkid(mdn, maxblkid, tx, B_FALSE, B_TRUE);
         rw_exit(&mdn->dn_struct_rwlock);
  
         /*
diff --git a/tests/zfs-tests/cmd/xattrtest/xattrtest.c b/tests/zfs-tests/cmd/xattrtest/xattrtest.c

index 32a6b1d95b10069af3edf942874602d6786617e0..42c510ed082d035fdab4c9b58cacc0ea0c4ed37d 100644 (file)
--- a/tests/zfs-tests/cmd/xattrtest/xattrtest.c
+++ b/tests/zfs-tests/cmd/xattrtest/xattrtest.c
@@ -144,11 +144,6 @@ parse_args(int argc, char **argv)
                         break;
                 case 'y':
                         verify = 1;
-                       if (phase != PHASE_ALL) {
-                               fprintf(stderr,
-                                   "Error: -y and -o are incompatible.\n");
-                               rc = 1;
-                       }
                         break;
                 case 'n':
                         nth = strtol(optarg, NULL, 0);
@@ -201,11 +196,6 @@ parse_args(int argc, char **argv)
                                     PHASE_ALL, PHASE_INVAL);
                                 rc = 1;
                         }
-                       if (verify == 1) {
-                               fprintf(stderr,
-                                   "Error: -y and -o are incompatible.\n");
-                               rc = 1;
-                       }
                         break;
                 default:
                         rc = 1;
diff --git a/tests/zfs-tests/tests/functional/rsend/send_encrypted_files.ksh b/tests/zfs-tests/tests/functional/rsend/send_encrypted_files.ksh

index 5bf25e27734b6beec03c2d7f2febcb3aa74d5437..d981aa3fd2103bf2d1076eae49ee2b17414e437e 100755 (executable)
--- a/tests/zfs-tests/tests/functional/rsend/send_encrypted_files.ksh
+++ b/tests/zfs-tests/tests/functional/rsend/send_encrypted_files.ksh
@@ -30,12 +30,18 @@
  # 3. Add a small 512 byte file to the filesystem
  # 4. Add a larger 32M file to the filesystem
  # 5. Add a large sparse file to the filesystem
-# 6. Add a file truncated to 4M to the filesystem
-# 7. Add a sparse file with metadata compression disabled to the filesystem
-# 8. Add and remove 1000 empty files to the filesystem
-# 9. Add a file with a large xattr value
-# 10. Snapshot the filesystem
-# 11. Send and receive the filesystem, ensuring that it can be mounted
+# 6. Add a 3 files that are to be truncated later
+# 7. Add 1000 empty files to the filesystem
+# 8. Add a file with a large xattr value
+# 9. Use xattrtest to create files with random xattrs (with and without xattrs=on)
+# 10. Take a snapshot of the filesystem
+# 11. Truncate one of the files from 32M to 128k
+# 12. Truncate one of the files from 512k to 384k
+# 13. Truncate one of the files from 512k to 0 to 384k
+# 14. Remove the 1000 empty files to the filesystem
+# 15. Take another snapshot of the filesystem
+# 16. Send and receive both snapshots
+# 17. Mount the filesystem and check the contents
  #
  
  verify_runnable "both"
@@ -51,46 +57,89 @@ function cleanup
  }
  log_onexit cleanup
  
+function recursive_cksum
+{
+       find $1 -type f -exec sha256sum {} \; | \
+               sort -k 2 | awk '{ print $1 }' | sha256sum
+}
+
  log_assert "Verify 'zfs send -w' works with many different file layouts"
  
  typeset keyfile=/$TESTPOOL/pkey
  typeset sendfile=/$TESTPOOL/sendfile
+typeset sendfile2=/$TESTPOOL/sendfile2
  
+# Create an encrypted dataset
  log_must eval "echo 'password' > $keyfile"
  log_must zfs create -o encryption=on -o keyformat=passphrase \
         -o keylocation=file://$keyfile $TESTPOOL/$TESTFS2
  
+# Create files with vaired layouts on disk
  log_must touch /$TESTPOOL/$TESTFS2/empty
  log_must mkfile 512 /$TESTPOOL/$TESTFS2/small
  log_must mkfile 32M /$TESTPOOL/$TESTFS2/full
  log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS2/sparse \
-       bs=512 count=1 seek=10G >/dev/null 2>&1
+       bs=512 count=1 seek=1048576 >/dev/null 2>&1
  log_must mkfile 32M /$TESTPOOL/$TESTFS2/truncated
-log_must truncate -s 4M /$TESTPOOL/$TESTFS2/truncated
-sync
+log_must mkfile 524288 /$TESTPOOL/$TESTFS2/truncated2
+log_must mkfile 524288 /$TESTPOOL/$TESTFS2/truncated3
  
  log_must mkdir -p /$TESTPOOL/$TESTFS2/dir
  for i in {1..1000}; do
         log_must mkfile 512 /$TESTPOOL/$TESTFS2/dir/file-$i
  done
-sync
  
+log_must mkdir -p /$TESTPOOL/$TESTFS2/xattrondir
+log_must zfs set xattr=on $TESTPOOL/$TESTFS2
+log_must xattrtest -f 10 -x 3 -s 32768 -r -k -p /$TESTPOOL/$TESTFS2/xattrondir
+log_must mkdir -p /$TESTPOOL/$TESTFS2/xattrsadir
+log_must zfs set xattr=sa $TESTPOOL/$TESTFS2
+log_must xattrtest -f 10 -x 3 -s 32768 -r -k -p /$TESTPOOL/$TESTFS2/xattrsadir
+
+# ZoL issue #7432
+log_must zfs set compression=on xattr=sa $TESTPOOL/$TESTFS2
+log_must touch /$TESTPOOL/$TESTFS2/attrs
+log_must eval "python -c 'print \"a\" * 4096' | \
+       attr -s bigval /$TESTPOOL/$TESTFS2/attrs"
+
+log_must zfs snapshot $TESTPOOL/$TESTFS2@snap1
+
+#
+# Truncate files created in the first snapshot. The first tests
+# truncating a large file to a single block. The second tests
+# truncating one block off the end of a file without changing
+# the required nlevels to hold it. The last tests handling
+# of a maxblkid that is dropped and then raised again.
+#
+log_must truncate -s 131072 /$TESTPOOL/$TESTFS2/truncated
+log_must truncate -s 393216 /$TESTPOOL/$TESTFS2/truncated2
+log_must truncate -s 0 /$TESTPOOL/$TESTFS2/truncated3
+log_must zpool sync $TESTPOOL
+log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS2/truncated3 \
+       bs=128k count=3 iflag=fullblock
+
+# Remove the empty files created in the first snapshot
  for i in {1..1000}; do
         log_must rm /$TESTPOOL/$TESTFS2/dir/file-$i
  done
  sync
  
-# ZoL issue #7432
-log_must zfs set compression=on xattr=sa $TESTPOOL/$TESTFS2
-log_must touch /$TESTPOOL/$TESTFS2/attrs
-log_must eval "python -c 'print \"a\" * 4096' | attr -s bigval /$TESTPOOL/$TESTFS2/attrs"
+log_must zfs snapshot $TESTPOOL/$TESTFS2@snap2
+expected_cksum=$(recursive_cksum /$TESTPOOL/$TESTFS2)
  
-log_must zfs snapshot $TESTPOOL/$TESTFS2@now
-log_must eval "zfs send -wR $TESTPOOL/$TESTFS2@now > $sendfile"
+log_must eval "zfs send -wp $TESTPOOL/$TESTFS2@snap1 > $sendfile"
+log_must eval "zfs send -wp -i @snap1 $TESTPOOL/$TESTFS2@snap2 > $sendfile2"
  
  log_must eval "zfs recv -F $TESTPOOL/recv < $sendfile"
+log_must eval "zfs recv -F $TESTPOOL/recv < $sendfile2"
  log_must zfs load-key $TESTPOOL/recv
  
  log_must zfs mount -a
+actual_cksum=$(recursive_cksum /$TESTPOOL/recv)
+[[ "$expected_cksum" != "$actual_cksum" ]] && \
+       log_fail "Recursive checksums differ ($expected_cksum != $actual_cksum)"
+
+log_must xattrtest -f 10 -o3 -y -p /$TESTPOOL/recv/xattrondir
+log_must xattrtest -f 10 -o3 -y -p /$TESTPOOL/recv/xattrsadir
  
  log_pass "Verified 'zfs send -w' works with many different file layouts"
author	Tom Caputi <tcaputi@datto.com>
	Wed, 13 Mar 2019 17:52:01 +0000 (13:52 -0400)
committer	Brian Behlendorf <behlendorf1@llnl.gov>
	Wed, 13 Mar 2019 17:52:01 +0000 (10:52 -0700)
include/sys/dmu.h		patch \| blob \| history
include/sys/dnode.h		patch \| blob \| history
module/zfs/dbuf.c		patch \| blob \| history
module/zfs/dmu.c		patch \| blob \| history
module/zfs/dmu_recv.c		patch \| blob \| history
module/zfs/dnode.c		patch \| blob \| history
module/zfs/dnode_sync.c		patch \| blob \| history
module/zfs/dsl_crypt.c		patch \| blob \| history
tests/zfs-tests/cmd/xattrtest/xattrtest.c		patch \| blob \| history
tests/zfs-tests/tests/functional/rsend/send_encrypted_files.ksh		patch \| blob \| history