]> granicus.if.org Git - zfs/commitdiff
Account for ashift when gathering buffers to be written to l2arc device
authorAndriy Gapon <avg@freebsd.org>
Fri, 12 Jun 2015 19:20:29 +0000 (21:20 +0200)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Thu, 25 Jun 2015 15:57:16 +0000 (08:57 -0700)
If we don't account for that, then we might end up overwriting disk
area of buffers that have not been evicted yet, because l2arc_evict
operates in terms of disk addresses.

The discrepancy between the write size calculation and the actual
increment to l2ad_hand was introduced in commit 3a17a7a9.

The change that introduced l2ad_hand alignment was almost correct
as the write size was accumulated as a sum of rounded buffer sizes.
See commit illumos/illumos-gate@e14bb32.

Also, we now consistently use asize / a_sz for the allocated size and
psize / p_sz for the physical size.  The latter accounts for a
possible size reduction because of the compression, whereas the
former accounts for a possible subsequent size expansion because of
the alignment requirements.

The code still assumes that either underlying storage subsystems or
hardware is able to do read-modify-write when an L2ARC buffer size is
not a multiple of a disk's block size.  This is true for 4KB sector disks
that provide 512B sector emulation, but may not be true in general.
In other words, we currently do not have any code to make sure that
an L2ARC buffer, whether compressed or not, which is used for physical
I/O has a suitable size.

Note that currently the cache device utilization is calculated based
on the physical size, not the allocated size.  The same applies to
l2_asize kstat. That is wrong, but this commit does not fix that.
The accounting problem was introduced partially in commit 3a17a7a9
and partially in 3038a2b (accounting became consistent but in favour
of the wrong size).

Porting Notes:

Reworked to be C90 compatible and the 'write_psize' variable was
removed because it is now unused.

References:
  https://reviews.csiden.org/r/229/
  https://reviews.freebsd.org/D2764

Ported-by: kernelOfTruth <kerneloftruth@gmail.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3400
Closes #3433
Closes #3451

module/zfs/arc.c

index 5d5bcbe2be2949545804cc13d98ffb93e71d1e61..16d9706729f404661faa850e61780124ac05899f 100644 (file)
@@ -5789,8 +5789,8 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
     boolean_t *headroom_boost)
 {
        arc_buf_hdr_t *hdr, *hdr_prev, *head;
-       uint64_t write_asize, write_psize, write_sz, headroom,
-           buf_compress_minsz;
+       uint64_t write_asize, write_sz, headroom, buf_compress_minsz,
+           stats_size;
        void *buf_data;
        boolean_t full;
        l2arc_write_callback_t *cb;
@@ -5805,7 +5805,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
        *headroom_boost = B_FALSE;
 
        pio = NULL;
-       write_sz = write_asize = write_psize = 0;
+       write_sz = write_asize = 0;
        full = B_FALSE;
        head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
        head->b_flags |= ARC_FLAG_L2_WRITE_HEAD;
@@ -5842,6 +5842,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
                for (; hdr; hdr = hdr_prev) {
                        kmutex_t *hash_lock;
                        uint64_t buf_sz;
+                       uint64_t buf_a_sz;
 
                        if (arc_warm == B_FALSE)
                                hdr_prev = multilist_sublist_next(mls, hdr);
@@ -5870,7 +5871,15 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
                                continue;
                        }
 
-                       if ((write_sz + hdr->b_size) > target_sz) {
+                       /*
+                        * Assume that the buffer is not going to be compressed
+                        * and could take more space on disk because of a larger
+                        * disk block size.
+                        */
+                       buf_sz = hdr->b_size;
+                       buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
+
+                       if ((write_asize + buf_a_sz) > target_sz) {
                                full = B_TRUE;
                                mutex_exit(hash_lock);
                                break;
@@ -5935,8 +5944,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
                         * using it to denote the header's state change.
                         */
                        hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET;
-
-                       buf_sz = hdr->b_size;
                        hdr->b_flags |= ARC_FLAG_HAS_L2HDR;
 
                        mutex_enter(&dev->l2ad_mtx);
@@ -5953,6 +5960,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
                        mutex_exit(hash_lock);
 
                        write_sz += buf_sz;
+                       write_asize += buf_a_sz;
                }
 
                multilist_sublist_unlock(mls);
@@ -5971,6 +5979,19 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
 
        mutex_enter(&dev->l2ad_mtx);
 
+       /*
+        * Note that elsewhere in this file arcstat_l2_asize
+        * and the used space on l2ad_vdev are updated using b_asize,
+        * which is not necessarily rounded up to the device block size.
+        * Too keep accounting consistent we do the same here as well:
+        * stats_size accumulates the sum of b_asize of the written buffers,
+        * while write_asize accumulates the sum of b_asize rounded up
+        * to the device block size.
+        * The latter sum is used only to validate the corectness of the code.
+        */
+       stats_size = 0;
+       write_asize = 0;
+
        /*
         * Now start writing the buffers. We're starting at the write head
         * and work backwards, retracing the course of the buffer selector
@@ -6024,7 +6045,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
 
                /* Compression may have squashed the buffer to zero length. */
                if (buf_sz != 0) {
-                       uint64_t buf_p_sz;
+                       uint64_t buf_a_sz;
 
                        wzio = zio_write_phys(pio, dev->l2ad_vdev,
                            dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
@@ -6035,14 +6056,14 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
                            zio_t *, wzio);
                        (void) zio_nowait(wzio);
 
-                       write_asize += buf_sz;
+                       stats_size += buf_sz;
 
                        /*
                         * Keep the clock hand suitably device-aligned.
                         */
-                       buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
-                       write_psize += buf_p_sz;
-                       dev->l2ad_hand += buf_p_sz;
+                       buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
+                       write_asize += buf_a_sz;
+                       dev->l2ad_hand += buf_a_sz;
                }
        }
 
@@ -6052,8 +6073,8 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
        ARCSTAT_BUMP(arcstat_l2_writes_sent);
        ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
        ARCSTAT_INCR(arcstat_l2_size, write_sz);
-       ARCSTAT_INCR(arcstat_l2_asize, write_asize);
-       vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0);
+       ARCSTAT_INCR(arcstat_l2_asize, stats_size);
+       vdev_space_update(dev->l2ad_vdev, stats_size, 0, 0);
 
        /*
         * Bump device hand to the device start if it is approaching the end.