4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
23 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
24 * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
26 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
29 #include <sys/zfs_context.h>
30 #include <sys/spa_impl.h>
31 #include <sys/vdev_disk.h>
32 #include <sys/vdev_impl.h>
34 #include <sys/fs/zfs.h>
36 #include <linux/mod_compat.h>
37 #include <linux/msdos_fs.h>
39 char *zfs_vdev_scheduler = VDEV_SCHEDULER;
40 static void *zfs_vdev_holder = VDEV_HOLDER;
42 /* size of the "reserved" partition, in blocks */
43 #define EFI_MIN_RESV_SIZE (16 * 1024)
46 * Virtual device vector for disks.
48 typedef struct dio_request {
49 zio_t *dr_zio; /* Parent ZIO */
50 atomic_t dr_ref; /* References */
51 int dr_error; /* Bio error */
52 int dr_bio_count; /* Count of bio's */
53 struct bio *dr_bio[0]; /* Attached bio's */
57 #ifdef HAVE_OPEN_BDEV_EXCLUSIVE
59 vdev_bdev_mode(int smode)
63 ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
75 vdev_bdev_mode(int smode)
79 ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
81 if ((smode & FREAD) && !(smode & FWRITE))
86 #endif /* HAVE_OPEN_BDEV_EXCLUSIVE */
89 * Returns the usable capacity (in bytes) for the partition or disk.
92 bdev_capacity(struct block_device *bdev)
94 return (i_size_read(bdev->bd_inode));
98 * Returns the maximum expansion capacity of the block device (in bytes).
100 * It is possible to expand a vdev when it has been created as a wholedisk
101 * and the containing block device has increased in capacity. Or when the
102 * partition containing the pool has been manually increased in size.
104 * This function is only responsible for calculating the potential expansion
105 * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is
106 * responsible for verifying the expected partition layout in the wholedisk
107 * case, and updating the partition table if appropriate. Once the partition
108 * size has been increased the additional capacity will be visible using
112 bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk)
117 if (wholedisk && bdev->bd_part != NULL && bdev != bdev->bd_contains) {
119 * When reporting maximum expansion capacity for a wholedisk
120 * deduct any capacity which is expected to be lost due to
121 * alignment restrictions. Over reporting this value isn't
122 * harmful and would only result in slightly less capacity
123 * than expected post expansion.
125 available = i_size_read(bdev->bd_contains->bd_inode) -
126 ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
127 PARTITION_END_ALIGNMENT) << SECTOR_BITS);
131 psize = bdev_capacity(bdev);
133 psize = bdev_capacity(bdev);
140 vdev_disk_error(zio_t *zio)
143 * This function can be called in interrupt context, for instance while
144 * handling IRQs coming from a misbehaving disk device; use printk()
145 * which is safe from any context.
147 printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d "
148 "offset=%llu size=%llu flags=%x\n", spa_name(zio->io_spa),
149 zio->io_vd->vdev_path, zio->io_error, zio->io_type,
150 (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
155 * Use the Linux 'noop' elevator for zfs managed block devices. This
156 * strikes the ideal balance by allowing the zfs elevator to do all
157 * request ordering and prioritization. While allowing the Linux
158 * elevator to do the maximum front/back merging allowed by the
159 * physical device. This yields the largest possible requests for
160 * the device with the lowest total overhead.
163 vdev_elevator_switch(vdev_t *v, char *elevator)
165 vdev_disk_t *vd = v->vdev_tsd;
166 struct request_queue *q;
170 for (int c = 0; c < v->vdev_children; c++)
171 vdev_elevator_switch(v->vdev_child[c], elevator);
173 if (!v->vdev_ops->vdev_op_leaf || vd->vd_bdev == NULL)
176 q = bdev_get_queue(vd->vd_bdev);
177 device = vd->vd_bdev->bd_disk->disk_name;
180 * Skip devices which are not whole disks (partitions).
181 * Device-mapper devices are excepted since they may be whole
182 * disks despite the vdev_wholedisk flag, in which case we can
183 * and should switch the elevator. If the device-mapper device
184 * does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the
185 * "Skip devices without schedulers" check below will fail.
187 if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0)
190 /* Leave existing scheduler when set to "none" */
191 if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4))
195 * The elevator_change() function was available in kernels from
196 * 2.6.36 to 4.11. When not available fall back to using the user
197 * mode helper functionality to set the elevator via sysfs. This
198 * requires /bin/echo and sysfs to be mounted which may not be true
199 * early in the boot process.
201 #ifdef HAVE_ELEVATOR_CHANGE
202 error = elevator_change(q, elevator);
204 #define SET_SCHEDULER_CMD \
205 "exec 0</dev/null " \
206 " 1>/sys/block/%s/queue/scheduler " \
210 char *argv[] = { "/bin/sh", "-c", NULL, NULL };
211 char *envp[] = { NULL };
213 argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator);
214 error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
216 #endif /* HAVE_ELEVATOR_CHANGE */
218 zfs_dbgmsg("Unable to set \"%s\" scheduler for %s (%s): %d\n",
219 elevator, v->vdev_path, device, error);
224 vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
227 struct block_device *bdev;
228 fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa));
229 int count = 0, block_size;
230 int bdev_retry_count = 50;
233 /* Must have a pathname and it must be absolute. */
234 if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
235 v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
236 vdev_dbgmsg(v, "invalid vdev_path");
237 return (SET_ERROR(EINVAL));
241 * Reopen the device if it is currently open. When expanding a
242 * partition force re-scanning the partition table while closed
243 * in order to get an accurate updated block device size. Then
244 * since udev may need to recreate the device links increase the
245 * open retry count before reporting the device as unavailable.
249 char disk_name[BDEVNAME_SIZE + 6] = "/dev/";
250 boolean_t reread_part = B_FALSE;
252 rw_enter(&vd->vd_lock, RW_WRITER);
257 if (v->vdev_expanding && bdev != bdev->bd_contains) {
258 bdevname(bdev->bd_contains, disk_name + 5);
259 reread_part = B_TRUE;
262 vdev_bdev_close(bdev, mode);
266 bdev = vdev_bdev_open(disk_name, mode, zfs_vdev_holder);
268 int error = vdev_bdev_reread_part(bdev);
269 vdev_bdev_close(bdev, mode);
271 bdev_retry_count = 100;
275 vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
277 rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL);
278 rw_enter(&vd->vd_lock, RW_WRITER);
282 * Devices are always opened by the path provided at configuration
283 * time. This means that if the provided path is a udev by-id path
284 * then drives may be re-cabled without an issue. If the provided
285 * path is a udev by-path path, then the physical location information
286 * will be preserved. This can be critical for more complicated
287 * configurations where drives are located in specific physical
288 * locations to maximize the systems tolerance to component failure.
290 * Alternatively, you can provide your own udev rule to flexibly map
291 * the drives as you see fit. It is not advised that you use the
292 * /dev/[hd]d devices which may be reordered due to probing order.
293 * Devices in the wrong locations will be detected by the higher
294 * level vdev validation.
296 * The specified paths may be briefly removed and recreated in
297 * response to udev events. This should be exceptionally unlikely
298 * because the zpool command makes every effort to verify these paths
299 * have already settled prior to reaching this point. Therefore,
300 * a ENOENT failure at this point is highly likely to be transient
301 * and it is reasonable to sleep and retry before giving up. In
302 * practice delays have been observed to be on the order of 100ms.
304 bdev = ERR_PTR(-ENXIO);
305 while (IS_ERR(bdev) && count < bdev_retry_count) {
306 bdev = vdev_bdev_open(v->vdev_path, mode, zfs_vdev_holder);
307 if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
308 schedule_timeout(MSEC_TO_TICK(10));
310 } else if (IS_ERR(bdev)) {
316 int error = -PTR_ERR(bdev);
317 vdev_dbgmsg(v, "open error=%d count=%d\n", error, count);
320 rw_exit(&vd->vd_lock);
321 return (SET_ERROR(error));
325 rw_exit(&vd->vd_lock);
328 /* Determine the physical block size */
329 block_size = vdev_bdev_block_size(vd->vd_bdev);
331 /* Clear the nowritecache bit, causes vdev_reopen() to try again. */
332 v->vdev_nowritecache = B_FALSE;
334 /* Inform the ZIO pipeline that we are non-rotational */
335 v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev));
337 /* Physical volume size in bytes for the partition */
338 *psize = bdev_capacity(vd->vd_bdev);
340 /* Physical volume size in bytes including possible expansion space */
341 *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk);
343 /* Based on the minimum sector size set the block size */
344 *ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
346 /* Try to set the io scheduler elevator algorithm */
347 (void) vdev_elevator_switch(v, zfs_vdev_scheduler);
353 vdev_disk_close(vdev_t *v)
355 vdev_disk_t *vd = v->vdev_tsd;
357 if (v->vdev_reopening || vd == NULL)
360 if (vd->vd_bdev != NULL) {
361 vdev_bdev_close(vd->vd_bdev,
362 vdev_bdev_mode(spa_mode(v->vdev_spa)));
365 rw_destroy(&vd->vd_lock);
366 kmem_free(vd, sizeof (vdev_disk_t));
370 static dio_request_t *
371 vdev_disk_dio_alloc(int bio_count)
376 dr = kmem_zalloc(sizeof (dio_request_t) +
377 sizeof (struct bio *) * bio_count, KM_SLEEP);
379 atomic_set(&dr->dr_ref, 0);
380 dr->dr_bio_count = bio_count;
383 for (i = 0; i < dr->dr_bio_count; i++)
384 dr->dr_bio[i] = NULL;
391 vdev_disk_dio_free(dio_request_t *dr)
395 for (i = 0; i < dr->dr_bio_count; i++)
397 bio_put(dr->dr_bio[i]);
399 kmem_free(dr, sizeof (dio_request_t) +
400 sizeof (struct bio *) * dr->dr_bio_count);
404 vdev_disk_dio_get(dio_request_t *dr)
406 atomic_inc(&dr->dr_ref);
410 vdev_disk_dio_put(dio_request_t *dr)
412 int rc = atomic_dec_return(&dr->dr_ref);
415 * Free the dio_request when the last reference is dropped and
416 * ensure zio_interpret is called only once with the correct zio
419 zio_t *zio = dr->dr_zio;
420 int error = dr->dr_error;
422 vdev_disk_dio_free(dr);
425 zio->io_error = error;
426 ASSERT3S(zio->io_error, >=, 0);
428 vdev_disk_error(zio);
430 zio_delay_interrupt(zio);
437 BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
439 dio_request_t *dr = bio->bi_private;
442 if (dr->dr_error == 0) {
443 #ifdef HAVE_1ARG_BIO_END_IO_T
444 dr->dr_error = BIO_END_IO_ERROR(bio);
447 dr->dr_error = -(error);
448 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
453 /* Drop reference acquired by __vdev_disk_physio */
454 rc = vdev_disk_dio_put(dr);
458 bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size)
460 unsigned int offset, size, i;
463 offset = offset_in_page(bio_ptr);
464 for (i = 0; i < bio->bi_max_vecs; i++) {
465 size = PAGE_SIZE - offset;
473 if (is_vmalloc_addr(bio_ptr))
474 page = vmalloc_to_page(bio_ptr);
476 page = virt_to_page(bio_ptr);
479 * Some network related block device uses tcp_sendpage, which
480 * doesn't behave well when using 0-count page, this is a
481 * safety net to catch them.
483 ASSERT3S(page_count(page), >, 0);
485 if (bio_add_page(bio, page, size, offset) != size)
497 bio_map_abd_off(struct bio *bio, abd_t *abd, unsigned int size, size_t off)
499 if (abd_is_linear(abd))
500 return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, size));
502 return (abd_scatter_bio_map_off(bio, abd, size, off));
506 vdev_submit_bio_impl(struct bio *bio)
508 #ifdef HAVE_1ARG_SUBMIT_BIO
515 #ifndef HAVE_BIO_SET_DEV
517 bio_set_dev(struct bio *bio, struct block_device *bdev)
521 #endif /* !HAVE_BIO_SET_DEV */
524 vdev_submit_bio(struct bio *bio)
526 #ifdef HAVE_CURRENT_BIO_TAIL
527 struct bio **bio_tail = current->bio_tail;
528 current->bio_tail = NULL;
529 vdev_submit_bio_impl(bio);
530 current->bio_tail = bio_tail;
532 struct bio_list *bio_list = current->bio_list;
533 current->bio_list = NULL;
534 vdev_submit_bio_impl(bio);
535 current->bio_list = bio_list;
540 __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
541 size_t io_size, uint64_t io_offset, int rw, int flags)
546 int bio_size, bio_count = 16;
547 int i = 0, error = 0;
548 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
549 struct blk_plug plug;
552 * Accessing outside the block device is never allowed.
554 if (io_offset + io_size > bdev->bd_inode->i_size) {
555 vdev_dbgmsg(zio->io_vd,
556 "Illegal access %llu size %llu, device size %llu",
557 io_offset, io_size, i_size_read(bdev->bd_inode));
558 return (SET_ERROR(EIO));
562 dr = vdev_disk_dio_alloc(bio_count);
564 return (SET_ERROR(ENOMEM));
566 if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
567 bio_set_flags_failfast(bdev, &flags);
572 * When the IO size exceeds the maximum bio size for the request
573 * queue we are forced to break the IO in multiple bio's and wait
574 * for them all to complete. Ideally, all pool users will set
575 * their volume block size to match the maximum request size and
576 * the common case will be one bio per vdev IO request.
580 bio_offset = io_offset;
582 for (i = 0; i <= dr->dr_bio_count; i++) {
584 /* Finished constructing bio's for given buffer */
589 * By default only 'bio_count' bio's per dio are allowed.
590 * However, if we find ourselves in a situation where more
591 * are needed we allocate a larger dio and warn the user.
593 if (dr->dr_bio_count == i) {
594 vdev_disk_dio_free(dr);
599 /* bio_alloc() with __GFP_WAIT never returns NULL */
600 dr->dr_bio[i] = bio_alloc(GFP_NOIO,
601 MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset),
603 if (unlikely(dr->dr_bio[i] == NULL)) {
604 vdev_disk_dio_free(dr);
605 return (SET_ERROR(ENOMEM));
608 /* Matching put called by vdev_disk_physio_completion */
609 vdev_disk_dio_get(dr);
611 bio_set_dev(dr->dr_bio[i], bdev);
612 BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
613 dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
614 dr->dr_bio[i]->bi_private = dr;
615 bio_set_op_attrs(dr->dr_bio[i], rw, flags);
617 /* Remaining size is returned to become the new size */
618 bio_size = bio_map_abd_off(dr->dr_bio[i], zio->io_abd,
619 bio_size, abd_offset);
621 /* Advance in buffer and construct another bio if needed */
622 abd_offset += BIO_BI_SIZE(dr->dr_bio[i]);
623 bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
626 /* Extra reference to protect dio_request during vdev_submit_bio */
627 vdev_disk_dio_get(dr);
629 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
630 if (dr->dr_bio_count > 1)
631 blk_start_plug(&plug);
634 /* Submit all bio's associated with this dio */
635 for (i = 0; i < dr->dr_bio_count; i++)
637 vdev_submit_bio(dr->dr_bio[i]);
639 #if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
640 if (dr->dr_bio_count > 1)
641 blk_finish_plug(&plug);
644 (void) vdev_disk_dio_put(dr);
649 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
651 zio_t *zio = bio->bi_private;
652 #ifdef HAVE_1ARG_BIO_END_IO_T
653 zio->io_error = BIO_END_IO_ERROR(bio);
655 zio->io_error = -error;
658 if (zio->io_error && (zio->io_error == EOPNOTSUPP))
659 zio->io_vd->vdev_nowritecache = B_TRUE;
662 ASSERT3S(zio->io_error, >=, 0);
664 vdev_disk_error(zio);
669 vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
671 struct request_queue *q;
674 q = bdev_get_queue(bdev);
676 return (SET_ERROR(ENXIO));
678 bio = bio_alloc(GFP_NOIO, 0);
679 /* bio_alloc() with __GFP_WAIT never returns NULL */
680 if (unlikely(bio == NULL))
681 return (SET_ERROR(ENOMEM));
683 bio->bi_end_io = vdev_disk_io_flush_completion;
684 bio->bi_private = zio;
685 bio_set_dev(bio, bdev);
687 vdev_submit_bio(bio);
688 invalidate_bdev(bdev);
694 vdev_disk_io_start(zio_t *zio)
696 vdev_t *v = zio->io_vd;
697 vdev_disk_t *vd = v->vdev_tsd;
698 int rw, flags, error;
701 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
702 * Nothing to be done here but return failure.
705 zio->io_error = ENXIO;
710 rw_enter(&vd->vd_lock, RW_READER);
713 * If the vdev is closed, it's likely due to a failed reopen and is
714 * in the UNAVAIL state. Nothing to be done here but return failure.
716 if (vd->vd_bdev == NULL) {
717 rw_exit(&vd->vd_lock);
718 zio->io_error = ENXIO;
723 switch (zio->io_type) {
726 if (!vdev_readable(v)) {
727 rw_exit(&vd->vd_lock);
728 zio->io_error = SET_ERROR(ENXIO);
733 switch (zio->io_cmd) {
734 case DKIOCFLUSHWRITECACHE:
736 if (zfs_nocacheflush)
739 if (v->vdev_nowritecache) {
740 zio->io_error = SET_ERROR(ENOTSUP);
744 error = vdev_disk_io_flush(vd->vd_bdev, zio);
746 rw_exit(&vd->vd_lock);
750 zio->io_error = error;
755 zio->io_error = SET_ERROR(ENOTSUP);
758 rw_exit(&vd->vd_lock);
763 #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
764 flags = (1 << BIO_RW_UNPLUG);
765 #elif defined(REQ_UNPLUG)
774 #if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
775 flags = (1 << BIO_RW_UNPLUG);
776 #elif defined(REQ_UNPLUG)
784 rw_exit(&vd->vd_lock);
785 zio->io_error = SET_ERROR(ENOTSUP);
790 zio->io_target_timestamp = zio_handle_io_delay(zio);
791 error = __vdev_disk_physio(vd->vd_bdev, zio,
792 zio->io_size, zio->io_offset, rw, flags);
793 rw_exit(&vd->vd_lock);
796 zio->io_error = error;
803 vdev_disk_io_done(zio_t *zio)
806 * If the device returned EIO, we revalidate the media. If it is
807 * determined the media has changed this triggers the asynchronous
808 * removal of the device from the configuration.
810 if (zio->io_error == EIO) {
811 vdev_t *v = zio->io_vd;
812 vdev_disk_t *vd = v->vdev_tsd;
814 if (check_disk_change(vd->vd_bdev)) {
815 vdev_bdev_invalidate(vd->vd_bdev);
816 v->vdev_remove_wanted = B_TRUE;
817 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
823 vdev_disk_hold(vdev_t *vd)
825 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
827 /* We must have a pathname, and it must be absolute. */
828 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
832 * Only prefetch path and devid info if the device has
835 if (vd->vdev_tsd != NULL)
838 /* XXX: Implement me as a vnode lookup for the device */
839 vd->vdev_name_vp = NULL;
840 vd->vdev_devid_vp = NULL;
844 vdev_disk_rele(vdev_t *vd)
846 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
848 /* XXX: Implement me as a vnode rele for the device */
852 param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp)
858 return (SET_ERROR(-EINVAL));
860 if ((p = strchr(val, '\n')) != NULL)
863 if (spa_mode_global != 0) {
864 mutex_enter(&spa_namespace_lock);
865 while ((spa = spa_next(spa)) != NULL) {
866 if (spa_state(spa) != POOL_STATE_ACTIVE ||
867 !spa_writeable(spa) || spa_suspended(spa))
870 spa_open_ref(spa, FTAG);
871 mutex_exit(&spa_namespace_lock);
872 vdev_elevator_switch(spa->spa_root_vdev, (char *)val);
873 mutex_enter(&spa_namespace_lock);
874 spa_close(spa, FTAG);
876 mutex_exit(&spa_namespace_lock);
879 return (param_set_charp(val, kp));
882 vdev_ops_t vdev_disk_ops = {
893 VDEV_TYPE_DISK, /* name of this vdev type */
894 B_TRUE /* leaf vdev */
897 module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler,
898 param_get_charp, &zfs_vdev_scheduler, 0644);
899 MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");