]> granicus.if.org Git - zfs/commitdiff
deadlock between mm_sem and tx assign in zfs_write() and page fault
authorilbsmart <wgqimut@gmail.com>
Tue, 16 Oct 2018 18:11:24 +0000 (02:11 +0800)
committerMatthew Ahrens <mahrens@delphix.com>
Tue, 16 Oct 2018 18:11:24 +0000 (11:11 -0700)
The bug time sequence:
1. thread #1, `zfs_write` assign a txg "n".
2. In a same process, thread #2, mmap page fault (which means the
   `mm_sem` is hold) occurred, `zfs_dirty_inode` open a txg failed,
   and wait previous txg "n" completed.
3. thread #1 call `uiomove` to write, however page fault is occurred
   in `uiomove`, which means it need `mm_sem`, but `mm_sem` is hold by
   thread #2, so it stuck and can't complete,  then txg "n" will
   not complete.

So thread #1 and thread #2 are deadlocked.

Reviewed-by: Chunwei Chen <tuxoko@gmail.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Signed-off-by: Grady Wong <grady.w@xtaotech.com>
Closes #7939

include/spl/sys/uio.h
include/sys/uio_impl.h
module/zcommon/zfs_uio.c
module/zfs/zfs_vnops.c
tests/zfs-tests/cmd/mmapwrite/mmapwrite.c
tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh

index 64c452b8d17f89cbe0a6a67b8a41e3705db36a77..fac26079d7bc667772bb5e3cc937ca5954351ff1 100644 (file)
@@ -53,6 +53,7 @@ typedef struct uio {
        int             uio_iovcnt;
        offset_t        uio_loffset;
        uio_seg_t       uio_segflg;
+       boolean_t       uio_fault_disable;
        uint16_t        uio_fmode;
        uint16_t        uio_extflg;
        offset_t        uio_limit;
index 37e283da0f8ba5f20ae3de8a035dca53f5aeffe6..cfef0b95dbb95cedbf0baa14bcbcd82a9ff8ae27 100644 (file)
@@ -42,7 +42,7 @@
 #include <sys/uio.h>
 
 extern int uiomove(void *, size_t, enum uio_rw, uio_t *);
-extern void uio_prefaultpages(ssize_t, uio_t *);
+extern int uio_prefaultpages(ssize_t, uio_t *);
 extern int uiocopy(void *, size_t, enum uio_rw, uio_t *, size_t *);
 extern void uioskip(uio_t *, size_t);
 
index af9716126f6d3f3f729631c5c2827fb6d6c2b8e2..a2c1b5c3aafe347bfedae18f3d2307da63cb7c73 100644 (file)
@@ -52,6 +52,7 @@
 #include <sys/sysmacros.h>
 #include <sys/strings.h>
 #include <linux/kmap_compat.h>
+#include <linux/uaccess.h>
 
 /*
  * Move "n" bytes at byte address "p"; "rw" indicates the direction
@@ -79,8 +80,24 @@ uiomove_iov(void *p, size_t n, enum uio_rw rw, struct uio *uio)
                                if (copy_to_user(iov->iov_base+skip, p, cnt))
                                        return (EFAULT);
                        } else {
-                               if (copy_from_user(p, iov->iov_base+skip, cnt))
-                                       return (EFAULT);
+                               if (uio->uio_fault_disable) {
+                                       if (!access_ok(VERIFY_READ,
+                                           (iov->iov_base + skip), cnt)) {
+                                               return (EFAULT);
+                                       }
+
+                                       pagefault_disable();
+                                       if (__copy_from_user_inatomic(p,
+                                           (iov->iov_base + skip), cnt)) {
+                                               pagefault_enable();
+                                               return (EFAULT);
+                                       }
+                                       pagefault_enable();
+                               } else {
+                                       if (copy_from_user(p,
+                                           (iov->iov_base + skip), cnt))
+                                               return (EFAULT);
+                               }
                        }
                        break;
                case UIO_SYSSPACE:
@@ -158,7 +175,7 @@ EXPORT_SYMBOL(uiomove);
  * error will terminate the process as this is only a best attempt to get
  * the pages resident.
  */
-void
+int
 uio_prefaultpages(ssize_t n, struct uio *uio)
 {
        const struct iovec *iov;
@@ -172,7 +189,7 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
        switch (uio->uio_segflg) {
                case UIO_SYSSPACE:
                case UIO_BVEC:
-                       return;
+                       return (0);
                case UIO_USERSPACE:
                case UIO_USERISPACE:
                        break;
@@ -196,7 +213,7 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
                p = iov->iov_base + skip;
                while (cnt) {
                        if (fuword8((uint8_t *)p, &tmp))
-                               return;
+                               return (EFAULT);
                        incr = MIN(cnt, PAGESIZE);
                        p += incr;
                        cnt -= incr;
@@ -206,8 +223,10 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
                 */
                p--;
                if (fuword8((uint8_t *)p, &tmp))
-                       return;
+                       return (EFAULT);
        }
+
+       return (0);
 }
 EXPORT_SYMBOL(uio_prefaultpages);
 
index 36f47e77a01cbb28e26a93ccfa8a444fa56a2226..f4e650dee30e90ec8204d45547c0c24ff1ad801e 100644 (file)
@@ -650,7 +650,10 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
                xuio = (xuio_t *)uio;
        else
 #endif
-               uio_prefaultpages(MIN(n, max_blksz), uio);
+               if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
+                       ZFS_EXIT(zfsvfs);
+                       return (SET_ERROR(EFAULT));
+               }
 
        /*
         * If in append mode, set the io offset pointer to eof.
@@ -808,8 +811,19 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
                ssize_t tx_bytes;
                if (abuf == NULL) {
                        tx_bytes = uio->uio_resid;
+                       uio->uio_fault_disable = B_TRUE;
                        error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
                            uio, nbytes, tx);
+                       if (error == EFAULT) {
+                               dmu_tx_commit(tx);
+                               if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
+                                       break;
+                               }
+                               continue;
+                       } else if (error != 0) {
+                               dmu_tx_commit(tx);
+                               break;
+                       }
                        tx_bytes -= uio->uio_resid;
                } else {
                        tx_bytes = nbytes;
@@ -909,8 +923,12 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
                ASSERT(tx_bytes == nbytes);
                n -= nbytes;
 
-               if (!xuio && n > 0)
-                       uio_prefaultpages(MIN(n, max_blksz), uio);
+               if (!xuio && n > 0) {
+                       if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
+                               error = EFAULT;
+                               break;
+                       }
+               }
        }
 
        zfs_inode_update(zp);
index 190d31af3aaaf6d3acd0d7cc9bc546ca16c4658b..b9915d5d31eb9dec5df54c8e030f38fd4d9902d5 100644 (file)
 #include <string.h>
 #include <sys/mman.h>
 #include <pthread.h>
+#include <errno.h>
+#include <err.h>
 
 /*
  * --------------------------------------------------------------------
- * Bug Id: 5032643
+ * Bug Issue Id: #7512
+ * The bug time sequence:
+ * 1. context #1, zfs_write assign a txg "n".
+ * 2. In the same process, context #2, mmap page fault (which means the mm_sem
+ *    is hold) occurred, zfs_dirty_inode open a txg failed, and wait previous
+ *    txg "n" completed.
+ * 3. context #1 call uiomove to write, however page fault is occurred in
+ *    uiomove, which means it need mm_sem, but mm_sem is hold by
+ *    context #2, so it stuck and can't complete, then txg "n" will not
+ *    complete.
  *
- * Simply writing to a file and mmaping that file at the same time can
- * result in deadlock.  Nothing perverse like writing from the file's
- * own mapping is required.
+ * So context #1 and context #2 trap into the "dead lock".
  * --------------------------------------------------------------------
  */
 
+#define        NORMAL_WRITE_TH_NUM     2
+
 static void *
-mapper(void *fdp)
+normal_writer(void *filename)
 {
-       void *addr;
-       int fd = *(int *)fdp;
+       char *file_path = filename;
+       int fd = -1;
+       ssize_t write_num = 0;
+       int page_size = getpagesize();
 
-       if ((addr =
-           mmap(0, 8192, PROT_READ, MAP_SHARED, fd, 0)) == MAP_FAILED) {
-               perror("mmap");
-               exit(1);
+       fd = open(file_path, O_RDWR | O_CREAT, 0777);
+       if (fd == -1) {
+               err(1, "failed to open %s", file_path);
        }
-       for (;;) {
-               if (mmap(addr, 8192, PROT_READ,
-                   MAP_SHARED|MAP_FIXED, fd, 0) == MAP_FAILED) {
-                       perror("mmap");
-                       exit(1);
+
+       char *buf = malloc(1);
+       while (1) {
+               write_num = write(fd, buf, 1);
+               if (write_num == 0) {
+                       err(1, "write failed!");
+                       break;
                }
+               lseek(fd, page_size, SEEK_CUR);
+       }
+
+       if (buf) {
+               free(buf);
        }
-       /* NOTREACHED */
-       return ((void *)1);
 }
 
-int
-main(int argc, char **argv)
+static void *
+map_writer(void *filename)
 {
-       int fd;
-       char buf[1024];
-       pthread_t tid;
+       int fd = -1;
+       int ret = 0;
+       char *buf = NULL;
+       int page_size = getpagesize();
+       int op_errno = 0;
+       char *file_path = filename;
 
-       memset(buf, 'a', sizeof (buf));
+       while (1) {
+               ret = access(file_path, F_OK);
+               if (ret) {
+                       op_errno = errno;
+                       if (op_errno == ENOENT) {
+                               fd = open(file_path, O_RDWR | O_CREAT, 0777);
+                               if (fd == -1) {
+                                       err(1, "open file failed");
+                               }
 
-       if (argc != 2) {
-               (void) printf("usage: %s <file name>\n", argv[0]);
-               exit(1);
-       }
+                               ret = ftruncate(fd, page_size);
+                               if (ret == -1) {
+                                       err(1, "truncate file failed");
+                               }
+                       } else {
+                               err(1, "access file failed!");
+                       }
+               } else {
+                       fd = open(file_path, O_RDWR, 0777);
+                       if (fd == -1) {
+                               err(1, "open file failed");
+                       }
+               }
 
-       if ((fd = open(argv[1], O_RDWR|O_CREAT|O_TRUNC, 0666)) == -1) {
-               perror("open");
-               exit(1);
+               if ((buf = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
+                   MAP_SHARED, fd, 0)) == MAP_FAILED) {
+                       err(1, "map file failed");
+               }
+
+               if (fd != -1)
+                       close(fd);
+
+               char s[10] = {0, };
+               memcpy(buf, s, 10);
+               ret = munmap(buf, page_size);
+               if (ret != 0) {
+                       err(1, "unmap file failed");
+               }
        }
+}
 
-       (void) pthread_setconcurrency(2);
-       if (pthread_create(&tid, NULL, mapper, &fd) != 0) {
-               perror("pthread_create");
-               close(fd);
+int
+main(int argc, char **argv)
+{
+       pthread_t map_write_tid;
+       pthread_t normal_write_tid[NORMAL_WRITE_TH_NUM];
+       int i = 0;
+
+       if (argc != 3) {
+               (void) printf("usage: %s <normal write file name>"
+                   "<map write file name>\n", argv[0]);
                exit(1);
        }
-       for (;;) {
-               if (write(fd, buf, sizeof (buf)) == -1) {
-                       perror("write");
-                       close(fd);
-                       exit(1);
+
+       for (i = 0; i < NORMAL_WRITE_TH_NUM; i++) {
+               if (pthread_create(&normal_write_tid[i], NULL, normal_writer,
+                   argv[1])) {
+                       err(1, "pthread_create normal_writer failed.");
                }
        }
 
-       close(fd);
+       if (pthread_create(&map_write_tid, NULL, map_writer, argv[2])) {
+               err(1, "pthread_create map_writer failed.");
+       }
 
        /* NOTREACHED */
+       pthread_join(map_write_tid, NULL);
        return (0);
 }
index 1eda971041dc73a2a9005b2cef35b2478a55f336..24150b827f8f9c6229a47d70e8619edb364b441c 100755 (executable)
@@ -53,12 +53,14 @@ if ! is_mp; then
 fi
 
 log_must chmod 777 $TESTDIR
-mmapwrite $TESTDIR/test-write-file &
+mmapwrite $TESTDIR/normal_write_file $TESTDIR/map_write_file &
 PID_MMAPWRITE=$!
-log_note "mmapwrite $TESTDIR/test-write-file pid: $PID_MMAPWRITE"
+log_note "mmapwrite $TESTDIR/normal_write_file $TESTDIR/map_write_file"\
+        "pid: $PID_MMAPWRITE"
 log_must sleep 30
 
 log_must kill -9 $PID_MMAPWRITE
-log_must ls -l $TESTDIR/test-write-file
+log_must ls -l $TESTDIR/normal_write_file
+log_must ls -l $TESTDIR/map_write_file
 
 log_pass "write(2) a mmap(2)'ing file succeeded."