From dde471ef5a07bd569deeadd3e9a88655db3e10ab Mon Sep 17 00:00:00 2001 From: Prasad Joshi Date: Sat, 28 May 2011 02:53:07 +0100 Subject: [PATCH] MMAP Optimization Enable zfs_getpage, zfs_fillpage, zfs_putpage, zfs_putapage functions. The functions have been modified to make them Linux friendly. ZFS uses these functions to read/write the mmapped pages. Using them from readpage/writepage results in clear code. The patch also adds readpages and writepages interface functions to read/write list of pages in one function call. The code change handles the first mmap optimization mentioned on https://github.com/behlendorf/zfs/issues/225 Signed-off-by: Prasad Joshi Signed-off-by: Brian Behlendorf Issue #255 --- include/sys/zfs_vnops.h | 3 + include/sys/zpl.h | 1 + module/zfs/zfs_vnops.c | 360 ++++++++++------------------------------ module/zfs/zpl_file.c | 141 +++++++++------- 4 files changed, 178 insertions(+), 327 deletions(-) diff --git a/include/sys/zfs_vnops.h b/include/sys/zfs_vnops.h index b1c7c9f53..23218a45b 100644 --- a/include/sys/zfs_vnops.h +++ b/include/sys/zfs_vnops.h @@ -71,6 +71,9 @@ extern int zfs_getsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr); extern int zfs_setsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr); +extern int zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages); +extern int zfs_putpage(struct page *page, struct writeback_control *wbc, + void *data); #ifdef __cplusplus } diff --git a/include/sys/zpl.h b/include/sys/zpl.h index 721347585..906b5a73d 100644 --- a/include/sys/zpl.h +++ b/include/sys/zpl.h @@ -29,6 +29,7 @@ #include #include #include +#include /* zpl_inode.c */ extern const struct inode_operations zpl_inode_operations; diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 86bef25fb..ff761854c 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -3725,53 +3725,31 @@ zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR); return (0); } +#endif /* HAVE_MMAP */ /* - * Push a page out to disk, klustering if possible. + * Push a page out to disk * * IN: vp - file to push page to. * pp - page to push. - * flags - additional flags. - * cr - credentials of caller. + * off - start of range pushed. + * len - len of range pushed. * - * OUT: offp - start of range pushed. - * lenp - len of range pushed. * * RETURN: 0 if success * error code if failure * - * NOTE: callers must have locked the page to be pushed. On - * exit, the page (and all other pages in the kluster) must be - * unlocked. + * NOTE: callers must have locked the page to be pushed. */ /* ARGSUSED */ static int -zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, - size_t *lenp, int flags, cred_t *cr) +zfs_putapage(struct inode *ip, struct page *pp, u_offset_t off, size_t len) { - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - dmu_tx_t *tx; - u_offset_t off, koff; - size_t len, klen; - int err; - - off = pp->p_offset; - len = PAGESIZE; - /* - * If our blocksize is bigger than the page size, try to kluster - * multiple pages so that we write a full block (thus avoiding - * a read-modify-write). - */ - if (off < zp->z_size && zp->z_blksz > PAGESIZE) { - klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); - koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0; - ASSERT(koff <= zp->z_size); - if (koff + klen > zp->z_size) - klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE); - pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags); - } - ASSERT3U(btop(len), ==, btopr(len)); + znode_t *zp = ITOZ(ip); + zfs_sb_t *zsb = ITOZSB(ip); + dmu_tx_t *tx; + caddr_t va; + int err; /* * Can't push pages past end-of-file. @@ -3780,24 +3758,16 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, /* ignore all pages */ err = 0; goto out; - } else if (off + len > zp->z_size) { - int npages = btopr(zp->z_size - off); - page_t *trunc; - - page_list_break(&pp, &trunc, npages); - /* ignore pages past end of file */ - if (trunc) - pvn_write_done(trunc, flags); + } else if (off + len > zp->z_size) len = zp->z_size - off; - } - if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) || - zfs_owner_overquota(zfsvfs, zp, B_TRUE)) { + if (zfs_owner_overquota(zsb, zp, B_FALSE) || + zfs_owner_overquota(zsb, zp, B_TRUE)) { err = EDQUOT; goto out; } top: - tx = dmu_tx_create(zfsvfs->z_os); + tx = dmu_tx_create(zsb->z_os); dmu_tx_hold_write(tx, zp->z_id, off, len); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); @@ -3813,52 +3783,38 @@ top: goto out; } - if (zp->z_blksz <= PAGESIZE) { - caddr_t va = zfs_map_page(pp, S_READ); - ASSERT3U(len, <=, PAGESIZE); - dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx); - zfs_unmap_page(pp, va); - } else { - err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx); - } + va = kmap(pp); + ASSERT3U(len, <=, PAGESIZE); + dmu_write(zsb->z_os, zp->z_id, off, len, va, tx); + kunmap(pp); if (err == 0) { uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int count = 0; - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zsb), NULL, &mtime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL, &ctime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zsb), NULL, &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); - zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); + zfs_log_write(zsb->z_log, tx, TX_WRITE, zp, off, len, 0); } dmu_tx_commit(tx); out: - pvn_write_done(pp, (err ? B_ERROR : 0) | flags); - if (offp) - *offp = off; - if (lenp) - *lenp = len; - return (err); } /* - * Copy the portion of the file indicated from pages into the file. - * The pages are stored in a page list attached to the files vnode. + * Copy the portion of the file indicated from page into the file. * - * IN: vp - vnode of file to push page data to. - * off - position in file to put data. - * len - amount of data to write. - * flags - flags to control the operation. - * cr - credentials of caller. - * ct - caller context. + * IN: ip - inode of file to push page data to. + * wbc - Unused parameter + * data - pointer to address_space * * RETURN: 0 if success * error code if failure @@ -3867,87 +3823,45 @@ out: * vp - ctime|mtime updated */ /*ARGSUSED*/ -static int -zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr) +int +zfs_putpage(struct page *page, struct writeback_control *wbc, void *data) { - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - page_t *pp; - size_t io_len; - u_offset_t io_off; - uint_t blksz; - rl_t *rl; - int error = 0; + struct address_space *mapping = data; + struct inode *ip = mapping->host; + znode_t *zp = ITOZ(ip); + zfs_sb_t *zsb = ITOZSB(ip); + rl_t *rl; + u_offset_t io_off; + size_t io_len; + size_t len; + int error; + + io_off = page_offset(page); + io_len = PAGESIZE; - ZFS_ENTER(zfsvfs); + ZFS_ENTER(zsb); ZFS_VERIFY_ZP(zp); - /* - * Align this request to the file block size in case we kluster. - * XXX - this can result in pretty aggresive locking, which can - * impact simultanious read/write access. One option might be - * to break up long requests (len == 0) into block-by-block - * operations to get narrower locking. - */ - blksz = zp->z_blksz; - if (ISP2(blksz)) - io_off = P2ALIGN_TYPED(off, blksz, u_offset_t); - else - io_off = 0; - if (len > 0 && ISP2(blksz)) - io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t); - else - io_len = 0; - - if (io_len == 0) { - /* - * Search the entire vp list for pages >= io_off. - */ - rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER); - error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr); - goto out; - } rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER); - if (off > zp->z_size) { + if (io_off > zp->z_size) { /* past end of file */ zfs_range_unlock(rl); - ZFS_EXIT(zfsvfs); + ZFS_EXIT(zsb); return (0); } len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off); - for (off = io_off; io_off < off + len; io_off += io_len) { - if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { - pp = page_lookup(vp, io_off, - (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED); - } else { - pp = page_lookup_nowait(vp, io_off, - (flags & B_FREE) ? SE_EXCL : SE_SHARED); - } - - if (pp != NULL && pvn_getdirty(pp, flags)) { - int err; - - /* - * Found a dirty page to push - */ - err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr); - if (err) - error = err; - } else { - io_len = PAGESIZE; - } - } -out: + error = zfs_putapage(ip, page, io_off, len); zfs_range_unlock(rl); - if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zfsvfs->z_log, zp->z_id); - ZFS_EXIT(zfsvfs); + + if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zsb->z_log, zp->z_id); + ZFS_EXIT(zsb); return (error); } -#endif /* HAVE_MMAP */ +EXPORT_SYMBOL(zfs_putpage); /*ARGSUSED*/ void @@ -4039,102 +3953,62 @@ zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, ZFS_EXIT(zfsvfs); return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); } +#endif /* HAVE_MMAP */ /* - * If we can't find a page in the cache, we will create a new page - * and fill it with file data. For efficiency, we may try to fill - * multiple pages at once (klustering) to fill up the supplied page - * list. Note that the pages to be filled are held with an exclusive - * lock to prevent access by other threads while they are being filled. + * Fill pages with data from the disk. */ static int -zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, - caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw) +zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages) { - znode_t *zp = VTOZ(vp); - page_t *pp, *cur_pp; - objset_t *os = zp->z_zfsvfs->z_os; - u_offset_t io_off, total; - size_t io_len; - int err; - - if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) { - /* - * We only have a single page, don't bother klustering - */ - io_off = off; - io_len = PAGESIZE; - pp = page_create_va(vp, io_off, io_len, - PG_EXCL | PG_WAIT, seg, addr); - } else { - /* - * Try to find enough pages to fill the page list - */ - pp = pvn_read_kluster(vp, off, seg, addr, &io_off, - &io_len, off, plsz, 0); - } - if (pp == NULL) { - /* - * The page already exists, nothing to do here. - */ - *pl = NULL; - return (0); - } + znode_t *zp = ITOZ(ip); + zfs_sb_t *zsb = ITOZSB(ip); + objset_t *os; + struct page *cur_pp; + u_offset_t io_off, total; + size_t io_len; + loff_t i_size; + unsigned page_idx; + int err; + + os = zsb->z_os; + io_len = nr_pages << PAGE_CACHE_SHIFT; + i_size = i_size_read(ip); + io_off = page_offset(pl[0]); + + if (io_off + io_len > i_size) + io_len = i_size - io_off; /* - * Fill the pages in the kluster. + * Iterate over list of pages and read each page individually. */ - cur_pp = pp; + page_idx = 0; + cur_pp = pl[0]; for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { caddr_t va; - ASSERT3U(io_off, ==, cur_pp->p_offset); - va = zfs_map_page(cur_pp, S_WRITE); + va = kmap(cur_pp); err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, DMU_READ_PREFETCH); - zfs_unmap_page(cur_pp, va); + kunmap(cur_pp); if (err) { - /* On error, toss the entire kluster */ - pvn_read_done(pp, B_ERROR); /* convert checksum errors into IO errors */ if (err == ECKSUM) err = EIO; return (err); } - cur_pp = cur_pp->p_next; + cur_pp = pl[++page_idx]; } - /* - * Fill in the page list array from the kluster starting - * from the desired offset `off'. - * NOTE: the page list will always be null terminated. - */ - pvn_plist_init(pp, pl, plsz, off, io_len, rw); - ASSERT(pl == NULL || (*pl)->p_offset == off); - return (0); } /* - * Return pointers to the pages for the file region [off, off + len] - * in the pl array. If plsz is greater than len, this function may - * also return page pointers from after the specified region - * (i.e. the region [off, off + plsz]). These additional pages are - * only returned if they are already in the cache, or were created as - * part of a klustered read. - * - * IN: vp - vnode of file to get data from. - * off - position in file to get data from. - * len - amount of data to retrieve. - * plsz - length of provided page list. - * seg - segment to obtain pages for. - * addr - virtual address of fault. - * rw - mode of created pages. - * cr - credentials of caller. - * ct - caller context. + * Uses zfs_fillpage to read data from the file and fill the pages. * - * OUT: protp - protection mode of created pages. - * pl - list of pages created. + * IN: ip - inode of file to get data from. + * pl - list of pages to read + * nr_pages - number of pages to read * * RETURN: 0 if success * error code if failure @@ -4143,80 +4017,30 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, * vp - atime updated */ /* ARGSUSED */ -static int -zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, - page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, - enum seg_rw rw, cred_t *cr) +int +zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages) { - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - page_t **pl0 = pl; - int err = 0; + znode_t *zp = ITOZ(ip); + zfs_sb_t *zsb = ITOZSB(ip); + int err; - /* we do our own caching, faultahead is unnecessary */ if (pl == NULL) return (0); - else if (len > plsz) - len = plsz; - else - len = P2ROUNDUP(len, PAGESIZE); - ASSERT(plsz >= len); - ZFS_ENTER(zfsvfs); + ZFS_ENTER(zsb); ZFS_VERIFY_ZP(zp); - if (protp) - *protp = PROT_ALL; + err = zfs_fillpage(ip, pl, nr_pages); - /* - * Loop through the requested range [off, off + len) looking - * for pages. If we don't find a page, we will need to create - * a new page and fill it with data from the file. - */ - while (len > 0) { - if (*pl = page_lookup(vp, off, SE_SHARED)) - *(pl+1) = NULL; - else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw)) - goto out; - while (*pl) { - ASSERT3U((*pl)->p_offset, ==, off); - off += PAGESIZE; - addr += PAGESIZE; - if (len > 0) { - ASSERT3U(len, >=, PAGESIZE); - len -= PAGESIZE; - } - ASSERT3U(plsz, >=, PAGESIZE); - plsz -= PAGESIZE; - pl++; - } - } + if (!err) + ZFS_ACCESSTIME_STAMP(zsb, zp); - /* - * Fill out the page array with any pages already in the cache. - */ - while (plsz > 0 && - (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) { - off += PAGESIZE; - plsz -= PAGESIZE; - } -out: - if (err) { - /* - * Release any pages we have previously locked. - */ - while (pl > pl0) - page_unlock(*--pl); - } else { - ZFS_ACCESSTIME_STAMP(zfsvfs, zp); - } - - *pl = NULL; - - ZFS_EXIT(zfsvfs); + ZFS_EXIT(zsb); return (err); } +EXPORT_SYMBOL(zfs_getpage); +#ifdef HAVE_MMAP /* * Request a memory map for a section of a file. This code interacts * with common code and the VM system as follows: diff --git a/module/zfs/zpl_file.c b/module/zfs/zpl_file.c index de66ff4b4..0e90b7803 100644 --- a/module/zfs/zpl_file.c +++ b/module/zfs/zpl_file.c @@ -254,6 +254,60 @@ zpl_mmap(struct file *filp, struct vm_area_struct *vma) return (error); } +static struct page ** +pages_vector_from_list(struct list_head *pages, unsigned nr_pages) +{ + struct page **pl; + struct page *t; + unsigned page_idx; + + pl = kmalloc(sizeof(*pl) * nr_pages, GFP_NOFS); + if (!pl) + return ERR_PTR(-ENOMEM); + + page_idx = 0; + list_for_each_entry_reverse(t, pages, lru) { + pl[page_idx] = t; + page_idx++; + } + + return pl; +} + +static int +zpl_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + struct inode *ip; + struct page **pl; + struct page *p, *n; + int error; + + ip = mapping->host; + + pl = pages_vector_from_list(pages, nr_pages); + if (IS_ERR(pl)) + return PTR_ERR(pl); + + error = -zfs_getpage(ip, pl, nr_pages); + if (error) + goto error; + + list_for_each_entry_safe_reverse(p, n, pages, lru) { + + list_del(&p->lru); + + flush_dcache_page(p); + SetPageUptodate(p); + unlock_page(p); + page_cache_release(p); + } + +error: + kfree(pl); + return error; +} + /* * Populate a page with data for the Linux page cache. This function is * only used to support mmap(2). There will be an identical copy of the @@ -267,33 +321,34 @@ static int zpl_readpage(struct file *filp, struct page *pp) { struct inode *ip; - loff_t off, i_size; - size_t len, wrote; - cred_t *cr = CRED(); - void *pb; + struct page *pl[1]; int error = 0; ASSERT(PageLocked(pp)); ip = pp->mapping->host; - off = page_offset(pp); - i_size = i_size_read(ip); - ASSERT3S(off, <, i_size); + pl[0] = pp; - crhold(cr); - len = MIN(PAGE_CACHE_SIZE, i_size - off); + error = -zfs_getpage(ip, pl, 1); - pb = kmap(pp); + if (error) { + SetPageError(pp); + ClearPageUptodate(pp); + } else { + ClearPageError(pp); + SetPageUptodate(pp); + flush_dcache_page(pp); + } - /* O_DIRECT is passed to bypass the page cache and avoid deadlock. */ - wrote = zpl_read_common(ip, pb, len, off, UIO_SYSSPACE, O_DIRECT, cr); - if (wrote != len) - error = -EIO; + unlock_page(pp); + return error; +} - if (!error && (len < PAGE_CACHE_SIZE)) - memset(pb + len, 0, PAGE_CACHE_SIZE - len); +int +zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data) +{ + int error; - kunmap(pp); - crfree(cr); + error = -zfs_putpage(pp, wbc, data); if (error) { SetPageError(pp); @@ -305,8 +360,13 @@ zpl_readpage(struct file *filp, struct page *pp) } unlock_page(pp); + return error; +} - return (error); +static int +zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) +{ + return write_cache_pages(mapping, wbc, zpl_putpage, mapping); } /* @@ -314,55 +374,18 @@ zpl_readpage(struct file *filp, struct page *pp) * support mmap(2). Mapped pages may be dirtied by memory operations * which never call .write(). These dirty pages are kept in sync with * the ARC buffers via this hook. - * - * Currently this function relies on zpl_write_common() and the O_DIRECT - * flag to push out the page. This works but the more correct way is - * to update zfs_putapage() to be Linux friendly and use that interface. */ static int zpl_writepage(struct page *pp, struct writeback_control *wbc) { - struct inode *ip; - loff_t off, i_size; - size_t len, read; - cred_t *cr = CRED(); - void *pb; - int error = 0; - - ASSERT(PageLocked(pp)); - ip = pp->mapping->host; - off = page_offset(pp); - i_size = i_size_read(ip); - - crhold(cr); - len = MIN(PAGE_CACHE_SIZE, i_size - off); - - pb = kmap(pp); - - /* O_DIRECT is passed to bypass the page cache and avoid deadlock. */ - read = zpl_write_common(ip, pb, len, off, UIO_SYSSPACE, O_DIRECT, cr); - if (read != len) - error = -EIO; - - kunmap(pp); - crfree(cr); - - if (error) { - SetPageError(pp); - ClearPageUptodate(pp); - } else { - ClearPageError(pp); - SetPageUptodate(pp); - } - - unlock_page(pp); - - return (error); + return zpl_putpage(pp, wbc, pp->mapping); } const struct address_space_operations zpl_address_space_operations = { + .readpages = zpl_readpages, .readpage = zpl_readpage, .writepage = zpl_writepage, + .writepages = zpl_writepages, }; const struct file_operations zpl_file_operations = { -- 2.40.0