single-chunk scatter ABDs can be treated as linear

author Matthew Ahrens <mahrens@delphix.com>

Tue, 11 Jun 2019 16:02:31 +0000 (09:02 -0700)

committer Brian Behlendorf <behlendorf1@llnl.gov>

Tue, 11 Jun 2019 16:02:31 +0000 (09:02 -0700)
author Matthew Ahrens <mahrens@delphix.com>
Tue, 11 Jun 2019 16:02:31 +0000 (09:02 -0700)
committer Brian Behlendorf <behlendorf1@llnl.gov>
Tue, 11 Jun 2019 16:02:31 +0000 (09:02 -0700)
diff --git a/include/sys/abd.h b/include/sys/abd.h

index 3d9fdbf102aa15149e244df59157108e3f6d52db..b781be4da700febc67a3852927485be2f46f21da 100644 (file)
--- a/include/sys/abd.h
+++ b/include/sys/abd.h
@@ -20,7 +20,7 @@
   */
  /*
   * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
- * Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
   */
  
  #ifndef _ABD_H
@@ -44,7 +44,8 @@ typedef enum abd_flags {
         ABD_FLAG_OWNER  = 1 << 1,       /* does it own its data buffers? */
         ABD_FLAG_META   = 1 << 2,       /* does this represent FS metadata? */
         ABD_FLAG_MULTI_ZONE  = 1 << 3,  /* pages split over memory zones */
-       ABD_FLAG_MULTI_CHUNK = 1 << 4   /* pages split over multiple chunks */
+       ABD_FLAG_MULTI_CHUNK = 1 << 4,  /* pages split over multiple chunks */
+       ABD_FLAG_LINEAR_PAGE = 1 << 5,  /* linear but allocd from page */
  } abd_flags_t;
  
  typedef struct abd {
@@ -60,6 +61,7 @@ typedef struct abd {
                 } abd_scatter;
                 struct abd_linear {
                         void            *abd_buf;
+                       struct scatterlist *abd_sgl; /* for LINEAR_PAGE */
                 } abd_linear;
         } abd_u;
  } abd_t;
@@ -75,6 +77,13 @@ abd_is_linear(abd_t *abd)
         return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0 ? B_TRUE : B_FALSE);
  }
  
+static inline boolean_t
+abd_is_linear_page(abd_t *abd)
+{
+       return ((abd->abd_flags & ABD_FLAG_LINEAR_PAGE) != 0 ?
+           B_TRUE : B_FALSE);
+}
+
  /*
   * Allocations and deallocations
   */
diff --git a/module/zfs/abd.c b/module/zfs/abd.c

index 9041bd8b18412e32e8d22c3483f79bf0a1c17a7c..9f688d9bc2b82caf2f099860f05c45edb678840b 100644 (file)
--- a/module/zfs/abd.c
+++ b/module/zfs/abd.c
@@ -72,17 +72,19 @@
   *  (2) Fragmentation is less of an issue since when we are at the limit of
   *      allocatable space, we won't have to search around for a long free
   *      hole in the VA space for large ARC allocations. Each chunk is mapped in
- *      individually, so even if we weren't using segkpm (see next point) we
+ *      individually, so even if we are using HIGHMEM (see next point) we
   *      wouldn't need to worry about finding a contiguous address range.
   *
- *  (3) Use of segkpm will avoid the need for map / unmap / TLB shootdown costs
- *      on each ABD access. (If segkpm isn't available then we use all linear
- *      ABDs to avoid this penalty.) See seg_kpm.c for more details.
+ *  (3) If we are not using HIGHMEM, then all physical memory is always
+ *      mapped into the kernel's address space, so we also avoid the map /
+ *      unmap costs on each ABD access.
+ *
+ * If we are not using HIGHMEM, scattered buffers which have only one chunk
+ * can be treated as linear buffers, because they are contiguous in the
+ * kernel's virtual address space.  See abd_alloc_pages() for details.
   *
   * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to
- * B_FALSE. However, it is not possible to use scattered ABDs if segkpm is not
- * available, which is the case on all 32-bit systems and any 64-bit systems
- * where kpm_enable is turned off.
+ * B_FALSE.
   *
   * In addition to directly allocating a linear or scattered ABD, it is also
   * possible to create an ABD by requesting the "sub-ABD" starting at an offset
@@ -249,18 +251,6 @@ abd_chunkcnt_for_bytes(size_t size)
  #define        __GFP_RECLAIM           __GFP_WAIT
  #endif
  
-static unsigned long
-abd_alloc_chunk(int nid, gfp_t gfp, unsigned int order)
-{
-       struct page *page;
-
-       page = alloc_pages_node(nid, gfp, order);
-       if (!page)
-               return (0);
-
-       return ((unsigned long) page_address(page));
-}
-
  /*
   * The goal is to minimize fragmentation by preferentially populating ABDs
   * with higher order compound pages from a single zone.  Allocation size is
@@ -283,19 +273,18 @@ abd_alloc_pages(abd_t *abd, size_t size)
         size_t remaining_size;
         int nid = NUMA_NO_NODE;
         int alloc_pages = 0;
-       int order;
  
         INIT_LIST_HEAD(&pages);
  
         while (alloc_pages < nr_pages) {
-               unsigned long paddr;
                 unsigned chunk_pages;
+               int order;
  
                 order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order);
                 chunk_pages = (1U << order);
  
-               paddr = abd_alloc_chunk(nid, order ? gfp_comp : gfp, order);
-               if (paddr == 0) {
+               page = alloc_pages_node(nid, order ? gfp_comp : gfp, order);
+               if (page == NULL) {
                         if (order == 0) {
                                 ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
                                 schedule_timeout_interruptible(1);
@@ -305,7 +294,6 @@ abd_alloc_pages(abd_t *abd, size_t size)
                         continue;
                 }
  
-               page = virt_to_page(paddr);
                 list_add_tail(&page->lru, &pages);
  
                 if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid))
@@ -336,7 +324,41 @@ abd_alloc_pages(abd_t *abd, size_t size)
                 list_del(&page->lru);
         }
  
-       if (chunks > 1) {
+       /*
+        * These conditions ensure that a possible transformation to a linear
+        * ABD would be valid.
+        */
+       ASSERT(!PageHighMem(sg_page(table.sgl)));
+       ASSERT0(ABD_SCATTER(abd).abd_offset);
+
+       if (table.nents == 1) {
+               /*
+                * Since there is only one entry, this ABD can be represented
+                * as a linear buffer.  All single-page (4K) ABD's can be
+                * represented this way.  Some multi-page ABD's can also be
+                * represented this way, if we were able to allocate a single
+                * "chunk" (higher-order "page" which represents a power-of-2
+                * series of physically-contiguous pages).  This is often the
+                * case for 2-page (8K) ABD's.
+                *
+                * Representing a single-entry scatter ABD as a linear ABD
+                * has the performance advantage of avoiding the copy (and
+                * allocation) in abd_borrow_buf_copy / abd_return_buf_copy.
+                * A performance increase of around 5% has been observed for
+                * ARC-cached reads (of small blocks which can take advantage
+                * of this).
+                *
+                * Note that this optimization is only possible because the
+                * pages are always mapped into the kernel's address space.
+                * This is not the case for highmem pages, so the
+                * optimization can not be made there.
+                */
+               abd->abd_flags |= ABD_FLAG_LINEAR;
+               abd->abd_flags |= ABD_FLAG_LINEAR_PAGE;
+               abd->abd_u.abd_linear.abd_sgl = table.sgl;
+               abd->abd_u.abd_linear.abd_buf =
+                   page_address(sg_page(table.sgl));
+       } else if (table.nents > 1) {
                 ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
                 abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
  
@@ -344,10 +366,10 @@ abd_alloc_pages(abd_t *abd, size_t size)
                         ABDSTAT_BUMP(abdstat_scatter_page_multi_zone);
                         abd->abd_flags |= ABD_FLAG_MULTI_ZONE;
                 }
-       }
  
-       ABD_SCATTER(abd).abd_sgl = table.sgl;
-       ABD_SCATTER(abd).abd_nents = table.nents;
+               ABD_SCATTER(abd).abd_sgl = table.sgl;
+               ABD_SCATTER(abd).abd_nents = table.nents;
+       }
  }
  #else
  /*
@@ -427,10 +449,6 @@ abd_free_pages(abd_t *abd)
  
  struct page;
  
-#define        kpm_enable                      1
-#define        abd_alloc_chunk(o) \
-       ((struct page *)umem_alloc_aligned(PAGESIZE << (o), 64, KM_SLEEP))
-#define        abd_free_chunk(chunk, o)        umem_free(chunk, PAGESIZE << (o))
  #define        zfs_kmap_atomic(chunk, km)      ((void *)chunk)
  #define        zfs_kunmap_atomic(addr, km)     do { (void)(addr); } while (0)
  #define        local_irq_save(flags)           do { (void)(flags); } while (0)
@@ -491,7 +509,7 @@ abd_alloc_pages(abd_t *abd, size_t size)
         sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages);
  
         abd_for_each_sg(abd, sg, nr_pages, i) {
-               struct page *p = abd_alloc_chunk(0);
+               struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
                 sg_set_page(sg, p, PAGESIZE, 0);
         }
         ABD_SCATTER(abd).abd_nents = nr_pages;
@@ -502,12 +520,11 @@ abd_free_pages(abd_t *abd)
  {
         int i, n = ABD_SCATTER(abd).abd_nents;
         struct scatterlist *sg;
-       int j;
  
         abd_for_each_sg(abd, sg, n, i) {
-               for (j = 0; j < sg->length; j += PAGESIZE) {
-                       struct page *p = nth_page(sg_page(sg), j>>PAGE_SHIFT);
-                       abd_free_chunk(p, 0);
+               for (int j = 0; j < sg->length; j += PAGESIZE) {
+                       struct page *p = nth_page(sg_page(sg), j >> PAGE_SHIFT);
+                       umem_free(p, PAGESIZE);
                 }
         }
  
@@ -560,7 +577,7 @@ abd_verify(abd_t *abd)
         ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
         ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
             ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
-           ABD_FLAG_MULTI_CHUNK));
+           ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE));
         IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
         IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
         if (abd_is_linear(abd)) {
@@ -613,6 +630,7 @@ abd_alloc(size_t size, boolean_t is_metadata)
  
         abd_t *abd = abd_alloc_struct();
         abd->abd_flags = ABD_FLAG_OWNER;
+       abd->abd_u.abd_scatter.abd_offset = 0;
         abd_alloc_pages(abd, size);
  
         if (is_metadata) {
@@ -622,8 +640,6 @@ abd_alloc(size_t size, boolean_t is_metadata)
         abd->abd_parent = NULL;
         zfs_refcount_create(&abd->abd_children);
  
-       abd->abd_u.abd_scatter.abd_offset = 0;
-
         ABDSTAT_BUMP(abdstat_scatter_cnt);
         ABDSTAT_INCR(abdstat_scatter_data_size, size);
         ABDSTAT_INCR(abdstat_scatter_chunk_waste,
@@ -681,6 +697,17 @@ abd_alloc_linear(size_t size, boolean_t is_metadata)
  static void
  abd_free_linear(abd_t *abd)
  {
+       if (abd_is_linear_page(abd)) {
+               /* Transform it back into a scatter ABD for freeing */
+               struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl;
+               abd->abd_flags &= ~ABD_FLAG_LINEAR;
+               abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE;
+               ABD_SCATTER(abd).abd_nents = 1;
+               ABD_SCATTER(abd).abd_offset = 0;
+               ABD_SCATTER(abd).abd_sgl = sg;
+               abd_free_scatter(abd);
+               return;
+       }
         if (abd->abd_flags & ABD_FLAG_META) {
                 zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
         } else {
@@ -718,7 +745,8 @@ abd_t *
  abd_alloc_sametype(abd_t *sabd, size_t size)
  {
         boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0;
-       if (abd_is_linear(sabd)) {
+       if (abd_is_linear(sabd) &&
+           !abd_is_linear_page(sabd)) {
                 return (abd_alloc_linear(size, is_metadata));
         } else {
                 return (abd_alloc(size, is_metadata));
@@ -966,6 +994,16 @@ abd_release_ownership_of_buf(abd_t *abd)
  {
         ASSERT(abd_is_linear(abd));
         ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
+
+       /*
+        * abd_free() needs to handle LINEAR_PAGE ABD's specially.
+        * Since that flag does not survive the
+        * abd_release_ownership_of_buf() -> abd_get_from_buf() ->
+        * abd_take_ownership_of_buf() sequence, we don't allow releasing
+        * these "linear but not zio_[data_]buf_alloc()'ed" ABD's.
+        */
+       ASSERT(!abd_is_linear_page(abd));
+
         abd_verify(abd);
  
         abd->abd_flags &= ~ABD_FLAG_OWNER;
diff --git a/module/zfs/arc.c b/module/zfs/arc.c

index 946ea3415eda68600960229c3257cd8f178b6a3b..c1ffe1f24f58abf9df6c7ce074e120789b5c1080 100644 (file)
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -2917,7 +2917,8 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
         /*
          * If the hdr's data can be shared then we share the data buffer and
          * set the appropriate bit in the hdr's b_flags to indicate the hdr is
-        * allocate a new buffer to store the buf's data.
+        * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new
+        * buffer to store the buf's data.
          *
          * There are two additional restrictions here because we're sharing
          * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
@@ -2925,10 +2926,17 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
          * an arc_write() then the hdr's data buffer will be released when the
          * write completes, even though the L2ARC write might still be using it.
          * Second, the hdr's ABD must be linear so that the buf's user doesn't
-        * need to be ABD-aware.
-        */
-       boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) &&
-           hdr->b_l1hdr.b_pabd != NULL && abd_is_linear(hdr->b_l1hdr.b_pabd);
+        * need to be ABD-aware.  It must be allocated via
+        * zio_[data_]buf_alloc(), not as a page, because we need to be able
+        * to abd_release_ownership_of_buf(), which isn't allowed on "linear
+        * page" buffers because the ABD code needs to handle freeing them
+        * specially.
+        */
+       boolean_t can_share = arc_can_share(hdr, buf) &&
+           !HDR_L2_WRITING(hdr) &&
+           hdr->b_l1hdr.b_pabd != NULL &&
+           abd_is_linear(hdr->b_l1hdr.b_pabd) &&
+           !abd_is_linear_page(hdr->b_l1hdr.b_pabd);
  
         /* Set up b_data and sharing */
         if (can_share) {
@@ -3731,7 +3739,6 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
                  * disk, it's easiest if we just set up sharing between the
                  * buf and the hdr.
                  */
-               ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd));
                 arc_hdr_free_abd(hdr, B_FALSE);
                 arc_share_buf(hdr, buf);
         }
diff --git a/module/zfs/zio.c b/module/zfs/zio.c

index 016ac07eabd92eed09bff77ba6f9bed37612cf7d..80a2dbc823970c944730b542e8b3c8de7cacdf2f 100644 (file)
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -331,12 +331,6 @@ zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize,
  {
         zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
  
-       /*
-        * Ensure that anyone expecting this zio to contain a linear ABD isn't
-        * going to get a nasty surprise when they try to access the data.
-        */
-       IMPLY(abd_is_linear(zio->io_abd), abd_is_linear(data));
-
         zt->zt_orig_abd = zio->io_abd;
         zt->zt_orig_size = zio->io_size;
         zt->zt_bufsize = bufsize;
author	Matthew Ahrens <mahrens@delphix.com>
	Tue, 11 Jun 2019 16:02:31 +0000 (09:02 -0700)
committer	Brian Behlendorf <behlendorf1@llnl.gov>
	Tue, 11 Jun 2019 16:02:31 +0000 (09:02 -0700)
include/sys/abd.h		patch \| blob \| history
module/zfs/abd.c		patch \| blob \| history
module/zfs/arc.c		patch \| blob \| history
module/zfs/zio.c		patch \| blob \| history