OpenZFS 8484 - Implement aggregate sum and use for arc counters

author Paul Dagnelie <pcd@delphix.com>

Thu, 25 May 2017 18:32:40 +0000 (11:32 -0700)

committer Brian Behlendorf <behlendorf1@llnl.gov>

Wed, 6 Jun 2018 16:35:59 +0000 (09:35 -0700)
author Paul Dagnelie <pcd@delphix.com>
Thu, 25 May 2017 18:32:40 +0000 (11:32 -0700)
committer Brian Behlendorf <behlendorf1@llnl.gov>
Wed, 6 Jun 2018 16:35:59 +0000 (09:35 -0700)
diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am

index 06306b87643ea0b0e28df0e2d32a733bc871a1bf..50c21831d2fd0cd9293eb9d593562a15d013ae6d 100644 (file)
--- a/include/sys/Makefile.am
+++ b/include/sys/Makefile.am
@@ -2,6 +2,7 @@ SUBDIRS = fm fs crypto lua sysevent
  
  COMMON_H = \
         $(top_srcdir)/include/sys/abd.h \
+       $(top_srcdir)/include/sys/aggsum.h \
         $(top_srcdir)/include/sys/arc.h \
         $(top_srcdir)/include/sys/arc_impl.h \
         $(top_srcdir)/include/sys/avl.h \
@@ -11,6 +12,7 @@ COMMON_H = \
         $(top_srcdir)/include/sys/bpobj.h \
         $(top_srcdir)/include/sys/bptree.h \
         $(top_srcdir)/include/sys/bqueue.h \
+       $(top_srcdir)/include/sys/cityhash.h \
         $(top_srcdir)/include/sys/dbuf.h \
         $(top_srcdir)/include/sys/ddt.h \
         $(top_srcdir)/include/sys/dmu.h \
diff --git a/include/sys/aggsum.h b/include/sys/aggsum.h

new file mode 100644 (file)

index 0000000..caa08d7
--- /dev/null
+++ b/include/sys/aggsum.h
@@ -0,0 +1,59 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ */
+
+#ifndef        _SYS_AGGSUM_H
+#define        _SYS_AGGSUM_H
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct aggsum_bucket aggsum_bucket_t;
+
+struct aggsum_bucket {
+       kmutex_t asc_lock;
+       int64_t asc_delta;
+       uint64_t asc_borrowed;
+} ____cacheline_aligned;
+
+/*
+ * Fan out over FANOUT cpus.
+ */
+typedef struct aggsum {
+       kmutex_t as_lock;
+       int64_t as_lower_bound;
+       int64_t as_upper_bound;
+       uint64_t as_numbuckets;
+       aggsum_bucket_t *as_buckets;
+} aggsum_t;
+
+void aggsum_init(aggsum_t *, uint64_t);
+void aggsum_fini(aggsum_t *);
+int64_t aggsum_lower_bound(aggsum_t *);
+int64_t aggsum_upper_bound(aggsum_t *);
+int aggsum_compare(aggsum_t *, uint64_t);
+uint64_t aggsum_value(aggsum_t *);
+void aggsum_add(aggsum_t *, int64_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_AGGSUM_H */
diff --git a/include/sys/cityhash.h b/include/sys/cityhash.h

new file mode 100644 (file)

index 0000000..33c3b7b
--- /dev/null
+++ b/include/sys/cityhash.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2011 Google, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+
+/*
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ */
+
+#ifndef        _SYS_CITYHASH_H
+#define        _SYS_CITYHASH_H
+
+#include <sys/zfs_context.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+uint64_t cityhash4(uint64_t, uint64_t, uint64_t, uint64_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_CITYHASH_H */
diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h

index 6ac16dc1ee855815d34a46f238afdc0df8466a2c..31a72ba70c86ca04612a76ccf0c40a2f341ecde2 100644 (file)
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@@ -750,5 +750,7 @@ extern fstrans_cookie_t spl_fstrans_mark(void);
  extern void spl_fstrans_unmark(fstrans_cookie_t);
  extern int __spl_pf_fstrans_check(void);
  
+#define        ____cacheline_aligned
+
  #endif /* _KERNEL */
  #endif /* _SYS_ZFS_CONTEXT_H */
diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am

index ec80ccf543f184906dadd79e807c39a07ed90dbf..f2a7a00a85d9b65d81416fce92a5480891fa1150 100644 (file)
--- a/lib/libzpool/Makefile.am
+++ b/lib/libzpool/Makefile.am
@@ -40,12 +40,14 @@ KERNEL_C = \
         zpool_prop.c \
         zprop_common.c \
         abd.c \
+       aggsum.c \
         arc.c \
         blkptr.c \
         bplist.c \
         bpobj.c \
         bptree.c \
         bqueue.c \
+       cityhash.c \
         dbuf.c \
         dbuf_stats.c \
         ddt.c \
diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in

index fe50107731a9a9f8def3ceb344671ff48a4f1274..1c2187c56cea6127b26652fe9509419392a73592 100644 (file)
--- a/module/zfs/Makefile.in
+++ b/module/zfs/Makefile.in
@@ -17,10 +17,12 @@ endif
  ccflags-y += $(NO_UNUSED_BUT_SET_VARIABLE)
  
  $(MODULE)-objs += abd.o
+$(MODULE)-objs += aggsum.o
  $(MODULE)-objs += arc.o
  $(MODULE)-objs += blkptr.o
  $(MODULE)-objs += bplist.o
  $(MODULE)-objs += bpobj.o
+$(MODULE)-objs += cityhash.o
  $(MODULE)-objs += dbuf.o
  $(MODULE)-objs += dbuf_stats.o
  $(MODULE)-objs += bptree.o
diff --git a/module/zfs/THIRDPARTYLICENSE.cityhash b/module/zfs/THIRDPARTYLICENSE.cityhash

new file mode 100644 (file)

index 0000000..e558b2a
--- /dev/null
+++ b/module/zfs/THIRDPARTYLICENSE.cityhash
@@ -0,0 +1,19 @@
+Copyright (c) 2011 Google, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/module/zfs/THIRDPARTYLICENSE.cityhash.descrip b/module/zfs/THIRDPARTYLICENSE.cityhash.descrip

new file mode 100644 (file)

index 0000000..f98cb76
--- /dev/null
+++ b/module/zfs/THIRDPARTYLICENSE.cityhash.descrip
@@ -0,0 +1 @@
+CITYHASH CHECKSUM FUNCTIONALITY IN ZFS
diff --git a/module/zfs/aggsum.c b/module/zfs/aggsum.c

new file mode 100644 (file)

index 0000000..171d0ff
--- /dev/null
+++ b/module/zfs/aggsum.c
@@ -0,0 +1,233 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/aggsum.h>
+
+/*
+ * Aggregate-sum counters are a form of fanned-out counter, used when atomic
+ * instructions on a single field cause enough CPU cache line contention to
+ * slow system performance. Due to their increased overhead and the expense
+ * involved with precisely reading from them, they should only be used in cases
+ * where the write rate (increment/decrement) is much higher than the read rate
+ * (get value).
+ *
+ * Aggregate sum counters are comprised of two basic parts, the core and the
+ * buckets. The core counter contains a lock for the entire counter, as well
+ * as the current upper and lower bounds on the value of the counter. The
+ * aggsum_bucket structure contains a per-bucket lock to protect the contents of
+ * the bucket, the current amount that this bucket has changed from the global
+ * counter (called the delta), and the amount of increment and decrement we have
+ * "borrowed" from the core counter.
+ *
+ * The basic operation of an aggsum is simple. Threads that wish to modify the
+ * counter will modify one bucket's counter (determined by their current CPU, to
+ * help minimize lock and cache contention). If the bucket already has
+ * sufficient capacity borrowed from the core structure to handle their request,
+ * they simply modify the delta and return.  If the bucket does not, we clear
+ * the bucket's current state (to prevent the borrowed amounts from getting too
+ * large), and borrow more from the core counter. Borrowing is done by adding to
+ * the upper bound (or subtracting from the lower bound) of the core counter,
+ * and setting the borrow value for the bucket to the amount added (or
+ * subtracted).  Clearing the bucket is the opposite; we add the current delta
+ * to both the lower and upper bounds of the core counter, subtract the borrowed
+ * incremental from the upper bound, and add the borrowed decrement from the
+ * lower bound.  Note that only borrowing and clearing require access to the
+ * core counter; since all other operations access CPU-local resources,
+ * performance can be much higher than a traditional counter.
+ *
+ * Threads that wish to read from the counter have a slightly more challenging
+ * task. It is fast to determine the upper and lower bounds of the aggum; this
+ * does not require grabbing any locks. This suffices for cases where an
+ * approximation of the aggsum's value is acceptable. However, if one needs to
+ * know whether some specific value is above or below the current value in the
+ * aggsum, they invoke aggsum_compare(). This function operates by repeatedly
+ * comparing the target value to the upper and lower bounds of the aggsum, and
+ * then clearing a bucket. This proceeds until the target is outside of the
+ * upper and lower bounds and we return a response, or the last bucket has been
+ * cleared and we know that the target is equal to the aggsum's value. Finally,
+ * the most expensive operation is determining the precise value of the aggsum.
+ * To do this, we clear every bucket and then return the upper bound (which must
+ * be equal to the lower bound). What makes aggsum_compare() and aggsum_value()
+ * expensive is clearing buckets. This involves grabbing the global lock
+ * (serializing against themselves and borrow operations), grabbing a bucket's
+ * lock (preventing threads on those CPUs from modifying their delta), and
+ * zeroing out the borrowed value (forcing that thread to borrow on its next
+ * request, which will also be expensive).  This is what makes aggsums well
+ * suited for write-many read-rarely operations.
+ */
+
+/*
+ * We will borrow aggsum_borrow_multiplier times the current request, so we will
+ * have to get the as_lock approximately every aggsum_borrow_multiplier calls to
+ * aggsum_delta().
+ */
+static uint_t aggsum_borrow_multiplier = 10;
+
+void
+aggsum_init(aggsum_t *as, uint64_t value)
+{
+       bzero(as, sizeof (*as));
+       as->as_lower_bound = as->as_upper_bound = value;
+       mutex_init(&as->as_lock, NULL, MUTEX_DEFAULT, NULL);
+       as->as_numbuckets = boot_ncpus;
+       as->as_buckets = kmem_zalloc(boot_ncpus * sizeof (aggsum_bucket_t),
+           KM_SLEEP);
+       for (int i = 0; i < as->as_numbuckets; i++) {
+               mutex_init(&as->as_buckets[i].asc_lock,
+                   NULL, MUTEX_DEFAULT, NULL);
+       }
+}
+
+void
+aggsum_fini(aggsum_t *as)
+{
+       for (int i = 0; i < as->as_numbuckets; i++)
+               mutex_destroy(&as->as_buckets[i].asc_lock);
+       kmem_free(as->as_buckets, as->as_numbuckets * sizeof (aggsum_bucket_t));
+       mutex_destroy(&as->as_lock);
+}
+
+int64_t
+aggsum_lower_bound(aggsum_t *as)
+{
+       return (as->as_lower_bound);
+}
+
+int64_t
+aggsum_upper_bound(aggsum_t *as)
+{
+       return (as->as_upper_bound);
+}
+
+static void
+aggsum_flush_bucket(aggsum_t *as, struct aggsum_bucket *asb)
+{
+       ASSERT(MUTEX_HELD(&as->as_lock));
+       ASSERT(MUTEX_HELD(&asb->asc_lock));
+
+       /*
+        * We use atomic instructions for this because we read the upper and
+        * lower bounds without the lock, so we need stores to be atomic.
+        */
+       atomic_add_64((volatile uint64_t *)&as->as_lower_bound, asb->asc_delta);
+       atomic_add_64((volatile uint64_t *)&as->as_upper_bound, asb->asc_delta);
+       asb->asc_delta = 0;
+       atomic_add_64((volatile uint64_t *)&as->as_upper_bound,
+           -asb->asc_borrowed);
+       atomic_add_64((volatile uint64_t *)&as->as_lower_bound,
+           asb->asc_borrowed);
+       asb->asc_borrowed = 0;
+}
+
+uint64_t
+aggsum_value(aggsum_t *as)
+{
+       int64_t rv;
+
+       mutex_enter(&as->as_lock);
+       if (as->as_lower_bound == as->as_upper_bound) {
+               rv = as->as_lower_bound;
+               for (int i = 0; i < as->as_numbuckets; i++) {
+                       ASSERT0(as->as_buckets[i].asc_delta);
+                       ASSERT0(as->as_buckets[i].asc_borrowed);
+               }
+               mutex_exit(&as->as_lock);
+               return (rv);
+       }
+       for (int i = 0; i < as->as_numbuckets; i++) {
+               struct aggsum_bucket *asb = &as->as_buckets[i];
+               mutex_enter(&asb->asc_lock);
+               aggsum_flush_bucket(as, asb);
+               mutex_exit(&asb->asc_lock);
+       }
+       VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound);
+       rv = as->as_lower_bound;
+       mutex_exit(&as->as_lock);
+
+       return (rv);
+}
+
+static void
+aggsum_borrow(aggsum_t *as, int64_t delta, struct aggsum_bucket *asb)
+{
+       int64_t abs_delta = (delta < 0 ? -delta : delta);
+       mutex_enter(&as->as_lock);
+       mutex_enter(&asb->asc_lock);
+
+       aggsum_flush_bucket(as, asb);
+
+       atomic_add_64((volatile uint64_t *)&as->as_upper_bound, abs_delta);
+       atomic_add_64((volatile uint64_t *)&as->as_lower_bound, -abs_delta);
+       asb->asc_borrowed = abs_delta;
+
+       mutex_exit(&asb->asc_lock);
+       mutex_exit(&as->as_lock);
+}
+
+void
+aggsum_add(aggsum_t *as, int64_t delta)
+{
+       struct aggsum_bucket *asb =
+           &as->as_buckets[CPU_SEQID % as->as_numbuckets];
+
+       for (;;) {
+               mutex_enter(&asb->asc_lock);
+               if (asb->asc_delta + delta <= (int64_t)asb->asc_borrowed &&
+                   asb->asc_delta + delta >= -(int64_t)asb->asc_borrowed) {
+                       asb->asc_delta += delta;
+                       mutex_exit(&asb->asc_lock);
+                       return;
+               }
+               mutex_exit(&asb->asc_lock);
+               aggsum_borrow(as, delta * aggsum_borrow_multiplier, asb);
+       }
+}
+
+/*
+ * Compare the aggsum value to target efficiently. Returns -1 if the value
+ * represented by the aggsum is less than target, 1 if it's greater, and 0 if
+ * they are equal.
+ */
+int
+aggsum_compare(aggsum_t *as, uint64_t target)
+{
+       if (as->as_upper_bound < target)
+               return (-1);
+       if (as->as_lower_bound > target)
+               return (1);
+       mutex_enter(&as->as_lock);
+       for (int i = 0; i < as->as_numbuckets; i++) {
+               struct aggsum_bucket *asb = &as->as_buckets[i];
+               mutex_enter(&asb->asc_lock);
+               aggsum_flush_bucket(as, asb);
+               mutex_exit(&asb->asc_lock);
+               if (as->as_upper_bound < target) {
+                       mutex_exit(&as->as_lock);
+                       return (-1);
+               }
+               if (as->as_lower_bound > target) {
+                       mutex_exit(&as->as_lock);
+                       return (1);
+               }
+       }
+       VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound);
+       ASSERT3U(as->as_lower_bound, ==, target);
+       mutex_exit(&as->as_lock);
+       return (0);
+}
diff --git a/module/zfs/arc.c b/module/zfs/arc.c

index be9964bff21895c0e89fe4438368994de72033ce..71bebf27746c59a285ebb15983408b82991e3fa1 100644 (file)
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -303,6 +303,8 @@
  #include <zfs_fletcher.h>
  #include <sys/arc_impl.h>
  #include <sys/trace_arc.h>
+#include <sys/aggsum.h>
+#include <sys/cityhash.h>
  
  #ifndef _KERNEL
  /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
@@ -475,6 +477,7 @@ typedef struct arc_stats {
         kstat_named_t arcstat_c;
         kstat_named_t arcstat_c_min;
         kstat_named_t arcstat_c_max;
+       /* Not updated directly; only synced in arc_kstat_update. */
         kstat_named_t arcstat_size;
         /*
          * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd.
@@ -503,12 +506,14 @@ typedef struct arc_stats {
          * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
          * caches), and arc_buf_t structures (allocated via arc_buf_t
          * cache).
+        * Not updated directly; only synced in arc_kstat_update.
          */
         kstat_named_t arcstat_hdr_size;
         /*
          * Number of bytes consumed by ARC buffers of type equal to
          * ARC_BUFC_DATA. This is generally consumed by buffers backing
          * on disk user data (e.g. plain file contents).
+        * Not updated directly; only synced in arc_kstat_update.
          */
         kstat_named_t arcstat_data_size;
         /*
@@ -516,18 +521,22 @@ typedef struct arc_stats {
          * ARC_BUFC_METADATA. This is generally consumed by buffers
          * backing on disk data that is used for internal ZFS
          * structures (e.g. ZAP, dnode, indirect blocks, etc).
+        * Not updated directly; only synced in arc_kstat_update.
          */
         kstat_named_t arcstat_metadata_size;
         /*
          * Number of bytes consumed by dmu_buf_impl_t objects.
+        * Not updated directly; only synced in arc_kstat_update.
          */
         kstat_named_t arcstat_dbuf_size;
         /*
          * Number of bytes consumed by dnode_t objects.
+        * Not updated directly; only synced in arc_kstat_update.
          */
         kstat_named_t arcstat_dnode_size;
         /*
          * Number of bytes consumed by bonus buffers.
+        * Not updated directly; only synced in arc_kstat_update.
          */
         kstat_named_t arcstat_bonus_size;
         /*
@@ -535,6 +544,7 @@ typedef struct arc_stats {
          * arc_anon state. This includes *all* buffers in the arc_anon
          * state; e.g. data, metadata, evictable, and unevictable buffers
          * are all included in this value.
+        * Not updated directly; only synced in arc_kstat_update.
          */
         kstat_named_t arcstat_anon_size;
         /*
@@ -542,6 +552,7 @@ typedef struct arc_stats {
          * following criteria: backing buffers of type ARC_BUFC_DATA,
          * residing in the arc_anon state, and are eligible for eviction
          * (e.g. have no outstanding holds on the buffer).
+        * Not updated directly; only synced in arc_kstat_update.
          */
         kstat_named_t arcstat_anon_evictable_data;
         /*
@@ -549,6 +560,7 @@ typedef struct arc_stats {
          * following criteria: backing buffers of type ARC_BUFC_METADATA,
          * residing in the arc_anon state, and are eligible for eviction
          * (e.g. have no outstanding holds on the buffer).
+        * Not updated directly; only synced in arc_kstat_update.
          */
         kstat_named_t arcstat_anon_evictable_metadata;
         /*
@@ -556,6 +568,7 @@ typedef struct arc_stats {
          * arc_mru state. This includes *all* buffers in the arc_mru
          * state; e.g. data, metadata, evictable, and unevictable buffers
          * are all included in this value.
+        * Not updated directly; only synced in arc_kstat_update.
          */
         kstat_named_t arcstat_mru_size;
         /*
@@ -563,6 +576,7 @@ typedef struct arc_stats {
          * following criteria: backing buffers of type ARC_BUFC_DATA,
          * residing in the arc_mru state, and are eligible for eviction
          * (e.g. have no outstanding holds on the buffer).
+        * Not updated directly; only synced in arc_kstat_update.
          */
         kstat_named_t arcstat_mru_evictable_data;
         /*
@@ -570,6 +584,7 @@ typedef struct arc_stats {
          * following criteria: backing buffers of type ARC_BUFC_METADATA,
          * residing in the arc_mru state, and are eligible for eviction
          * (e.g. have no outstanding holds on the buffer).
+        * Not updated directly; only synced in arc_kstat_update.
          */
         kstat_named_t arcstat_mru_evictable_metadata;
         /*
@@ -580,18 +595,21 @@ typedef struct arc_stats {
          * don't actually have ARC buffers linked off of these headers.
          * Thus, *if* the headers had associated ARC buffers, these
          * buffers *would have* consumed this number of bytes.
+        * Not updated directly; only synced in arc_kstat_update.
          */
         kstat_named_t arcstat_mru_ghost_size;
         /*
          * Number of bytes that *would have been* consumed by ARC
          * buffers that are eligible for eviction, of type
          * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
+        * Not updated directly; only synced in arc_kstat_update.
          */
         kstat_named_t arcstat_mru_ghost_evictable_data;
         /*
          * Number of bytes that *would have been* consumed by ARC
          * buffers that are eligible for eviction, of type
          * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
+        * Not updated directly; only synced in arc_kstat_update.
          */
         kstat_named_t arcstat_mru_ghost_evictable_metadata;
         /*
@@ -599,36 +617,42 @@ typedef struct arc_stats {
          * arc_mfu state. This includes *all* buffers in the arc_mfu
          * state; e.g. data, metadata, evictable, and unevictable buffers
          * are all included in this value.
+        * Not updated directly; only synced in arc_kstat_update.
          */
         kstat_named_t arcstat_mfu_size;
         /*
          * Number of bytes consumed by ARC buffers that are eligible for
          * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
          * state.
+        * Not updated directly; only synced in arc_kstat_update.
          */
         kstat_named_t arcstat_mfu_evictable_data;
         /*
          * Number of bytes consumed by ARC buffers that are eligible for
          * eviction, of type ARC_BUFC_METADATA, and reside in the
          * arc_mfu state.
+        * Not updated directly; only synced in arc_kstat_update.
          */
         kstat_named_t arcstat_mfu_evictable_metadata;
         /*
          * Total number of bytes that *would have been* consumed by ARC
          * buffers in the arc_mfu_ghost state. See the comment above
          * arcstat_mru_ghost_size for more details.
+        * Not updated directly; only synced in arc_kstat_update.
          */
         kstat_named_t arcstat_mfu_ghost_size;
         /*
          * Number of bytes that *would have been* consumed by ARC
          * buffers that are eligible for eviction, of type
          * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
+        * Not updated directly; only synced in arc_kstat_update.
          */
         kstat_named_t arcstat_mfu_ghost_evictable_data;
         /*
          * Number of bytes that *would have been* consumed by ARC
          * buffers that are eligible for eviction, of type
          * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
+        * Not updated directly; only synced in arc_kstat_update.
          */
         kstat_named_t arcstat_mfu_ghost_evictable_metadata;
         kstat_named_t arcstat_l2_hits;
@@ -650,6 +674,7 @@ typedef struct arc_stats {
         kstat_named_t arcstat_l2_io_error;
         kstat_named_t arcstat_l2_lsize;
         kstat_named_t arcstat_l2_psize;
+       /* Not updated directly; only synced in arc_kstat_update. */
         kstat_named_t arcstat_l2_hdr_size;
         kstat_named_t arcstat_memory_throttle_count;
         kstat_named_t arcstat_memory_direct_count;
@@ -661,6 +686,7 @@ typedef struct arc_stats {
         kstat_named_t arcstat_tempreserve;
         kstat_named_t arcstat_loaned_bytes;
         kstat_named_t arcstat_prune;
+       /* Not updated directly; only synced in arc_kstat_update. */
         kstat_named_t arcstat_meta_used;
         kstat_named_t arcstat_meta_limit;
         kstat_named_t arcstat_dnode_limit;
@@ -829,7 +855,6 @@ static arc_state_t  *arc_l2c_only;
   * the possibility of inconsistency by having shadow copies of the variables,
   * while still allowing the code to be readable.
   */
-#define        arc_size        ARCSTAT(arcstat_size)   /* actual total arc size */
  #define        arc_p           ARCSTAT(arcstat_p)      /* target size of MRU */
  #define        arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
  #define        arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */
@@ -840,11 +865,7 @@ static arc_state_t *arc_l2c_only;
  #define        arc_meta_limit  ARCSTAT(arcstat_meta_limit) /* max size for metadata */
  #define        arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */
  #define        arc_meta_min    ARCSTAT(arcstat_meta_min) /* min size for metadata */
-#define        arc_meta_used   ARCSTAT(arcstat_meta_used) /* size of metadata */
  #define        arc_meta_max    ARCSTAT(arcstat_meta_max) /* max size of metadata */
-#define        arc_dbuf_size   ARCSTAT(arcstat_dbuf_size) /* dbuf metadata */
-#define        arc_dnode_size  ARCSTAT(arcstat_dnode_size) /* dnode metadata */
-#define        arc_bonus_size  ARCSTAT(arcstat_bonus_size) /* bonus buffer metadata */
  #define        arc_need_free   ARCSTAT(arcstat_need_free) /* bytes to be freed */
  #define        arc_sys_free    ARCSTAT(arcstat_sys_free) /* target system free bytes */
  
@@ -857,6 +878,24 @@ static arc_state_t *arc_l2c_only;
  /* number of bytes in the arc from arc_buf_t's */
  #define        arc_overhead_size       ARCSTAT(arcstat_overhead_size)
  
+/*
+ * There are also some ARC variables that we want to export, but that are
+ * updated so often that having the canonical representation be the statistic
+ * variable causes a performance bottleneck. We want to use aggsum_t's for these
+ * instead, but still be able to export the kstat in the same way as before.
+ * The solution is to always use the aggsum version, except in the kstat update
+ * callback.
+ */
+aggsum_t arc_size;
+aggsum_t arc_meta_used;
+aggsum_t astat_data_size;
+aggsum_t astat_metadata_size;
+aggsum_t astat_dbuf_size;
+aggsum_t astat_dnode_size;
+aggsum_t astat_bonus_size;
+aggsum_t astat_hdr_size;
+aggsum_t astat_l2_hdr_size;
+
  static list_t arc_prune_list;
  static kmutex_t arc_prune_mtx;
  static taskq_t *arc_prune_taskq;
@@ -1050,21 +1089,15 @@ static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
  static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
  static void l2arc_read_done(zio_t *);
  
+
+/*
+ * We use Cityhash for this. It's fast, and has good hash properties without
+ * requiring any large static buffers.
+ */
  static uint64_t
  buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
  {
-       uint8_t *vdva = (uint8_t *)dva;
-       uint64_t crc = -1ULL;
-       int i;
-
-       ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
-
-       for (i = 0; i < sizeof (dva_t); i++)
-               crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
-
-       crc ^= (spa>>8) ^ birth;
-
-       return (crc);
+       return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth));
  }
  
  #define        HDR_EMPTY(hdr)                                          \
@@ -2676,32 +2709,32 @@ arc_space_consume(uint64_t space, arc_space_type_t type)
         default:
                 break;
         case ARC_SPACE_DATA:
-               ARCSTAT_INCR(arcstat_data_size, space);
+               aggsum_add(&astat_data_size, space);
                 break;
         case ARC_SPACE_META:
-               ARCSTAT_INCR(arcstat_metadata_size, space);
+               aggsum_add(&astat_metadata_size, space);
                 break;
         case ARC_SPACE_BONUS:
-               ARCSTAT_INCR(arcstat_bonus_size, space);
+               aggsum_add(&astat_bonus_size, space);
                 break;
         case ARC_SPACE_DNODE:
-               ARCSTAT_INCR(arcstat_dnode_size, space);
+               aggsum_add(&astat_dnode_size, space);
                 break;
         case ARC_SPACE_DBUF:
-               ARCSTAT_INCR(arcstat_dbuf_size, space);
+               aggsum_add(&astat_dbuf_size, space);
                 break;
         case ARC_SPACE_HDRS:
-               ARCSTAT_INCR(arcstat_hdr_size, space);
+               aggsum_add(&astat_hdr_size, space);
                 break;
         case ARC_SPACE_L2HDRS:
-               ARCSTAT_INCR(arcstat_l2_hdr_size, space);
+               aggsum_add(&astat_l2_hdr_size, space);
                 break;
         }
  
         if (type != ARC_SPACE_DATA)
-               ARCSTAT_INCR(arcstat_meta_used, space);
+               aggsum_add(&arc_meta_used, space);
  
-       atomic_add_64(&arc_size, space);
+       aggsum_add(&arc_size, space);
  }
  
  void
@@ -2713,37 +2746,42 @@ arc_space_return(uint64_t space, arc_space_type_t type)
         default:
                 break;
         case ARC_SPACE_DATA:
-               ARCSTAT_INCR(arcstat_data_size, -space);
+               aggsum_add(&astat_data_size, -space);
                 break;
         case ARC_SPACE_META:
-               ARCSTAT_INCR(arcstat_metadata_size, -space);
+               aggsum_add(&astat_metadata_size, -space);
                 break;
         case ARC_SPACE_BONUS:
-               ARCSTAT_INCR(arcstat_bonus_size, -space);
+               aggsum_add(&astat_bonus_size, -space);
                 break;
         case ARC_SPACE_DNODE:
-               ARCSTAT_INCR(arcstat_dnode_size, -space);
+               aggsum_add(&astat_dnode_size, -space);
                 break;
         case ARC_SPACE_DBUF:
-               ARCSTAT_INCR(arcstat_dbuf_size, -space);
+               aggsum_add(&astat_dbuf_size, -space);
                 break;
         case ARC_SPACE_HDRS:
-               ARCSTAT_INCR(arcstat_hdr_size, -space);
+               aggsum_add(&astat_hdr_size, -space);
                 break;
         case ARC_SPACE_L2HDRS:
-               ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
+               aggsum_add(&astat_l2_hdr_size, -space);
                 break;
         }
  
         if (type != ARC_SPACE_DATA) {
-               ASSERT(arc_meta_used >= space);
-               if (arc_meta_max < arc_meta_used)
-                       arc_meta_max = arc_meta_used;
-               ARCSTAT_INCR(arcstat_meta_used, -space);
+               ASSERT(aggsum_compare(&arc_meta_used, space) >= 0);
+               /*
+                * We use the upper bound here rather than the precise value
+                * because the arc_meta_max value doesn't need to be
+                * precise. It's only consumed by humans via arcstats.
+                */
+               if (arc_meta_max < aggsum_upper_bound(&arc_meta_used))
+                       arc_meta_max = aggsum_upper_bound(&arc_meta_used);
+               aggsum_add(&arc_meta_used, -space);
         }
  
-       ASSERT(arc_size >= space);
-       atomic_add_64(&arc_size, -space);
+       ASSERT(aggsum_compare(&arc_size, space) >= 0);
+       aggsum_add(&arc_size, -space);
  }
  
  /*
@@ -4073,9 +4111,12 @@ arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
                  * Request that 10% of the LRUs be scanned by the superblock
                  * shrinker.
                  */
-               if (type == ARC_BUFC_DATA && arc_dnode_size > arc_dnode_limit)
-                       arc_prune_async((arc_dnode_size - arc_dnode_limit) /
-                           sizeof (dnode_t) / zfs_arc_dnode_reduce_percent);
+               if (type == ARC_BUFC_DATA && aggsum_compare(&astat_dnode_size,
+                   arc_dnode_limit) > 0) {
+                       arc_prune_async((aggsum_upper_bound(&astat_dnode_size) -
+                           arc_dnode_limit) / sizeof (dnode_t) /
+                           zfs_arc_dnode_reduce_percent);
+               }
  
                 /*
                  * Start eviction using a randomly selected sublist,
@@ -4257,14 +4298,14 @@ arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
   *
   * Therefore, this function has been updated to make alternating passes
   * over the ARC releasing data buffers and then newly unheld meta data
- * buffers.  This ensures forward progress is maintained and arc_meta_used
+ * buffers.  This ensures forward progress is maintained and meta_used
   * will decrease.  Normally this is sufficient, but if required the ARC
   * will call the registered prune callbacks causing dentry and inodes to
   * be dropped from the VFS cache.  This will make dnode meta data buffers
   * available for reclaim.
   */
  static uint64_t
-arc_adjust_meta_balanced(void)
+arc_adjust_meta_balanced(uint64_t meta_used)
  {
         int64_t delta, prune = 0, adjustmnt;
         uint64_t total_evicted = 0;
@@ -4280,7 +4321,7 @@ restart:
          * metadata from the MFU. I think we probably need to implement a
          * "metadata arc_p" value to do this properly.
          */
-       adjustmnt = arc_meta_used - arc_meta_limit;
+       adjustmnt = meta_used - arc_meta_limit;
  
         if (adjustmnt > 0 && refcount_count(&arc_mru->arcs_esize[type]) > 0) {
                 delta = MIN(refcount_count(&arc_mru->arcs_esize[type]),
@@ -4305,7 +4346,7 @@ restart:
                 total_evicted += arc_adjust_impl(arc_mfu, 0, delta, type);
         }
  
-       adjustmnt = arc_meta_used - arc_meta_limit;
+       adjustmnt = meta_used - arc_meta_limit;
  
         if (adjustmnt > 0 &&
             refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) {
@@ -4329,7 +4370,7 @@ restart:
          * meta buffers.  Requests to the upper layers will be made with
          * increasingly large scan sizes until the ARC is below the limit.
          */
-       if (arc_meta_used > arc_meta_limit) {
+       if (meta_used > arc_meta_limit) {
                 if (type == ARC_BUFC_DATA) {
                         type = ARC_BUFC_METADATA;
                 } else {
@@ -4354,7 +4395,7 @@ restart:
   * capped by the arc_meta_limit tunable.
   */
  static uint64_t
-arc_adjust_meta_only(void)
+arc_adjust_meta_only(uint64_t meta_used)
  {
         uint64_t total_evicted = 0;
         int64_t target;
@@ -4366,7 +4407,7 @@ arc_adjust_meta_only(void)
          * we're over the meta limit more than we're over arc_p, we
          * evict some from the MRU here, and some from the MFU below.
          */
-       target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
+       target = MIN((int64_t)(meta_used - arc_meta_limit),
             (int64_t)(refcount_count(&arc_anon->arcs_size) +
             refcount_count(&arc_mru->arcs_size) - arc_p));
  
@@ -4377,8 +4418,9 @@ arc_adjust_meta_only(void)
          * below the meta limit, but not so much as to drop us below the
          * space allotted to the MFU (which is defined as arc_c - arc_p).
          */
-       target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
-           (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p)));
+       target = MIN((int64_t)(meta_used - arc_meta_limit),
+           (int64_t)(refcount_count(&arc_mfu->arcs_size) -
+           (arc_c - arc_p)));
  
         total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
  
@@ -4386,12 +4428,12 @@ arc_adjust_meta_only(void)
  }
  
  static uint64_t
-arc_adjust_meta(void)
+arc_adjust_meta(uint64_t meta_used)
  {
         if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY)
-               return (arc_adjust_meta_only());
+               return (arc_adjust_meta_only(meta_used));
         else
-               return (arc_adjust_meta_balanced());
+               return (arc_adjust_meta_balanced(meta_used));
  }
  
  /*
@@ -4478,12 +4520,14 @@ arc_adjust(void)
         uint64_t total_evicted = 0;
         uint64_t bytes;
         int64_t target;
+       uint64_t asize = aggsum_value(&arc_size);
+       uint64_t ameta = aggsum_value(&arc_meta_used);
  
         /*
          * If we're over arc_meta_limit, we want to correct that before
          * potentially evicting data buffers below.
          */
-       total_evicted += arc_adjust_meta();
+       total_evicted += arc_adjust_meta(ameta);
  
         /*
          * Adjust MRU size
@@ -4495,9 +4539,9 @@ arc_adjust(void)
          * the MRU is over arc_p, we'll evict enough to get back to
          * arc_p here, and then evict more from the MFU below.
          */
-       target = MIN((int64_t)(arc_size - arc_c),
+       target = MIN((int64_t)(asize - arc_c),
             (int64_t)(refcount_count(&arc_anon->arcs_size) +
-           refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p));
+           refcount_count(&arc_mru->arcs_size) + ameta - arc_p));
  
         /*
          * If we're below arc_meta_min, always prefer to evict data.
@@ -4508,7 +4552,7 @@ arc_adjust(void)
          * type, spill over into the next type.
          */
         if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
-           arc_meta_used > arc_meta_min) {
+           ameta > arc_meta_min) {
                 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
                 total_evicted += bytes;
  
@@ -4541,10 +4585,10 @@ arc_adjust(void)
          * size back to arc_p, if we're still above the target cache
          * size, we evict the rest from the MFU.
          */
-       target = arc_size - arc_c;
+       target = asize - arc_c;
  
         if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
-           arc_meta_used > arc_meta_min) {
+           ameta > arc_meta_min) {
                 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
                 total_evicted += bytes;
  
@@ -4645,13 +4689,14 @@ arc_flush(spa_t *spa, boolean_t retry)
  void
  arc_shrink(int64_t to_free)
  {
+       uint64_t asize = aggsum_value(&arc_size);
         uint64_t c = arc_c;
  
         if (c > to_free && c - to_free > arc_c_min) {
                 arc_c = c - to_free;
                 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
-               if (arc_c > arc_size)
-                       arc_c = MAX(arc_size, arc_c_min);
+               if (asize < arc_c)
+                       arc_c = MAX(asize, arc_c_min);
                 if (arc_p > arc_c)
                         arc_p = (arc_c >> 1);
                 ASSERT(arc_c >= arc_c_min);
@@ -4660,7 +4705,7 @@ arc_shrink(int64_t to_free)
                 arc_c = arc_c_min;
         }
  
-       if (arc_size > arc_c)
+       if (asize > arc_c)
                 (void) arc_adjust();
  }
  
@@ -4877,7 +4922,8 @@ arc_kmem_reap_now(void)
         extern kmem_cache_t     *range_seg_cache;
  
  #ifdef _KERNEL
-       if ((arc_meta_used >= arc_meta_limit) && zfs_arc_meta_prune) {
+       if ((aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) &&
+           zfs_arc_meta_prune) {
                 /*
                  * We are exceeding our meta-data cache limit.
                  * Prune some entries to release holds on meta-data.
@@ -5022,7 +5068,7 @@ arc_reclaim_thread(void *unused)
                  * be helpful and could potentially cause us to enter an
                  * infinite loop.
                  */
-               if (arc_size <= arc_c || evicted == 0) {
+               if (aggsum_compare(&arc_size, arc_c) <= 0|| evicted == 0) {
                         /*
                          * We're either no longer overflowing, or we
                          * can't evict anything more, so we should wake
@@ -5101,12 +5147,13 @@ arc_reclaim_thread(void *unused)
  static uint64_t
  arc_evictable_memory(void)
  {
+       int64_t asize = aggsum_value(&arc_size);
         uint64_t arc_clean =
             refcount_count(&arc_mru->arcs_esize[ARC_BUFC_DATA]) +
             refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) +
             refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_DATA]) +
             refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
-       uint64_t arc_dirty = MAX((int64_t)arc_size - (int64_t)arc_clean, 0);
+       uint64_t arc_dirty = MAX((int64_t)asize - (int64_t)arc_clean, 0);
  
         /*
          * Scale reported evictable memory in proportion to page cache, cap
@@ -5118,7 +5165,7 @@ arc_evictable_memory(void)
         if (arc_dirty >= min)
                 return (arc_clean);
  
-       return (MAX((int64_t)arc_size - (int64_t)min, 0));
+       return (MAX((int64_t)asize - (int64_t)min, 0));
  }
  
  /*
@@ -5261,7 +5308,8 @@ arc_adapt(int bytes, arc_state_t *state)
          * cache size, increment the target cache size
          */
         ASSERT3U(arc_c, >=, 2ULL << SPA_MAXBLOCKSHIFT);
-       if (arc_size >= arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
+       if (aggsum_compare(&arc_size, arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) >=
+           0) {
                 atomic_add_64(&arc_c, (int64_t)bytes);
                 if (arc_c > arc_c_max)
                         arc_c = arc_c_max;
@@ -5284,7 +5332,16 @@ arc_is_overflowing(void)
         uint64_t overflow = MAX(SPA_MAXBLOCKSIZE,
             arc_c >> zfs_arc_overflow_shift);
  
-       return (arc_size >= arc_c + overflow);
+       /*
+        * We just compare the lower bound here for performance reasons. Our
+        * primary goals are to make sure that the arc never grows without
+        * bound, and that it can reach its maximum size. This check
+        * accomplishes both goals. The maximum amount we could run over by is
+        * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block
+        * in the ARC. In practice, that's in the tens of MB, which is low
+        * enough to be safe.
+        */
+       return (aggsum_lower_bound(&arc_size) >= arc_c + overflow);
  }
  
  static abd_t *
@@ -5399,7 +5456,8 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
                  * If we are growing the cache, and we are adding anonymous
                  * data, and we have outgrown arc_p, update arc_p
                  */
-               if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon &&
+               if (aggsum_compare(&arc_size, arc_c) < 0 &&
+                   hdr->b_l1hdr.b_state == arc_anon &&
                     (refcount_count(&arc_anon->arcs_size) +
                     refcount_count(&arc_mru->arcs_size) > arc_p))
                         arc_p = MIN(arc_c, arc_p + size);
@@ -7213,6 +7271,17 @@ arc_kstat_update(kstat_t *ksp, int rw)
                     &as->arcstat_mfu_ghost_evictable_data,
                     &as->arcstat_mfu_ghost_evictable_metadata);
  
+               ARCSTAT(arcstat_size) = aggsum_value(&arc_size);
+               ARCSTAT(arcstat_meta_used) = aggsum_value(&arc_meta_used);
+               ARCSTAT(arcstat_data_size) = aggsum_value(&astat_data_size);
+               ARCSTAT(arcstat_metadata_size) =
+                   aggsum_value(&astat_metadata_size);
+               ARCSTAT(arcstat_hdr_size) = aggsum_value(&astat_hdr_size);
+               ARCSTAT(arcstat_l2_hdr_size) = aggsum_value(&astat_l2_hdr_size);
+               ARCSTAT(arcstat_dbuf_size) = aggsum_value(&astat_dbuf_size);
+               ARCSTAT(arcstat_dnode_size) = aggsum_value(&astat_dnode_size);
+               ARCSTAT(arcstat_bonus_size) = aggsum_value(&astat_bonus_size);
+
                 as->arcstat_memory_all_bytes.value.ui64 =
                     arc_all_memory();
                 as->arcstat_memory_free_bytes.value.ui64 =
@@ -7424,6 +7493,16 @@ arc_state_init(void)
         refcount_create(&arc_mfu_ghost->arcs_size);
         refcount_create(&arc_l2c_only->arcs_size);
  
+       aggsum_init(&arc_meta_used, 0);
+       aggsum_init(&arc_size, 0);
+       aggsum_init(&astat_data_size, 0);
+       aggsum_init(&astat_metadata_size, 0);
+       aggsum_init(&astat_hdr_size, 0);
+       aggsum_init(&astat_l2_hdr_size, 0);
+       aggsum_init(&astat_bonus_size, 0);
+       aggsum_init(&astat_dnode_size, 0);
+       aggsum_init(&astat_dbuf_size, 0);
+
         arc_anon->arcs_state = ARC_STATE_ANON;
         arc_mru->arcs_state = ARC_STATE_MRU;
         arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST;
@@ -7465,6 +7544,16 @@ arc_state_fini(void)
         multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
         multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
         multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
+
+       aggsum_fini(&arc_meta_used);
+       aggsum_fini(&arc_size);
+       aggsum_fini(&astat_data_size);
+       aggsum_fini(&astat_metadata_size);
+       aggsum_fini(&astat_hdr_size);
+       aggsum_fini(&astat_l2_hdr_size);
+       aggsum_fini(&astat_bonus_size);
+       aggsum_fini(&astat_dnode_size);
+       aggsum_fini(&astat_dbuf_size);
  }
  
  uint64_t
@@ -7516,7 +7605,6 @@ arc_init(void)
  
         arc_c = arc_c_max;
         arc_p = (arc_c >> 1);
-       arc_size = 0;
  
         /* Set min to 1/2 of arc_c_min */
         arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT;
diff --git a/module/zfs/cityhash.c b/module/zfs/cityhash.c

new file mode 100644 (file)

index 0000000..2b62eda
--- /dev/null
+++ b/module/zfs/cityhash.c
@@ -0,0 +1,63 @@
+// Copyright (c) 2011 Google, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+/*
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ */
+
+#include <sys/cityhash.h>
+
+#define        HASH_K1 0xb492b66fbe98f273ULL
+#define        HASH_K2 0x9ae16a3b2f90404fULL
+
+/*
+ * Bitwise right rotate.  Normally this will compile to a single
+ * instruction.
+ */
+static inline uint64_t
+rotate(uint64_t val, int shift)
+{
+       // Avoid shifting by 64: doing so yields an undefined result.
+       return (shift == 0 ? val : (val >> shift) | (val << (64 - shift)));
+}
+
+static inline uint64_t
+cityhash_helper(uint64_t u, uint64_t v, uint64_t mul)
+{
+       uint64_t a = (u ^ v) * mul;
+       a ^= (a >> 47);
+       uint64_t b = (v ^ a) * mul;
+       b ^= (b >> 47);
+       b *= mul;
+       return (b);
+}
+
+uint64_t
+cityhash4(uint64_t w1, uint64_t w2, uint64_t w3, uint64_t w4)
+{
+       uint64_t mul = HASH_K2 + 64;
+       uint64_t a = w1 * HASH_K1;
+       uint64_t b = w2;
+       uint64_t c = w4 * mul;
+       uint64_t d = w3 * HASH_K2;
+       return (cityhash_helper(rotate(a + b, 43) + rotate(c, 30) + d,
+           a + rotate(b + HASH_K2, 18) + c, mul));
+
+}
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c

index a8c48167ad1c8ff9c51e20eca0f3301254aa79be..6e2f20e507c8d8b847394f5ecc242d65060f2ebd 100644 (file)
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -48,6 +48,7 @@
  #include <sys/callb.h>
  #include <sys/abd.h>
  #include <sys/vdev.h>
+#include <sys/cityhash.h>
  
  kstat_t *dbuf_ksp;
  
@@ -270,23 +271,14 @@ static dbuf_hash_table_t dbuf_hash_table;
  
  static uint64_t dbuf_hash_count;
  
+/*
+ * We use Cityhash for this. It's fast, and has good hash properties without
+ * requiring any large static buffers.
+ */
  static uint64_t
  dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
  {
-       uintptr_t osv = (uintptr_t)os;
-       uint64_t crc = -1ULL;
-
-       ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
-       crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
-       crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
-       crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
-       crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
-       crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
-       crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
-
-       crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
-
-       return (crc);
+       return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid));
  }
  
  #define        DBUF_EQUAL(dbuf, os, obj, level, blkid)         \
author	Paul Dagnelie <pcd@delphix.com>
	Thu, 25 May 2017 18:32:40 +0000 (11:32 -0700)
committer	Brian Behlendorf <behlendorf1@llnl.gov>
	Wed, 6 Jun 2018 16:35:59 +0000 (09:35 -0700)
include/sys/Makefile.am		patch \| blob \| history
include/sys/aggsum.h	[new file with mode: 0644]	patch \| blob
include/sys/cityhash.h	[new file with mode: 0644]	patch \| blob
include/sys/zfs_context.h		patch \| blob \| history
lib/libzpool/Makefile.am		patch \| blob \| history
module/zfs/Makefile.in		patch \| blob \| history
module/zfs/THIRDPARTYLICENSE.cityhash	[new file with mode: 0644]	patch \| blob
module/zfs/THIRDPARTYLICENSE.cityhash.descrip	[new file with mode: 0644]	patch \| blob
module/zfs/aggsum.c	[new file with mode: 0644]	patch \| blob
module/zfs/arc.c		patch \| blob \| history
module/zfs/cityhash.c	[new file with mode: 0644]	patch \| blob
module/zfs/dbuf.c		patch \| blob \| history