#define _SPL_KMEM_H
#include <linux/slab.h>
+#include <linux/sched.h>
extern int kmem_debugging(void);
extern char *kmem_vasprintf(const char *fmt, va_list ap);
/*
* Memory allocation interfaces
*/
-#define KM_SLEEP GFP_KERNEL /* Can sleep, never fails */
-#define KM_NOSLEEP GFP_ATOMIC /* Can not sleep, may fail */
-#define KM_PUSHPAGE (GFP_NOIO | __GFP_HIGH) /* Use reserved memory */
-#define KM_NODEBUG __GFP_NOWARN /* Suppress warnings */
-#define KM_FLAGS __GFP_BITS_MASK
-#define KM_VMFLAGS GFP_LEVEL_MASK
+#define KM_SLEEP 0x0000 /* can block for memory; success guaranteed */
+#define KM_NOSLEEP 0x0001 /* cannot block for memory; may fail */
+#define KM_PUSHPAGE 0x0004 /* can block for memory; may use reserve */
+#define KM_ZERO 0x1000 /* zero the allocation */
+#define KM_VMEM 0x2000 /* caller is vmem_* wrapper */
-/*
- * Used internally, the kernel does not need to support this flag
- */
-#ifndef __GFP_ZERO
-#define __GFP_ZERO 0x8000
-#endif
+#define KM_PUBLIC_MASK (KM_SLEEP | KM_NOSLEEP | KM_PUSHPAGE)
/*
- * __GFP_NOFAIL looks like it will be removed from the kernel perhaps as
- * early as 2.6.32. To avoid this issue when it occurs in upstream kernels
- * we retry the allocation here as long as it is not __GFP_WAIT (GFP_ATOMIC).
- * I would prefer the caller handle the failure case cleanly but we are
- * trying to emulate Solaris and those are not the Solaris semantics.
+ * Convert a KM_* flags mask to its Linux GFP_* counterpart. The conversion
+ * function is context aware which means that KM_SLEEP allocations can be
+ * safely used in syncing contexts which have set PF_FSTRANS.
*/
-static inline void *
-kmalloc_nofail(size_t size, gfp_t flags)
-{
- void *ptr;
-
- do {
- ptr = kmalloc(size, flags);
- } while (ptr == NULL && (flags & __GFP_WAIT));
-
- return (ptr);
-}
-
-static inline void *
-kzalloc_nofail(size_t size, gfp_t flags)
+static inline gfp_t
+kmem_flags_convert(int flags)
{
- void *ptr;
-
- do {
- ptr = kzalloc(size, flags);
- } while (ptr == NULL && (flags & __GFP_WAIT));
+ gfp_t lflags = __GFP_NOWARN | __GFP_COMP;
- return (ptr);
-}
+ if (flags & KM_NOSLEEP) {
+ lflags |= GFP_ATOMIC | __GFP_NORETRY;
+ } else {
+ lflags |= GFP_KERNEL;
+ if ((current->flags & PF_FSTRANS))
+ lflags &= ~(__GFP_IO|__GFP_FS);
+ }
-static inline void *
-kmalloc_node_nofail(size_t size, gfp_t flags, int node)
-{
- void *ptr;
+ if (flags & KM_PUSHPAGE)
+ lflags |= __GFP_HIGH;
- do {
- ptr = kmalloc_node(size, flags, node);
- } while (ptr == NULL && (flags & __GFP_WAIT));
+ if (flags & KM_ZERO)
+ lflags |= __GFP_ZERO;
- return (ptr);
+ return (lflags);
}
-#ifdef DEBUG_KMEM
-
-/*
- * Memory accounting functions to be used only when DEBUG_KMEM is set.
- */
#ifdef HAVE_ATOMIC64_T
#define kmem_alloc_used_add(size) atomic64_add(size, &kmem_alloc_used)
#define kmem_alloc_used_sub(size) atomic64_sub(size, &kmem_alloc_used)
extern unsigned long long kmem_alloc_max;
#endif /* HAVE_ATOMIC64_T */
-#ifdef DEBUG_KMEM_TRACKING
-/*
- * DEBUG_KMEM && DEBUG_KMEM_TRACKING
- *
- * The maximum level of memory debugging. All memory will be accounted
- * for and each allocation will be explicitly tracked. Any allocation
- * which is leaked will be reported on module unload and the exact location
- * where that memory was allocation will be reported. This level of memory
- * tracking will have a significant impact on performance and should only
- * be enabled for debugging. This feature may be enabled by passing
- * --enable-debug-kmem-tracking to configure.
- */
-#define kmem_alloc(sz, fl) kmem_alloc_track((sz), (fl), \
- __FUNCTION__, __LINE__, 0, 0)
-#define kmem_zalloc(sz, fl) kmem_alloc_track((sz), (fl)|__GFP_ZERO,\
- __FUNCTION__, __LINE__, 0, 0)
-#define kmem_alloc_node(sz, fl, nd) kmem_alloc_track((sz), (fl), \
- __FUNCTION__, __LINE__, 1, nd)
-#define kmem_free(ptr, sz) kmem_free_track((ptr), (sz))
-
-extern void *kmem_alloc_track(size_t, int, const char *, int, int, int);
-extern void kmem_free_track(const void *, size_t);
-
-#else /* DEBUG_KMEM_TRACKING */
-/*
- * DEBUG_KMEM && !DEBUG_KMEM_TRACKING
- *
- * The default build will set DEBUG_KEM. This provides basic memory
- * accounting with little to no impact on performance. When the module
- * is unloaded in any memory was leaked the total number of leaked bytes
- * will be reported on the console. To disable this basic accounting
- * pass the --disable-debug-kmem option to configure.
- */
-#define kmem_alloc(sz, fl) kmem_alloc_debug((sz), (fl), \
- __FUNCTION__, __LINE__, 0, 0)
-#define kmem_zalloc(sz, fl) kmem_alloc_debug((sz), (fl)|__GFP_ZERO,\
- __FUNCTION__, __LINE__, 0, 0)
-#define kmem_alloc_node(sz, fl, nd) kmem_alloc_debug((sz), (fl), \
- __FUNCTION__, __LINE__, 1, nd)
-#define kmem_free(ptr, sz) kmem_free_debug((ptr), (sz))
-
-extern void *kmem_alloc_debug(size_t, int, const char *, int, int, int);
-extern void kmem_free_debug(const void *, size_t);
-
-#endif /* DEBUG_KMEM_TRACKING */
-#else /* DEBUG_KMEM */
-/*
- * !DEBUG_KMEM && !DEBUG_KMEM_TRACKING
- *
- * All debugging is disabled. There will be no overhead even for
- * minimal memory accounting. To enable basic accounting pass the
- * --enable-debug-kmem option to configure.
- */
-#define kmem_alloc(sz, fl) kmalloc_nofail((sz), (fl))
-#define kmem_zalloc(sz, fl) kzalloc_nofail((sz), (fl))
-#define kmem_alloc_node(sz, fl, nd) kmalloc_node_nofail((sz), (fl), (nd))
-#define kmem_free(ptr, sz) ((void)(sz), kfree(ptr))
+extern unsigned int spl_kmem_alloc_warn;
+extern unsigned int spl_kmem_alloc_max;
-#endif /* DEBUG_KMEM */
+#define kmem_alloc(sz, fl) spl_kmem_alloc((sz), (fl), __func__, __LINE__)
+#define kmem_zalloc(sz, fl) spl_kmem_zalloc((sz), (fl), __func__, __LINE__)
+#define kmem_free(ptr, sz) spl_kmem_free((ptr), (sz))
-int spl_kmem_init(void);
-void spl_kmem_fini(void);
+extern void *spl_kmem_alloc(size_t sz, int fl, const char *func, int line);
+extern void *spl_kmem_zalloc(size_t sz, int fl, const char *func, int line);
+extern void spl_kmem_free(const void *ptr, size_t sz);
-#define kmem_virt(ptr) (((ptr) >= (void *)VMALLOC_START) && \
- ((ptr) < (void *)VMALLOC_END))
+/*
+ * The following functions are only available for internal use.
+ */
+extern void *spl_kmem_alloc_impl(size_t size, int flags, int node);
+extern void *spl_kmem_alloc_debug(size_t size, int flags, int node);
+extern void *spl_kmem_alloc_track(size_t size, int flags,
+ const char *func, int line, int node);
+extern void spl_kmem_free_impl(const void *buf, size_t size);
+extern void spl_kmem_free_debug(const void *buf, size_t size);
+extern void spl_kmem_free_track(const void *buf, size_t size);
+
+extern int spl_kmem_init(void);
+extern void spl_kmem_fini(void);
#endif /* _SPL_KMEM_H */
extern void spl_kmem_cache_destroy(spl_kmem_cache_t *skc);
extern void *spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags);
extern void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj);
+extern void spl_kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags);
extern void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count);
extern void spl_kmem_reap(void);
#define kmem_cache_reap_now(skc) \
spl_kmem_cache_reap_now(skc, skc->skc_reap)
#define kmem_reap() spl_kmem_reap()
-#define kmem_virt(ptr) \
- (((ptr) >= (void *)VMALLOC_START) && \
- ((ptr) < (void *)VMALLOC_END))
-
-/*
- * Allow custom slab allocation flags to be set for KMC_SLAB based caches.
- * One use for this function is to ensure the __GFP_COMP flag is part of
- * the default allocation mask which ensures higher order allocations are
- * properly refcounted. This flag was added to the default ->allocflags
- * as of Linux 3.11.
- */
-static inline void
-kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags)
-{
- if (skc->skc_linux_cache == NULL)
- return;
-
-#if defined(HAVE_KMEM_CACHE_ALLOCFLAGS)
- skc->skc_linux_cache->allocflags |= flags;
-#elif defined(HAVE_KMEM_CACHE_GFPFLAGS)
- skc->skc_linux_cache->gfpflags |= flags;
-#endif
-}
/*
* The following functions are only available for internal use.
#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START)
#endif
-static inline void *
-vmalloc_nofail(size_t size, gfp_t flags)
-{
- void *ptr;
-
- /*
- * Retry failed __vmalloc() allocations once every second. The
- * rational for the delay is that the likely failure modes are:
- *
- * 1) The system has completely exhausted memory, in which case
- * delaying 1 second for the memory reclaim to run is reasonable
- * to avoid thrashing the system.
- * 2) The system has memory but has exhausted the small virtual
- * address space available on 32-bit systems. Retrying the
- * allocation immediately will only result in spinning on the
- * virtual address space lock. It is better delay a second and
- * hope that another process will free some of the address space.
- * But the bottom line is there is not much we can actually do
- * since we can never safely return a failure and honor the
- * Solaris semantics.
- */
- while (1) {
- ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
- if (unlikely((ptr == NULL) && (flags & __GFP_WAIT))) {
- set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(HZ);
- } else {
- break;
- }
- }
-
- return (ptr);
-}
-
-static inline void *
-vzalloc_nofail(size_t size, gfp_t flags)
-{
- void *ptr;
-
- ptr = vmalloc_nofail(size, flags);
- if (ptr)
- memset(ptr, 0, (size));
-
- return (ptr);
-}
-
-#ifdef DEBUG_KMEM
-
-/*
- * Memory accounting functions to be used only when DEBUG_KMEM is set.
- */
-#ifdef HAVE_ATOMIC64_T
-
-#define vmem_alloc_used_add(size) atomic64_add(size, &vmem_alloc_used)
-#define vmem_alloc_used_sub(size) atomic64_sub(size, &vmem_alloc_used)
-#define vmem_alloc_used_read() atomic64_read(&vmem_alloc_used)
-#define vmem_alloc_used_set(size) atomic64_set(&vmem_alloc_used, size)
-
-extern atomic64_t vmem_alloc_used;
-extern unsigned long long vmem_alloc_max;
-
-#else /* HAVE_ATOMIC64_T */
-
-#define vmem_alloc_used_add(size) atomic_add(size, &vmem_alloc_used)
-#define vmem_alloc_used_sub(size) atomic_sub(size, &vmem_alloc_used)
-#define vmem_alloc_used_read() atomic_read(&vmem_alloc_used)
-#define vmem_alloc_used_set(size) atomic_set(&vmem_alloc_used, size)
-
-extern atomic_t vmem_alloc_used;
-extern unsigned long long vmem_alloc_max;
-
-#endif /* HAVE_ATOMIC64_T */
-
-#ifdef DEBUG_KMEM_TRACKING
/*
- * DEBUG_KMEM && DEBUG_KMEM_TRACKING
+ * vmem_* is an interface to a low level arena-based memory allocator on
+ * Illumos that is used to allocate virtual address space. The kmem SLAB
+ * allocator allocates slabs from it. Then the generic allocation functions
+ * kmem_{alloc,zalloc,free}() are layered on top of SLAB allocators.
*
- * The maximum level of memory debugging. All memory will be accounted
- * for and each allocation will be explicitly tracked. Any allocation
- * which is leaked will be reported on module unload and the exact location
- * where that memory was allocation will be reported. This level of memory
- * tracking will have a significant impact on performance and should only
- * be enabled for debugging. This feature may be enabled by passing
- * --enable-debug-kmem-tracking to configure.
- */
-#define vmem_alloc(sz, fl) vmem_alloc_track((sz), (fl), \
- __FUNCTION__, __LINE__)
-#define vmem_zalloc(sz, fl) vmem_alloc_track((sz), (fl)|__GFP_ZERO,\
- __FUNCTION__, __LINE__)
-#define vmem_free(ptr, sz) vmem_free_track((ptr), (sz))
-
-extern void *kmem_alloc_track(size_t, int, const char *, int, int, int);
-extern void kmem_free_track(const void *, size_t);
-extern void *vmem_alloc_track(size_t, int, const char *, int);
-extern void vmem_free_track(const void *, size_t);
-
-#else /* DEBUG_KMEM_TRACKING */
-/*
- * DEBUG_KMEM && !DEBUG_KMEM_TRACKING
+ * On Linux, the primary means of doing allocations is via kmalloc(), which
+ * is similarly layered on top of something called the buddy allocator. The
+ * buddy allocator is not available to kernel modules, it uses physical
+ * memory addresses rather than virtual memory addresses and is prone to
+ * fragmentation.
*
- * The default build will set DEBUG_KEM. This provides basic memory
- * accounting with little to no impact on performance. When the module
- * is unloaded in any memory was leaked the total number of leaked bytes
- * will be reported on the console. To disable this basic accounting
- * pass the --disable-debug-kmem option to configure.
- */
-#define vmem_alloc(sz, fl) vmem_alloc_debug((sz), (fl), \
- __FUNCTION__, __LINE__)
-#define vmem_zalloc(sz, fl) vmem_alloc_debug((sz), (fl)|__GFP_ZERO,\
- __FUNCTION__, __LINE__)
-#define vmem_free(ptr, sz) vmem_free_debug((ptr), (sz))
-
-extern void *vmem_alloc_debug(size_t, int, const char *, int);
-extern void vmem_free_debug(const void *, size_t);
-
-#endif /* DEBUG_KMEM_TRACKING */
-#else /* DEBUG_KMEM */
-/*
- * !DEBUG_KMEM && !DEBUG_KMEM_TRACKING
+ * Linux sets aside a relatively small address space for in-kernel virtual
+ * memory from which allocations can be done using vmalloc(). It might seem
+ * like a good idea to use vmalloc() to implement something similar to
+ * Illumos' allocator. However, this has the following problems:
+ *
+ * 1. Page directory table allocations are hard coded to use GFP_KERNEL.
+ * Consequently, any KM_PUSHPAGE or KM_NOSLEEP allocations done using
+ * vmalloc() will not have proper semantics.
+ *
+ * 2. Address space exhaustion is a real issue on 32-bit platforms where
+ * only a few 100MB are available. The kernel will handle it by spinning
+ * when it runs out of address space.
+ *
+ * 3. All vmalloc() allocations and frees are protected by a single global
+ * lock which serializes all allocations.
*
- * All debugging is disabled. There will be no overhead even for
- * minimal memory accounting. To enable basic accounting pass the
- * --enable-debug-kmem option to configure.
+ * 4. Accessing /proc/meminfo and /proc/vmallocinfo will iterate the entire
+ * list. The former will sum the allocations while the latter will print
+ * them to user space in a way that user space can keep the lock held
+ * indefinitely. When the total number of mapped allocations is large
+ * (several 100,000) a large amount of time will be spent waiting on locks.
+ *
+ * 5. Linux has a wait_on_bit() locking primitive that assumes physical
+ * memory is used, it simply does not work on virtual memory. Certain
+ * Linux structures (e.g. the superblock) use them and might be embedded
+ * into a structure from Illumos. This makes using Linux virtual memory
+ * unsafe in certain situations.
+ *
+ * It follows that we cannot obtain identical semantics to those on Illumos.
+ * Consequently, we implement the kmem_{alloc,zalloc,free}() functions in
+ * such a way that they can be used as drop-in replacements for small vmem_*
+ * allocations (8MB in size or smaller) and map vmem_{alloc,zalloc,free}()
+ * to them.
*/
-#define vmem_alloc(sz, fl) vmalloc_nofail((sz), (fl))
-#define vmem_zalloc(sz, fl) vzalloc_nofail((sz), (fl))
-#define vmem_free(ptr, sz) ((void)(sz), vfree(ptr))
-#endif /* DEBUG_KMEM */
+#define vmem_alloc(sz, fl) spl_vmem_alloc((sz), (fl), __func__, __LINE__)
+#define vmem_zalloc(sz, fl) spl_vmem_zalloc((sz), (fl), __func__, __LINE__)
+#define vmem_free(ptr, sz) spl_vmem_free((ptr), (sz))
+
+extern void *spl_vmem_alloc(size_t sz, int fl, const char *func, int line);
+extern void *spl_vmem_zalloc(size_t sz, int fl, const char *func, int line);
+extern void spl_vmem_free(const void *ptr, size_t sz);
int spl_vmem_init(void);
void spl_vmem_fini(void);
Default value: \fB0\fR.
.RE
+.sp
+.ne 2
+.na
+\fBspl_kmem_alloc_warn\fR (uint)
+.ad
+.RS 12n
+As a general rule kmem_alloc() allocations should be small, preferably
+just a few pages since they must by physically contiguous. Therefore, a
+rate limited warning will be printed to the console for any kmem_alloc()
+which exceeds a reasonable threshold.
+
+The default warning threshold is set to eight pages but capped at 32K to
+accommodate systems using large pages. This value was selected to be small
+enough to ensure the largest allocations are quickly noticed and fixed.
+But large enough to avoid logging any warnings when a allocation size is
+larger than optimal but not a serious concern. Since this value is tunable,
+developers are encouraged to set it lower when testing so any new largish
+allocations are quickly caught. These warnings may be disabled by setting
+the threshold to zero.
+.sp
+Default value: \fB32K\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_alloc_max\fR (uint)
+.ad
+.RS 12n
+Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE.
+Allocations which are marginally smaller than this limit may succeed but
+should still be avoided due to the expense of locating a contiguous range
+of free pages. Therefore, a maximum kmem size with reasonable safely
+margin of 4x is set. Kmem_alloc() allocations larger than this maximum
+will quickly fail. Vmem_alloc() allocations less than or equal to this
+value will use kmalloc(), but shift to vmalloc() when exceeding this value.
+.sp
+Default value: \fBKMALLOC_MAX_SIZE/4\fR.
+.RE
+
.sp
.ne 2
.na
* One serious concern I do have about this method is the relatively
* small virtual address space on 32bit arches. This will seriously
* constrain the size of the slab caches and their performance.
- *
- * XXX: Improve the partial slab list by carefully maintaining a
- * strict ordering of fullest to emptiest slabs based on
- * the slab reference count. This guarantees that when freeing
- * slabs back to the system we need only linearly traverse the
- * last N slabs in the list to discover all the freeable slabs.
- *
- * XXX: NUMA awareness for optionally allocating memory close to a
- * particular core. This can be advantageous if you know the slab
- * object will be short lived and primarily accessed from one core.
- *
- * XXX: Slab coloring may also yield performance improvements and would
- * be desirable to implement.
*/
struct list_head spl_kmem_cache_list; /* List of caches */
static void *
kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
{
+ gfp_t lflags = kmem_flags_convert(flags);
void *ptr;
ASSERT(ISP2(size));
if (skc->skc_flags & KMC_KMEM)
- ptr = (void *)__get_free_pages(flags | __GFP_COMP,
- get_order(size));
+ ptr = (void *)__get_free_pages(lflags, get_order(size));
else
- ptr = __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
+ ptr = __vmalloc(size, lflags | __GFP_HIGHMEM, PAGE_KERNEL);
/* Resulting allocated memory will be page aligned */
ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
}
/*
- * Traverse all the partial slabs attached to a cache and free those
- * which which are currently empty, and have not been touched for
- * skc_delay seconds to avoid thrashing. The count argument is
- * passed to optionally cap the number of slabs reclaimed, a count
- * of zero means try and reclaim everything. When flag is set we
- * always free an available slab regardless of age.
+ * Traverse all the partial slabs attached to a cache and free those which
+ * are currently empty, and have not been touched for skc_delay seconds to
+ * avoid thrashing. The count argument is passed to optionally cap the
+ * number of slabs reclaimed, a count of zero means try and reclaim
+ * everything. When flag the is set available slabs freed regardless of age.
*/
static void
spl_slab_reclaim(spl_kmem_cache_t *skc, int count, int flag)
static int
spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
{
+ gfp_t lflags = kmem_flags_convert(flags);
spl_kmem_emergency_t *ske;
int empty;
if (!empty)
return (-EEXIST);
- ske = kmalloc(sizeof (*ske), flags);
+ ske = kmalloc(sizeof (*ske), lflags);
if (ske == NULL)
return (-ENOMEM);
- ske->ske_obj = kmalloc(skc->skc_obj_size, flags);
+ ske->ske_obj = kmalloc(skc->skc_obj_size, lflags);
if (ske->ske_obj == NULL) {
kfree(ske);
return (-ENOMEM);
int size = sizeof (spl_kmem_magazine_t) +
sizeof (void *) * skc->skc_mag_size;
- skm = kmem_alloc_node(size, KM_SLEEP, cpu_to_node(cpu));
+ skm = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu));
if (skm) {
skm->skm_magic = SKM_MAGIC;
skm->skm_avail = 0;
static void
spl_magazine_free(spl_kmem_magazine_t *skm)
{
- int size = sizeof (spl_kmem_magazine_t) +
- sizeof (void *) * skm->skm_size;
-
ASSERT(skm->skm_magic == SKM_MAGIC);
ASSERT(skm->skm_avail == 0);
-
- kmem_free(skm, size);
+ kfree(skm);
}
/*
spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, spl_kmem_reclaim_t reclaim,
void *priv, void *vmp, int flags)
{
+ gfp_t lflags = kmem_flags_convert(KM_SLEEP);
spl_kmem_cache_t *skc;
int rc;
* Allocate memory for a new cache and initialize it. Unfortunately,
* this usually ends up being a large allocation of ~32k because
* we need to allocate enough memory for the worst case number of
- * cpus in the magazine, skc_mag[NR_CPUS]. Because of this we
- * explicitly pass KM_NODEBUG to suppress the kmem warning
+ * cpus in the magazine, skc_mag[NR_CPUS].
*/
- skc = kmem_zalloc(sizeof (*skc), KM_SLEEP| KM_NODEBUG);
+ skc = kzalloc(sizeof (*skc), lflags);
if (skc == NULL)
return (NULL);
skc->skc_magic = SKC_MAGIC;
skc->skc_name_size = strlen(name) + 1;
- skc->skc_name = (char *)kmem_alloc(skc->skc_name_size, KM_SLEEP);
+ skc->skc_name = (char *)kmalloc(skc->skc_name_size, lflags);
if (skc->skc_name == NULL) {
- kmem_free(skc, sizeof (*skc));
+ kfree(skc);
return (NULL);
}
strncpy(skc->skc_name, name, skc->skc_name_size);
goto out;
}
- kmem_cache_set_allocflags(skc, __GFP_COMP);
+#if defined(HAVE_KMEM_CACHE_ALLOCFLAGS)
+ skc->skc_linux_cache->allocflags |= __GFP_COMP;
+#elif defined(HAVE_KMEM_CACHE_GFPFLAGS)
+ skc->skc_linux_cache->gfpflags |= __GFP_COMP;
+#endif
skc->skc_flags |= KMC_NOMAGAZINE;
}
return (skc);
out:
- kmem_free(skc->skc_name, skc->skc_name_size);
- kmem_free(skc, sizeof (*skc));
+ kfree(skc->skc_name);
+ kfree(skc);
return (NULL);
}
EXPORT_SYMBOL(spl_kmem_cache_create);
ASSERT3U(skc->skc_obj_emergency, ==, 0);
ASSERT(list_empty(&skc->skc_complete_list));
- kmem_free(skc->skc_name, skc->skc_name_size);
spin_unlock(&skc->skc_lock);
- kmem_free(skc, sizeof (*skc));
+ kfree(skc->skc_name);
+ kfree(skc);
}
EXPORT_SYMBOL(spl_kmem_cache_destroy);
spl_kmem_cache_t *skc = ska->ska_cache;
spl_kmem_slab_t *sks;
- sks = spl_slab_alloc(skc, ska->ska_flags | __GFP_NORETRY | KM_NODEBUG);
+#if defined(PF_MEMALLOC_NOIO)
+ unsigned noio_flag = memalloc_noio_save();
+ sks = spl_slab_alloc(skc, ska->ska_flags);
+ memalloc_noio_restore(noio_flag);
+#else
+ sks = spl_slab_alloc(skc, ska->ska_flags);
+#endif
spin_lock(&skc->skc_lock);
if (sks) {
skc->skc_slab_total++;
static int
spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
{
- int remaining, rc;
+ int remaining, rc = 0;
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
ASSERT(skc->skc_magic == SKC_MAGIC);
ASSERT((skc->skc_flags & KMC_SLAB) == 0);
might_sleep();
if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) {
spl_kmem_alloc_t *ska;
- ska = kmalloc(sizeof (*ska), flags);
+ ska = kmalloc(sizeof (*ska), kmem_flags_convert(flags));
if (ska == NULL) {
clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
wake_up_all(&skc->skc_waitq);
atomic_inc(&skc->skc_ref);
ska->ska_cache = skc;
- ska->ska_flags = flags & ~__GFP_FS;
+ ska->ska_flags = flags;
taskq_init_ent(&ska->ska_tqe);
taskq_dispatch_ent(spl_kmem_cache_taskq,
spl_cache_grow_work, ska, 0, &ska->ska_tqe);
spl_kmem_magazine_t *skm;
void *obj = NULL;
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
ASSERT(skc->skc_magic == SKC_MAGIC);
ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
- ASSERT(flags & KM_SLEEP);
atomic_inc(&skc->skc_ref);
*/
if (skc->skc_flags & KMC_SLAB) {
struct kmem_cache *slc = skc->skc_linux_cache;
-
do {
- obj = kmem_cache_alloc(slc, flags | __GFP_COMP);
+ obj = kmem_cache_alloc(slc, kmem_flags_convert(flags));
} while ((obj == NULL) && !(flags & KM_NOSLEEP));
goto ret;
* are guaranteed to have physical addresses. They must be removed
* from the tree of emergency objects and the freed.
*/
- if ((skc->skc_flags & KMC_VMEM) && !kmem_virt(obj)) {
+ if ((skc->skc_flags & KMC_VMEM) && !is_vmalloc_addr(obj)) {
spl_emergency_free(skc, obj);
goto out;
}
*/
#include <sys/debug.h>
+#include <sys/sysmacros.h>
#include <sys/kmem.h>
#include <sys/vmem.h>
+#include <linux/mm.h>
+#include <linux/ratelimit.h>
+
+/*
+ * As a general rule kmem_alloc() allocations should be small, preferably
+ * just a few pages since they must by physically contiguous. Therefore, a
+ * rate limited warning will be printed to the console for any kmem_alloc()
+ * which exceeds a reasonable threshold.
+ *
+ * The default warning threshold is set to eight pages but capped at 32K to
+ * accommodate systems using large pages. This value was selected to be small
+ * enough to ensure the largest allocations are quickly noticed and fixed.
+ * But large enough to avoid logging any warnings when a allocation size is
+ * larger than optimal but not a serious concern. Since this value is tunable,
+ * developers are encouraged to set it lower when testing so any new largish
+ * allocations are quickly caught. These warnings may be disabled by setting
+ * the threshold to zero.
+ */
+unsigned int spl_kmem_alloc_warn = MAX(8 * PAGE_SIZE, 32 * 1024);
+module_param(spl_kmem_alloc_warn, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_alloc_warn,
+ "Warning threshold in bytes for a kmem_alloc()");
+EXPORT_SYMBOL(spl_kmem_alloc_warn);
+
+/*
+ * Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE.
+ * Allocations which are marginally smaller than this limit may succeed but
+ * should still be avoided due to the expense of locating a contiguous range
+ * of free pages. Therefore, a maximum kmem size with reasonable safely
+ * margin of 4x is set. Kmem_alloc() allocations larger than this maximum
+ * will quickly fail. Vmem_alloc() allocations less than or equal to this
+ * value will use kmalloc(), but shift to vmalloc() when exceeding this value.
+ */
+unsigned int spl_kmem_alloc_max = (KMALLOC_MAX_SIZE >> 2);
+module_param(spl_kmem_alloc_max, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_alloc_max,
+ "Maximum size in bytes for a kmem_alloc()");
+EXPORT_SYMBOL(spl_kmem_alloc_max);
int
kmem_debugging(void)
int n;
n = strlen(str);
- ptr = kmalloc_nofail(n + 1, flags);
+ ptr = kmalloc(n + 1, kmem_flags_convert(flags));
if (ptr)
memcpy(ptr, str, n + 1);
EXPORT_SYMBOL(strfree);
/*
- * Memory allocation interfaces and debugging for basic kmem_*
- * and vmem_* style memory allocation. When DEBUG_KMEM is enabled
- * the SPL will keep track of the total memory allocated, and
- * report any memory leaked when the module is unloaded.
+ * Limit the number of large allocation stack traces dumped to not more than
+ * 5 every 60 seconds to prevent denial-of-service attacks from debug code.
+ */
+DEFINE_RATELIMIT_STATE(kmem_alloc_ratelimit_state, 60 * HZ, 5);
+
+/*
+ * General purpose unified implementation of kmem_alloc(). It is an
+ * amalgamation of Linux and Illumos allocator design. It should never be
+ * exported to ensure that code using kmem_alloc()/kmem_zalloc() remains
+ * relatively portable. Consumers may only access this function through
+ * wrappers that enforce the common flags to ensure portability.
+ */
+inline void *
+spl_kmem_alloc_impl(size_t size, int flags, int node)
+{
+ gfp_t lflags = kmem_flags_convert(flags);
+ void *ptr;
+
+ /*
+ * Log abnormally large allocations and rate limit the console output.
+ * Allocations larger than spl_kmem_alloc_warn should be performed
+ * through the vmem_alloc()/vmem_zalloc() interfaces.
+ */
+ if ((spl_kmem_alloc_warn > 0) && (size > spl_kmem_alloc_warn) &&
+ !(flags & KM_VMEM) && __ratelimit(&kmem_alloc_ratelimit_state)) {
+ printk(KERN_WARNING
+ "Large kmem_alloc(%lu, 0x%x), please file an issue at:\n"
+ "https://github.com/zfsonlinux/zfs/issues/new\n",
+ (unsigned long)size, flags);
+ dump_stack();
+ }
+
+ /*
+ * Use a loop because kmalloc_node() can fail when GFP_KERNEL is used
+ * unlike kmem_alloc() with KM_SLEEP on Illumos.
+ */
+ do {
+ /*
+ * Calling kmalloc_node() when the size >= spl_kmem_alloc_max
+ * is unsafe. This must fail for all for kmem_alloc() and
+ * kmem_zalloc() callers.
+ *
+ * For vmem_alloc() and vmem_zalloc() callers it is permissible
+ * to use __vmalloc(). However, in general use of __vmalloc()
+ * is strongly discouraged because a global lock must be
+ * acquired. Contention on this lock can significantly
+ * impact performance so frequently manipulating the virtual
+ * address space is strongly discouraged.
+ */
+ if (unlikely(size > spl_kmem_alloc_max)) {
+ if (flags & KM_VMEM) {
+ ptr = __vmalloc(size, lflags, PAGE_KERNEL);
+ } else {
+ return (NULL);
+ }
+ } else {
+ ptr = kmalloc_node(size, lflags, node);
+ }
+
+ if (likely(ptr) || (flags & KM_NOSLEEP))
+ return (ptr);
+
+ if (unlikely(__ratelimit(&kmem_alloc_ratelimit_state))) {
+ printk(KERN_WARNING
+ "Possible memory allocation deadlock: "
+ "size=%lu lflags=0x%x",
+ (unsigned long)size, lflags);
+ dump_stack();
+ }
+
+ /*
+ * Use cond_resched() instead of congestion_wait() to avoid
+ * deadlocking systems where there are no block devices.
+ */
+ cond_resched();
+ } while (1);
+
+ return (NULL);
+}
+
+inline void
+spl_kmem_free_impl(const void *buf, size_t size)
+{
+ if (is_vmalloc_addr(buf))
+ vfree(buf);
+ else
+ kfree(buf);
+}
+
+/*
+ * Memory allocation and accounting for kmem_* * style allocations. When
+ * DEBUG_KMEM is enabled the total memory allocated will be tracked and
+ * any memory leaked will be reported during module unload.
+ *
+ * ./configure --enable-debug-kmem
*/
#ifdef DEBUG_KMEM
EXPORT_SYMBOL(kmem_alloc_used);
EXPORT_SYMBOL(kmem_alloc_max);
+inline void *
+spl_kmem_alloc_debug(size_t size, int flags, int node)
+{
+ void *ptr;
+
+ ptr = spl_kmem_alloc_impl(size, flags, node);
+ if (ptr) {
+ kmem_alloc_used_add(size);
+ if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
+ kmem_alloc_max = kmem_alloc_used_read();
+ }
+
+ return (ptr);
+}
+
+inline void
+spl_kmem_free_debug(const void *ptr, size_t size)
+{
+ kmem_alloc_used_sub(size);
+ spl_kmem_free_impl(ptr, size);
+}
+
/*
* When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked
* but also the location of every alloc and free. When the SPL module is
* contended particularly on xfree(). If we want to run with this detailed
* debugging enabled for anything other than debugging we need to minimize
* the contention by moving to a lock per xmem_table entry model.
+ *
+ * ./configure --enable-debug-kmem-tracking
*/
#ifdef DEBUG_KMEM_TRACKING
+#include <linux/hash.h>
+#include <linux/ctype.h>
+
#define KMEM_HASH_BITS 10
#define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS)
int kd_line; /* Allocation line */
} kmem_debug_t;
-spinlock_t kmem_lock;
-struct hlist_head kmem_table[KMEM_TABLE_SIZE];
-struct list_head kmem_list;
-
-EXPORT_SYMBOL(kmem_lock);
-EXPORT_SYMBOL(kmem_table);
-EXPORT_SYMBOL(kmem_list);
+static spinlock_t kmem_lock;
+static struct hlist_head kmem_table[KMEM_TABLE_SIZE];
+static struct list_head kmem_list;
static kmem_debug_t *
kmem_del_init(spinlock_t *lock, struct hlist_head *table,
return (NULL);
}
-void *
-kmem_alloc_track(size_t size, int flags, const char *func, int line,
- int node_alloc, int node)
+inline void *
+spl_kmem_alloc_track(size_t size, int flags,
+ const char *func, int line, int node)
{
void *ptr = NULL;
kmem_debug_t *dptr;
unsigned long irq_flags;
- /* Function may be called with KM_NOSLEEP so failure is possible */
- dptr = (kmem_debug_t *) kmalloc_nofail(sizeof (kmem_debug_t),
- flags & ~__GFP_ZERO);
+ dptr = kmalloc(sizeof (kmem_debug_t), kmem_flags_convert(flags));
+ if (dptr == NULL)
+ return (NULL);
- if (unlikely(dptr == NULL)) {
- printk(KERN_WARNING "debug kmem_alloc(%ld, 0x%x) at %s:%d "
- "failed (%lld/%llu)\n", sizeof (kmem_debug_t), flags,
- func, line, kmem_alloc_used_read(), kmem_alloc_max);
- } else {
- /*
- * Marked unlikely because we should never be doing this,
- * we tolerate to up 2 pages but a single page is best.
- */
- if (unlikely((size > PAGE_SIZE*2) && !(flags & KM_NODEBUG))) {
- printk(KERN_WARNING "large kmem_alloc(%llu, 0x%x) "
- "at %s:%d failed (%lld/%llu)\n",
- (unsigned long long)size, flags, func, line,
- kmem_alloc_used_read(), kmem_alloc_max);
- spl_dumpstack();
- }
-
- /*
- * We use __strdup() below because the string pointed to by
- * __FUNCTION__ might not be available by the time we want
- * to print it since the module might have been unloaded.
- * This can only fail in the KM_NOSLEEP case.
- */
- dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO);
- if (unlikely(dptr->kd_func == NULL)) {
- kfree(dptr);
- printk(KERN_WARNING "debug __strdup() at %s:%d "
- "failed (%lld/%llu)\n", func, line,
- kmem_alloc_used_read(), kmem_alloc_max);
- goto out;
- }
-
- /* Use the correct allocator */
- if (node_alloc) {
- ASSERT(!(flags & __GFP_ZERO));
- ptr = kmalloc_node_nofail(size, flags, node);
- } else if (flags & __GFP_ZERO) {
- ptr = kzalloc_nofail(size, flags & ~__GFP_ZERO);
- } else {
- ptr = kmalloc_nofail(size, flags);
- }
+ dptr->kd_func = __strdup(func, flags);
+ if (dptr->kd_func == NULL) {
+ kfree(dptr);
+ return (NULL);
+ }
- if (unlikely(ptr == NULL)) {
- kfree(dptr->kd_func);
- kfree(dptr);
- printk(KERN_WARNING "kmem_alloc(%llu, 0x%x) "
- "at %s:%d failed (%lld/%llu)\n",
- (unsigned long long) size, flags, func, line,
- kmem_alloc_used_read(), kmem_alloc_max);
- goto out;
- }
+ ptr = spl_kmem_alloc_debug(size, flags, node);
+ if (ptr == NULL) {
+ kfree(dptr->kd_func);
+ kfree(dptr);
+ return (NULL);
+ }
- kmem_alloc_used_add(size);
- if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
- kmem_alloc_max = kmem_alloc_used_read();
+ INIT_HLIST_NODE(&dptr->kd_hlist);
+ INIT_LIST_HEAD(&dptr->kd_list);
- INIT_HLIST_NODE(&dptr->kd_hlist);
- INIT_LIST_HEAD(&dptr->kd_list);
+ dptr->kd_addr = ptr;
+ dptr->kd_size = size;
+ dptr->kd_line = line;
- dptr->kd_addr = ptr;
- dptr->kd_size = size;
- dptr->kd_line = line;
+ spin_lock_irqsave(&kmem_lock, irq_flags);
+ hlist_add_head(&dptr->kd_hlist,
+ &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
+ list_add_tail(&dptr->kd_list, &kmem_list);
+ spin_unlock_irqrestore(&kmem_lock, irq_flags);
- spin_lock_irqsave(&kmem_lock, irq_flags);
- hlist_add_head(&dptr->kd_hlist,
- &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
- list_add_tail(&dptr->kd_list, &kmem_list);
- spin_unlock_irqrestore(&kmem_lock, irq_flags);
- }
-out:
return (ptr);
}
-EXPORT_SYMBOL(kmem_alloc_track);
-void
-kmem_free_track(const void *ptr, size_t size)
+inline void
+spl_kmem_free_track(const void *ptr, size_t size)
{
kmem_debug_t *dptr;
- ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
- (unsigned long long) size);
-
/* Must exist in hash due to kmem_alloc() */
dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
- ASSERT(dptr);
+ ASSERT3P(dptr, !=, NULL);
+ ASSERT3S(dptr->kd_size, ==, size);
- /* Size must match */
- ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
- "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
- (unsigned long long) size, dptr->kd_func, dptr->kd_line);
-
- kmem_alloc_used_sub(size);
kfree(dptr->kd_func);
-
- memset((void *)dptr, 0x5a, sizeof (kmem_debug_t));
kfree(dptr);
- memset((void *)ptr, 0x5a, size);
- kfree(ptr);
+ spl_kmem_free_debug(ptr, size);
}
-EXPORT_SYMBOL(kmem_free_track);
-
-#else /* DEBUG_KMEM_TRACKING */
+#endif /* DEBUG_KMEM_TRACKING */
+#endif /* DEBUG_KMEM */
+/*
+ * Public kmem_alloc(), kmem_zalloc() and kmem_free() interfaces.
+ */
void *
-kmem_alloc_debug(size_t size, int flags, const char *func, int line,
- int node_alloc, int node)
+spl_kmem_alloc(size_t size, int flags, const char *func, int line)
{
- void *ptr;
-
- /*
- * Marked unlikely because we should never be doing this,
- * we tolerate to up 2 pages but a single page is best.
- */
- if (unlikely((size > PAGE_SIZE * 2) && !(flags & KM_NODEBUG))) {
- printk(KERN_WARNING
- "large kmem_alloc(%llu, 0x%x) at %s:%d (%lld/%llu)\n",
- (unsigned long long)size, flags, func, line,
- (unsigned long long)kmem_alloc_used_read(), kmem_alloc_max);
- spl_dumpstack();
- }
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
+
+#if !defined(DEBUG_KMEM)
+ return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+ return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+ return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
+}
+EXPORT_SYMBOL(spl_kmem_alloc);
- /* Use the correct allocator */
- if (node_alloc) {
- ASSERT(!(flags & __GFP_ZERO));
- ptr = kmalloc_node_nofail(size, flags, node);
- } else if (flags & __GFP_ZERO) {
- ptr = kzalloc_nofail(size, flags & (~__GFP_ZERO));
- } else {
- ptr = kmalloc_nofail(size, flags);
- }
+void *
+spl_kmem_zalloc(size_t size, int flags, const char *func, int line)
+{
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
- if (unlikely(ptr == NULL)) {
- printk(KERN_WARNING
- "kmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
- (unsigned long long)size, flags, func, line,
- (unsigned long long)kmem_alloc_used_read(), kmem_alloc_max);
- } else {
- kmem_alloc_used_add(size);
- if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
- kmem_alloc_max = kmem_alloc_used_read();
- }
+ flags |= KM_ZERO;
- return (ptr);
+#if !defined(DEBUG_KMEM)
+ return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+ return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+ return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
}
-EXPORT_SYMBOL(kmem_alloc_debug);
+EXPORT_SYMBOL(spl_kmem_zalloc);
void
-kmem_free_debug(const void *ptr, size_t size)
+spl_kmem_free(const void *buf, size_t size)
{
- ASSERT(ptr || size > 0);
- kmem_alloc_used_sub(size);
- kfree(ptr);
+#if !defined(DEBUG_KMEM)
+ return (spl_kmem_free_impl(buf, size));
+#elif !defined(DEBUG_KMEM_TRACKING)
+ return (spl_kmem_free_debug(buf, size));
+#else
+ return (spl_kmem_free_track(buf, size));
+#endif
}
-EXPORT_SYMBOL(kmem_free_debug);
-
-#endif /* DEBUG_KMEM_TRACKING */
-#endif /* DEBUG_KMEM */
+EXPORT_SYMBOL(spl_kmem_free);
#if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
static char *
spin_unlock_irqrestore(lock, flags);
}
-#else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
-#define spl_kmem_init_tracking(list, lock, size)
-#define spl_kmem_fini_tracking(list, lock)
#endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
int
spl_kmem_init(void)
{
- int rc = 0;
-
#ifdef DEBUG_KMEM
kmem_alloc_used_set(0);
+
+#ifdef DEBUG_KMEM_TRACKING
spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);
-#endif
+#endif /* DEBUG_KMEM_TRACKING */
+#endif /* DEBUG_KMEM */
- return (rc);
+ return (0);
}
void
*/
if (kmem_alloc_used_read() != 0)
printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n",
- kmem_alloc_used_read(), kmem_alloc_max);
+ (unsigned long)kmem_alloc_used_read(), kmem_alloc_max);
+#ifdef DEBUG_KMEM_TRACKING
spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
+#endif /* DEBUG_KMEM_TRACKING */
#endif /* DEBUG_KMEM */
}
.mode = 0444,
.proc_handler = &proc_doulongvec_minmax,
},
- {
- .procname = "vmem_used",
- .data = &vmem_alloc_used,
-# ifdef HAVE_ATOMIC64_T
- .maxlen = sizeof(atomic64_t),
-# else
- .maxlen = sizeof(atomic_t),
-# endif /* HAVE_ATOMIC64_T */
- .mode = 0444,
- .proc_handler = &proc_domemused,
- },
- {
- .procname = "vmem_max",
- .data = &vmem_alloc_max,
- .maxlen = sizeof(unsigned long),
- .extra1 = &table_min,
- .extra2 = &table_max,
- .mode = 0444,
- .proc_handler = &proc_doulongvec_minmax,
- },
{
.procname = "slab_kmem_total",
.data = (void *)(KMC_KMEM | KMC_TOTAL),
if (table == NULL)
return (NULL);
- table->ht_bins = kmem_zalloc(sizeof(tsd_hash_bin_t) * size,
- KM_SLEEP | KM_NODEBUG);
+ table->ht_bins = kmem_zalloc(sizeof(tsd_hash_bin_t) * size, KM_SLEEP);
if (table->ht_bins == NULL) {
kmem_free(table, sizeof(tsd_hash_table_t));
return (NULL);
#include <sys/debug.h>
#include <sys/vmem.h>
+#include <linux/mm_compat.h>
#include <linux/module.h>
vmem_t *heap_arena = NULL;
EXPORT_SYMBOL(vmem_size);
/*
- * Memory allocation interfaces and debugging for basic kmem_*
- * and vmem_* style memory allocation. When DEBUG_KMEM is enabled
- * the SPL will keep track of the total memory allocated, and
- * report any memory leaked when the module is unloaded.
+ * Public vmem_alloc(), vmem_zalloc() and vmem_free() interfaces.
*/
-#ifdef DEBUG_KMEM
-
-/* Shim layer memory accounting */
-#ifdef HAVE_ATOMIC64_T
-atomic64_t vmem_alloc_used = ATOMIC64_INIT(0);
-unsigned long long vmem_alloc_max = 0;
-#else /* HAVE_ATOMIC64_T */
-atomic_t vmem_alloc_used = ATOMIC_INIT(0);
-unsigned long long vmem_alloc_max = 0;
-#endif /* HAVE_ATOMIC64_T */
-
-EXPORT_SYMBOL(vmem_alloc_used);
-EXPORT_SYMBOL(vmem_alloc_max);
-
-/*
- * When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked
- * but also the location of every alloc and free. When the SPL module is
- * unloaded a list of all leaked addresses and where they were allocated
- * will be dumped to the console. Enabling this feature has a significant
- * impact on performance but it makes finding memory leaks straight forward.
- *
- * Not surprisingly with debugging enabled the xmem_locks are very highly
- * contended particularly on xfree(). If we want to run with this detailed
- * debugging enabled for anything other than debugging we need to minimize
- * the contention by moving to a lock per xmem_table entry model.
- */
-#ifdef DEBUG_KMEM_TRACKING
-
-#define VMEM_HASH_BITS 10
-#define VMEM_TABLE_SIZE (1 << VMEM_HASH_BITS)
-
-typedef struct kmem_debug {
- struct hlist_node kd_hlist; /* Hash node linkage */
- struct list_head kd_list; /* List of all allocations */
- void *kd_addr; /* Allocation pointer */
- size_t kd_size; /* Allocation size */
- const char *kd_func; /* Allocation function */
- int kd_line; /* Allocation line */
-} kmem_debug_t;
-
-spinlock_t vmem_lock;
-struct hlist_head vmem_table[VMEM_TABLE_SIZE];
-struct list_head vmem_list;
-
-EXPORT_SYMBOL(vmem_lock);
-EXPORT_SYMBOL(vmem_table);
-EXPORT_SYMBOL(vmem_list);
-
void *
-vmem_alloc_track(size_t size, int flags, const char *func, int line)
-{
- void *ptr = NULL;
- kmem_debug_t *dptr;
- unsigned long irq_flags;
-
- ASSERT(flags & KM_SLEEP);
-
- /* Function may be called with KM_NOSLEEP so failure is possible */
- dptr = (kmem_debug_t *) kmalloc_nofail(sizeof (kmem_debug_t),
- flags & ~__GFP_ZERO);
- if (unlikely(dptr == NULL)) {
- printk(KERN_WARNING "debug vmem_alloc(%ld, 0x%x) "
- "at %s:%d failed (%lld/%llu)\n",
- sizeof (kmem_debug_t), flags, func, line,
- vmem_alloc_used_read(), vmem_alloc_max);
- } else {
- /*
- * We use __strdup() below because the string pointed to by
- * __FUNCTION__ might not be available by the time we want
- * to print it, since the module might have been unloaded.
- * This can never fail because we have already asserted
- * that flags is KM_SLEEP.
- */
- dptr->kd_func = __strdup(func, flags & ~__GFP_ZERO);
- if (unlikely(dptr->kd_func == NULL)) {
- kfree(dptr);
- printk(KERN_WARNING "debug __strdup() at %s:%d "
- "failed (%lld/%llu)\n", func, line,
- vmem_alloc_used_read(), vmem_alloc_max);
- goto out;
- }
-
- /* Use the correct allocator */
- if (flags & __GFP_ZERO) {
- ptr = vzalloc_nofail(size, flags & ~__GFP_ZERO);
- } else {
- ptr = vmalloc_nofail(size, flags);
- }
-
- if (unlikely(ptr == NULL)) {
- kfree(dptr->kd_func);
- kfree(dptr);
- printk(KERN_WARNING "vmem_alloc (%llu, 0x%x) "
- "at %s:%d failed (%lld/%llu)\n",
- (unsigned long long) size, flags, func, line,
- vmem_alloc_used_read(), vmem_alloc_max);
- goto out;
- }
-
- vmem_alloc_used_add(size);
- if (unlikely(vmem_alloc_used_read() > vmem_alloc_max))
- vmem_alloc_max = vmem_alloc_used_read();
-
- INIT_HLIST_NODE(&dptr->kd_hlist);
- INIT_LIST_HEAD(&dptr->kd_list);
-
- dptr->kd_addr = ptr;
- dptr->kd_size = size;
- dptr->kd_line = line;
-
- spin_lock_irqsave(&vmem_lock, irq_flags);
- hlist_add_head(&dptr->kd_hlist,
- &vmem_table[hash_ptr(ptr, VMEM_HASH_BITS)]);
- list_add_tail(&dptr->kd_list, &vmem_list);
- spin_unlock_irqrestore(&vmem_lock, irq_flags);
- }
-out:
- return (ptr);
-}
-EXPORT_SYMBOL(vmem_alloc_track);
-
-void
-vmem_free_track(const void *ptr, size_t size)
+spl_vmem_alloc(size_t size, int flags, const char *func, int line)
{
- kmem_debug_t *dptr;
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
- ASSERTF(ptr || size > 0, "ptr: %p, size: %llu", ptr,
- (unsigned long long) size);
+ flags |= KM_VMEM;
- /* Must exist in hash due to vmem_alloc() */
- dptr = kmem_del_init(&vmem_lock, vmem_table, VMEM_HASH_BITS, ptr);
- ASSERT(dptr);
-
- /* Size must match */
- ASSERTF(dptr->kd_size == size, "kd_size (%llu) != size (%llu), "
- "kd_func = %s, kd_line = %d\n", (unsigned long long) dptr->kd_size,
- (unsigned long long) size, dptr->kd_func, dptr->kd_line);
-
- vmem_alloc_used_sub(size);
- kfree(dptr->kd_func);
-
- memset((void *)dptr, 0x5a, sizeof (kmem_debug_t));
- kfree(dptr);
-
- memset((void *)ptr, 0x5a, size);
- vfree(ptr);
+#if !defined(DEBUG_KMEM)
+ return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+ return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+ return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
}
-EXPORT_SYMBOL(vmem_free_track);
-
-#else /* DEBUG_KMEM_TRACKING */
+EXPORT_SYMBOL(spl_vmem_alloc);
void *
-vmem_alloc_debug(size_t size, int flags, const char *func, int line)
+spl_vmem_zalloc(size_t size, int flags, const char *func, int line)
{
- void *ptr;
-
- ASSERT(flags & KM_SLEEP);
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
- /* Use the correct allocator */
- if (flags & __GFP_ZERO) {
- ptr = vzalloc_nofail(size, flags & (~__GFP_ZERO));
- } else {
- ptr = vmalloc_nofail(size, flags);
- }
+ flags |= (KM_VMEM | KM_ZERO);
- if (unlikely(ptr == NULL)) {
- printk(KERN_WARNING
- "vmem_alloc(%llu, 0x%x) at %s:%d failed (%lld/%llu)\n",
- (unsigned long long)size, flags, func, line,
- (unsigned long long)vmem_alloc_used_read(), vmem_alloc_max);
- } else {
- vmem_alloc_used_add(size);
- if (unlikely(vmem_alloc_used_read() > vmem_alloc_max))
- vmem_alloc_max = vmem_alloc_used_read();
- }
-
- return (ptr);
+#if !defined(DEBUG_KMEM)
+ return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+ return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+ return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
}
-EXPORT_SYMBOL(vmem_alloc_debug);
+EXPORT_SYMBOL(spl_vmem_zalloc);
void
-vmem_free_debug(const void *ptr, size_t size)
-{
- ASSERT(ptr || size > 0);
- vmem_alloc_used_sub(size);
- vfree(ptr);
-}
-EXPORT_SYMBOL(vmem_free_debug);
-
-#endif /* DEBUG_KMEM_TRACKING */
-#endif /* DEBUG_KMEM */
-
-#if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
-static char *
-spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
-{
- int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
- int i, flag = 1;
-
- ASSERT(str != NULL && len >= 17);
- memset(str, 0, len);
-
- /*
- * Check for a fully printable string, and while we are at
- * it place the printable characters in the passed buffer.
- */
- for (i = 0; i < size; i++) {
- str[i] = ((char *)(kd->kd_addr))[i];
- if (isprint(str[i])) {
- continue;
- } else {
- /*
- * Minimum number of printable characters found
- * to make it worthwhile to print this as ascii.
- */
- if (i > min)
- break;
-
- flag = 0;
- break;
- }
- }
-
- if (!flag) {
- sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",
- *((uint8_t *)kd->kd_addr),
- *((uint8_t *)kd->kd_addr + 2),
- *((uint8_t *)kd->kd_addr + 4),
- *((uint8_t *)kd->kd_addr + 6),
- *((uint8_t *)kd->kd_addr + 8),
- *((uint8_t *)kd->kd_addr + 10),
- *((uint8_t *)kd->kd_addr + 12),
- *((uint8_t *)kd->kd_addr + 14));
- }
-
- return (str);
-}
-
-static int
-spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size)
-{
- int i;
-
- spin_lock_init(lock);
- INIT_LIST_HEAD(list);
-
- for (i = 0; i < size; i++)
- INIT_HLIST_HEAD(&kmem_table[i]);
-
- return (0);
-}
-
-static void
-spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)
+spl_vmem_free(const void *buf, size_t size)
{
- unsigned long flags;
- kmem_debug_t *kd;
- char str[17];
-
- spin_lock_irqsave(lock, flags);
- if (!list_empty(list))
- printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address",
- "size", "data", "func", "line");
-
- list_for_each_entry(kd, list, kd_list)
- printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr,
- (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8),
- kd->kd_func, kd->kd_line);
-
- spin_unlock_irqrestore(lock, flags);
+#if !defined(DEBUG_KMEM)
+ return (spl_kmem_free_impl(buf, size));
+#elif !defined(DEBUG_KMEM_TRACKING)
+ return (spl_kmem_free_debug(buf, size));
+#else
+ return (spl_kmem_free_track(buf, size));
+#endif
}
-#else /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
-#define spl_kmem_init_tracking(list, lock, size)
-#define spl_kmem_fini_tracking(list, lock)
-#endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
+EXPORT_SYMBOL(spl_vmem_free);
int
spl_vmem_init(void)
{
- int rc = 0;
-
-#ifdef DEBUG_KMEM
- vmem_alloc_used_set(0);
- spl_kmem_init_tracking(&vmem_list, &vmem_lock, VMEM_TABLE_SIZE);
-#endif
-
- return (rc);
+ return (0);
}
void
spl_vmem_fini(void)
{
-#ifdef DEBUG_KMEM
- /*
- * Display all unreclaimed memory addresses, including the
- * allocation size and the first few bytes of what's located
- * at that address to aid in debugging. Performance is not
- * a serious concern here since it is module unload time.
- */
- if (vmem_alloc_used_read() != 0)
- printk(KERN_WARNING "vmem leaked %ld/%llu bytes\n",
- vmem_alloc_used_read(), vmem_alloc_max);
-
- spl_kmem_fini_tracking(&vmem_list, &vmem_lock);
-#endif /* DEBUG_KMEM */
}
int size = PAGE_SIZE;
int i, count, rc = 0;
- while ((!rc) && (size <= (PAGE_SIZE * 32))) {
+ while ((!rc) && (size <= spl_kmem_alloc_warn)) {
count = 0;
for (i = 0; i < SPLAT_KMEM_ALLOC_COUNT; i++) {
- ptr[i] = kmem_alloc(size, KM_SLEEP | KM_NODEBUG);
+ ptr[i] = kmem_alloc(size, KM_SLEEP);
if (ptr[i])
count++;
}
int size = PAGE_SIZE;
int i, j, count, rc = 0;
- while ((!rc) && (size <= (PAGE_SIZE * 32))) {
+ while ((!rc) && (size <= spl_kmem_alloc_warn)) {
count = 0;
for (i = 0; i < SPLAT_KMEM_ALLOC_COUNT; i++) {
- ptr[i] = kmem_zalloc(size, KM_SLEEP | KM_NODEBUG);
+ ptr[i] = kmem_zalloc(size, KM_SLEEP);
if (ptr[i])
count++;
}
int size = PAGE_SIZE;
int i, count, rc = 0;
- while ((!rc) && (size <= (PAGE_SIZE * 1024))) {
+ /*
+ * Test up to 4x the maximum kmem_alloc() size to ensure both
+ * the kmem_alloc() and vmem_alloc() call paths are used.
+ */
+ while ((!rc) && (size <= (4 * spl_kmem_alloc_max))) {
count = 0;
for (i = 0; i < SPLAT_VMEM_ALLOC_COUNT; i++) {
int size = PAGE_SIZE;
int i, j, count, rc = 0;
- while ((!rc) && (size <= (PAGE_SIZE * 1024))) {
+ /*
+ * Test up to 4x the maximum kmem_zalloc() size to ensure both
+ * the kmem_zalloc() and vmem_zalloc() call paths are used.
+ */
+ while ((!rc) && (size <= (4 * spl_kmem_alloc_max))) {
count = 0;
for (i = 0; i < SPLAT_VMEM_ALLOC_COUNT; i++) {