granicus.if.org Git - zfs/blob - include/sys/arc_impl.h

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2013 by Delphix. All rights reserved.
  24  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  25  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  26  */
  27
  28 #ifndef _SYS_ARC_IMPL_H
  29 #define _SYS_ARC_IMPL_H
  30
  31 #include <sys/arc.h>
  32
  33 #ifdef __cplusplus
  34 extern "C" {
  35 #endif
  36
  37 /*
  38  * Note that buffers can be in one of 6 states:
  39  *      ARC_anon        - anonymous (discussed below)
  40  *      ARC_mru         - recently used, currently cached
  41  *      ARC_mru_ghost   - recentely used, no longer in cache
  42  *      ARC_mfu         - frequently used, currently cached
  43  *      ARC_mfu_ghost   - frequently used, no longer in cache
  44  *      ARC_l2c_only    - exists in L2ARC but not other states
  45  * When there are no active references to the buffer, they are
  46  * are linked onto a list in one of these arc states.  These are
  47  * the only buffers that can be evicted or deleted.  Within each
  48  * state there are multiple lists, one for meta-data and one for
  49  * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
  50  * etc.) is tracked separately so that it can be managed more
  51  * explicitly: favored over data, limited explicitly.
  52  *
  53  * Anonymous buffers are buffers that are not associated with
  54  * a DVA.  These are buffers that hold dirty block copies
  55  * before they are written to stable storage.  By definition,
  56  * they are "ref'd" and are considered part of arc_mru
  57  * that cannot be freed.  Generally, they will aquire a DVA
  58  * as they are written and migrate onto the arc_mru list.
  59  *
  60  * The ARC_l2c_only state is for buffers that are in the second
  61  * level ARC but no longer in any of the ARC_m* lists.  The second
  62  * level ARC itself may also contain buffers that are in any of
  63  * the ARC_m* states - meaning that a buffer can exist in two
  64  * places.  The reason for the ARC_l2c_only state is to keep the
  65  * buffer header in the hash table, so that reads that hit the
  66  * second level ARC benefit from these fast lookups.
  67  */
  68
  69 typedef struct arc_state {
  70         /*
  71          * list of evictable buffers
  72          */
  73         multilist_t arcs_list[ARC_BUFC_NUMTYPES];
  74         /*
  75          * total amount of evictable data in this state
  76          */
  77         refcount_t arcs_esize[ARC_BUFC_NUMTYPES];
  78         /*
  79          * total amount of data in this state; this includes: evictable,
  80          * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
  81          */
  82         refcount_t arcs_size;
  83         /*
  84          * supports the "dbufs" kstat
  85          */
  86         arc_state_type_t arcs_state;
  87 } arc_state_t;
  88
  89 typedef struct arc_callback arc_callback_t;
  90
  91 struct arc_callback {
  92         void                    *acb_private;
  93         arc_done_func_t         *acb_done;
  94         arc_buf_t               *acb_buf;
  95         boolean_t               acb_compressed;
  96         zio_t                   *acb_zio_dummy;
  97         arc_callback_t          *acb_next;
  98 };
  99
 100 typedef struct arc_write_callback arc_write_callback_t;
 101
 102 struct arc_write_callback {
 103         void            *awcb_private;
 104         arc_done_func_t *awcb_ready;
 105         arc_done_func_t *awcb_children_ready;
 106         arc_done_func_t *awcb_physdone;
 107         arc_done_func_t *awcb_done;
 108         arc_buf_t       *awcb_buf;
 109 };
 110
 111 /*
 112  * ARC buffers are separated into multiple structs as a memory saving measure:
 113  *   - Common fields struct, always defined, and embedded within it:
 114  *       - L2-only fields, always allocated but undefined when not in L2ARC
 115  *       - L1-only fields, only allocated when in L1ARC
 116  *
 117  *           Buffer in L1                     Buffer only in L2
 118  *    +------------------------+          +------------------------+
 119  *    | arc_buf_hdr_t          |          | arc_buf_hdr_t          |
 120  *    |                        |          |                        |
 121  *    |                        |          |                        |
 122  *    |                        |          |                        |
 123  *    +------------------------+          +------------------------+
 124  *    | l2arc_buf_hdr_t        |          | l2arc_buf_hdr_t        |
 125  *    | (undefined if L1-only) |          |                        |
 126  *    +------------------------+          +------------------------+
 127  *    | l1arc_buf_hdr_t        |
 128  *    |                        |
 129  *    |                        |
 130  *    |                        |
 131  *    |                        |
 132  *    +------------------------+
 133  *
 134  * Because it's possible for the L2ARC to become extremely large, we can wind
 135  * up eating a lot of memory in L2ARC buffer headers, so the size of a header
 136  * is minimized by only allocating the fields necessary for an L1-cached buffer
 137  * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
 138  * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
 139  * words in pointers. arc_hdr_realloc() is used to switch a header between
 140  * these two allocation states.
 141  */
 142 typedef struct l1arc_buf_hdr {
 143         kmutex_t                b_freeze_lock;
 144         zio_cksum_t             *b_freeze_cksum;
 145
 146         arc_buf_t               *b_buf;
 147         uint32_t                b_bufcnt;
 148         /* for waiting on writes to complete */
 149         kcondvar_t              b_cv;
 150         uint8_t                 b_byteswap;
 151
 152
 153         /* protected by arc state mutex */
 154         arc_state_t             *b_state;
 155         multilist_node_t        b_arc_node;
 156
 157         /* updated atomically */
 158         clock_t                 b_arc_access;
 159         uint32_t                b_mru_hits;
 160         uint32_t                b_mru_ghost_hits;
 161         uint32_t                b_mfu_hits;
 162         uint32_t                b_mfu_ghost_hits;
 163         uint32_t                b_l2_hits;
 164
 165         /* self protecting */
 166         refcount_t              b_refcnt;
 167
 168         arc_callback_t          *b_acb;
 169         void                    *b_pdata;
 170 } l1arc_buf_hdr_t;
 171
 172 typedef struct l2arc_dev {
 173         vdev_t                  *l2ad_vdev;     /* vdev */
 174         spa_t                   *l2ad_spa;      /* spa */
 175         uint64_t                l2ad_hand;      /* next write location */
 176         uint64_t                l2ad_start;     /* first addr on device */
 177         uint64_t                l2ad_end;       /* last addr on device */
 178         boolean_t               l2ad_first;     /* first sweep through */
 179         boolean_t               l2ad_writing;   /* currently writing */
 180         kmutex_t                l2ad_mtx;       /* lock for buffer list */
 181         list_t                  l2ad_buflist;   /* buffer list */
 182         list_node_t             l2ad_node;      /* device list node */
 183         refcount_t              l2ad_alloc;     /* allocated bytes */
 184 } l2arc_dev_t;
 185
 186 typedef struct l2arc_buf_hdr {
 187         /* protected by arc_buf_hdr mutex */
 188         l2arc_dev_t             *b_dev;         /* L2ARC device */
 189         uint64_t                b_daddr;        /* disk address, offset byte */
 190         uint32_t                b_hits;
 191
 192         list_node_t             b_l2node;
 193 } l2arc_buf_hdr_t;
 194
 195 typedef struct l2arc_write_callback {
 196         l2arc_dev_t     *l2wcb_dev;             /* device info */
 197         arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
 198 } l2arc_write_callback_t;
 199
 200 struct arc_buf_hdr {
 201         /* protected by hash lock */
 202         dva_t                   b_dva;
 203         uint64_t                b_birth;
 204
 205         arc_buf_contents_t      b_type;
 206         arc_buf_hdr_t           *b_hash_next;
 207         arc_flags_t             b_flags;
 208
 209         /*
 210          * This field stores the size of the data buffer after
 211          * compression, and is set in the arc's zio completion handlers.
 212          * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes).
 213          *
 214          * While the block pointers can store up to 32MB in their psize
 215          * field, we can only store up to 32MB minus 512B. This is due
 216          * to the bp using a bias of 1, whereas we use a bias of 0 (i.e.
 217          * a field of zeros represents 512B in the bp). We can't use a
 218          * bias of 1 since we need to reserve a psize of zero, here, to
 219          * represent holes and embedded blocks.
 220          *
 221          * This isn't a problem in practice, since the maximum size of a
 222          * buffer is limited to 16MB, so we never need to store 32MB in
 223          * this field. Even in the upstream illumos code base, the
 224          * maximum size of a buffer is limited to 16MB.
 225          */
 226         uint16_t                b_psize;
 227
 228         /*
 229          * This field stores the size of the data buffer before
 230          * compression, and cannot change once set. It is in units
 231          * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes)
 232          */
 233         uint16_t                b_lsize;        /* immutable */
 234         uint64_t                b_spa;          /* immutable */
 235
 236         /* L2ARC fields. Undefined when not in L2ARC. */
 237         l2arc_buf_hdr_t         b_l2hdr;
 238         /* L1ARC fields. Undefined when in l2arc_only state */
 239         l1arc_buf_hdr_t         b_l1hdr;
 240 };
 241 #ifdef __cplusplus
 242 }
 243 #endif
 244
 245 #endif /* _SYS_ARC_IMPL_H */