1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright (c) 2013 by Delphix. All rights reserved.
  25  */
  26 
  27 /*
  28  * DVA-based Adjustable Replacement Cache
  29  *
  30  * While much of the theory of operation used here is
  31  * based on the self-tuning, low overhead replacement cache
  32  * presented by Megiddo and Modha at FAST 2003, there are some
  33  * significant differences:
  34  *
  35  * 1. The Megiddo and Modha model assumes any page is evictable.
  36  * Pages in its cache cannot be "locked" into memory.  This makes
  37  * the eviction algorithm simple: evict the last page in the list.
  38  * This also make the performance characteristics easy to reason
  39  * about.  Our cache is not so simple.  At any given moment, some
  40  * subset of the blocks in the cache are un-evictable because we
  41  * have handed out a reference to them.  Blocks are only evictable
  42  * when there are no external references active.  This makes
  43  * eviction far more problematic:  we choose to evict the evictable
  44  * blocks that are the "lowest" in the list.
  45  *
  46  * There are times when it is not possible to evict the requested
  47  * space.  In these circumstances we are unable to adjust the cache
  48  * size.  To prevent the cache growing unbounded at these times we
  49  * implement a "cache throttle" that slows the flow of new data
  50  * into the cache until we can make space available.
  51  *
  52  * 2. The Megiddo and Modha model assumes a fixed cache size.
  53  * Pages are evicted when the cache is full and there is a cache
  54  * miss.  Our model has a variable sized cache.  It grows with
  55  * high use, but also tries to react to memory pressure from the
  56  * operating system: decreasing its size when system memory is
  57  * tight.
  58  *
  59  * 3. The Megiddo and Modha model assumes a fixed page size. All
  60  * elements of the cache are therefor exactly the same size.  So
  61  * when adjusting the cache size following a cache miss, its simply
  62  * a matter of choosing a single page to evict.  In our model, we
  63  * have variable sized cache blocks (rangeing from 512 bytes to
  64  * 128K bytes).  We therefor choose a set of blocks to evict to make
  65  * space for a cache miss that approximates as closely as possible
  66  * the space used by the new block.
  67  *
  68  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  69  * by N. Megiddo & D. Modha, FAST 2003
  70  */
  71 
  72 /*
  73  * The locking model:
  74  *
  75  * A new reference to a cache buffer can be obtained in two
  76  * ways: 1) via a hash table lookup using the DVA as a key,
  77  * or 2) via one of the ARC lists.  The arc_read() interface
  78  * uses method 1, while the internal arc algorithms for
  79  * adjusting the cache use method 2.  We therefor provide two
  80  * types of locks: 1) the hash table lock array, and 2) the
  81  * arc list locks.
  82  *
  83  * Buffers do not have their own mutexes, rather they rely on the
  84  * hash table mutexes for the bulk of their protection (i.e. most
  85  * fields in the arc_buf_hdr_t are protected by these mutexes).
  86  *
  87  * buf_hash_find() returns the appropriate mutex (held) when it
  88  * locates the requested buffer in the hash table.  It returns
  89  * NULL for the mutex if the buffer was not in the table.
  90  *
  91  * buf_hash_remove() expects the appropriate hash mutex to be
  92  * already held before it is invoked.
  93  *
  94  * Each arc state also has a mutex which is used to protect the
  95  * buffer list associated with the state.  When attempting to
  96  * obtain a hash table lock while holding an arc list lock you
  97  * must use: mutex_tryenter() to avoid deadlock.  Also note that
  98  * the active state mutex must be held before the ghost state mutex.
  99  *
 100  * Arc buffers may have an associated eviction callback function.
 101  * This function will be invoked prior to removing the buffer (e.g.
 102  * in arc_do_user_evicts()).  Note however that the data associated
 103  * with the buffer may be evicted prior to the callback.  The callback
 104  * must be made with *no locks held* (to prevent deadlock).  Additionally,
 105  * the users of callbacks must ensure that their private data is
 106  * protected from simultaneous callbacks from arc_buf_evict()
 107  * and arc_do_user_evicts().
 108  *
 109  * Note that the majority of the performance stats are manipulated
 110  * with atomic operations.
 111  *
 112  * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
 113  *
 114  *      - L2ARC buflist creation
 115  *      - L2ARC buflist eviction
 116  *      - L2ARC write completion, which walks L2ARC buflists
 117  *      - ARC header destruction, as it removes from L2ARC buflists
 118  *      - ARC header release, as it removes from L2ARC buflists
 119  */
 120 
 121 #include <sys/spa.h>
 122 #include <sys/zio.h>
 123 #include <sys/zfs_context.h>
 124 #include <sys/arc.h>
 125 #include <sys/refcount.h>
 126 #include <sys/vdev.h>
 127 #include <sys/vdev_impl.h>
 128 #ifdef _KERNEL
 129 #include <sys/vmsystm.h>
 130 #include <vm/anon.h>
 131 #include <sys/fs/swapnode.h>
 132 #include <sys/dnlc.h>
 133 #endif
 134 #include <sys/callb.h>
 135 #include <sys/kstat.h>
 136 #include <zfs_fletcher.h>
 137 
 138 #ifndef _KERNEL
 139 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 140 boolean_t arc_watch = B_FALSE;
 141 int arc_procfd;
 142 #endif
 143 
 144 static kmutex_t         arc_reclaim_thr_lock;
 145 static kcondvar_t       arc_reclaim_thr_cv;     /* used to signal reclaim thr */
 146 static uint8_t          arc_thread_exit;
 147 
 148 extern int zfs_write_limit_shift;
 149 extern uint64_t zfs_write_limit_max;
 150 extern kmutex_t zfs_write_limit_lock;
 151 
 152 #define ARC_REDUCE_DNLC_PERCENT 3
 153 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
 154 
 155 typedef enum arc_reclaim_strategy {
 156         ARC_RECLAIM_AGGR,               /* Aggressive reclaim strategy */
 157         ARC_RECLAIM_CONS                /* Conservative reclaim strategy */
 158 } arc_reclaim_strategy_t;
 159 
 160 /* number of seconds before growing cache again */
 161 static int              arc_grow_retry = 60;
 162 
 163 /* shift of arc_c for calculating both min and max arc_p */
 164 static int              arc_p_min_shift = 4;
 165 
 166 /* log2(fraction of arc to reclaim) */
 167 static int              arc_shrink_shift = 5;
 168 
 169 /*
 170  * minimum lifespan of a prefetch block in clock ticks
 171  * (initialized in arc_init())
 172  */
 173 static int              arc_min_prefetch_lifespan;
 174 
 175 static int arc_dead;
 176 
 177 /*
 178  * The arc has filled available memory and has now warmed up.
 179  */
 180 static boolean_t arc_warm;
 181 
 182 /*
 183  * These tunables are for performance analysis.
 184  */
 185 uint64_t zfs_arc_max;
 186 uint64_t zfs_arc_min;
 187 uint64_t zfs_arc_meta_limit = 0;
 188 int zfs_arc_grow_retry = 0;
 189 int zfs_arc_shrink_shift = 0;
 190 int zfs_arc_p_min_shift = 0;
 191 int zfs_disable_dup_eviction = 0;
 192 
 193 /*
 194  * Note that buffers can be in one of 6 states:
 195  *      ARC_anon        - anonymous (discussed below)
 196  *      ARC_mru         - recently used, currently cached
 197  *      ARC_mru_ghost   - recentely used, no longer in cache
 198  *      ARC_mfu         - frequently used, currently cached
 199  *      ARC_mfu_ghost   - frequently used, no longer in cache
 200  *      ARC_l2c_only    - exists in L2ARC but not other states
 201  * When there are no active references to the buffer, they are
 202  * are linked onto a list in one of these arc states.  These are
 203  * the only buffers that can be evicted or deleted.  Within each
 204  * state there are multiple lists, one for meta-data and one for
 205  * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
 206  * etc.) is tracked separately so that it can be managed more
 207  * explicitly: favored over data, limited explicitly.
 208  *
 209  * Anonymous buffers are buffers that are not associated with
 210  * a DVA.  These are buffers that hold dirty block copies
 211  * before they are written to stable storage.  By definition,
 212  * they are "ref'd" and are considered part of arc_mru
 213  * that cannot be freed.  Generally, they will aquire a DVA
 214  * as they are written and migrate onto the arc_mru list.
 215  *
 216  * The ARC_l2c_only state is for buffers that are in the second
 217  * level ARC but no longer in any of the ARC_m* lists.  The second
 218  * level ARC itself may also contain buffers that are in any of
 219  * the ARC_m* states - meaning that a buffer can exist in two
 220  * places.  The reason for the ARC_l2c_only state is to keep the
 221  * buffer header in the hash table, so that reads that hit the
 222  * second level ARC benefit from these fast lookups.
 223  */
 224 
 225 typedef struct arc_state {
 226         list_t  arcs_list[ARC_BUFC_NUMTYPES];   /* list of evictable buffers */
 227         uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
 228         uint64_t arcs_size;     /* total amount of data in this state */
 229         kmutex_t arcs_mtx;
 230 } arc_state_t;
 231 
 232 /* The 6 states: */
 233 static arc_state_t ARC_anon;
 234 static arc_state_t ARC_mru;
 235 static arc_state_t ARC_mru_ghost;
 236 static arc_state_t ARC_mfu;
 237 static arc_state_t ARC_mfu_ghost;
 238 static arc_state_t ARC_l2c_only;
 239 
 240 typedef struct arc_stats {
 241         kstat_named_t arcstat_hits;
 242         kstat_named_t arcstat_misses;
 243         kstat_named_t arcstat_demand_data_hits;
 244         kstat_named_t arcstat_demand_data_misses;
 245         kstat_named_t arcstat_demand_metadata_hits;
 246         kstat_named_t arcstat_demand_metadata_misses;
 247         kstat_named_t arcstat_prefetch_data_hits;
 248         kstat_named_t arcstat_prefetch_data_misses;
 249         kstat_named_t arcstat_prefetch_metadata_hits;
 250         kstat_named_t arcstat_prefetch_metadata_misses;
 251         kstat_named_t arcstat_mru_hits;
 252         kstat_named_t arcstat_mru_ghost_hits;
 253         kstat_named_t arcstat_mfu_hits;
 254         kstat_named_t arcstat_mfu_ghost_hits;
 255         kstat_named_t arcstat_deleted;
 256         kstat_named_t arcstat_recycle_miss;
 257         kstat_named_t arcstat_mutex_miss;
 258         kstat_named_t arcstat_evict_skip;
 259         kstat_named_t arcstat_evict_l2_cached;
 260         kstat_named_t arcstat_evict_l2_eligible;
 261         kstat_named_t arcstat_evict_l2_ineligible;
 262         kstat_named_t arcstat_hash_elements;
 263         kstat_named_t arcstat_hash_elements_max;
 264         kstat_named_t arcstat_hash_collisions;
 265         kstat_named_t arcstat_hash_chains;
 266         kstat_named_t arcstat_hash_chain_max;
 267         kstat_named_t arcstat_p;
 268         kstat_named_t arcstat_c;
 269         kstat_named_t arcstat_c_min;
 270         kstat_named_t arcstat_c_max;
 271         kstat_named_t arcstat_size;
 272         kstat_named_t arcstat_hdr_size;
 273         kstat_named_t arcstat_data_size;
 274         kstat_named_t arcstat_other_size;
 275         kstat_named_t arcstat_l2_hits;
 276         kstat_named_t arcstat_l2_misses;
 277         kstat_named_t arcstat_l2_feeds;
 278         kstat_named_t arcstat_l2_rw_clash;
 279         kstat_named_t arcstat_l2_read_bytes;
 280         kstat_named_t arcstat_l2_write_bytes;
 281         kstat_named_t arcstat_l2_writes_sent;
 282         kstat_named_t arcstat_l2_writes_done;
 283         kstat_named_t arcstat_l2_writes_error;
 284         kstat_named_t arcstat_l2_writes_hdr_miss;
 285         kstat_named_t arcstat_l2_evict_lock_retry;
 286         kstat_named_t arcstat_l2_evict_reading;
 287         kstat_named_t arcstat_l2_free_on_write;
 288         kstat_named_t arcstat_l2_abort_lowmem;
 289         kstat_named_t arcstat_l2_cksum_bad;
 290         kstat_named_t arcstat_l2_io_error;
 291         kstat_named_t arcstat_l2_size;
 292         kstat_named_t arcstat_l2_hdr_size;
 293         kstat_named_t arcstat_memory_throttle_count;
 294         kstat_named_t arcstat_duplicate_buffers;
 295         kstat_named_t arcstat_duplicate_buffers_size;
 296         kstat_named_t arcstat_duplicate_reads;
 297         kstat_named_t arcstat_meta_used;
 298         kstat_named_t arcstat_meta_limit;
 299         kstat_named_t arcstat_meta_max;
 300 } arc_stats_t;
 301 
 302 static arc_stats_t arc_stats = {
 303         { "hits",                       KSTAT_DATA_UINT64 },
 304         { "misses",                     KSTAT_DATA_UINT64 },
 305         { "demand_data_hits",           KSTAT_DATA_UINT64 },
 306         { "demand_data_misses",         KSTAT_DATA_UINT64 },
 307         { "demand_metadata_hits",       KSTAT_DATA_UINT64 },
 308         { "demand_metadata_misses",     KSTAT_DATA_UINT64 },
 309         { "prefetch_data_hits",         KSTAT_DATA_UINT64 },
 310         { "prefetch_data_misses",       KSTAT_DATA_UINT64 },
 311         { "prefetch_metadata_hits",     KSTAT_DATA_UINT64 },
 312         { "prefetch_metadata_misses",   KSTAT_DATA_UINT64 },
 313         { "mru_hits",                   KSTAT_DATA_UINT64 },
 314         { "mru_ghost_hits",             KSTAT_DATA_UINT64 },
 315         { "mfu_hits",                   KSTAT_DATA_UINT64 },
 316         { "mfu_ghost_hits",             KSTAT_DATA_UINT64 },
 317         { "deleted",                    KSTAT_DATA_UINT64 },
 318         { "recycle_miss",               KSTAT_DATA_UINT64 },
 319         { "mutex_miss",                 KSTAT_DATA_UINT64 },
 320         { "evict_skip",                 KSTAT_DATA_UINT64 },
 321         { "evict_l2_cached",            KSTAT_DATA_UINT64 },
 322         { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
 323         { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
 324         { "hash_elements",              KSTAT_DATA_UINT64 },
 325         { "hash_elements_max",          KSTAT_DATA_UINT64 },
 326         { "hash_collisions",            KSTAT_DATA_UINT64 },
 327         { "hash_chains",                KSTAT_DATA_UINT64 },
 328         { "hash_chain_max",             KSTAT_DATA_UINT64 },
 329         { "p",                          KSTAT_DATA_UINT64 },
 330         { "c",                          KSTAT_DATA_UINT64 },
 331         { "c_min",                      KSTAT_DATA_UINT64 },
 332         { "c_max",                      KSTAT_DATA_UINT64 },
 333         { "size",                       KSTAT_DATA_UINT64 },
 334         { "hdr_size",                   KSTAT_DATA_UINT64 },
 335         { "data_size",                  KSTAT_DATA_UINT64 },
 336         { "other_size",                 KSTAT_DATA_UINT64 },
 337         { "l2_hits",                    KSTAT_DATA_UINT64 },
 338         { "l2_misses",                  KSTAT_DATA_UINT64 },
 339         { "l2_feeds",                   KSTAT_DATA_UINT64 },
 340         { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 341         { "l2_read_bytes",              KSTAT_DATA_UINT64 },
 342         { "l2_write_bytes",             KSTAT_DATA_UINT64 },
 343         { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 344         { "l2_writes_done",             KSTAT_DATA_UINT64 },
 345         { "l2_writes_error",            KSTAT_DATA_UINT64 },
 346         { "l2_writes_hdr_miss",         KSTAT_DATA_UINT64 },
 347         { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 348         { "l2_evict_reading",           KSTAT_DATA_UINT64 },
 349         { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 350         { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 351         { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 352         { "l2_io_error",                KSTAT_DATA_UINT64 },
 353         { "l2_size",                    KSTAT_DATA_UINT64 },
 354         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 355         { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 356         { "duplicate_buffers",          KSTAT_DATA_UINT64 },
 357         { "duplicate_buffers_size",     KSTAT_DATA_UINT64 },
 358         { "duplicate_reads",            KSTAT_DATA_UINT64 },
 359         { "arc_meta_used",              KSTAT_DATA_UINT64 },
 360         { "arc_meta_limit",             KSTAT_DATA_UINT64 },
 361         { "arc_meta_max",               KSTAT_DATA_UINT64 }
 362 };
 363 
 364 #define ARCSTAT(stat)   (arc_stats.stat.value.ui64)
 365 
 366 #define ARCSTAT_INCR(stat, val) \
 367         atomic_add_64(&arc_stats.stat.value.ui64, (val));
 368 
 369 #define ARCSTAT_BUMP(stat)      ARCSTAT_INCR(stat, 1)
 370 #define ARCSTAT_BUMPDOWN(stat)  ARCSTAT_INCR(stat, -1)
 371 
 372 #define ARCSTAT_MAX(stat, val) {                                        \
 373         uint64_t m;                                                     \
 374         while ((val) > (m = arc_stats.stat.value.ui64) &&            \
 375             (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))     \
 376                 continue;                                               \
 377 }
 378 
 379 #define ARCSTAT_MAXSTAT(stat) \
 380         ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
 381 
 382 /*
 383  * We define a macro to allow ARC hits/misses to be easily broken down by
 384  * two separate conditions, giving a total of four different subtypes for
 385  * each of hits and misses (so eight statistics total).
 386  */
 387 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 388         if (cond1) {                                                    \
 389                 if (cond2) {                                            \
 390                         ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 391                 } else {                                                \
 392                         ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 393                 }                                                       \
 394         } else {                                                        \
 395                 if (cond2) {                                            \
 396                         ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 397                 } else {                                                \
 398                         ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 399                 }                                                       \
 400         }
 401 
 402 kstat_t                 *arc_ksp;
 403 static arc_state_t      *arc_anon;
 404 static arc_state_t      *arc_mru;
 405 static arc_state_t      *arc_mru_ghost;
 406 static arc_state_t      *arc_mfu;
 407 static arc_state_t      *arc_mfu_ghost;
 408 static arc_state_t      *arc_l2c_only;
 409 
 410 /*
 411  * There are several ARC variables that are critical to export as kstats --
 412  * but we don't want to have to grovel around in the kstat whenever we wish to
 413  * manipulate them.  For these variables, we therefore define them to be in
 414  * terms of the statistic variable.  This assures that we are not introducing
 415  * the possibility of inconsistency by having shadow copies of the variables,
 416  * while still allowing the code to be readable.
 417  */
 418 #define arc_size        ARCSTAT(arcstat_size)   /* actual total arc size */
 419 #define arc_p           ARCSTAT(arcstat_p)      /* target size of MRU */
 420 #define arc_c           ARCSTAT(arcstat_c)      /* target size of cache */
 421 #define arc_c_min       ARCSTAT(arcstat_c_min)  /* min target cache size */
 422 #define arc_c_max       ARCSTAT(arcstat_c_max)  /* max target cache size */
 423 #define arc_meta_limit  ARCSTAT(arcstat_meta_limit) /* max size for metadata */
 424 #define arc_meta_used   ARCSTAT(arcstat_meta_used) /* size of metadata */
 425 #define arc_meta_max    ARCSTAT(arcstat_meta_max) /* max size of metadata */
 426 
 427 static int              arc_no_grow;    /* Don't try to grow cache size */
 428 static uint64_t         arc_tempreserve;
 429 static uint64_t         arc_loaned_bytes;
 430 
 431 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
 432 
 433 typedef struct arc_callback arc_callback_t;
 434 
 435 struct arc_callback {
 436         void                    *acb_private;
 437         arc_done_func_t         *acb_done;
 438         arc_buf_t               *acb_buf;
 439         zio_t                   *acb_zio_dummy;
 440         arc_callback_t          *acb_next;
 441 };
 442 
 443 typedef struct arc_write_callback arc_write_callback_t;
 444 
 445 struct arc_write_callback {
 446         void            *awcb_private;
 447         arc_done_func_t *awcb_ready;
 448         arc_done_func_t *awcb_done;
 449         arc_buf_t       *awcb_buf;
 450 };
 451 
 452 struct arc_buf_hdr {
 453         /* protected by hash lock */
 454         dva_t                   b_dva;
 455         uint64_t                b_birth;
 456         uint64_t                b_cksum0;
 457 
 458         kmutex_t                b_freeze_lock;
 459         zio_cksum_t             *b_freeze_cksum;
 460         void                    *b_thawed;
 461 
 462         arc_buf_hdr_t           *b_hash_next;
 463         arc_buf_t               *b_buf;
 464         uint32_t                b_flags;
 465         uint32_t                b_datacnt;
 466 
 467         arc_callback_t          *b_acb;
 468         kcondvar_t              b_cv;
 469 
 470         /* immutable */
 471         arc_buf_contents_t      b_type;
 472         uint64_t                b_size;
 473         uint64_t                b_spa;
 474 
 475         /* protected by arc state mutex */
 476         arc_state_t             *b_state;
 477         list_node_t             b_arc_node;
 478 
 479         /* updated atomically */
 480         clock_t                 b_arc_access;
 481 
 482         /* self protecting */
 483         refcount_t              b_refcnt;
 484 
 485         l2arc_buf_hdr_t         *b_l2hdr;
 486         list_node_t             b_l2node;
 487 };
 488 
 489 static arc_buf_t *arc_eviction_list;
 490 static kmutex_t arc_eviction_mtx;
 491 static arc_buf_hdr_t arc_eviction_hdr;
 492 static void arc_get_data_buf(arc_buf_t *buf);
 493 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
 494 static int arc_evict_needed(arc_buf_contents_t type);
 495 static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
 496 static void arc_buf_watch(arc_buf_t *buf);
 497 
 498 static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
 499 
 500 #define GHOST_STATE(state)      \
 501         ((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
 502         (state) == arc_l2c_only)
 503 
 504 /*
 505  * Private ARC flags.  These flags are private ARC only flags that will show up
 506  * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
 507  * be passed in as arc_flags in things like arc_read.  However, these flags
 508  * should never be passed and should only be set by ARC code.  When adding new
 509  * public flags, make sure not to smash the private ones.
 510  */
 511 
 512 #define ARC_IN_HASH_TABLE       (1 << 9)  /* this buffer is hashed */
 513 #define ARC_IO_IN_PROGRESS      (1 << 10) /* I/O in progress for buf */
 514 #define ARC_IO_ERROR            (1 << 11) /* I/O failed for buf */
 515 #define ARC_FREED_IN_READ       (1 << 12) /* buf freed while in read */
 516 #define ARC_BUF_AVAILABLE       (1 << 13) /* block not in active use */
 517 #define ARC_INDIRECT            (1 << 14) /* this is an indirect block */
 518 #define ARC_FREE_IN_PROGRESS    (1 << 15) /* hdr about to be freed */
 519 #define ARC_L2_WRITING          (1 << 16) /* L2ARC write in progress */
 520 #define ARC_L2_EVICTED          (1 << 17) /* evicted during I/O */
 521 #define ARC_L2_WRITE_HEAD       (1 << 18) /* head of write list */
 522 
 523 #define HDR_IN_HASH_TABLE(hdr)  ((hdr)->b_flags & ARC_IN_HASH_TABLE)
 524 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
 525 #define HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_IO_ERROR)
 526 #define HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_PREFETCH)
 527 #define HDR_FREED_IN_READ(hdr)  ((hdr)->b_flags & ARC_FREED_IN_READ)
 528 #define HDR_BUF_AVAILABLE(hdr)  ((hdr)->b_flags & ARC_BUF_AVAILABLE)
 529 #define HDR_FREE_IN_PROGRESS(hdr)       ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
 530 #define HDR_L2CACHE(hdr)        ((hdr)->b_flags & ARC_L2CACHE)
 531 #define HDR_L2_READING(hdr)     ((hdr)->b_flags & ARC_IO_IN_PROGRESS &&  \
 532                                     (hdr)->b_l2hdr != NULL)
 533 #define HDR_L2_WRITING(hdr)     ((hdr)->b_flags & ARC_L2_WRITING)
 534 #define HDR_L2_EVICTED(hdr)     ((hdr)->b_flags & ARC_L2_EVICTED)
 535 #define HDR_L2_WRITE_HEAD(hdr)  ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
 536 
 537 /*
 538  * Other sizes
 539  */
 540 
 541 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 542 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
 543 
 544 /*
 545  * Hash table routines
 546  */
 547 
 548 struct ht_table {
 549         arc_buf_hdr_t   *hdr;
 550         kmutex_t        lock;
 551 };
 552 
 553 typedef struct buf_hash_table {
 554         uint64_t ht_mask;
 555         struct ht_table *ht_table;
 556 } buf_hash_table_t;
 557 
 558 static buf_hash_table_t buf_hash_table;
 559 
 560 #define BUF_HASH_INDEX(spa, dva, birth) \
 561         (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 562 #define BUF_HASH_LOCK(idx) (&buf_hash_table.ht_table[idx].lock)
 563 #define HDR_LOCK(hdr) \
 564         (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 565 
 566 uint64_t zfs_crc64_table[256];
 567 
 568 /*
 569  * Level 2 ARC
 570  */
 571 
 572 #define L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
 573 #define L2ARC_HEADROOM          2               /* num of writes */
 574 #define L2ARC_FEED_SECS         1               /* caching interval secs */
 575 #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 576 
 577 #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 578 #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 579 
 580 /*
 581  * L2ARC Performance Tunables
 582  */
 583 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;    /* default max write size */
 584 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;  /* extra write during warmup */
 585 uint64_t l2arc_headroom = L2ARC_HEADROOM;       /* number of dev writes */
 586 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;     /* interval seconds */
 587 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
 588 boolean_t l2arc_noprefetch = B_TRUE;            /* don't cache prefetch bufs */
 589 boolean_t l2arc_feed_again = B_TRUE;            /* turbo warmup */
 590 boolean_t l2arc_norw = B_TRUE;                  /* no reads during writes */
 591 
 592 /*
 593  * L2ARC Internals
 594  */
 595 typedef struct l2arc_dev {
 596         vdev_t                  *l2ad_vdev;     /* vdev */
 597         spa_t                   *l2ad_spa;      /* spa */
 598         uint64_t                l2ad_hand;      /* next write location */
 599         uint64_t                l2ad_write;     /* desired write size, bytes */
 600         uint64_t                l2ad_boost;     /* warmup write boost, bytes */
 601         uint64_t                l2ad_start;     /* first addr on device */
 602         uint64_t                l2ad_end;       /* last addr on device */
 603         uint64_t                l2ad_evict;     /* last addr eviction reached */
 604         boolean_t               l2ad_first;     /* first sweep through */
 605         boolean_t               l2ad_writing;   /* currently writing */
 606         list_t                  *l2ad_buflist;  /* buffer list */
 607         list_node_t             l2ad_node;      /* device list node */
 608 } l2arc_dev_t;
 609 
 610 static list_t L2ARC_dev_list;                   /* device list */
 611 static list_t *l2arc_dev_list;                  /* device list pointer */
 612 static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 613 static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 614 static kmutex_t l2arc_buflist_mtx;              /* mutex for all buflists */
 615 static list_t L2ARC_free_on_write;              /* free after write buf list */
 616 static list_t *l2arc_free_on_write;             /* free after write list ptr */
 617 static kmutex_t l2arc_free_on_write_mtx;        /* mutex for list */
 618 static uint64_t l2arc_ndev;                     /* number of devices */
 619 
 620 typedef struct l2arc_read_callback {
 621         arc_buf_t       *l2rcb_buf;             /* read buffer */
 622         spa_t           *l2rcb_spa;             /* spa */
 623         blkptr_t        l2rcb_bp;               /* original blkptr */
 624         zbookmark_t     l2rcb_zb;               /* original bookmark */
 625         int             l2rcb_flags;            /* original flags */
 626 } l2arc_read_callback_t;
 627 
 628 typedef struct l2arc_write_callback {
 629         l2arc_dev_t     *l2wcb_dev;             /* device info */
 630         arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
 631 } l2arc_write_callback_t;
 632 
 633 struct l2arc_buf_hdr {
 634         /* protected by arc_buf_hdr  mutex */
 635         l2arc_dev_t     *b_dev;                 /* L2ARC device */
 636         uint64_t        b_daddr;                /* disk address, offset byte */
 637 };
 638 
 639 typedef struct l2arc_data_free {
 640         /* protected by l2arc_free_on_write_mtx */
 641         void            *l2df_data;
 642         size_t          l2df_size;
 643         void            (*l2df_func)(void *, size_t);
 644         list_node_t     l2df_list_node;
 645 } l2arc_data_free_t;
 646 
 647 static kmutex_t l2arc_feed_thr_lock;
 648 static kcondvar_t l2arc_feed_thr_cv;
 649 static uint8_t l2arc_thread_exit;
 650 
 651 static void l2arc_read_done(zio_t *zio);
 652 static void l2arc_hdr_stat_add(void);
 653 static void l2arc_hdr_stat_remove(void);
 654 
 655 static uint64_t
 656 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 657 {
 658         uint8_t *vdva = (uint8_t *)dva;
 659         uint64_t crc = -1ULL;
 660         int i;
 661 
 662         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 663 
 664         for (i = 0; i < sizeof (dva_t); i++)
 665                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
 666 
 667         crc ^= (spa>>8) ^ birth;
 668 
 669         return (crc);
 670 }
 671 
 672 #define BUF_EMPTY(buf)                                          \
 673         ((buf)->b_dva.dva_word[0] == 0 &&                    \
 674         (buf)->b_dva.dva_word[1] == 0 &&                     \
 675         (buf)->b_birth == 0)
 676 
 677 #define BUF_EQUAL(spa, dva, birth, buf)                         \
 678         ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&       \
 679         ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&       \
 680         ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
 681 
 682 static void
 683 buf_discard_identity(arc_buf_hdr_t *hdr)
 684 {
 685         hdr->b_dva.dva_word[0] = 0;
 686         hdr->b_dva.dva_word[1] = 0;
 687         hdr->b_birth = 0;
 688         hdr->b_cksum0 = 0;
 689 }
 690 
 691 static arc_buf_hdr_t *
 692 buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
 693 {
 694         uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 695         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 696         arc_buf_hdr_t *buf;
 697 
 698         mutex_enter(hash_lock);
 699         for (buf = buf_hash_table.ht_table[idx].hdr; buf != NULL;
 700             buf = buf->b_hash_next) {
 701                 if (BUF_EQUAL(spa, dva, birth, buf)) {
 702                         *lockp = hash_lock;
 703                         return (buf);
 704                 }
 705         }
 706         mutex_exit(hash_lock);
 707         *lockp = NULL;
 708         return (NULL);
 709 }
 710 
 711 /*
 712  * Insert an entry into the hash table.  If there is already an element
 713  * equal to elem in the hash table, then the already existing element
 714  * will be returned and the new element will not be inserted.
 715  * Otherwise returns NULL.
 716  */
 717 static arc_buf_hdr_t *
 718 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
 719 {
 720         uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 721         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 722         arc_buf_hdr_t *fbuf;
 723         uint32_t i;
 724 
 725         ASSERT(!HDR_IN_HASH_TABLE(buf));
 726         *lockp = hash_lock;
 727         mutex_enter(hash_lock);
 728         for (fbuf = buf_hash_table.ht_table[idx].hdr, i = 0; fbuf != NULL;
 729             fbuf = fbuf->b_hash_next, i++) {
 730                 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
 731                         return (fbuf);
 732         }
 733 
 734         buf->b_hash_next = buf_hash_table.ht_table[idx].hdr;
 735         buf_hash_table.ht_table[idx].hdr = buf;
 736         buf->b_flags |= ARC_IN_HASH_TABLE;
 737 
 738         /* collect some hash table performance data */
 739         if (i > 0) {
 740                 ARCSTAT_BUMP(arcstat_hash_collisions);
 741                 if (i == 1)
 742                         ARCSTAT_BUMP(arcstat_hash_chains);
 743 
 744                 ARCSTAT_MAX(arcstat_hash_chain_max, i);
 745         }
 746 
 747         ARCSTAT_BUMP(arcstat_hash_elements);
 748         ARCSTAT_MAXSTAT(arcstat_hash_elements);
 749 
 750         return (NULL);
 751 }
 752 
 753 static void
 754 buf_hash_remove(arc_buf_hdr_t *buf)
 755 {
 756         arc_buf_hdr_t *fbuf, **bufp;
 757         uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
 758 
 759         ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
 760         ASSERT(HDR_IN_HASH_TABLE(buf));
 761 
 762         bufp = &buf_hash_table.ht_table[idx].hdr;
 763         while ((fbuf = *bufp) != buf) {
 764                 ASSERT(fbuf != NULL);
 765                 bufp = &fbuf->b_hash_next;
 766         }
 767         *bufp = buf->b_hash_next;
 768         buf->b_hash_next = NULL;
 769         buf->b_flags &= ~ARC_IN_HASH_TABLE;
 770 
 771         /* collect some hash table performance data */
 772         ARCSTAT_BUMPDOWN(arcstat_hash_elements);
 773 
 774         if (buf_hash_table.ht_table[idx].hdr &&
 775             buf_hash_table.ht_table[idx].hdr->b_hash_next == NULL)
 776                 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 777 }
 778 
 779 /*
 780  * Global data structures and functions for the buf kmem cache.
 781  */
 782 static kmem_cache_t *hdr_cache;
 783 static kmem_cache_t *buf_cache;
 784 
 785 static void
 786 buf_fini(void)
 787 {
 788         int i;
 789 
 790         for (i = 0; i < buf_hash_table.ht_mask + 1; i++)
 791                 mutex_destroy(&buf_hash_table.ht_table[i].lock);
 792         kmem_free(buf_hash_table.ht_table,
 793             (buf_hash_table.ht_mask + 1) * sizeof (struct ht_table));
 794         kmem_cache_destroy(hdr_cache);
 795         kmem_cache_destroy(buf_cache);
 796 }
 797 
 798 /*
 799  * Constructor callback - called when the cache is empty
 800  * and a new buf is requested.
 801  */
 802 /* ARGSUSED */
 803 static int
 804 hdr_cons(void *vbuf, void *unused, int kmflag)
 805 {
 806         arc_buf_hdr_t *buf = vbuf;
 807 
 808         bzero(buf, sizeof (arc_buf_hdr_t));
 809         refcount_create(&buf->b_refcnt);
 810         cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
 811         mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
 812         arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 813 
 814         return (0);
 815 }
 816 
 817 /* ARGSUSED */
 818 static int
 819 buf_cons(void *vbuf, void *unused, int kmflag)
 820 {
 821         arc_buf_t *buf = vbuf;
 822 
 823         bzero(buf, sizeof (arc_buf_t));
 824         mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 825         arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 826 
 827         return (0);
 828 }
 829 
 830 /*
 831  * Destructor callback - called when a cached buf is
 832  * no longer required.
 833  */
 834 /* ARGSUSED */
 835 static void
 836 hdr_dest(void *vbuf, void *unused)
 837 {
 838         arc_buf_hdr_t *buf = vbuf;
 839 
 840         ASSERT(BUF_EMPTY(buf));
 841         refcount_destroy(&buf->b_refcnt);
 842         cv_destroy(&buf->b_cv);
 843         mutex_destroy(&buf->b_freeze_lock);
 844         arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 845 }
 846 
 847 /* ARGSUSED */
 848 static void
 849 buf_dest(void *vbuf, void *unused)
 850 {
 851         arc_buf_t *buf = vbuf;
 852 
 853         mutex_destroy(&buf->b_evict_lock);
 854         arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 855 }
 856 
 857 /*
 858  * Reclaim callback -- invoked when memory is low.
 859  */
 860 /* ARGSUSED */
 861 static void
 862 hdr_recl(void *unused)
 863 {
 864         dprintf("hdr_recl called\n");
 865         /*
 866          * umem calls the reclaim func when we destroy the buf cache,
 867          * which is after we do arc_fini().
 868          */
 869         if (!arc_dead)
 870                 cv_signal(&arc_reclaim_thr_cv);
 871 }
 872 
 873 static void
 874 buf_init(void)
 875 {
 876         uint64_t *ct;
 877         uint64_t hsize = 1ULL << 12;
 878         int i, j;
 879 
 880         /*
 881          * The hash table is big enough to fill all of physical memory
 882          * with an average 64K block size.  The table will take up
 883          * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
 884          */
 885         while (hsize * 65536 < physmem * PAGESIZE)
 886                 hsize <<= 1;
 887 retry:
 888         buf_hash_table.ht_mask = hsize - 1;
 889         buf_hash_table.ht_table =
 890             kmem_zalloc(hsize * sizeof (struct ht_table), KM_NOSLEEP);
 891         if (buf_hash_table.ht_table == NULL) {
 892                 ASSERT(hsize > (1ULL << 8));
 893                 hsize >>= 1;
 894                 goto retry;
 895         }
 896 
 897         hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
 898             0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
 899         buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
 900             0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 901 
 902         for (i = 0; i < 256; i++)
 903                 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
 904                         *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
 905 
 906         for (i = 0; i < hsize; i++) {
 907                 mutex_init(&buf_hash_table.ht_table[i].lock,
 908                     NULL, MUTEX_DEFAULT, NULL);
 909         }
 910 }
 911 
 912 #define ARC_MINTIME     (hz>>4) /* 62 ms */
 913 
 914 static void
 915 arc_cksum_verify(arc_buf_t *buf)
 916 {
 917         zio_cksum_t zc;
 918 
 919         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 920                 return;
 921 
 922         mutex_enter(&buf->b_hdr->b_freeze_lock);
 923         if (buf->b_hdr->b_freeze_cksum == NULL ||
 924             (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
 925                 mutex_exit(&buf->b_hdr->b_freeze_lock);
 926                 return;
 927         }
 928         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
 929         if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
 930                 panic("buffer modified while frozen!");
 931         mutex_exit(&buf->b_hdr->b_freeze_lock);
 932 }
 933 
 934 static int
 935 arc_cksum_equal(arc_buf_t *buf)
 936 {
 937         zio_cksum_t zc;
 938         int equal;
 939 
 940         mutex_enter(&buf->b_hdr->b_freeze_lock);
 941         fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
 942         equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
 943         mutex_exit(&buf->b_hdr->b_freeze_lock);
 944 
 945         return (equal);
 946 }
 947 
 948 static void
 949 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
 950 {
 951         if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
 952                 return;
 953 
 954         mutex_enter(&buf->b_hdr->b_freeze_lock);
 955         if (buf->b_hdr->b_freeze_cksum != NULL) {
 956                 mutex_exit(&buf->b_hdr->b_freeze_lock);
 957                 return;
 958         }
 959         buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
 960         fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
 961             buf->b_hdr->b_freeze_cksum);
 962         mutex_exit(&buf->b_hdr->b_freeze_lock);
 963         arc_buf_watch(buf);
 964 }
 965 
 966 #ifndef _KERNEL
 967 typedef struct procctl {
 968         long cmd;
 969         prwatch_t prwatch;
 970 } procctl_t;
 971 #endif
 972 
 973 /* ARGSUSED */
 974 static void
 975 arc_buf_unwatch(arc_buf_t *buf)
 976 {
 977 #ifndef _KERNEL
 978         if (arc_watch) {
 979                 int result;
 980                 procctl_t ctl;
 981                 ctl.cmd = PCWATCH;
 982                 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
 983                 ctl.prwatch.pr_size = 0;
 984                 ctl.prwatch.pr_wflags = 0;
 985                 result = write(arc_procfd, &ctl, sizeof (ctl));
 986                 ASSERT3U(result, ==, sizeof (ctl));
 987         }
 988 #endif
 989 }
 990 
 991 /* ARGSUSED */
 992 static void
 993 arc_buf_watch(arc_buf_t *buf)
 994 {
 995 #ifndef _KERNEL
 996         if (arc_watch) {
 997                 int result;
 998                 procctl_t ctl;
 999                 ctl.cmd = PCWATCH;
1000                 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1001                 ctl.prwatch.pr_size = buf->b_hdr->b_size;
1002                 ctl.prwatch.pr_wflags = WA_WRITE;
1003                 result = write(arc_procfd, &ctl, sizeof (ctl));
1004                 ASSERT3U(result, ==, sizeof (ctl));
1005         }
1006 #endif
1007 }
1008 
1009 void
1010 arc_buf_thaw(arc_buf_t *buf)
1011 {
1012         if (zfs_flags & ZFS_DEBUG_MODIFY) {
1013                 if (buf->b_hdr->b_state != arc_anon)
1014                         panic("modifying non-anon buffer!");
1015                 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1016                         panic("modifying buffer while i/o in progress!");
1017                 arc_cksum_verify(buf);
1018         }
1019 
1020         mutex_enter(&buf->b_hdr->b_freeze_lock);
1021         if (buf->b_hdr->b_freeze_cksum != NULL) {
1022                 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1023                 buf->b_hdr->b_freeze_cksum = NULL;
1024         }
1025 
1026         if (zfs_flags & ZFS_DEBUG_MODIFY) {
1027                 if (buf->b_hdr->b_thawed)
1028                         kmem_free(buf->b_hdr->b_thawed, 1);
1029                 buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1030         }
1031 
1032         mutex_exit(&buf->b_hdr->b_freeze_lock);
1033 
1034         arc_buf_unwatch(buf);
1035 }
1036 
1037 void
1038 arc_buf_freeze(arc_buf_t *buf)
1039 {
1040         kmutex_t *hash_lock;
1041 
1042         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1043                 return;
1044 
1045         hash_lock = HDR_LOCK(buf->b_hdr);
1046         mutex_enter(hash_lock);
1047 
1048         ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1049             buf->b_hdr->b_state == arc_anon);
1050         arc_cksum_compute(buf, B_FALSE);
1051         mutex_exit(hash_lock);
1052 
1053 }
1054 
1055 static void
1056 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1057 {
1058         ASSERT(MUTEX_HELD(hash_lock));
1059 
1060         if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1061             (ab->b_state != arc_anon)) {
1062                 uint64_t delta = ab->b_size * ab->b_datacnt;
1063                 list_t *list = &ab->b_state->arcs_list[ab->b_type];
1064                 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1065 
1066                 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
1067                 mutex_enter(&ab->b_state->arcs_mtx);
1068                 ASSERT(list_link_active(&ab->b_arc_node));
1069                 list_remove(list, ab);
1070                 if (GHOST_STATE(ab->b_state)) {
1071                         ASSERT0(ab->b_datacnt);
1072                         ASSERT3P(ab->b_buf, ==, NULL);
1073                         delta = ab->b_size;
1074                 }
1075                 ASSERT(delta > 0);
1076                 ASSERT3U(*size, >=, delta);
1077                 atomic_add_64(size, -delta);
1078                 mutex_exit(&ab->b_state->arcs_mtx);
1079                 /* remove the prefetch flag if we get a reference */
1080                 if (ab->b_flags & ARC_PREFETCH)
1081                         ab->b_flags &= ~ARC_PREFETCH;
1082         }
1083 }
1084 
1085 static int
1086 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1087 {
1088         int cnt;
1089         arc_state_t *state = ab->b_state;
1090 
1091         ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1092         ASSERT(!GHOST_STATE(state));
1093 
1094         if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1095             (state != arc_anon)) {
1096                 uint64_t *size = &state->arcs_lsize[ab->b_type];
1097 
1098                 ASSERT(!MUTEX_HELD(&state->arcs_mtx));
1099                 mutex_enter(&state->arcs_mtx);
1100                 ASSERT(!list_link_active(&ab->b_arc_node));
1101                 list_insert_head(&state->arcs_list[ab->b_type], ab);
1102                 ASSERT(ab->b_datacnt > 0);
1103                 atomic_add_64(size, ab->b_size * ab->b_datacnt);
1104                 mutex_exit(&state->arcs_mtx);
1105         }
1106         return (cnt);
1107 }
1108 
1109 /*
1110  * Move the supplied buffer to the indicated state.  The mutex
1111  * for the buffer must be held by the caller.
1112  */
1113 static void
1114 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1115 {
1116         arc_state_t *old_state = ab->b_state;
1117         int64_t refcnt = refcount_count(&ab->b_refcnt);
1118         uint64_t from_delta, to_delta;
1119 
1120         ASSERT(MUTEX_HELD(hash_lock));
1121         ASSERT(new_state != old_state);
1122         ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1123         ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1124         ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1125 
1126         from_delta = to_delta = ab->b_datacnt * ab->b_size;
1127 
1128         /*
1129          * If this buffer is evictable, transfer it from the
1130          * old state list to the new state list.
1131          */
1132         if (refcnt == 0) {
1133                 if (old_state != arc_anon) {
1134                         int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1135                         uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1136 
1137                         if (use_mutex)
1138                                 mutex_enter(&old_state->arcs_mtx);
1139 
1140                         ASSERT(list_link_active(&ab->b_arc_node));
1141                         list_remove(&old_state->arcs_list[ab->b_type], ab);
1142 
1143                         /*
1144                          * If prefetching out of the ghost cache,
1145                          * we will have a non-zero datacnt.
1146                          */
1147                         if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1148                                 /* ghost elements have a ghost size */
1149                                 ASSERT(ab->b_buf == NULL);
1150                                 from_delta = ab->b_size;
1151                         }
1152                         ASSERT3U(*size, >=, from_delta);
1153                         atomic_add_64(size, -from_delta);
1154 
1155                         if (use_mutex)
1156                                 mutex_exit(&old_state->arcs_mtx);
1157                 }
1158                 if (new_state != arc_anon) {
1159                         int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
1160                         uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1161 
1162                         if (use_mutex)
1163                                 mutex_enter(&new_state->arcs_mtx);
1164 
1165                         list_insert_head(&new_state->arcs_list[ab->b_type], ab);
1166 
1167                         /* ghost elements have a ghost size */
1168                         if (GHOST_STATE(new_state)) {
1169                                 ASSERT(ab->b_datacnt == 0);
1170                                 ASSERT(ab->b_buf == NULL);
1171                                 to_delta = ab->b_size;
1172                         }
1173                         atomic_add_64(size, to_delta);
1174 
1175                         if (use_mutex)
1176                                 mutex_exit(&new_state->arcs_mtx);
1177                 }
1178         }
1179 
1180         ASSERT(!BUF_EMPTY(ab));
1181         if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1182                 buf_hash_remove(ab);
1183 
1184         /* adjust state sizes */
1185         if (to_delta)
1186                 atomic_add_64(&new_state->arcs_size, to_delta);
1187         if (from_delta) {
1188                 ASSERT3U(old_state->arcs_size, >=, from_delta);
1189                 atomic_add_64(&old_state->arcs_size, -from_delta);
1190         }
1191         ab->b_state = new_state;
1192 
1193         /* adjust l2arc hdr stats */
1194         if (new_state == arc_l2c_only)
1195                 l2arc_hdr_stat_add();
1196         else if (old_state == arc_l2c_only)
1197                 l2arc_hdr_stat_remove();
1198 }
1199 
1200 void
1201 arc_space_consume(uint64_t space, arc_space_type_t type)
1202 {
1203         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1204 
1205         switch (type) {
1206         case ARC_SPACE_DATA:
1207                 ARCSTAT_INCR(arcstat_data_size, space);
1208                 break;
1209         case ARC_SPACE_OTHER:
1210                 ARCSTAT_INCR(arcstat_other_size, space);
1211                 break;
1212         case ARC_SPACE_HDRS:
1213                 ARCSTAT_INCR(arcstat_hdr_size, space);
1214                 break;
1215         case ARC_SPACE_L2HDRS:
1216                 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1217                 break;
1218         }
1219 
1220         ARCSTAT_INCR(arcstat_meta_used, space);
1221         atomic_add_64(&arc_size, space);
1222 }
1223 
1224 void
1225 arc_space_return(uint64_t space, arc_space_type_t type)
1226 {
1227         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1228 
1229         switch (type) {
1230         case ARC_SPACE_DATA:
1231                 ARCSTAT_INCR(arcstat_data_size, -space);
1232                 break;
1233         case ARC_SPACE_OTHER:
1234                 ARCSTAT_INCR(arcstat_other_size, -space);
1235                 break;
1236         case ARC_SPACE_HDRS:
1237                 ARCSTAT_INCR(arcstat_hdr_size, -space);
1238                 break;
1239         case ARC_SPACE_L2HDRS:
1240                 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1241                 break;
1242         }
1243 
1244         ASSERT(arc_meta_used >= space);
1245         if (arc_meta_max < arc_meta_used)
1246                 arc_meta_max = arc_meta_used;
1247         ARCSTAT_INCR(arcstat_meta_used, -space);
1248         ASSERT(arc_size >= space);
1249         atomic_add_64(&arc_size, -space);
1250 }
1251 
1252 void *
1253 arc_data_buf_alloc(uint64_t size)
1254 {
1255         if (arc_evict_needed(ARC_BUFC_DATA))
1256                 cv_signal(&arc_reclaim_thr_cv);
1257         atomic_add_64(&arc_size, size);
1258         return (zio_data_buf_alloc(size));
1259 }
1260 
1261 void
1262 arc_data_buf_free(void *buf, uint64_t size)
1263 {
1264         zio_data_buf_free(buf, size);
1265         ASSERT(arc_size >= size);
1266         atomic_add_64(&arc_size, -size);
1267 }
1268 
1269 arc_buf_t *
1270 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1271 {
1272         arc_buf_hdr_t *hdr;
1273         arc_buf_t *buf;
1274 
1275         ASSERT3U(size, >, 0);
1276         hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1277         ASSERT(BUF_EMPTY(hdr));
1278         hdr->b_size = size;
1279         hdr->b_type = type;
1280         hdr->b_spa = spa_load_guid(spa);
1281         hdr->b_state = arc_anon;
1282         hdr->b_arc_access = 0;
1283         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1284         buf->b_hdr = hdr;
1285         buf->b_data = NULL;
1286         buf->b_efunc = NULL;
1287         buf->b_private = NULL;
1288         buf->b_next = NULL;
1289         hdr->b_buf = buf;
1290         arc_get_data_buf(buf);
1291         hdr->b_datacnt = 1;
1292         hdr->b_flags = 0;
1293         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1294         (void) refcount_add(&hdr->b_refcnt, tag);
1295 
1296         return (buf);
1297 }
1298 
1299 static char *arc_onloan_tag = "onloan";
1300 
1301 /*
1302  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1303  * flight data by arc_tempreserve_space() until they are "returned". Loaned
1304  * buffers must be returned to the arc before they can be used by the DMU or
1305  * freed.
1306  */
1307 arc_buf_t *
1308 arc_loan_buf(spa_t *spa, int size)
1309 {
1310         arc_buf_t *buf;
1311 
1312         buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1313 
1314         atomic_add_64(&arc_loaned_bytes, size);
1315         return (buf);
1316 }
1317 
1318 /*
1319  * Return a loaned arc buffer to the arc.
1320  */
1321 void
1322 arc_return_buf(arc_buf_t *buf, void *tag)
1323 {
1324         arc_buf_hdr_t *hdr = buf->b_hdr;
1325 
1326         ASSERT(buf->b_data != NULL);
1327         (void) refcount_add(&hdr->b_refcnt, tag);
1328         (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1329 
1330         atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1331 }
1332 
1333 /* Detach an arc_buf from a dbuf (tag) */
1334 void
1335 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1336 {
1337         arc_buf_hdr_t *hdr;
1338 
1339         ASSERT(buf->b_data != NULL);
1340         hdr = buf->b_hdr;
1341         (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1342         (void) refcount_remove(&hdr->b_refcnt, tag);
1343         buf->b_efunc = NULL;
1344         buf->b_private = NULL;
1345 
1346         atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1347 }
1348 
1349 static arc_buf_t *
1350 arc_buf_clone(arc_buf_t *from)
1351 {
1352         arc_buf_t *buf;
1353         arc_buf_hdr_t *hdr = from->b_hdr;
1354         uint64_t size = hdr->b_size;
1355 
1356         ASSERT(hdr->b_state != arc_anon);
1357 
1358         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1359         buf->b_hdr = hdr;
1360         buf->b_data = NULL;
1361         buf->b_efunc = NULL;
1362         buf->b_private = NULL;
1363         buf->b_next = hdr->b_buf;
1364         hdr->b_buf = buf;
1365         arc_get_data_buf(buf);
1366         bcopy(from->b_data, buf->b_data, size);
1367 
1368         /*
1369          * This buffer already exists in the arc so create a duplicate
1370          * copy for the caller.  If the buffer is associated with user data
1371          * then track the size and number of duplicates.  These stats will be
1372          * updated as duplicate buffers are created and destroyed.
1373          */
1374         if (hdr->b_type == ARC_BUFC_DATA) {
1375                 ARCSTAT_BUMP(arcstat_duplicate_buffers);
1376                 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1377         }
1378         hdr->b_datacnt += 1;
1379         return (buf);
1380 }
1381 
1382 void
1383 arc_buf_add_ref(arc_buf_t *buf, void* tag)
1384 {
1385         arc_buf_hdr_t *hdr;
1386         kmutex_t *hash_lock;
1387 
1388         /*
1389          * Check to see if this buffer is evicted.  Callers
1390          * must verify b_data != NULL to know if the add_ref
1391          * was successful.
1392          */
1393         mutex_enter(&buf->b_evict_lock);
1394         if (buf->b_data == NULL) {
1395                 mutex_exit(&buf->b_evict_lock);
1396                 return;
1397         }
1398         hash_lock = HDR_LOCK(buf->b_hdr);
1399         mutex_enter(hash_lock);
1400         hdr = buf->b_hdr;
1401         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1402         mutex_exit(&buf->b_evict_lock);
1403 
1404         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1405         add_reference(hdr, hash_lock, tag);
1406         DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1407         arc_access(hdr, hash_lock);
1408         mutex_exit(hash_lock);
1409         ARCSTAT_BUMP(arcstat_hits);
1410         ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1411             demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1412             data, metadata, hits);
1413 }
1414 
1415 /*
1416  * Free the arc data buffer.  If it is an l2arc write in progress,
1417  * the buffer is placed on l2arc_free_on_write to be freed later.
1418  */
1419 static void
1420 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1421 {
1422         arc_buf_hdr_t *hdr = buf->b_hdr;
1423 
1424         if (HDR_L2_WRITING(hdr)) {
1425                 l2arc_data_free_t *df;
1426                 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1427                 df->l2df_data = buf->b_data;
1428                 df->l2df_size = hdr->b_size;
1429                 df->l2df_func = free_func;
1430                 mutex_enter(&l2arc_free_on_write_mtx);
1431                 list_insert_head(l2arc_free_on_write, df);
1432                 mutex_exit(&l2arc_free_on_write_mtx);
1433                 ARCSTAT_BUMP(arcstat_l2_free_on_write);
1434         } else {
1435                 free_func(buf->b_data, hdr->b_size);
1436         }
1437 }
1438 
1439 static void
1440 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1441 {
1442         arc_buf_t **bufp;
1443 
1444         /* free up data associated with the buf */
1445         if (buf->b_data) {
1446                 arc_state_t *state = buf->b_hdr->b_state;
1447                 uint64_t size = buf->b_hdr->b_size;
1448                 arc_buf_contents_t type = buf->b_hdr->b_type;
1449 
1450                 arc_cksum_verify(buf);
1451                 arc_buf_unwatch(buf);
1452 
1453                 if (!recycle) {
1454                         if (type == ARC_BUFC_METADATA) {
1455                                 arc_buf_data_free(buf, zio_buf_free);
1456                                 arc_space_return(size, ARC_SPACE_DATA);
1457                         } else {
1458                                 ASSERT(type == ARC_BUFC_DATA);
1459                                 arc_buf_data_free(buf, zio_data_buf_free);
1460                                 ARCSTAT_INCR(arcstat_data_size, -size);
1461                                 atomic_add_64(&arc_size, -size);
1462                         }
1463                 }
1464                 if (list_link_active(&buf->b_hdr->b_arc_node)) {
1465                         uint64_t *cnt = &state->arcs_lsize[type];
1466 
1467                         ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1468                         ASSERT(state != arc_anon);
1469 
1470                         ASSERT3U(*cnt, >=, size);
1471                         atomic_add_64(cnt, -size);
1472                 }
1473                 ASSERT3U(state->arcs_size, >=, size);
1474                 atomic_add_64(&state->arcs_size, -size);
1475                 buf->b_data = NULL;
1476 
1477                 /*
1478                  * If we're destroying a duplicate buffer make sure
1479                  * that the appropriate statistics are updated.
1480                  */
1481                 if (buf->b_hdr->b_datacnt > 1 &&
1482                     buf->b_hdr->b_type == ARC_BUFC_DATA) {
1483                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1484                         ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1485                 }
1486                 ASSERT(buf->b_hdr->b_datacnt > 0);
1487                 buf->b_hdr->b_datacnt -= 1;
1488         }
1489 
1490         /* only remove the buf if requested */
1491         if (!all)
1492                 return;
1493 
1494         /* remove the buf from the hdr list */
1495         for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1496                 continue;
1497         *bufp = buf->b_next;
1498         buf->b_next = NULL;
1499 
1500         ASSERT(buf->b_efunc == NULL);
1501 
1502         /* clean up the buf */
1503         buf->b_hdr = NULL;
1504         kmem_cache_free(buf_cache, buf);
1505 }
1506 
1507 static void
1508 arc_hdr_destroy(arc_buf_hdr_t *hdr)
1509 {
1510         ASSERT(refcount_is_zero(&hdr->b_refcnt));
1511         ASSERT3P(hdr->b_state, ==, arc_anon);
1512         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1513         l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1514 
1515         if (l2hdr != NULL) {
1516                 boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1517                 /*
1518                  * To prevent arc_free() and l2arc_evict() from
1519                  * attempting to free the same buffer at the same time,
1520                  * a FREE_IN_PROGRESS flag is given to arc_free() to
1521                  * give it priority.  l2arc_evict() can't destroy this
1522                  * header while we are waiting on l2arc_buflist_mtx.
1523                  *
1524                  * The hdr may be removed from l2ad_buflist before we
1525                  * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1526                  */
1527                 if (!buflist_held) {
1528                         mutex_enter(&l2arc_buflist_mtx);
1529                         l2hdr = hdr->b_l2hdr;
1530                 }
1531 
1532                 if (l2hdr != NULL) {
1533                         list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1534                         ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1535                         kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1536                         if (hdr->b_state == arc_l2c_only)
1537                                 l2arc_hdr_stat_remove();
1538                         hdr->b_l2hdr = NULL;
1539                 }
1540 
1541                 if (!buflist_held)
1542                         mutex_exit(&l2arc_buflist_mtx);
1543         }
1544 
1545         if (!BUF_EMPTY(hdr)) {
1546                 ASSERT(!HDR_IN_HASH_TABLE(hdr));
1547                 buf_discard_identity(hdr);
1548         }
1549         while (hdr->b_buf) {
1550                 arc_buf_t *buf = hdr->b_buf;
1551 
1552                 if (buf->b_efunc) {
1553                         mutex_enter(&arc_eviction_mtx);
1554                         mutex_enter(&buf->b_evict_lock);
1555                         ASSERT(buf->b_hdr != NULL);
1556                         arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1557                         hdr->b_buf = buf->b_next;
1558                         buf->b_hdr = &arc_eviction_hdr;
1559                         buf->b_next = arc_eviction_list;
1560                         arc_eviction_list = buf;
1561                         mutex_exit(&buf->b_evict_lock);
1562                         mutex_exit(&arc_eviction_mtx);
1563                 } else {
1564                         arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1565                 }
1566         }
1567         if (hdr->b_freeze_cksum != NULL) {
1568                 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1569                 hdr->b_freeze_cksum = NULL;
1570         }
1571         if (hdr->b_thawed) {
1572                 kmem_free(hdr->b_thawed, 1);
1573                 hdr->b_thawed = NULL;
1574         }
1575 
1576         ASSERT(!list_link_active(&hdr->b_arc_node));
1577         ASSERT3P(hdr->b_hash_next, ==, NULL);
1578         ASSERT3P(hdr->b_acb, ==, NULL);
1579         kmem_cache_free(hdr_cache, hdr);
1580 }
1581 
1582 void
1583 arc_buf_free(arc_buf_t *buf, void *tag)
1584 {
1585         arc_buf_hdr_t *hdr = buf->b_hdr;
1586         int hashed = hdr->b_state != arc_anon;
1587 
1588         ASSERT(buf->b_efunc == NULL);
1589         ASSERT(buf->b_data != NULL);
1590 
1591         if (hashed) {
1592                 kmutex_t *hash_lock = HDR_LOCK(hdr);
1593 
1594                 mutex_enter(hash_lock);
1595                 hdr = buf->b_hdr;
1596                 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1597 
1598                 (void) remove_reference(hdr, hash_lock, tag);
1599                 if (hdr->b_datacnt > 1) {
1600                         arc_buf_destroy(buf, FALSE, TRUE);
1601                 } else {
1602                         ASSERT(buf == hdr->b_buf);
1603                         ASSERT(buf->b_efunc == NULL);
1604                         hdr->b_flags |= ARC_BUF_AVAILABLE;
1605                 }
1606                 mutex_exit(hash_lock);
1607         } else if (HDR_IO_IN_PROGRESS(hdr)) {
1608                 int destroy_hdr;
1609                 /*
1610                  * We are in the middle of an async write.  Don't destroy
1611                  * this buffer unless the write completes before we finish
1612                  * decrementing the reference count.
1613                  */
1614                 mutex_enter(&arc_eviction_mtx);
1615                 (void) remove_reference(hdr, NULL, tag);
1616                 ASSERT(refcount_is_zero(&hdr->b_refcnt));
1617                 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1618                 mutex_exit(&arc_eviction_mtx);
1619                 if (destroy_hdr)
1620                         arc_hdr_destroy(hdr);
1621         } else {
1622                 if (remove_reference(hdr, NULL, tag) > 0)
1623                         arc_buf_destroy(buf, FALSE, TRUE);
1624                 else
1625                         arc_hdr_destroy(hdr);
1626         }
1627 }
1628 
1629 boolean_t
1630 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1631 {
1632         arc_buf_hdr_t *hdr = buf->b_hdr;
1633         kmutex_t *hash_lock = HDR_LOCK(hdr);
1634         boolean_t no_callback = (buf->b_efunc == NULL);
1635 
1636         if (hdr->b_state == arc_anon) {
1637                 ASSERT(hdr->b_datacnt == 1);
1638                 arc_buf_free(buf, tag);
1639                 return (no_callback);
1640         }
1641 
1642         mutex_enter(hash_lock);
1643         hdr = buf->b_hdr;
1644         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1645         ASSERT(hdr->b_state != arc_anon);
1646         ASSERT(buf->b_data != NULL);
1647 
1648         (void) remove_reference(hdr, hash_lock, tag);
1649         if (hdr->b_datacnt > 1) {
1650                 if (no_callback)
1651                         arc_buf_destroy(buf, FALSE, TRUE);
1652         } else if (no_callback) {
1653                 ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1654                 ASSERT(buf->b_efunc == NULL);
1655                 hdr->b_flags |= ARC_BUF_AVAILABLE;
1656         }
1657         ASSERT(no_callback || hdr->b_datacnt > 1 ||
1658             refcount_is_zero(&hdr->b_refcnt));
1659         mutex_exit(hash_lock);
1660         return (no_callback);
1661 }
1662 
1663 int
1664 arc_buf_size(arc_buf_t *buf)
1665 {
1666         return (buf->b_hdr->b_size);
1667 }
1668 
1669 /*
1670  * Called from the DMU to determine if the current buffer should be
1671  * evicted. In order to ensure proper locking, the eviction must be initiated
1672  * from the DMU. Return true if the buffer is associated with user data and
1673  * duplicate buffers still exist.
1674  */
1675 boolean_t
1676 arc_buf_eviction_needed(arc_buf_t *buf)
1677 {
1678         arc_buf_hdr_t *hdr;
1679         boolean_t evict_needed = B_FALSE;
1680 
1681         if (zfs_disable_dup_eviction)
1682                 return (B_FALSE);
1683 
1684         mutex_enter(&buf->b_evict_lock);
1685         hdr = buf->b_hdr;
1686         if (hdr == NULL) {
1687                 /*
1688                  * We are in arc_do_user_evicts(); let that function
1689                  * perform the eviction.
1690                  */
1691                 ASSERT(buf->b_data == NULL);
1692                 mutex_exit(&buf->b_evict_lock);
1693                 return (B_FALSE);
1694         } else if (buf->b_data == NULL) {
1695                 /*
1696                  * We have already been added to the arc eviction list;
1697                  * recommend eviction.
1698                  */
1699                 ASSERT3P(hdr, ==, &arc_eviction_hdr);
1700                 mutex_exit(&buf->b_evict_lock);
1701                 return (B_TRUE);
1702         }
1703 
1704         if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
1705                 evict_needed = B_TRUE;
1706 
1707         mutex_exit(&buf->b_evict_lock);
1708         return (evict_needed);
1709 }
1710 
1711 /*
1712  * Evict buffers from list until we've removed the specified number of
1713  * bytes.  Move the removed buffers to the appropriate evict state.
1714  * If the recycle flag is set, then attempt to "recycle" a buffer:
1715  * - look for a buffer to evict that is `bytes' long.
1716  * - return the data block from this buffer rather than freeing it.
1717  * This flag is used by callers that are trying to make space for a
1718  * new buffer in a full arc cache.
1719  *
1720  * This function makes a "best effort".  It skips over any buffers
1721  * it can't get a hash_lock on, and so may not catch all candidates.
1722  * It may also return without evicting as much space as requested.
1723  */
1724 static void *
1725 arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1726     arc_buf_contents_t type)
1727 {
1728         arc_state_t *evicted_state;
1729         uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1730         arc_buf_hdr_t *ab, *ab_prev = NULL;
1731         list_t *list = &state->arcs_list[type];
1732         kmutex_t *hash_lock;
1733         boolean_t have_lock;
1734         void *stolen = NULL;
1735 
1736         ASSERT(state == arc_mru || state == arc_mfu);
1737 
1738         evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1739 
1740         mutex_enter(&state->arcs_mtx);
1741         mutex_enter(&evicted_state->arcs_mtx);
1742 
1743         for (ab = list_tail(list); ab; ab = ab_prev) {
1744                 ab_prev = list_prev(list, ab);
1745                 /* prefetch buffers have a minimum lifespan */
1746                 if (HDR_IO_IN_PROGRESS(ab) ||
1747                     (spa && ab->b_spa != spa) ||
1748                     (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1749                     ddi_get_lbolt() - ab->b_arc_access <
1750                     arc_min_prefetch_lifespan)) {
1751                         skipped++;
1752                         continue;
1753                 }
1754                 /* "lookahead" for better eviction candidate */
1755                 if (recycle && ab->b_size != bytes &&
1756                     ab_prev && ab_prev->b_size == bytes)
1757                         continue;
1758                 hash_lock = HDR_LOCK(ab);
1759                 have_lock = MUTEX_HELD(hash_lock);
1760                 if (have_lock || mutex_tryenter(hash_lock)) {
1761                         ASSERT0(refcount_count(&ab->b_refcnt));
1762                         ASSERT(ab->b_datacnt > 0);
1763                         while (ab->b_buf) {
1764                                 arc_buf_t *buf = ab->b_buf;
1765                                 if (!mutex_tryenter(&buf->b_evict_lock)) {
1766                                         missed += 1;
1767                                         break;
1768                                 }
1769                                 if (buf->b_data) {
1770                                         bytes_evicted += ab->b_size;
1771                                         if (recycle && ab->b_type == type &&
1772                                             ab->b_size == bytes &&
1773                                             !HDR_L2_WRITING(ab)) {
1774                                                 stolen = buf->b_data;
1775                                                 recycle = FALSE;
1776                                         }
1777                                 }
1778                                 if (buf->b_efunc) {
1779                                         mutex_enter(&arc_eviction_mtx);
1780                                         arc_buf_destroy(buf,
1781                                             buf->b_data == stolen, FALSE);
1782                                         ab->b_buf = buf->b_next;
1783                                         buf->b_hdr = &arc_eviction_hdr;
1784                                         buf->b_next = arc_eviction_list;
1785                                         arc_eviction_list = buf;
1786                                         mutex_exit(&arc_eviction_mtx);
1787                                         mutex_exit(&buf->b_evict_lock);
1788                                 } else {
1789                                         mutex_exit(&buf->b_evict_lock);
1790                                         arc_buf_destroy(buf,
1791                                             buf->b_data == stolen, TRUE);
1792                                 }
1793                         }
1794 
1795                         if (ab->b_l2hdr) {
1796                                 ARCSTAT_INCR(arcstat_evict_l2_cached,
1797                                     ab->b_size);
1798                         } else {
1799                                 if (l2arc_write_eligible(ab->b_spa, ab)) {
1800                                         ARCSTAT_INCR(arcstat_evict_l2_eligible,
1801                                             ab->b_size);
1802                                 } else {
1803                                         ARCSTAT_INCR(
1804                                             arcstat_evict_l2_ineligible,
1805                                             ab->b_size);
1806                                 }
1807                         }
1808 
1809                         if (ab->b_datacnt == 0) {
1810                                 arc_change_state(evicted_state, ab, hash_lock);
1811                                 ASSERT(HDR_IN_HASH_TABLE(ab));
1812                                 ab->b_flags |= ARC_IN_HASH_TABLE;
1813                                 ab->b_flags &= ~ARC_BUF_AVAILABLE;
1814                                 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
1815                         }
1816                         if (!have_lock)
1817                                 mutex_exit(hash_lock);
1818                         if (bytes >= 0 && bytes_evicted >= bytes)
1819                                 break;
1820                 } else {
1821                         missed += 1;
1822                 }
1823         }
1824 
1825         mutex_exit(&evicted_state->arcs_mtx);
1826         mutex_exit(&state->arcs_mtx);
1827 
1828         if (bytes_evicted < bytes)
1829                 dprintf("only evicted %lld bytes from %x",
1830                     (longlong_t)bytes_evicted, state);
1831 
1832         if (skipped)
1833                 ARCSTAT_INCR(arcstat_evict_skip, skipped);
1834 
1835         if (missed)
1836                 ARCSTAT_INCR(arcstat_mutex_miss, missed);
1837 
1838         /*
1839          * We have just evicted some data into the ghost state, make
1840          * sure we also adjust the ghost state size if necessary.
1841          */
1842         if (arc_no_grow &&
1843             arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
1844                 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
1845                     arc_mru_ghost->arcs_size - arc_c;
1846 
1847                 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
1848                         int64_t todelete =
1849                             MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
1850                         arc_evict_ghost(arc_mru_ghost, NULL, todelete);
1851                 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
1852                         int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
1853                             arc_mru_ghost->arcs_size +
1854                             arc_mfu_ghost->arcs_size - arc_c);
1855                         arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
1856                 }
1857         }
1858 
1859         return (stolen);
1860 }
1861 
1862 /*
1863  * Remove buffers from list until we've removed the specified number of
1864  * bytes.  Destroy the buffers that are removed.
1865  */
1866 static void
1867 arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
1868 {
1869         arc_buf_hdr_t *ab, *ab_prev;
1870         arc_buf_hdr_t marker = { 0 };
1871         list_t *list = &state->arcs_list[ARC_BUFC_DATA];
1872         kmutex_t *hash_lock;
1873         uint64_t bytes_deleted = 0;
1874         uint64_t bufs_skipped = 0;
1875 
1876         ASSERT(GHOST_STATE(state));
1877 top:
1878         mutex_enter(&state->arcs_mtx);
1879         for (ab = list_tail(list); ab; ab = ab_prev) {
1880                 ab_prev = list_prev(list, ab);
1881                 if (spa && ab->b_spa != spa)
1882                         continue;
1883 
1884                 /* ignore markers */
1885                 if (ab->b_spa == 0)
1886                         continue;
1887 
1888                 hash_lock = HDR_LOCK(ab);
1889                 /* caller may be trying to modify this buffer, skip it */
1890                 if (MUTEX_HELD(hash_lock))
1891                         continue;
1892                 if (mutex_tryenter(hash_lock)) {
1893                         ASSERT(!HDR_IO_IN_PROGRESS(ab));
1894                         ASSERT(ab->b_buf == NULL);
1895                         ARCSTAT_BUMP(arcstat_deleted);
1896                         bytes_deleted += ab->b_size;
1897 
1898                         if (ab->b_l2hdr != NULL) {
1899                                 /*
1900                                  * This buffer is cached on the 2nd Level ARC;
1901                                  * don't destroy the header.
1902                                  */
1903                                 arc_change_state(arc_l2c_only, ab, hash_lock);
1904                                 mutex_exit(hash_lock);
1905                         } else {
1906                                 arc_change_state(arc_anon, ab, hash_lock);
1907                                 mutex_exit(hash_lock);
1908                                 arc_hdr_destroy(ab);
1909                         }
1910 
1911                         DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
1912                         if (bytes >= 0 && bytes_deleted >= bytes)
1913                                 break;
1914                 } else if (bytes < 0) {
1915                         /*
1916                          * Insert a list marker and then wait for the
1917                          * hash lock to become available. Once its
1918                          * available, restart from where we left off.
1919                          */
1920                         list_insert_after(list, ab, &marker);
1921                         mutex_exit(&state->arcs_mtx);
1922                         mutex_enter(hash_lock);
1923                         mutex_exit(hash_lock);
1924                         mutex_enter(&state->arcs_mtx);
1925                         ab_prev = list_prev(list, &marker);
1926                         list_remove(list, &marker);
1927                 } else
1928                         bufs_skipped += 1;
1929         }
1930         mutex_exit(&state->arcs_mtx);
1931 
1932         if (list == &state->arcs_list[ARC_BUFC_DATA] &&
1933             (bytes < 0 || bytes_deleted < bytes)) {
1934                 list = &state->arcs_list[ARC_BUFC_METADATA];
1935                 goto top;
1936         }
1937 
1938         if (bufs_skipped) {
1939                 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
1940                 ASSERT(bytes >= 0);
1941         }
1942 
1943         if (bytes_deleted < bytes)
1944                 dprintf("only deleted %lld bytes from %p",
1945                     (longlong_t)bytes_deleted, state);
1946 }
1947 
1948 static void
1949 arc_adjust(void)
1950 {
1951         int64_t adjustment, delta;
1952 
1953         /*
1954          * Adjust MRU size
1955          */
1956 
1957         adjustment = MIN((int64_t)(arc_size - arc_c),
1958             (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
1959             arc_p));
1960 
1961         if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
1962                 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
1963                 (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA);
1964                 adjustment -= delta;
1965         }
1966 
1967         if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
1968                 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
1969                 (void) arc_evict(arc_mru, NULL, delta, FALSE,
1970                     ARC_BUFC_METADATA);
1971         }
1972 
1973         /*
1974          * Adjust MFU size
1975          */
1976 
1977         adjustment = arc_size - arc_c;
1978 
1979         if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
1980                 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
1981                 (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA);
1982                 adjustment -= delta;
1983         }
1984 
1985         if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
1986                 int64_t delta = MIN(adjustment,
1987                     arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
1988                 (void) arc_evict(arc_mfu, NULL, delta, FALSE,
1989                     ARC_BUFC_METADATA);
1990         }
1991 
1992         /*
1993          * Adjust ghost lists
1994          */
1995 
1996         adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
1997 
1998         if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
1999                 delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2000                 arc_evict_ghost(arc_mru_ghost, NULL, delta);
2001         }
2002 
2003         adjustment =
2004             arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2005 
2006         if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2007                 delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2008                 arc_evict_ghost(arc_mfu_ghost, NULL, delta);
2009         }
2010 }
2011 
2012 static void
2013 arc_do_user_evicts(void)
2014 {
2015         mutex_enter(&arc_eviction_mtx);
2016         while (arc_eviction_list != NULL) {
2017                 arc_buf_t *buf = arc_eviction_list;
2018                 arc_eviction_list = buf->b_next;
2019                 mutex_enter(&buf->b_evict_lock);
2020                 buf->b_hdr = NULL;
2021                 mutex_exit(&buf->b_evict_lock);
2022                 mutex_exit(&arc_eviction_mtx);
2023 
2024                 if (buf->b_efunc != NULL)
2025                         VERIFY(buf->b_efunc(buf) == 0);
2026 
2027                 buf->b_efunc = NULL;
2028                 buf->b_private = NULL;
2029                 kmem_cache_free(buf_cache, buf);
2030                 mutex_enter(&arc_eviction_mtx);
2031         }
2032         mutex_exit(&arc_eviction_mtx);
2033 }
2034 
2035 /*
2036  * Flush all *evictable* data from the cache for the given spa.
2037  * NOTE: this will not touch "active" (i.e. referenced) data.
2038  */
2039 void
2040 arc_flush(spa_t *spa)
2041 {
2042         uint64_t guid = 0;
2043 
2044         if (spa)
2045                 guid = spa_load_guid(spa);
2046 
2047         while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
2048                 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2049                 if (spa)
2050                         break;
2051         }
2052         while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
2053                 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2054                 if (spa)
2055                         break;
2056         }
2057         while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
2058                 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2059                 if (spa)
2060                         break;
2061         }
2062         while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
2063                 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2064                 if (spa)
2065                         break;
2066         }
2067 
2068         arc_evict_ghost(arc_mru_ghost, guid, -1);
2069         arc_evict_ghost(arc_mfu_ghost, guid, -1);
2070 
2071         mutex_enter(&arc_reclaim_thr_lock);
2072         arc_do_user_evicts();
2073         mutex_exit(&arc_reclaim_thr_lock);
2074         ASSERT(spa || arc_eviction_list == NULL);
2075 }
2076 
2077 void
2078 arc_shrink(void)
2079 {
2080         if (arc_c > arc_c_min) {
2081                 uint64_t to_free;
2082 
2083 #ifdef _KERNEL
2084                 to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree));
2085 #else
2086                 to_free = arc_c >> arc_shrink_shift;
2087 #endif
2088                 if (arc_c > arc_c_min + to_free)
2089                         atomic_add_64(&arc_c, -to_free);
2090                 else
2091                         arc_c = arc_c_min;
2092 
2093                 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2094                 if (arc_c > arc_size)
2095                         arc_c = MAX(arc_size, arc_c_min);
2096                 if (arc_p > arc_c)
2097                         arc_p = (arc_c >> 1);
2098                 ASSERT(arc_c >= arc_c_min);
2099                 ASSERT((int64_t)arc_p >= 0);
2100         }
2101 
2102         if (arc_size > arc_c)
2103                 arc_adjust();
2104 }
2105 
2106 /*
2107  * Determine if the system is under memory pressure and is asking
2108  * to reclaim memory. A return value of 1 indicates that the system
2109  * is under memory pressure and that the arc should adjust accordingly.
2110  */
2111 static int
2112 arc_reclaim_needed(void)
2113 {
2114         uint64_t extra;
2115 
2116 #ifdef _KERNEL
2117 
2118         if (needfree)
2119                 return (1);
2120 
2121         /*
2122          * take 'desfree' extra pages, so we reclaim sooner, rather than later
2123          */
2124         extra = desfree;
2125 
2126         /*
2127          * check that we're out of range of the pageout scanner.  It starts to
2128          * schedule paging if freemem is less than lotsfree and needfree.
2129          * lotsfree is the high-water mark for pageout, and needfree is the
2130          * number of needed free pages.  We add extra pages here to make sure
2131          * the scanner doesn't start up while we're freeing memory.
2132          */
2133         if (freemem < lotsfree + needfree + extra)
2134                 return (1);
2135 
2136         /*
2137          * check to make sure that swapfs has enough space so that anon
2138          * reservations can still succeed. anon_resvmem() checks that the
2139          * availrmem is greater than swapfs_minfree, and the number of reserved
2140          * swap pages.  We also add a bit of extra here just to prevent
2141          * circumstances from getting really dire.
2142          */
2143         if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2144                 return (1);
2145 
2146 #if defined(__i386)
2147         /*
2148          * If we're on an i386 platform, it's possible that we'll exhaust the
2149          * kernel heap space before we ever run out of available physical
2150          * memory.  Most checks of the size of the heap_area compare against
2151          * tune.t_minarmem, which is the minimum available real memory that we
2152          * can have in the system.  However, this is generally fixed at 25 pages
2153          * which is so low that it's useless.  In this comparison, we seek to
2154          * calculate the total heap-size, and reclaim if more than 3/4ths of the
2155          * heap is allocated.  (Or, in the calculation, if less than 1/4th is
2156          * free)
2157          */
2158         if (vmem_size(heap_arena, VMEM_FREE) <
2159             (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2))
2160                 return (1);
2161 #endif
2162 
2163         /*
2164          * If zio data pages are being allocated out of a separate heap segment,
2165          * then enforce that the size of available vmem for this arena remains
2166          * above about 1/16th free.
2167          *
2168          * Note: The 1/16th arena free requirement was put in place
2169          * to aggressively evict memory from the arc in order to avoid
2170          * memory fragmentation issues.
2171          */
2172         if (zio_arena != NULL &&
2173             vmem_size(zio_arena, VMEM_FREE) <
2174             (vmem_size(zio_arena, VMEM_ALLOC) >> 4))
2175                 return (1);
2176 #else
2177         if (spa_get_random(100) == 0)
2178                 return (1);
2179 #endif
2180         return (0);
2181 }
2182 
2183 static void
2184 arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2185 {
2186         size_t                  i;
2187         kmem_cache_t            *prev_cache = NULL;
2188         kmem_cache_t            *prev_data_cache = NULL;
2189         extern kmem_cache_t     *zio_buf_cache[];
2190         extern kmem_cache_t     *zio_data_buf_cache[];
2191 
2192 #ifdef _KERNEL
2193         if (arc_meta_used >= arc_meta_limit) {
2194                 /*
2195                  * We are exceeding our meta-data cache limit.
2196                  * Purge some DNLC entries to release holds on meta-data.
2197                  */
2198                 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2199         }
2200 #if defined(__i386)
2201         /*
2202          * Reclaim unused memory from all kmem caches.
2203          */
2204         kmem_reap();
2205 #endif
2206 #endif
2207 
2208         /*
2209          * An aggressive reclamation will shrink the cache size as well as
2210          * reap free buffers from the arc kmem caches.
2211          */
2212         if (strat == ARC_RECLAIM_AGGR)
2213                 arc_shrink();
2214 
2215         for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2216                 if (zio_buf_cache[i] != prev_cache) {
2217                         prev_cache = zio_buf_cache[i];
2218                         kmem_cache_reap_now(zio_buf_cache[i]);
2219                 }
2220                 if (zio_data_buf_cache[i] != prev_data_cache) {
2221                         prev_data_cache = zio_data_buf_cache[i];
2222                         kmem_cache_reap_now(zio_data_buf_cache[i]);
2223                 }
2224         }
2225         kmem_cache_reap_now(buf_cache);
2226         kmem_cache_reap_now(hdr_cache);
2227 
2228         /*
2229          * Ask the vmem areana to reclaim unused memory from its
2230          * quantum caches.
2231          */
2232         if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
2233                 vmem_qcache_reap(zio_arena);
2234 }
2235 
2236 static void
2237 arc_reclaim_thread(void)
2238 {
2239         clock_t                 growtime = 0;
2240         arc_reclaim_strategy_t  last_reclaim = ARC_RECLAIM_CONS;
2241         callb_cpr_t             cpr;
2242 
2243         CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2244 
2245         mutex_enter(&arc_reclaim_thr_lock);
2246         while (arc_thread_exit == 0) {
2247                 if (arc_reclaim_needed()) {
2248 
2249                         if (arc_no_grow) {
2250                                 if (last_reclaim == ARC_RECLAIM_CONS) {
2251                                         last_reclaim = ARC_RECLAIM_AGGR;
2252                                 } else {
2253                                         last_reclaim = ARC_RECLAIM_CONS;
2254                                 }
2255                         } else {
2256                                 arc_no_grow = TRUE;
2257                                 last_reclaim = ARC_RECLAIM_AGGR;
2258                                 membar_producer();
2259                         }
2260 
2261                         /* reset the growth delay for every reclaim */
2262                         growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2263 
2264                         arc_kmem_reap_now(last_reclaim);
2265                         arc_warm = B_TRUE;
2266 
2267                 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2268                         arc_no_grow = FALSE;
2269                 }
2270 
2271                 arc_adjust();
2272 
2273                 if (arc_eviction_list != NULL)
2274                         arc_do_user_evicts();
2275 
2276                 /* block until needed, or one second, whichever is shorter */
2277                 CALLB_CPR_SAFE_BEGIN(&cpr);
2278                 (void) cv_timedwait(&arc_reclaim_thr_cv,
2279                     &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
2280                 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2281         }
2282 
2283         arc_thread_exit = 0;
2284         cv_broadcast(&arc_reclaim_thr_cv);
2285         CALLB_CPR_EXIT(&cpr);               /* drops arc_reclaim_thr_lock */
2286         thread_exit();
2287 }
2288 
2289 /*
2290  * Adapt arc info given the number of bytes we are trying to add and
2291  * the state that we are comming from.  This function is only called
2292  * when we are adding new content to the cache.
2293  */
2294 static void
2295 arc_adapt(int bytes, arc_state_t *state)
2296 {
2297         int mult;
2298         uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2299 
2300         if (state == arc_l2c_only)
2301                 return;
2302 
2303         ASSERT(bytes > 0);
2304         /*
2305          * Adapt the target size of the MRU list:
2306          *      - if we just hit in the MRU ghost list, then increase
2307          *        the target size of the MRU list.
2308          *      - if we just hit in the MFU ghost list, then increase
2309          *        the target size of the MFU list by decreasing the
2310          *        target size of the MRU list.
2311          */
2312         if (state == arc_mru_ghost) {
2313                 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2314                     1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2315                 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2316 
2317                 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2318         } else if (state == arc_mfu_ghost) {
2319                 uint64_t delta;
2320 
2321                 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2322                     1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2323                 mult = MIN(mult, 10);
2324 
2325                 delta = MIN(bytes * mult, arc_p);
2326                 arc_p = MAX(arc_p_min, arc_p - delta);
2327         }
2328         ASSERT((int64_t)arc_p >= 0);
2329 
2330         if (arc_reclaim_needed()) {
2331                 cv_signal(&arc_reclaim_thr_cv);
2332                 return;
2333         }
2334 
2335         if (arc_no_grow)
2336                 return;
2337 
2338         if (arc_c >= arc_c_max)
2339                 return;
2340 
2341         /*
2342          * If we're within (2 * maxblocksize) bytes of the target
2343          * cache size, increment the target cache size
2344          */
2345         if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2346                 atomic_add_64(&arc_c, (int64_t)bytes);
2347                 if (arc_c > arc_c_max)
2348                         arc_c = arc_c_max;
2349                 else if (state == arc_anon)
2350                         atomic_add_64(&arc_p, (int64_t)bytes);
2351                 if (arc_p > arc_c)
2352                         arc_p = arc_c;
2353         }
2354         ASSERT((int64_t)arc_p >= 0);
2355 }
2356 
2357 /*
2358  * Check if the cache has reached its limits and eviction is required
2359  * prior to insert.
2360  */
2361 static int
2362 arc_evict_needed(arc_buf_contents_t type)
2363 {
2364         if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2365                 return (1);
2366 
2367         if (arc_reclaim_needed())
2368                 return (1);
2369 
2370         return (arc_size > arc_c);
2371 }
2372 
2373 /*
2374  * The buffer, supplied as the first argument, needs a data block.
2375  * So, if we are at cache max, determine which cache should be victimized.
2376  * We have the following cases:
2377  *
2378  * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2379  * In this situation if we're out of space, but the resident size of the MFU is
2380  * under the limit, victimize the MFU cache to satisfy this insertion request.
2381  *
2382  * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2383  * Here, we've used up all of the available space for the MRU, so we need to
2384  * evict from our own cache instead.  Evict from the set of resident MRU
2385  * entries.
2386  *
2387  * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2388  * c minus p represents the MFU space in the cache, since p is the size of the
2389  * cache that is dedicated to the MRU.  In this situation there's still space on
2390  * the MFU side, so the MRU side needs to be victimized.
2391  *
2392  * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2393  * MFU's resident set is consuming more space than it has been allotted.  In
2394  * this situation, we must victimize our own cache, the MFU, for this insertion.
2395  */
2396 static void
2397 arc_get_data_buf(arc_buf_t *buf)
2398 {
2399         arc_state_t             *state = buf->b_hdr->b_state;
2400         uint64_t                size = buf->b_hdr->b_size;
2401         arc_buf_contents_t      type = buf->b_hdr->b_type;
2402 
2403         arc_adapt(size, state);
2404 
2405         /*
2406          * We have not yet reached cache maximum size,
2407          * just allocate a new buffer.
2408          */
2409         if (!arc_evict_needed(type)) {
2410                 if (type == ARC_BUFC_METADATA) {
2411                         buf->b_data = zio_buf_alloc(size);
2412                         arc_space_consume(size, ARC_SPACE_DATA);
2413                 } else {
2414                         ASSERT(type == ARC_BUFC_DATA);
2415                         buf->b_data = zio_data_buf_alloc(size);
2416                         ARCSTAT_INCR(arcstat_data_size, size);
2417                         atomic_add_64(&arc_size, size);
2418                 }
2419                 goto out;
2420         }
2421 
2422         /*
2423          * If we are prefetching from the mfu ghost list, this buffer
2424          * will end up on the mru list; so steal space from there.
2425          */
2426         if (state == arc_mfu_ghost)
2427                 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2428         else if (state == arc_mru_ghost)
2429                 state = arc_mru;
2430 
2431         if (state == arc_mru || state == arc_anon) {
2432                 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2433                 state = (arc_mfu->arcs_lsize[type] >= size &&
2434                     arc_p > mru_used) ? arc_mfu : arc_mru;
2435         } else {
2436                 /* MFU cases */
2437                 uint64_t mfu_space = arc_c - arc_p;
2438                 state =  (arc_mru->arcs_lsize[type] >= size &&
2439                     mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2440         }
2441         if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
2442                 if (type == ARC_BUFC_METADATA) {
2443                         buf->b_data = zio_buf_alloc(size);
2444                         arc_space_consume(size, ARC_SPACE_DATA);
2445                 } else {
2446                         ASSERT(type == ARC_BUFC_DATA);
2447                         buf->b_data = zio_data_buf_alloc(size);
2448                         ARCSTAT_INCR(arcstat_data_size, size);
2449                         atomic_add_64(&arc_size, size);
2450                 }
2451                 ARCSTAT_BUMP(arcstat_recycle_miss);
2452         }
2453         ASSERT(buf->b_data != NULL);
2454 out:
2455         /*
2456          * Update the state size.  Note that ghost states have a
2457          * "ghost size" and so don't need to be updated.
2458          */
2459         if (!GHOST_STATE(buf->b_hdr->b_state)) {
2460                 arc_buf_hdr_t *hdr = buf->b_hdr;
2461 
2462                 atomic_add_64(&hdr->b_state->arcs_size, size);
2463                 if (list_link_active(&hdr->b_arc_node)) {
2464                         ASSERT(refcount_is_zero(&hdr->b_refcnt));
2465                         atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2466                 }
2467                 /*
2468                  * If we are growing the cache, and we are adding anonymous
2469                  * data, and we have outgrown arc_p, update arc_p
2470                  */
2471                 if (arc_size < arc_c && hdr->b_state == arc_anon &&
2472                     arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2473                         arc_p = MIN(arc_c, arc_p + size);
2474         }
2475 }
2476 
2477 /*
2478  * This routine is called whenever a buffer is accessed.
2479  * NOTE: the hash lock is dropped in this function.
2480  */
2481 static void
2482 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2483 {
2484         clock_t now;
2485 
2486         ASSERT(MUTEX_HELD(hash_lock));
2487 
2488         if (buf->b_state == arc_anon) {
2489                 /*
2490                  * This buffer is not in the cache, and does not
2491                  * appear in our "ghost" list.  Add the new buffer
2492                  * to the MRU state.
2493                  */
2494 
2495                 ASSERT(buf->b_arc_access == 0);
2496                 buf->b_arc_access = ddi_get_lbolt();
2497                 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2498                 arc_change_state(arc_mru, buf, hash_lock);
2499 
2500         } else if (buf->b_state == arc_mru) {
2501                 now = ddi_get_lbolt();
2502 
2503                 /*
2504                  * If this buffer is here because of a prefetch, then either:
2505                  * - clear the flag if this is a "referencing" read
2506                  *   (any subsequent access will bump this into the MFU state).
2507                  * or
2508                  * - move the buffer to the head of the list if this is
2509                  *   another prefetch (to make it less likely to be evicted).
2510                  */
2511                 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2512                         if (refcount_count(&buf->b_refcnt) == 0) {
2513                                 ASSERT(list_link_active(&buf->b_arc_node));
2514                         } else {
2515                                 buf->b_flags &= ~ARC_PREFETCH;
2516                                 ARCSTAT_BUMP(arcstat_mru_hits);
2517                         }
2518                         buf->b_arc_access = now;
2519                         return;
2520                 }
2521 
2522                 /*
2523                  * This buffer has been "accessed" only once so far,
2524                  * but it is still in the cache. Move it to the MFU
2525                  * state.
2526                  */
2527                 if (now > buf->b_arc_access + ARC_MINTIME) {
2528                         /*
2529                          * More than 125ms have passed since we
2530                          * instantiated this buffer.  Move it to the
2531                          * most frequently used state.
2532                          */
2533                         buf->b_arc_access = now;
2534                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2535                         arc_change_state(arc_mfu, buf, hash_lock);
2536                 }
2537                 ARCSTAT_BUMP(arcstat_mru_hits);
2538         } else if (buf->b_state == arc_mru_ghost) {
2539                 arc_state_t     *new_state;
2540                 /*
2541                  * This buffer has been "accessed" recently, but
2542                  * was evicted from the cache.  Move it to the
2543                  * MFU state.
2544                  */
2545 
2546                 if (buf->b_flags & ARC_PREFETCH) {
2547                         new_state = arc_mru;
2548                         if (refcount_count(&buf->b_refcnt) > 0)
2549                                 buf->b_flags &= ~ARC_PREFETCH;
2550                         DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2551                 } else {
2552                         new_state = arc_mfu;
2553                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2554                 }
2555 
2556                 buf->b_arc_access = ddi_get_lbolt();
2557                 arc_change_state(new_state, buf, hash_lock);
2558 
2559                 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2560         } else if (buf->b_state == arc_mfu) {
2561                 /*
2562                  * This buffer has been accessed more than once and is
2563                  * still in the cache.  Keep it in the MFU state.
2564                  *
2565                  * NOTE: an add_reference() that occurred when we did
2566                  * the arc_read() will have kicked this off the list.
2567                  * If it was a prefetch, we will explicitly move it to
2568                  * the head of the list now.
2569                  */
2570                 if ((buf->b_flags & ARC_PREFETCH) != 0) {
2571                         ASSERT(refcount_count(&buf->b_refcnt) == 0);
2572                         ASSERT(list_link_active(&buf->b_arc_node));
2573                 }
2574                 ARCSTAT_BUMP(arcstat_mfu_hits);
2575                 buf->b_arc_access = ddi_get_lbolt();
2576         } else if (buf->b_state == arc_mfu_ghost) {
2577                 arc_state_t     *new_state = arc_mfu;
2578                 /*
2579                  * This buffer has been accessed more than once but has
2580                  * been evicted from the cache.  Move it back to the
2581                  * MFU state.
2582                  */
2583 
2584                 if (buf->b_flags & ARC_PREFETCH) {
2585                         /*
2586                          * This is a prefetch access...
2587                          * move this block back to the MRU state.
2588                          */
2589                         ASSERT0(refcount_count(&buf->b_refcnt));
2590                         new_state = arc_mru;
2591                 }
2592 
2593                 buf->b_arc_access = ddi_get_lbolt();
2594                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2595                 arc_change_state(new_state, buf, hash_lock);
2596 
2597                 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2598         } else if (buf->b_state == arc_l2c_only) {
2599                 /*
2600                  * This buffer is on the 2nd Level ARC.
2601                  */
2602 
2603                 buf->b_arc_access = ddi_get_lbolt();
2604                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2605                 arc_change_state(arc_mfu, buf, hash_lock);
2606         } else {
2607                 ASSERT(!"invalid arc state");
2608         }
2609 }
2610 
2611 /* a generic arc_done_func_t which you can use */
2612 /* ARGSUSED */
2613 void
2614 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2615 {
2616         if (zio == NULL || zio->io_error == 0)
2617                 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
2618         VERIFY(arc_buf_remove_ref(buf, arg));
2619 }
2620 
2621 /* a generic arc_done_func_t */
2622 void
2623 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2624 {
2625         arc_buf_t **bufp = arg;
2626         if (zio && zio->io_error) {
2627                 VERIFY(arc_buf_remove_ref(buf, arg));
2628                 *bufp = NULL;
2629         } else {
2630                 *bufp = buf;
2631                 ASSERT(buf->b_data);
2632         }
2633 }
2634 
2635 static void
2636 arc_read_done(zio_t *zio)
2637 {
2638         arc_buf_hdr_t   *hdr, *found;
2639         arc_buf_t       *buf;
2640         arc_buf_t       *abuf;  /* buffer we're assigning to callback */
2641         kmutex_t        *hash_lock;
2642         arc_callback_t  *callback_list, *acb;
2643         int             freeable = FALSE;
2644 
2645         buf = zio->io_private;
2646         hdr = buf->b_hdr;
2647 
2648         /*
2649          * The hdr was inserted into hash-table and removed from lists
2650          * prior to starting I/O.  We should find this header, since
2651          * it's in the hash table, and it should be legit since it's
2652          * not possible to evict it during the I/O.  The only possible
2653          * reason for it not to be found is if we were freed during the
2654          * read.
2655          */
2656         found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
2657             &hash_lock);
2658 
2659         ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
2660             (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
2661             (found == hdr && HDR_L2_READING(hdr)));
2662 
2663         hdr->b_flags &= ~ARC_L2_EVICTED;
2664         if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
2665                 hdr->b_flags &= ~ARC_L2CACHE;
2666 
2667         /* byteswap if necessary */
2668         callback_list = hdr->b_acb;
2669         ASSERT(callback_list != NULL);
2670         if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
2671                 dmu_object_byteswap_t bswap =
2672                     DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
2673                 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
2674                     byteswap_uint64_array :
2675                     dmu_ot_byteswap[bswap].ob_func;
2676                 func(buf->b_data, hdr->b_size);
2677         }
2678 
2679         arc_cksum_compute(buf, B_FALSE);
2680         arc_buf_watch(buf);
2681 
2682         if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
2683                 /*
2684                  * Only call arc_access on anonymous buffers.  This is because
2685                  * if we've issued an I/O for an evicted buffer, we've already
2686                  * called arc_access (to prevent any simultaneous readers from
2687                  * getting confused).
2688                  */
2689                 arc_access(hdr, hash_lock);
2690         }
2691 
2692         /* create copies of the data buffer for the callers */
2693         abuf = buf;
2694         for (acb = callback_list; acb; acb = acb->acb_next) {
2695                 if (acb->acb_done) {
2696                         if (abuf == NULL) {
2697                                 ARCSTAT_BUMP(arcstat_duplicate_reads);
2698                                 abuf = arc_buf_clone(buf);
2699                         }
2700                         acb->acb_buf = abuf;
2701                         abuf = NULL;
2702                 }
2703         }
2704         hdr->b_acb = NULL;
2705         hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2706         ASSERT(!HDR_BUF_AVAILABLE(hdr));
2707         if (abuf == buf) {
2708                 ASSERT(buf->b_efunc == NULL);
2709                 ASSERT(hdr->b_datacnt == 1);
2710                 hdr->b_flags |= ARC_BUF_AVAILABLE;
2711         }
2712 
2713         ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
2714 
2715         if (zio->io_error != 0) {
2716                 hdr->b_flags |= ARC_IO_ERROR;
2717                 if (hdr->b_state != arc_anon)
2718                         arc_change_state(arc_anon, hdr, hash_lock);
2719                 if (HDR_IN_HASH_TABLE(hdr))
2720                         buf_hash_remove(hdr);
2721                 freeable = refcount_is_zero(&hdr->b_refcnt);
2722         }
2723 
2724         /*
2725          * Broadcast before we drop the hash_lock to avoid the possibility
2726          * that the hdr (and hence the cv) might be freed before we get to
2727          * the cv_broadcast().
2728          */
2729         cv_broadcast(&hdr->b_cv);
2730 
2731         if (hash_lock) {
2732                 mutex_exit(hash_lock);
2733         } else {
2734                 /*
2735                  * This block was freed while we waited for the read to
2736                  * complete.  It has been removed from the hash table and
2737                  * moved to the anonymous state (so that it won't show up
2738                  * in the cache).
2739                  */
2740                 ASSERT3P(hdr->b_state, ==, arc_anon);
2741                 freeable = refcount_is_zero(&hdr->b_refcnt);
2742         }
2743 
2744         /* execute each callback and free its structure */
2745         while ((acb = callback_list) != NULL) {
2746                 if (acb->acb_done)
2747                         acb->acb_done(zio, acb->acb_buf, acb->acb_private);
2748 
2749                 if (acb->acb_zio_dummy != NULL) {
2750                         acb->acb_zio_dummy->io_error = zio->io_error;
2751                         zio_nowait(acb->acb_zio_dummy);
2752                 }
2753 
2754                 callback_list = acb->acb_next;
2755                 kmem_free(acb, sizeof (arc_callback_t));
2756         }
2757 
2758         if (freeable)
2759                 arc_hdr_destroy(hdr);
2760 }
2761 
2762 /*
2763  * "Read" the block at the specified DVA (in bp) via the
2764  * cache.  If the block is found in the cache, invoke the provided
2765  * callback immediately and return.  Note that the `zio' parameter
2766  * in the callback will be NULL in this case, since no IO was
2767  * required.  If the block is not in the cache pass the read request
2768  * on to the spa with a substitute callback function, so that the
2769  * requested block will be added to the cache.
2770  *
2771  * If a read request arrives for a block that has a read in-progress,
2772  * either wait for the in-progress read to complete (and return the
2773  * results); or, if this is a read with a "done" func, add a record
2774  * to the read to invoke the "done" func when the read completes,
2775  * and return; or just return.
2776  *
2777  * arc_read_done() will invoke all the requested "done" functions
2778  * for readers of this block.
2779  */
2780 int
2781 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
2782     void *private, int priority, int zio_flags, uint32_t *arc_flags,
2783     const zbookmark_t *zb)
2784 {
2785         arc_buf_hdr_t *hdr;
2786         arc_buf_t *buf = NULL;
2787         kmutex_t *hash_lock;
2788         zio_t *rzio;
2789         uint64_t guid = spa_load_guid(spa);
2790 
2791 top:
2792         hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
2793             &hash_lock);
2794         if (hdr && hdr->b_datacnt > 0) {
2795 
2796                 *arc_flags |= ARC_CACHED;
2797 
2798                 if (HDR_IO_IN_PROGRESS(hdr)) {
2799 
2800                         if (*arc_flags & ARC_WAIT) {
2801                                 cv_wait(&hdr->b_cv, hash_lock);
2802                                 mutex_exit(hash_lock);
2803                                 goto top;
2804                         }
2805                         ASSERT(*arc_flags & ARC_NOWAIT);
2806 
2807                         if (done) {
2808                                 arc_callback_t  *acb = NULL;
2809 
2810                                 acb = kmem_zalloc(sizeof (arc_callback_t),
2811                                     KM_SLEEP);
2812                                 acb->acb_done = done;
2813                                 acb->acb_private = private;
2814                                 if (pio != NULL)
2815                                         acb->acb_zio_dummy = zio_null(pio,
2816                                             spa, NULL, NULL, NULL, zio_flags);
2817 
2818                                 ASSERT(acb->acb_done != NULL);
2819                                 acb->acb_next = hdr->b_acb;
2820                                 hdr->b_acb = acb;
2821                                 add_reference(hdr, hash_lock, private);
2822                                 mutex_exit(hash_lock);
2823                                 return (0);
2824                         }
2825                         mutex_exit(hash_lock);
2826                         return (0);
2827                 }
2828 
2829                 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2830 
2831                 if (done) {
2832                         add_reference(hdr, hash_lock, private);
2833                         /*
2834                          * If this block is already in use, create a new
2835                          * copy of the data so that we will be guaranteed
2836                          * that arc_release() will always succeed.
2837                          */
2838                         buf = hdr->b_buf;
2839                         ASSERT(buf);
2840                         ASSERT(buf->b_data);
2841                         if (HDR_BUF_AVAILABLE(hdr)) {
2842                                 ASSERT(buf->b_efunc == NULL);
2843                                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
2844                         } else {
2845                                 buf = arc_buf_clone(buf);
2846                         }
2847 
2848                 } else if (*arc_flags & ARC_PREFETCH &&
2849                     refcount_count(&hdr->b_refcnt) == 0) {
2850                         hdr->b_flags |= ARC_PREFETCH;
2851                 }
2852                 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
2853                 arc_access(hdr, hash_lock);
2854                 if (*arc_flags & ARC_L2CACHE)
2855                         hdr->b_flags |= ARC_L2CACHE;
2856                 mutex_exit(hash_lock);
2857                 ARCSTAT_BUMP(arcstat_hits);
2858                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2859                     demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2860                     data, metadata, hits);
2861 
2862                 if (done)
2863                         done(NULL, buf, private);
2864         } else {
2865                 uint64_t size = BP_GET_LSIZE(bp);
2866                 arc_callback_t  *acb;
2867                 vdev_t *vd = NULL;
2868                 uint64_t addr = 0;
2869                 boolean_t devw = B_FALSE;
2870 
2871                 if (hdr == NULL) {
2872                         /* this block is not in the cache */
2873                         arc_buf_hdr_t   *exists;
2874                         arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
2875                         buf = arc_buf_alloc(spa, size, private, type);
2876                         hdr = buf->b_hdr;
2877                         hdr->b_dva = *BP_IDENTITY(bp);
2878                         hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
2879                         hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
2880                         exists = buf_hash_insert(hdr, &hash_lock);
2881                         if (exists) {
2882                                 /* somebody beat us to the hash insert */
2883                                 mutex_exit(hash_lock);
2884                                 buf_discard_identity(hdr);
2885                                 (void) arc_buf_remove_ref(buf, private);
2886                                 goto top; /* restart the IO request */
2887                         }
2888                         /* if this is a prefetch, we don't have a reference */
2889                         if (*arc_flags & ARC_PREFETCH) {
2890                                 (void) remove_reference(hdr, hash_lock,
2891                                     private);
2892                                 hdr->b_flags |= ARC_PREFETCH;
2893                         }
2894                         if (*arc_flags & ARC_L2CACHE)
2895                                 hdr->b_flags |= ARC_L2CACHE;
2896                         if (BP_GET_LEVEL(bp) > 0)
2897                                 hdr->b_flags |= ARC_INDIRECT;
2898                 } else {
2899                         /* this block is in the ghost cache */
2900                         ASSERT(GHOST_STATE(hdr->b_state));
2901                         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2902                         ASSERT0(refcount_count(&hdr->b_refcnt));
2903                         ASSERT(hdr->b_buf == NULL);
2904 
2905                         /* if this is a prefetch, we don't have a reference */
2906                         if (*arc_flags & ARC_PREFETCH)
2907                                 hdr->b_flags |= ARC_PREFETCH;
2908                         else
2909                                 add_reference(hdr, hash_lock, private);
2910                         if (*arc_flags & ARC_L2CACHE)
2911                                 hdr->b_flags |= ARC_L2CACHE;
2912                         buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2913                         buf->b_hdr = hdr;
2914                         buf->b_data = NULL;
2915                         buf->b_efunc = NULL;
2916                         buf->b_private = NULL;
2917                         buf->b_next = NULL;
2918                         hdr->b_buf = buf;
2919                         ASSERT(hdr->b_datacnt == 0);
2920                         hdr->b_datacnt = 1;
2921                         arc_get_data_buf(buf);
2922                         arc_access(hdr, hash_lock);
2923                 }
2924 
2925                 ASSERT(!GHOST_STATE(hdr->b_state));
2926 
2927                 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
2928                 acb->acb_done = done;
2929                 acb->acb_private = private;
2930 
2931                 ASSERT(hdr->b_acb == NULL);
2932                 hdr->b_acb = acb;
2933                 hdr->b_flags |= ARC_IO_IN_PROGRESS;
2934 
2935                 if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
2936                     (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
2937                         devw = hdr->b_l2hdr->b_dev->l2ad_writing;
2938                         addr = hdr->b_l2hdr->b_daddr;
2939                         /*
2940                          * Lock out device removal.
2941                          */
2942                         if (vdev_is_dead(vd) ||
2943                             !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
2944                                 vd = NULL;
2945                 }
2946 
2947                 mutex_exit(hash_lock);
2948 
2949                 ASSERT3U(hdr->b_size, ==, size);
2950                 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
2951                     uint64_t, size, zbookmark_t *, zb);
2952                 ARCSTAT_BUMP(arcstat_misses);
2953                 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2954                     demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2955                     data, metadata, misses);
2956 
2957                 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
2958                         /*
2959                          * Read from the L2ARC if the following are true:
2960                          * 1. The L2ARC vdev was previously cached.
2961                          * 2. This buffer still has L2ARC metadata.
2962                          * 3. This buffer isn't currently writing to the L2ARC.
2963                          * 4. The L2ARC entry wasn't evicted, which may
2964                          *    also have invalidated the vdev.
2965                          * 5. This isn't prefetch and l2arc_noprefetch is set.
2966                          */
2967                         if (hdr->b_l2hdr != NULL &&
2968                             !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
2969                             !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
2970                                 l2arc_read_callback_t *cb;
2971 
2972                                 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
2973                                 ARCSTAT_BUMP(arcstat_l2_hits);
2974 
2975                                 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
2976                                     KM_SLEEP);
2977                                 cb->l2rcb_buf = buf;
2978                                 cb->l2rcb_spa = spa;
2979                                 cb->l2rcb_bp = *bp;
2980                                 cb->l2rcb_zb = *zb;
2981                                 cb->l2rcb_flags = zio_flags;
2982 
2983                                 ASSERT(addr >= VDEV_LABEL_START_SIZE &&
2984                                     addr + size < vd->vdev_psize -
2985                                     VDEV_LABEL_END_SIZE);
2986 
2987                                 /*
2988                                  * l2arc read.  The SCL_L2ARC lock will be
2989                                  * released by l2arc_read_done().
2990                                  */
2991                                 rzio = zio_read_phys(pio, vd, addr, size,
2992                                     buf->b_data, ZIO_CHECKSUM_OFF,
2993                                     l2arc_read_done, cb, priority, zio_flags |
2994                                     ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
2995                                     ZIO_FLAG_DONT_PROPAGATE |
2996                                     ZIO_FLAG_DONT_RETRY, B_FALSE);
2997                                 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
2998                                     zio_t *, rzio);
2999                                 ARCSTAT_INCR(arcstat_l2_read_bytes, size);
3000 
3001                                 if (*arc_flags & ARC_NOWAIT) {
3002                                         zio_nowait(rzio);
3003                                         return (0);
3004                                 }
3005 
3006                                 ASSERT(*arc_flags & ARC_WAIT);
3007                                 if (zio_wait(rzio) == 0)
3008                                         return (0);
3009 
3010                                 /* l2arc read error; goto zio_read() */
3011                         } else {
3012                                 DTRACE_PROBE1(l2arc__miss,
3013                                     arc_buf_hdr_t *, hdr);
3014                                 ARCSTAT_BUMP(arcstat_l2_misses);
3015                                 if (HDR_L2_WRITING(hdr))
3016                                         ARCSTAT_BUMP(arcstat_l2_rw_clash);
3017                                 spa_config_exit(spa, SCL_L2ARC, vd);
3018                         }
3019                 } else {
3020                         if (vd != NULL)
3021                                 spa_config_exit(spa, SCL_L2ARC, vd);
3022                         if (l2arc_ndev != 0) {
3023                                 DTRACE_PROBE1(l2arc__miss,
3024                                     arc_buf_hdr_t *, hdr);
3025                                 ARCSTAT_BUMP(arcstat_l2_misses);
3026                         }
3027                 }
3028 
3029                 rzio = zio_read(pio, spa, bp, buf->b_data, size,
3030                     arc_read_done, buf, priority, zio_flags, zb);
3031 
3032                 if (*arc_flags & ARC_WAIT)
3033                         return (zio_wait(rzio));
3034 
3035                 ASSERT(*arc_flags & ARC_NOWAIT);
3036                 zio_nowait(rzio);
3037         }
3038         return (0);
3039 }
3040 
3041 void
3042 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3043 {
3044         ASSERT(buf->b_hdr != NULL);
3045         ASSERT(buf->b_hdr->b_state != arc_anon);
3046         ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3047         ASSERT(buf->b_efunc == NULL);
3048         ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3049 
3050         buf->b_efunc = func;
3051         buf->b_private = private;
3052 }
3053 
3054 /*
3055  * This is used by the DMU to let the ARC know that a buffer is
3056  * being evicted, so the ARC should clean up.  If this arc buf
3057  * is not yet in the evicted state, it will be put there.
3058  */
3059 int
3060 arc_buf_evict(arc_buf_t *buf)
3061 {
3062         arc_buf_hdr_t *hdr;
3063         kmutex_t *hash_lock;
3064         arc_buf_t **bufp;
3065 
3066         mutex_enter(&buf->b_evict_lock);
3067         hdr = buf->b_hdr;
3068         if (hdr == NULL) {
3069                 /*
3070                  * We are in arc_do_user_evicts().
3071                  */
3072                 ASSERT(buf->b_data == NULL);
3073                 mutex_exit(&buf->b_evict_lock);
3074                 return (0);
3075         } else if (buf->b_data == NULL) {
3076                 arc_buf_t copy = *buf; /* structure assignment */
3077                 /*
3078                  * We are on the eviction list; process this buffer now
3079                  * but let arc_do_user_evicts() do the reaping.
3080                  */
3081                 buf->b_efunc = NULL;
3082                 mutex_exit(&buf->b_evict_lock);
3083                 VERIFY(copy.b_efunc(&copy) == 0);
3084                 return (1);
3085         }
3086         hash_lock = HDR_LOCK(hdr);
3087         mutex_enter(hash_lock);
3088         hdr = buf->b_hdr;
3089         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3090 
3091         ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3092         ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3093 
3094         /*
3095          * Pull this buffer off of the hdr
3096          */
3097         bufp = &hdr->b_buf;
3098         while (*bufp != buf)
3099                 bufp = &(*bufp)->b_next;
3100         *bufp = buf->b_next;
3101 
3102         ASSERT(buf->b_data != NULL);
3103         arc_buf_destroy(buf, FALSE, FALSE);
3104 
3105         if (hdr->b_datacnt == 0) {
3106                 arc_state_t *old_state = hdr->b_state;
3107                 arc_state_t *evicted_state;
3108 
3109                 ASSERT(hdr->b_buf == NULL);
3110                 ASSERT(refcount_is_zero(&hdr->b_refcnt));
3111 
3112                 evicted_state =
3113                     (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3114 
3115                 mutex_enter(&old_state->arcs_mtx);
3116                 mutex_enter(&evicted_state->arcs_mtx);
3117 
3118                 arc_change_state(evicted_state, hdr, hash_lock);
3119                 ASSERT(HDR_IN_HASH_TABLE(hdr));
3120                 hdr->b_flags |= ARC_IN_HASH_TABLE;
3121                 hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3122 
3123                 mutex_exit(&evicted_state->arcs_mtx);
3124                 mutex_exit(&old_state->arcs_mtx);
3125         }
3126         mutex_exit(hash_lock);
3127         mutex_exit(&buf->b_evict_lock);
3128 
3129         VERIFY(buf->b_efunc(buf) == 0);
3130         buf->b_efunc = NULL;
3131         buf->b_private = NULL;
3132         buf->b_hdr = NULL;
3133         buf->b_next = NULL;
3134         kmem_cache_free(buf_cache, buf);
3135         return (1);
3136 }
3137 
3138 /*
3139  * Release this buffer from the cache.  This must be done
3140  * after a read and prior to modifying the buffer contents.
3141  * If the buffer has more than one reference, we must make
3142  * a new hdr for the buffer.
3143  */
3144 void
3145 arc_release(arc_buf_t *buf, void *tag)
3146 {
3147         arc_buf_hdr_t *hdr;
3148         kmutex_t *hash_lock = NULL;
3149         l2arc_buf_hdr_t *l2hdr;
3150         uint64_t buf_size;
3151 
3152         /*
3153          * It would be nice to assert that if it's DMU metadata (level >
3154          * 0 || it's the dnode file), then it must be syncing context.
3155          * But we don't know that information at this level.
3156          */
3157 
3158         mutex_enter(&buf->b_evict_lock);
3159         hdr = buf->b_hdr;
3160 
3161         /* this buffer is not on any list */
3162         ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3163 
3164         if (hdr->b_state == arc_anon) {
3165                 /* this buffer is already released */
3166                 ASSERT(buf->b_efunc == NULL);
3167         } else {
3168                 hash_lock = HDR_LOCK(hdr);
3169                 mutex_enter(hash_lock);
3170                 hdr = buf->b_hdr;
3171                 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3172         }
3173 
3174         l2hdr = hdr->b_l2hdr;
3175         if (l2hdr) {
3176                 mutex_enter(&l2arc_buflist_mtx);
3177                 hdr->b_l2hdr = NULL;
3178         }
3179         buf_size = hdr->b_size;
3180 
3181         /*
3182          * Do we have more than one buf?
3183          */
3184         if (hdr->b_datacnt > 1) {
3185                 arc_buf_hdr_t *nhdr;
3186                 arc_buf_t **bufp;
3187                 uint64_t blksz = hdr->b_size;
3188                 uint64_t spa = hdr->b_spa;
3189                 arc_buf_contents_t type = hdr->b_type;
3190                 uint32_t flags = hdr->b_flags;
3191 
3192                 ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3193                 /*
3194                  * Pull the data off of this hdr and attach it to
3195                  * a new anonymous hdr.
3196                  */
3197                 (void) remove_reference(hdr, hash_lock, tag);
3198                 bufp = &hdr->b_buf;
3199                 while (*bufp != buf)
3200                         bufp = &(*bufp)->b_next;
3201                 *bufp = buf->b_next;
3202                 buf->b_next = NULL;
3203 
3204                 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3205                 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3206                 if (refcount_is_zero(&hdr->b_refcnt)) {
3207                         uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3208                         ASSERT3U(*size, >=, hdr->b_size);
3209                         atomic_add_64(size, -hdr->b_size);
3210                 }
3211 
3212                 /*
3213                  * We're releasing a duplicate user data buffer, update
3214                  * our statistics accordingly.
3215                  */
3216                 if (hdr->b_type == ARC_BUFC_DATA) {
3217                         ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3218                         ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3219                             -hdr->b_size);
3220                 }
3221                 hdr->b_datacnt -= 1;
3222                 arc_cksum_verify(buf);
3223                 arc_buf_unwatch(buf);
3224 
3225                 mutex_exit(hash_lock);
3226 
3227                 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3228                 nhdr->b_size = blksz;
3229                 nhdr->b_spa = spa;
3230                 nhdr->b_type = type;
3231                 nhdr->b_buf = buf;
3232                 nhdr->b_state = arc_anon;
3233                 nhdr->b_arc_access = 0;
3234                 nhdr->b_flags = flags & ARC_L2_WRITING;
3235                 nhdr->b_l2hdr = NULL;
3236                 nhdr->b_datacnt = 1;
3237                 nhdr->b_freeze_cksum = NULL;
3238                 (void) refcount_add(&nhdr->b_refcnt, tag);
3239                 buf->b_hdr = nhdr;
3240                 mutex_exit(&buf->b_evict_lock);
3241                 atomic_add_64(&arc_anon->arcs_size, blksz);
3242         } else {
3243                 mutex_exit(&buf->b_evict_lock);
3244                 ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3245                 ASSERT(!list_link_active(&hdr->b_arc_node));
3246                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3247                 if (hdr->b_state != arc_anon)
3248                         arc_change_state(arc_anon, hdr, hash_lock);
3249                 hdr->b_arc_access = 0;
3250                 if (hash_lock)
3251                         mutex_exit(hash_lock);
3252 
3253                 buf_discard_identity(hdr);
3254                 arc_buf_thaw(buf);
3255         }
3256         buf->b_efunc = NULL;
3257         buf->b_private = NULL;
3258 
3259         if (l2hdr) {
3260                 list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3261                 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3262                 ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3263                 mutex_exit(&l2arc_buflist_mtx);
3264         }
3265 }
3266 
3267 int
3268 arc_released(arc_buf_t *buf)
3269 {
3270         int released;
3271 
3272         mutex_enter(&buf->b_evict_lock);
3273         released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3274         mutex_exit(&buf->b_evict_lock);
3275         return (released);
3276 }
3277 
3278 int
3279 arc_has_callback(arc_buf_t *buf)
3280 {
3281         int callback;
3282 
3283         mutex_enter(&buf->b_evict_lock);
3284         callback = (buf->b_efunc != NULL);
3285         mutex_exit(&buf->b_evict_lock);
3286         return (callback);
3287 }
3288 
3289 #ifdef ZFS_DEBUG
3290 int
3291 arc_referenced(arc_buf_t *buf)
3292 {
3293         int referenced;
3294 
3295         mutex_enter(&buf->b_evict_lock);
3296         referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3297         mutex_exit(&buf->b_evict_lock);
3298         return (referenced);
3299 }
3300 #endif
3301 
3302 static void
3303 arc_write_ready(zio_t *zio)
3304 {
3305         arc_write_callback_t *callback = zio->io_private;
3306         arc_buf_t *buf = callback->awcb_buf;
3307         arc_buf_hdr_t *hdr = buf->b_hdr;
3308 
3309         ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3310         callback->awcb_ready(zio, buf, callback->awcb_private);
3311 
3312         /*
3313          * If the IO is already in progress, then this is a re-write
3314          * attempt, so we need to thaw and re-compute the cksum.
3315          * It is the responsibility of the callback to handle the
3316          * accounting for any re-write attempt.
3317          */
3318         if (HDR_IO_IN_PROGRESS(hdr)) {
3319                 mutex_enter(&hdr->b_freeze_lock);
3320                 if (hdr->b_freeze_cksum != NULL) {
3321                         kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3322                         hdr->b_freeze_cksum = NULL;
3323                 }
3324                 mutex_exit(&hdr->b_freeze_lock);
3325         }
3326         arc_cksum_compute(buf, B_FALSE);
3327         hdr->b_flags |= ARC_IO_IN_PROGRESS;
3328 }
3329 
3330 static void
3331 arc_write_done(zio_t *zio)
3332 {
3333         arc_write_callback_t *callback = zio->io_private;
3334         arc_buf_t *buf = callback->awcb_buf;
3335         arc_buf_hdr_t *hdr = buf->b_hdr;
3336 
3337         ASSERT(hdr->b_acb == NULL);
3338 
3339         if (zio->io_error == 0) {
3340                 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3341                 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3342                 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3343         } else {
3344                 ASSERT(BUF_EMPTY(hdr));
3345         }
3346 
3347         /*
3348          * If the block to be written was all-zero, we may have
3349          * compressed it away.  In this case no write was performed
3350          * so there will be no dva/birth/checksum.  The buffer must
3351          * therefore remain anonymous (and uncached).
3352          */
3353         if (!BUF_EMPTY(hdr)) {
3354                 arc_buf_hdr_t *exists;
3355                 kmutex_t *hash_lock;
3356 
3357                 ASSERT(zio->io_error == 0);
3358 
3359                 arc_cksum_verify(buf);
3360 
3361                 exists = buf_hash_insert(hdr, &hash_lock);
3362                 if (exists) {
3363                         /*
3364                          * This can only happen if we overwrite for
3365                          * sync-to-convergence, because we remove
3366                          * buffers from the hash table when we arc_free().
3367                          */
3368                         if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3369                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3370                                         panic("bad overwrite, hdr=%p exists=%p",
3371                                             (void *)hdr, (void *)exists);
3372                                 ASSERT(refcount_is_zero(&exists->b_refcnt));
3373                                 arc_change_state(arc_anon, exists, hash_lock);
3374                                 mutex_exit(hash_lock);
3375                                 arc_hdr_destroy(exists);
3376                                 exists = buf_hash_insert(hdr, &hash_lock);
3377                                 ASSERT3P(exists, ==, NULL);
3378                         } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
3379                                 /* nopwrite */
3380                                 ASSERT(zio->io_prop.zp_nopwrite);
3381                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3382                                         panic("bad nopwrite, hdr=%p exists=%p",
3383                                             (void *)hdr, (void *)exists);
3384                         } else {
3385                                 /* Dedup */
3386                                 ASSERT(hdr->b_datacnt == 1);
3387                                 ASSERT(hdr->b_state == arc_anon);
3388                                 ASSERT(BP_GET_DEDUP(zio->io_bp));
3389                                 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3390                         }
3391                 }
3392                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3393                 /* if it's not anon, we are doing a scrub */
3394                 if (!exists && hdr->b_state == arc_anon)
3395                         arc_access(hdr, hash_lock);
3396                 mutex_exit(hash_lock);
3397         } else {
3398                 hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3399         }
3400 
3401         ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3402         callback->awcb_done(zio, buf, callback->awcb_private);
3403 
3404         kmem_free(callback, sizeof (arc_write_callback_t));
3405 }
3406 
3407 zio_t *
3408 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3409     blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
3410     arc_done_func_t *ready, arc_done_func_t *done, void *private,
3411     int priority, int zio_flags, const zbookmark_t *zb)
3412 {
3413         arc_buf_hdr_t *hdr = buf->b_hdr;
3414         arc_write_callback_t *callback;
3415         zio_t *zio;
3416 
3417         ASSERT(ready != NULL);
3418         ASSERT(done != NULL);
3419         ASSERT(!HDR_IO_ERROR(hdr));
3420         ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3421         ASSERT(hdr->b_acb == NULL);
3422         if (l2arc)
3423                 hdr->b_flags |= ARC_L2CACHE;
3424         callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3425         callback->awcb_ready = ready;
3426         callback->awcb_done = done;
3427         callback->awcb_private = private;
3428         callback->awcb_buf = buf;
3429 
3430         zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3431             arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
3432 
3433         return (zio);
3434 }
3435 
3436 static int
3437 arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
3438 {
3439 #ifdef _KERNEL
3440         uint64_t available_memory = ptob(freemem);
3441         static uint64_t page_load = 0;
3442         static uint64_t last_txg = 0;
3443 
3444 #if defined(__i386)
3445         available_memory =
3446             MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3447 #endif
3448         if (available_memory >= zfs_write_limit_max)
3449                 return (0);
3450 
3451         if (txg > last_txg) {
3452                 last_txg = txg;
3453                 page_load = 0;
3454         }
3455         /*
3456          * If we are in pageout, we know that memory is already tight,
3457          * the arc is already going to be evicting, so we just want to
3458          * continue to let page writes occur as quickly as possible.
3459          */
3460         if (curproc == proc_pageout) {
3461                 if (page_load > MAX(ptob(minfree), available_memory) / 4)
3462                         return (SET_ERROR(ERESTART));
3463                 /* Note: reserve is inflated, so we deflate */
3464                 page_load += reserve / 8;
3465                 return (0);
3466         } else if (page_load > 0 && arc_reclaim_needed()) {
3467                 /* memory is low, delay before restarting */
3468                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3469                 return (SET_ERROR(EAGAIN));
3470         }
3471         page_load = 0;
3472 
3473         if (arc_size > arc_c_min) {
3474                 uint64_t evictable_memory =
3475                     arc_mru->arcs_lsize[ARC_BUFC_DATA] +
3476                     arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
3477                     arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
3478                     arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
3479                 available_memory += MIN(evictable_memory, arc_size - arc_c_min);
3480         }
3481 
3482         if (inflight_data > available_memory / 4) {
3483                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3484                 return (SET_ERROR(ERESTART));
3485         }
3486 #endif
3487         return (0);
3488 }
3489 
3490 void
3491 arc_tempreserve_clear(uint64_t reserve)
3492 {
3493         atomic_add_64(&arc_tempreserve, -reserve);
3494         ASSERT((int64_t)arc_tempreserve >= 0);
3495 }
3496 
3497 int
3498 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3499 {
3500         int error;
3501         uint64_t anon_size;
3502 
3503 #ifdef ZFS_DEBUG
3504         /*
3505          * Once in a while, fail for no reason.  Everything should cope.
3506          */
3507         if (spa_get_random(10000) == 0) {
3508                 dprintf("forcing random failure\n");
3509                 return (SET_ERROR(ERESTART));
3510         }
3511 #endif
3512         if (reserve > arc_c/4 && !arc_no_grow)
3513                 arc_c = MIN(arc_c_max, reserve * 4);
3514         if (reserve > arc_c)
3515                 return (SET_ERROR(ENOMEM));
3516 
3517         /*
3518          * Don't count loaned bufs as in flight dirty data to prevent long
3519          * network delays from blocking transactions that are ready to be
3520          * assigned to a txg.
3521          */
3522         anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3523 
3524         /*
3525          * Writes will, almost always, require additional memory allocations
3526          * in order to compress/encrypt/etc the data.  We therefor need to
3527          * make sure that there is sufficient available memory for this.
3528          */
3529         if (error = arc_memory_throttle(reserve, anon_size, txg))
3530                 return (error);
3531 
3532         /*
3533          * Throttle writes when the amount of dirty data in the cache
3534          * gets too large.  We try to keep the cache less than half full
3535          * of dirty blocks so that our sync times don't grow too large.
3536          * Note: if two requests come in concurrently, we might let them
3537          * both succeed, when one of them should fail.  Not a huge deal.
3538          */
3539 
3540         if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3541             anon_size > arc_c / 4) {
3542                 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3543                     "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3544                     arc_tempreserve>>10,
3545                     arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3546                     arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3547                     reserve>>10, arc_c>>10);
3548                 return (SET_ERROR(ERESTART));
3549         }
3550         atomic_add_64(&arc_tempreserve, reserve);
3551         return (0);
3552 }
3553 
3554 void
3555 arc_init(void)
3556 {
3557         mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3558         cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3559 
3560         /* Convert seconds to clock ticks */
3561         arc_min_prefetch_lifespan = 1 * hz;
3562 
3563         /* Start out with 1/8 of all memory */
3564         arc_c = physmem * PAGESIZE / 8;
3565 
3566 #ifdef _KERNEL
3567         /*
3568          * On architectures where the physical memory can be larger
3569          * than the addressable space (intel in 32-bit mode), we may
3570          * need to limit the cache to 1/8 of VM size.
3571          */
3572         arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3573 #endif
3574 
3575         /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
3576         arc_c_min = MAX(arc_c / 4, 64<<20);
3577         /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
3578         if (arc_c * 8 >= 1<<30)
3579                 arc_c_max = (arc_c * 8) - (1<<30);
3580         else
3581                 arc_c_max = arc_c_min;
3582         arc_c_max = MAX(arc_c * 6, arc_c_max);
3583 
3584         /*
3585          * Allow the tunables to override our calculations if they are
3586          * reasonable (ie. over 64MB)
3587          */
3588         if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
3589                 arc_c_max = zfs_arc_max;
3590         if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
3591                 arc_c_min = zfs_arc_min;
3592 
3593         arc_c = arc_c_max;
3594         arc_p = (arc_c >> 1);
3595 
3596         /* limit meta-data to 1/4 of the arc capacity */
3597         arc_meta_limit = arc_c_max / 4;
3598 
3599         /* Allow the tunable to override if it is reasonable */
3600         if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
3601                 arc_meta_limit = zfs_arc_meta_limit;
3602 
3603         if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
3604                 arc_c_min = arc_meta_limit / 2;
3605 
3606         if (zfs_arc_grow_retry > 0)
3607                 arc_grow_retry = zfs_arc_grow_retry;
3608 
3609         if (zfs_arc_shrink_shift > 0)
3610                 arc_shrink_shift = zfs_arc_shrink_shift;
3611 
3612         if (zfs_arc_p_min_shift > 0)
3613                 arc_p_min_shift = zfs_arc_p_min_shift;
3614 
3615         /* if kmem_flags are set, lets try to use less memory */
3616         if (kmem_debugging())
3617                 arc_c = arc_c / 2;
3618         if (arc_c < arc_c_min)
3619                 arc_c = arc_c_min;
3620 
3621         arc_anon = &ARC_anon;
3622         arc_mru = &ARC_mru;
3623         arc_mru_ghost = &ARC_mru_ghost;
3624         arc_mfu = &ARC_mfu;
3625         arc_mfu_ghost = &ARC_mfu_ghost;
3626         arc_l2c_only = &ARC_l2c_only;
3627         arc_size = 0;
3628 
3629         mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3630         mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3631         mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3632         mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3633         mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3634         mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3635 
3636         list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
3637             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3638         list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
3639             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3640         list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
3641             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3642         list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
3643             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3644         list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
3645             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3646         list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
3647             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3648         list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
3649             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3650         list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
3651             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3652         list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
3653             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3654         list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
3655             sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3656 
3657         buf_init();
3658 
3659         arc_thread_exit = 0;
3660         arc_eviction_list = NULL;
3661         mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3662         bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3663 
3664         arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3665             sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3666 
3667         if (arc_ksp != NULL) {
3668                 arc_ksp->ks_data = &arc_stats;
3669                 kstat_install(arc_ksp);
3670         }
3671 
3672         (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
3673             TS_RUN, minclsyspri);
3674 
3675         arc_dead = FALSE;
3676         arc_warm = B_FALSE;
3677 
3678         if (zfs_write_limit_max == 0)
3679                 zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
3680         else
3681                 zfs_write_limit_shift = 0;
3682         mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
3683 }
3684 
3685 void
3686 arc_fini(void)
3687 {
3688         mutex_enter(&arc_reclaim_thr_lock);
3689         arc_thread_exit = 1;
3690         while (arc_thread_exit != 0)
3691                 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3692         mutex_exit(&arc_reclaim_thr_lock);
3693 
3694         arc_flush(NULL);
3695 
3696         arc_dead = TRUE;
3697 
3698         if (arc_ksp != NULL) {
3699                 kstat_delete(arc_ksp);
3700                 arc_ksp = NULL;
3701         }
3702 
3703         mutex_destroy(&arc_eviction_mtx);
3704         mutex_destroy(&arc_reclaim_thr_lock);
3705         cv_destroy(&arc_reclaim_thr_cv);
3706 
3707         list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
3708         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
3709         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
3710         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
3711         list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
3712         list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
3713         list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
3714         list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3715 
3716         mutex_destroy(&arc_anon->arcs_mtx);
3717         mutex_destroy(&arc_mru->arcs_mtx);
3718         mutex_destroy(&arc_mru_ghost->arcs_mtx);
3719         mutex_destroy(&arc_mfu->arcs_mtx);
3720         mutex_destroy(&arc_mfu_ghost->arcs_mtx);
3721         mutex_destroy(&arc_l2c_only->arcs_mtx);
3722 
3723         mutex_destroy(&zfs_write_limit_lock);
3724 
3725         buf_fini();
3726 
3727         ASSERT(arc_loaned_bytes == 0);
3728 }
3729 
3730 /*
3731  * Level 2 ARC
3732  *
3733  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
3734  * It uses dedicated storage devices to hold cached data, which are populated
3735  * using large infrequent writes.  The main role of this cache is to boost
3736  * the performance of random read workloads.  The intended L2ARC devices
3737  * include short-stroked disks, solid state disks, and other media with
3738  * substantially faster read latency than disk.
3739  *
3740  *                 +-----------------------+
3741  *                 |         ARC           |
3742  *                 +-----------------------+
3743  *                    |         ^     ^
3744  *                    |         |     |
3745  *      l2arc_feed_thread()    arc_read()
3746  *                    |         |     |
3747  *                    |  l2arc read   |
3748  *                    V         |     |
3749  *               +---------------+    |
3750  *               |     L2ARC     |    |
3751  *               +---------------+    |
3752  *                   |    ^           |
3753  *          l2arc_write() |           |
3754  *                   |    |           |
3755  *                   V    |           |
3756  *                 +-------+      +-------+
3757  *                 | vdev  |      | vdev  |
3758  *                 | cache |      | cache |
3759  *                 +-------+      +-------+
3760  *                 +=========+     .-----.
3761  *                 :  L2ARC  :    |-_____-|
3762  *                 : devices :    | Disks |
3763  *                 +=========+    `-_____-'
3764  *
3765  * Read requests are satisfied from the following sources, in order:
3766  *
3767  *      1) ARC
3768  *      2) vdev cache of L2ARC devices
3769  *      3) L2ARC devices
3770  *      4) vdev cache of disks
3771  *      5) disks
3772  *
3773  * Some L2ARC device types exhibit extremely slow write performance.
3774  * To accommodate for this there are some significant differences between
3775  * the L2ARC and traditional cache design:
3776  *
3777  * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
3778  * the ARC behave as usual, freeing buffers and placing headers on ghost
3779  * lists.  The ARC does not send buffers to the L2ARC during eviction as
3780  * this would add inflated write latencies for all ARC memory pressure.
3781  *
3782  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
3783  * It does this by periodically scanning buffers from the eviction-end of
3784  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
3785  * not already there.  It scans until a headroom of buffers is satisfied,
3786  * which itself is a buffer for ARC eviction.  The thread that does this is
3787  * l2arc_feed_thread(), illustrated below; example sizes are included to
3788  * provide a better sense of ratio than this diagram:
3789  *
3790  *             head -->                        tail
3791  *              +---------------------+----------+
3792  *      ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
3793  *              +---------------------+----------+   |   o L2ARC eligible
3794  *      ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
3795  *              +---------------------+----------+   |
3796  *                   15.9 Gbytes      ^ 32 Mbytes    |
3797  *                                 headroom          |
3798  *                                            l2arc_feed_thread()
3799  *                                                   |
3800  *                       l2arc write hand <--[oooo]--'
3801  *                               |           8 Mbyte
3802  *                               |          write max
3803  *                               V
3804  *                +==============================+
3805  *      L2ARC dev |####|#|###|###|    |####| ... |
3806  *                +==============================+
3807  *                           32 Gbytes
3808  *
3809  * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
3810  * evicted, then the L2ARC has cached a buffer much sooner than it probably
3811  * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
3812  * safe to say that this is an uncommon case, since buffers at the end of
3813  * the ARC lists have moved there due to inactivity.
3814  *
3815  * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
3816  * then the L2ARC simply misses copying some buffers.  This serves as a
3817  * pressure valve to prevent heavy read workloads from both stalling the ARC
3818  * with waits and clogging the L2ARC with writes.  This also helps prevent
3819  * the potential for the L2ARC to churn if it attempts to cache content too
3820  * quickly, such as during backups of the entire pool.
3821  *
3822  * 5. After system boot and before the ARC has filled main memory, there are
3823  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
3824  * lists can remain mostly static.  Instead of searching from tail of these
3825  * lists as pictured, the l2arc_feed_thread() will search from the list heads
3826  * for eligible buffers, greatly increasing its chance of finding them.
3827  *
3828  * The L2ARC device write speed is also boosted during this time so that
3829  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
3830  * there are no L2ARC reads, and no fear of degrading read performance
3831  * through increased writes.
3832  *
3833  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
3834  * the vdev queue can aggregate them into larger and fewer writes.  Each
3835  * device is written to in a rotor fashion, sweeping writes through
3836  * available space then repeating.
3837  *
3838  * 7. The L2ARC does not store dirty content.  It never needs to flush
3839  * write buffers back to disk based storage.
3840  *
3841  * 8. If an ARC buffer is written (and dirtied) which also exists in the
3842  * L2ARC, the now stale L2ARC buffer is immediately dropped.
3843  *
3844  * The performance of the L2ARC can be tweaked by a number of tunables, which
3845  * may be necessary for different workloads:
3846  *
3847  *      l2arc_write_max         max write bytes per interval
3848  *      l2arc_write_boost       extra write bytes during device warmup
3849  *      l2arc_noprefetch        skip caching prefetched buffers
3850  *      l2arc_headroom          number of max device writes to precache
3851  *      l2arc_feed_secs         seconds between L2ARC writing
3852  *
3853  * Tunables may be removed or added as future performance improvements are
3854  * integrated, and also may become zpool properties.
3855  *
3856  * There are three key functions that control how the L2ARC warms up:
3857  *
3858  *      l2arc_write_eligible()  check if a buffer is eligible to cache
3859  *      l2arc_write_size()      calculate how much to write
3860  *      l2arc_write_interval()  calculate sleep delay between writes
3861  *
3862  * These three functions determine what to write, how much, and how quickly
3863  * to send writes.
3864  */
3865 
3866 static boolean_t
3867 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
3868 {
3869         /*
3870          * A buffer is *not* eligible for the L2ARC if it:
3871          * 1. belongs to a different spa.
3872          * 2. is already cached on the L2ARC.
3873          * 3. has an I/O in progress (it may be an incomplete read).
3874          * 4. is flagged not eligible (zfs property).
3875          */
3876         if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
3877             HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
3878                 return (B_FALSE);
3879 
3880         return (B_TRUE);
3881 }
3882 
3883 static uint64_t
3884 l2arc_write_size(l2arc_dev_t *dev)
3885 {
3886         uint64_t size;
3887 
3888         size = dev->l2ad_write;
3889 
3890         if (arc_warm == B_FALSE)
3891                 size += dev->l2ad_boost;
3892 
3893         return (size);
3894 
3895 }
3896 
3897 static clock_t
3898 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
3899 {
3900         clock_t interval, next, now;
3901 
3902         /*
3903          * If the ARC lists are busy, increase our write rate; if the
3904          * lists are stale, idle back.  This is achieved by checking
3905          * how much we previously wrote - if it was more than half of
3906          * what we wanted, schedule the next write much sooner.
3907          */
3908         if (l2arc_feed_again && wrote > (wanted / 2))
3909                 interval = (hz * l2arc_feed_min_ms) / 1000;
3910         else
3911                 interval = hz * l2arc_feed_secs;
3912 
3913         now = ddi_get_lbolt();
3914         next = MAX(now, MIN(now + interval, began + interval));
3915 
3916         return (next);
3917 }
3918 
3919 static void
3920 l2arc_hdr_stat_add(void)
3921 {
3922         ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
3923         ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
3924 }
3925 
3926 static void
3927 l2arc_hdr_stat_remove(void)
3928 {
3929         ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
3930         ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
3931 }
3932 
3933 /*
3934  * Cycle through L2ARC devices.  This is how L2ARC load balances.
3935  * If a device is returned, this also returns holding the spa config lock.
3936  */
3937 static l2arc_dev_t *
3938 l2arc_dev_get_next(void)
3939 {
3940         l2arc_dev_t *first, *next = NULL;
3941 
3942         /*
3943          * Lock out the removal of spas (spa_namespace_lock), then removal
3944          * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
3945          * both locks will be dropped and a spa config lock held instead.
3946          */
3947         mutex_enter(&spa_namespace_lock);
3948         mutex_enter(&l2arc_dev_mtx);
3949 
3950         /* if there are no vdevs, there is nothing to do */
3951         if (l2arc_ndev == 0)
3952                 goto out;
3953 
3954         first = NULL;
3955         next = l2arc_dev_last;
3956         do {
3957                 /* loop around the list looking for a non-faulted vdev */
3958                 if (next == NULL) {
3959                         next = list_head(l2arc_dev_list);
3960                 } else {
3961                         next = list_next(l2arc_dev_list, next);
3962                         if (next == NULL)
3963                                 next = list_head(l2arc_dev_list);
3964                 }
3965 
3966                 /* if we have come back to the start, bail out */
3967                 if (first == NULL)
3968                         first = next;
3969                 else if (next == first)
3970                         break;
3971 
3972         } while (vdev_is_dead(next->l2ad_vdev));
3973 
3974         /* if we were unable to find any usable vdevs, return NULL */
3975         if (vdev_is_dead(next->l2ad_vdev))
3976                 next = NULL;
3977 
3978         l2arc_dev_last = next;
3979 
3980 out:
3981         mutex_exit(&l2arc_dev_mtx);
3982 
3983         /*
3984          * Grab the config lock to prevent the 'next' device from being
3985          * removed while we are writing to it.
3986          */
3987         if (next != NULL)
3988                 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
3989         mutex_exit(&spa_namespace_lock);
3990 
3991         return (next);
3992 }
3993 
3994 /*
3995  * Free buffers that were tagged for destruction.
3996  */
3997 static void
3998 l2arc_do_free_on_write()
3999 {
4000         list_t *buflist;
4001         l2arc_data_free_t *df, *df_prev;
4002 
4003         mutex_enter(&l2arc_free_on_write_mtx);
4004         buflist = l2arc_free_on_write;
4005 
4006         for (df = list_tail(buflist); df; df = df_prev) {
4007                 df_prev = list_prev(buflist, df);
4008                 ASSERT(df->l2df_data != NULL);
4009                 ASSERT(df->l2df_func != NULL);
4010                 df->l2df_func(df->l2df_data, df->l2df_size);
4011                 list_remove(buflist, df);
4012                 kmem_free(df, sizeof (l2arc_data_free_t));
4013         }
4014 
4015         mutex_exit(&l2arc_free_on_write_mtx);
4016 }
4017 
4018 /*
4019  * A write to a cache device has completed.  Update all headers to allow
4020  * reads from these buffers to begin.
4021  */
4022 static void
4023 l2arc_write_done(zio_t *zio)
4024 {
4025         l2arc_write_callback_t *cb;
4026         l2arc_dev_t *dev;
4027         list_t *buflist;
4028         arc_buf_hdr_t *head, *ab, *ab_prev;
4029         l2arc_buf_hdr_t *abl2;
4030         kmutex_t *hash_lock;
4031 
4032         cb = zio->io_private;
4033         ASSERT(cb != NULL);
4034         dev = cb->l2wcb_dev;
4035         ASSERT(dev != NULL);
4036         head = cb->l2wcb_head;
4037         ASSERT(head != NULL);
4038         buflist = dev->l2ad_buflist;
4039         ASSERT(buflist != NULL);
4040         DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4041             l2arc_write_callback_t *, cb);
4042 
4043         if (zio->io_error != 0)
4044                 ARCSTAT_BUMP(arcstat_l2_writes_error);
4045 
4046         mutex_enter(&l2arc_buflist_mtx);
4047 
4048         /*
4049          * All writes completed, or an error was hit.
4050          */
4051         for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4052                 ab_prev = list_prev(buflist, ab);
4053 
4054                 hash_lock = HDR_LOCK(ab);
4055                 if (!mutex_tryenter(hash_lock)) {
4056                         /*
4057                          * This buffer misses out.  It may be in a stage
4058                          * of eviction.  Its ARC_L2_WRITING flag will be
4059                          * left set, denying reads to this buffer.
4060                          */
4061                         ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4062                         continue;
4063                 }
4064 
4065                 if (zio->io_error != 0) {
4066                         /*
4067                          * Error - drop L2ARC entry.
4068                          */
4069                         list_remove(buflist, ab);
4070                         abl2 = ab->b_l2hdr;
4071                         ab->b_l2hdr = NULL;
4072                         kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4073                         ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4074                 }
4075 
4076                 /*
4077                  * Allow ARC to begin reads to this L2ARC entry.
4078                  */
4079                 ab->b_flags &= ~ARC_L2_WRITING;
4080 
4081                 mutex_exit(hash_lock);
4082         }
4083 
4084         atomic_inc_64(&l2arc_writes_done);
4085         list_remove(buflist, head);
4086         kmem_cache_free(hdr_cache, head);
4087         mutex_exit(&l2arc_buflist_mtx);
4088 
4089         l2arc_do_free_on_write();
4090 
4091         kmem_free(cb, sizeof (l2arc_write_callback_t));
4092 }
4093 
4094 /*
4095  * A read to a cache device completed.  Validate buffer contents before
4096  * handing over to the regular ARC routines.
4097  */
4098 static void
4099 l2arc_read_done(zio_t *zio)
4100 {
4101         l2arc_read_callback_t *cb;
4102         arc_buf_hdr_t *hdr;
4103         arc_buf_t *buf;
4104         kmutex_t *hash_lock;
4105         int equal;
4106 
4107         ASSERT(zio->io_vd != NULL);
4108         ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4109 
4110         spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4111 
4112         cb = zio->io_private;
4113         ASSERT(cb != NULL);
4114         buf = cb->l2rcb_buf;
4115         ASSERT(buf != NULL);
4116 
4117         hash_lock = HDR_LOCK(buf->b_hdr);
4118         mutex_enter(hash_lock);
4119         hdr = buf->b_hdr;
4120         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4121 
4122         /*
4123          * Check this survived the L2ARC journey.
4124          */
4125         equal = arc_cksum_equal(buf);
4126         if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4127                 mutex_exit(hash_lock);
4128                 zio->io_private = buf;
4129                 zio->io_bp_copy = cb->l2rcb_bp;   /* XXX fix in L2ARC 2.0 */
4130                 zio->io_bp = &zio->io_bp_copy;        /* XXX fix in L2ARC 2.0 */
4131                 arc_read_done(zio);
4132         } else {
4133                 mutex_exit(hash_lock);
4134                 /*
4135                  * Buffer didn't survive caching.  Increment stats and
4136                  * reissue to the original storage device.
4137                  */
4138                 if (zio->io_error != 0) {
4139                         ARCSTAT_BUMP(arcstat_l2_io_error);
4140                 } else {
4141                         zio->io_error = SET_ERROR(EIO);
4142                 }
4143                 if (!equal)
4144                         ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4145 
4146                 /*
4147                  * If there's no waiter, issue an async i/o to the primary
4148                  * storage now.  If there *is* a waiter, the caller must
4149                  * issue the i/o in a context where it's OK to block.
4150                  */
4151                 if (zio->io_waiter == NULL) {
4152                         zio_t *pio = zio_unique_parent(zio);
4153 
4154                         ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4155 
4156                         zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4157                             buf->b_data, zio->io_size, arc_read_done, buf,
4158                             zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4159                 }
4160         }
4161 
4162         kmem_free(cb, sizeof (l2arc_read_callback_t));
4163 }
4164 
4165 /*
4166  * This is the list priority from which the L2ARC will search for pages to
4167  * cache.  This is used within loops (0..3) to cycle through lists in the
4168  * desired order.  This order can have a significant effect on cache
4169  * performance.
4170  *
4171  * Currently the metadata lists are hit first, MFU then MRU, followed by
4172  * the data lists.  This function returns a locked list, and also returns
4173  * the lock pointer.
4174  */
4175 static list_t *
4176 l2arc_list_locked(int list_num, kmutex_t **lock)
4177 {
4178         list_t *list = NULL;
4179 
4180         ASSERT(list_num >= 0 && list_num <= 3);
4181 
4182         switch (list_num) {
4183         case 0:
4184                 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
4185                 *lock = &arc_mfu->arcs_mtx;
4186                 break;
4187         case 1:
4188                 list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4189                 *lock = &arc_mru->arcs_mtx;
4190                 break;
4191         case 2:
4192                 list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4193                 *lock = &arc_mfu->arcs_mtx;
4194                 break;
4195         case 3:
4196                 list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4197                 *lock = &arc_mru->arcs_mtx;
4198                 break;
4199         }
4200 
4201         ASSERT(!(MUTEX_HELD(*lock)));
4202         mutex_enter(*lock);
4203         return (list);
4204 }
4205 
4206 /*
4207  * Evict buffers from the device write hand to the distance specified in
4208  * bytes.  This distance may span populated buffers, it may span nothing.
4209  * This is clearing a region on the L2ARC device ready for writing.
4210  * If the 'all' boolean is set, every buffer is evicted.
4211  */
4212 static void
4213 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4214 {
4215         list_t *buflist;
4216         l2arc_buf_hdr_t *abl2;
4217         arc_buf_hdr_t *ab, *ab_prev;
4218         kmutex_t *hash_lock;
4219         uint64_t taddr;
4220 
4221         buflist = dev->l2ad_buflist;
4222 
4223         if (buflist == NULL)
4224                 return;
4225 
4226         if (!all && dev->l2ad_first) {
4227                 /*
4228                  * This is the first sweep through the device.  There is
4229                  * nothing to evict.
4230                  */
4231                 return;
4232         }
4233 
4234         if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4235                 /*
4236                  * When nearing the end of the device, evict to the end
4237                  * before the device write hand jumps to the start.
4238                  */
4239                 taddr = dev->l2ad_end;
4240         } else {
4241                 taddr = dev->l2ad_hand + distance;
4242         }
4243         DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4244             uint64_t, taddr, boolean_t, all);
4245 
4246 top:
4247         mutex_enter(&l2arc_buflist_mtx);
4248         for (ab = list_tail(buflist); ab; ab = ab_prev) {
4249                 ab_prev = list_prev(buflist, ab);
4250 
4251                 hash_lock = HDR_LOCK(ab);
4252                 if (!mutex_tryenter(hash_lock)) {
4253                         /*
4254                          * Missed the hash lock.  Retry.
4255                          */
4256                         ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4257                         mutex_exit(&l2arc_buflist_mtx);
4258                         mutex_enter(hash_lock);
4259                         mutex_exit(hash_lock);
4260                         goto top;
4261                 }
4262 
4263                 if (HDR_L2_WRITE_HEAD(ab)) {
4264                         /*
4265                          * We hit a write head node.  Leave it for
4266                          * l2arc_write_done().
4267                          */
4268                         list_remove(buflist, ab);
4269                         mutex_exit(hash_lock);
4270                         continue;
4271                 }
4272 
4273                 if (!all && ab->b_l2hdr != NULL &&
4274                     (ab->b_l2hdr->b_daddr > taddr ||
4275                     ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4276                         /*
4277                          * We've evicted to the target address,
4278                          * or the end of the device.
4279                          */
4280                         mutex_exit(hash_lock);
4281                         break;
4282                 }
4283 
4284                 if (HDR_FREE_IN_PROGRESS(ab)) {
4285                         /*
4286                          * Already on the path to destruction.
4287                          */
4288                         mutex_exit(hash_lock);
4289                         continue;
4290                 }
4291 
4292                 if (ab->b_state == arc_l2c_only) {
4293                         ASSERT(!HDR_L2_READING(ab));
4294                         /*
4295                          * This doesn't exist in the ARC.  Destroy.
4296                          * arc_hdr_destroy() will call list_remove()
4297                          * and decrement arcstat_l2_size.
4298                          */
4299                         arc_change_state(arc_anon, ab, hash_lock);
4300                         arc_hdr_destroy(ab);
4301                 } else {
4302                         /*
4303                          * Invalidate issued or about to be issued
4304                          * reads, since we may be about to write
4305                          * over this location.
4306                          */
4307                         if (HDR_L2_READING(ab)) {
4308                                 ARCSTAT_BUMP(arcstat_l2_evict_reading);
4309                                 ab->b_flags |= ARC_L2_EVICTED;
4310                         }
4311 
4312                         /*
4313                          * Tell ARC this no longer exists in L2ARC.
4314                          */
4315                         if (ab->b_l2hdr != NULL) {
4316                                 abl2 = ab->b_l2hdr;
4317                                 ab->b_l2hdr = NULL;
4318                                 kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4319                                 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4320                         }
4321                         list_remove(buflist, ab);
4322 
4323                         /*
4324                          * This may have been leftover after a
4325                          * failed write.
4326                          */
4327                         ab->b_flags &= ~ARC_L2_WRITING;
4328                 }
4329                 mutex_exit(hash_lock);
4330         }
4331         mutex_exit(&l2arc_buflist_mtx);
4332 
4333         vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4334         dev->l2ad_evict = taddr;
4335 }
4336 
4337 /*
4338  * Find and write ARC buffers to the L2ARC device.
4339  *
4340  * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4341  * for reading until they have completed writing.
4342  */
4343 static uint64_t
4344 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
4345 {
4346         arc_buf_hdr_t *ab, *ab_prev, *head;
4347         l2arc_buf_hdr_t *hdrl2;
4348         list_t *list;
4349         uint64_t passed_sz, write_sz, buf_sz, headroom;
4350         void *buf_data;
4351         kmutex_t *hash_lock, *list_lock;
4352         boolean_t have_lock, full;
4353         l2arc_write_callback_t *cb;
4354         zio_t *pio, *wzio;
4355         uint64_t guid = spa_load_guid(spa);
4356 
4357         ASSERT(dev->l2ad_vdev != NULL);
4358 
4359         pio = NULL;
4360         write_sz = 0;
4361         full = B_FALSE;
4362         head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4363         head->b_flags |= ARC_L2_WRITE_HEAD;
4364 
4365         /*
4366          * Copy buffers for L2ARC writing.
4367          */
4368         mutex_enter(&l2arc_buflist_mtx);
4369         for (int try = 0; try <= 3; try++) {
4370                 list = l2arc_list_locked(try, &list_lock);
4371                 passed_sz = 0;
4372 
4373                 /*
4374                  * L2ARC fast warmup.
4375                  *
4376                  * Until the ARC is warm and starts to evict, read from the
4377                  * head of the ARC lists rather than the tail.
4378                  */
4379                 headroom = target_sz * l2arc_headroom;
4380                 if (arc_warm == B_FALSE)
4381                         ab = list_head(list);
4382                 else
4383                         ab = list_tail(list);
4384 
4385                 for (; ab; ab = ab_prev) {
4386                         if (arc_warm == B_FALSE)
4387                                 ab_prev = list_next(list, ab);
4388                         else
4389                                 ab_prev = list_prev(list, ab);
4390 
4391                         hash_lock = HDR_LOCK(ab);
4392                         have_lock = MUTEX_HELD(hash_lock);
4393                         if (!have_lock && !mutex_tryenter(hash_lock)) {
4394                                 /*
4395                                  * Skip this buffer rather than waiting.
4396                                  */
4397                                 continue;
4398                         }
4399 
4400                         passed_sz += ab->b_size;
4401                         if (passed_sz > headroom) {
4402                                 /*
4403                                  * Searched too far.
4404                                  */
4405                                 mutex_exit(hash_lock);
4406                                 break;
4407                         }
4408 
4409                         if (!l2arc_write_eligible(guid, ab)) {
4410                                 mutex_exit(hash_lock);
4411                                 continue;
4412                         }
4413 
4414                         if ((write_sz + ab->b_size) > target_sz) {
4415                                 full = B_TRUE;
4416                                 mutex_exit(hash_lock);
4417                                 break;
4418                         }
4419 
4420                         if (pio == NULL) {
4421                                 /*
4422                                  * Insert a dummy header on the buflist so
4423                                  * l2arc_write_done() can find where the
4424                                  * write buffers begin without searching.
4425                                  */
4426                                 list_insert_head(dev->l2ad_buflist, head);
4427 
4428                                 cb = kmem_alloc(
4429                                     sizeof (l2arc_write_callback_t), KM_SLEEP);
4430                                 cb->l2wcb_dev = dev;
4431                                 cb->l2wcb_head = head;
4432                                 pio = zio_root(spa, l2arc_write_done, cb,
4433                                     ZIO_FLAG_CANFAIL);
4434                         }
4435 
4436                         /*
4437                          * Create and add a new L2ARC header.
4438                          */
4439                         hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4440                         hdrl2->b_dev = dev;
4441                         hdrl2->b_daddr = dev->l2ad_hand;
4442 
4443                         ab->b_flags |= ARC_L2_WRITING;
4444                         ab->b_l2hdr = hdrl2;
4445                         list_insert_head(dev->l2ad_buflist, ab);
4446                         buf_data = ab->b_buf->b_data;
4447                         buf_sz = ab->b_size;
4448 
4449                         /*
4450                          * Compute and store the buffer cksum before
4451                          * writing.  On debug the cksum is verified first.
4452                          */
4453                         arc_cksum_verify(ab->b_buf);
4454                         arc_cksum_compute(ab->b_buf, B_TRUE);
4455 
4456                         mutex_exit(hash_lock);
4457 
4458                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
4459                             dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4460                             NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4461                             ZIO_FLAG_CANFAIL, B_FALSE);
4462 
4463                         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4464                             zio_t *, wzio);
4465                         (void) zio_nowait(wzio);
4466 
4467                         /*
4468                          * Keep the clock hand suitably device-aligned.
4469                          */
4470                         buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4471 
4472                         write_sz += buf_sz;
4473                         dev->l2ad_hand += buf_sz;
4474                 }
4475 
4476                 mutex_exit(list_lock);
4477 
4478                 if (full == B_TRUE)
4479                         break;
4480         }
4481         mutex_exit(&l2arc_buflist_mtx);
4482 
4483         if (pio == NULL) {
4484                 ASSERT0(write_sz);
4485                 kmem_cache_free(hdr_cache, head);
4486                 return (0);
4487         }
4488 
4489         ASSERT3U(write_sz, <=, target_sz);
4490         ARCSTAT_BUMP(arcstat_l2_writes_sent);
4491         ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
4492         ARCSTAT_INCR(arcstat_l2_size, write_sz);
4493         vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0);
4494 
4495         /*
4496          * Bump device hand to the device start if it is approaching the end.
4497          * l2arc_evict() will already have evicted ahead for this case.
4498          */
4499         if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4500                 vdev_space_update(dev->l2ad_vdev,
4501                     dev->l2ad_end - dev->l2ad_hand, 0, 0);
4502                 dev->l2ad_hand = dev->l2ad_start;
4503                 dev->l2ad_evict = dev->l2ad_start;
4504                 dev->l2ad_first = B_FALSE;
4505         }
4506 
4507         dev->l2ad_writing = B_TRUE;
4508         (void) zio_wait(pio);
4509         dev->l2ad_writing = B_FALSE;
4510 
4511         return (write_sz);
4512 }
4513 
4514 /*
4515  * This thread feeds the L2ARC at regular intervals.  This is the beating
4516  * heart of the L2ARC.
4517  */
4518 static void
4519 l2arc_feed_thread(void)
4520 {
4521         callb_cpr_t cpr;
4522         l2arc_dev_t *dev;
4523         spa_t *spa;
4524         uint64_t size, wrote;
4525         clock_t begin, next = ddi_get_lbolt();
4526 
4527         CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
4528 
4529         mutex_enter(&l2arc_feed_thr_lock);
4530 
4531         while (l2arc_thread_exit == 0) {
4532                 CALLB_CPR_SAFE_BEGIN(&cpr);
4533                 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
4534                     next);
4535                 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
4536                 next = ddi_get_lbolt() + hz;
4537 
4538                 /*
4539                  * Quick check for L2ARC devices.
4540                  */
4541                 mutex_enter(&l2arc_dev_mtx);
4542                 if (l2arc_ndev == 0) {
4543                         mutex_exit(&l2arc_dev_mtx);
4544                         continue;
4545                 }
4546                 mutex_exit(&l2arc_dev_mtx);
4547                 begin = ddi_get_lbolt();
4548 
4549                 /*
4550                  * This selects the next l2arc device to write to, and in
4551                  * doing so the next spa to feed from: dev->l2ad_spa.   This
4552                  * will return NULL if there are now no l2arc devices or if
4553                  * they are all faulted.
4554                  *
4555                  * If a device is returned, its spa's config lock is also
4556                  * held to prevent device removal.  l2arc_dev_get_next()
4557                  * will grab and release l2arc_dev_mtx.
4558                  */
4559                 if ((dev = l2arc_dev_get_next()) == NULL)
4560                         continue;
4561 
4562                 spa = dev->l2ad_spa;
4563                 ASSERT(spa != NULL);
4564 
4565                 /*
4566                  * If the pool is read-only then force the feed thread to
4567                  * sleep a little longer.
4568                  */
4569                 if (!spa_writeable(spa)) {
4570                         next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
4571                         spa_config_exit(spa, SCL_L2ARC, dev);
4572                         continue;
4573                 }
4574 
4575                 /*
4576                  * Avoid contributing to memory pressure.
4577                  */
4578                 if (arc_reclaim_needed()) {
4579                         ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
4580                         spa_config_exit(spa, SCL_L2ARC, dev);
4581                         continue;
4582                 }
4583 
4584                 ARCSTAT_BUMP(arcstat_l2_feeds);
4585 
4586                 size = l2arc_write_size(dev);
4587 
4588                 /*
4589                  * Evict L2ARC buffers that will be overwritten.
4590                  */
4591                 l2arc_evict(dev, size, B_FALSE);
4592 
4593                 /*
4594                  * Write ARC buffers.
4595                  */
4596                 wrote = l2arc_write_buffers(spa, dev, size);
4597 
4598                 /*
4599                  * Calculate interval between writes.
4600                  */
4601                 next = l2arc_write_interval(begin, size, wrote);
4602                 spa_config_exit(spa, SCL_L2ARC, dev);
4603         }
4604 
4605         l2arc_thread_exit = 0;
4606         cv_broadcast(&l2arc_feed_thr_cv);
4607         CALLB_CPR_EXIT(&cpr);               /* drops l2arc_feed_thr_lock */
4608         thread_exit();
4609 }
4610 
4611 boolean_t
4612 l2arc_vdev_present(vdev_t *vd)
4613 {
4614         l2arc_dev_t *dev;
4615 
4616         mutex_enter(&l2arc_dev_mtx);
4617         for (dev = list_head(l2arc_dev_list); dev != NULL;
4618             dev = list_next(l2arc_dev_list, dev)) {
4619                 if (dev->l2ad_vdev == vd)
4620                         break;
4621         }
4622         mutex_exit(&l2arc_dev_mtx);
4623 
4624         return (dev != NULL);
4625 }
4626 
4627 /*
4628  * Add a vdev for use by the L2ARC.  By this point the spa has already
4629  * validated the vdev and opened it.
4630  */
4631 void
4632 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
4633 {
4634         l2arc_dev_t *adddev;
4635 
4636         ASSERT(!l2arc_vdev_present(vd));
4637 
4638         /*
4639          * Create a new l2arc device entry.
4640          */
4641         adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
4642         adddev->l2ad_spa = spa;
4643         adddev->l2ad_vdev = vd;
4644         adddev->l2ad_write = l2arc_write_max;
4645         adddev->l2ad_boost = l2arc_write_boost;
4646         adddev->l2ad_start = VDEV_LABEL_START_SIZE;
4647         adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
4648         adddev->l2ad_hand = adddev->l2ad_start;
4649         adddev->l2ad_evict = adddev->l2ad_start;
4650         adddev->l2ad_first = B_TRUE;
4651         adddev->l2ad_writing = B_FALSE;
4652         ASSERT3U(adddev->l2ad_write, >, 0);
4653 
4654         /*
4655          * This is a list of all ARC buffers that are still valid on the
4656          * device.
4657          */
4658         adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
4659         list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
4660             offsetof(arc_buf_hdr_t, b_l2node));
4661 
4662         vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
4663 
4664         /*
4665          * Add device to global list
4666          */
4667         mutex_enter(&l2arc_dev_mtx);
4668         list_insert_head(l2arc_dev_list, adddev);
4669         atomic_inc_64(&l2arc_ndev);
4670         mutex_exit(&l2arc_dev_mtx);
4671 }
4672 
4673 /*
4674  * Remove a vdev from the L2ARC.
4675  */
4676 void
4677 l2arc_remove_vdev(vdev_t *vd)
4678 {
4679         l2arc_dev_t *dev, *nextdev, *remdev = NULL;
4680 
4681         /*
4682          * Find the device by vdev
4683          */
4684         mutex_enter(&l2arc_dev_mtx);
4685         for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
4686                 nextdev = list_next(l2arc_dev_list, dev);
4687                 if (vd == dev->l2ad_vdev) {
4688                         remdev = dev;
4689                         break;
4690                 }
4691         }
4692         ASSERT(remdev != NULL);
4693 
4694         /*
4695          * Remove device from global list
4696          */
4697         list_remove(l2arc_dev_list, remdev);
4698         l2arc_dev_last = NULL;          /* may have been invalidated */
4699         atomic_dec_64(&l2arc_ndev);
4700         mutex_exit(&l2arc_dev_mtx);
4701 
4702         /*
4703          * Clear all buflists and ARC references.  L2ARC device flush.
4704          */
4705         l2arc_evict(remdev, 0, B_TRUE);
4706         list_destroy(remdev->l2ad_buflist);
4707         kmem_free(remdev->l2ad_buflist, sizeof (list_t));
4708         kmem_free(remdev, sizeof (l2arc_dev_t));
4709 }
4710 
4711 void
4712 l2arc_init(void)
4713 {
4714         l2arc_thread_exit = 0;
4715         l2arc_ndev = 0;
4716         l2arc_writes_sent = 0;
4717         l2arc_writes_done = 0;
4718 
4719         mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
4720         cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
4721         mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
4722         mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
4723         mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
4724 
4725         l2arc_dev_list = &L2ARC_dev_list;
4726         l2arc_free_on_write = &L2ARC_free_on_write;
4727         list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
4728             offsetof(l2arc_dev_t, l2ad_node));
4729         list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
4730             offsetof(l2arc_data_free_t, l2df_list_node));
4731 }
4732 
4733 void
4734 l2arc_fini(void)
4735 {
4736         /*
4737          * This is called from dmu_fini(), which is called from spa_fini();
4738          * Because of this, we can assume that all l2arc devices have
4739          * already been removed when the pools themselves were removed.
4740          */
4741 
4742         l2arc_do_free_on_write();
4743 
4744         mutex_destroy(&l2arc_feed_thr_lock);
4745         cv_destroy(&l2arc_feed_thr_cv);
4746         mutex_destroy(&l2arc_dev_mtx);
4747         mutex_destroy(&l2arc_buflist_mtx);
4748         mutex_destroy(&l2arc_free_on_write_mtx);
4749 
4750         list_destroy(l2arc_dev_list);
4751         list_destroy(l2arc_free_on_write);
4752 }
4753 
4754 void
4755 l2arc_start(void)
4756 {
4757         if (!(spa_mode_global & FWRITE))
4758                 return;
4759 
4760         (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
4761             TS_RUN, minclsyspri);
4762 }
4763 
4764 void
4765 l2arc_stop(void)
4766 {
4767         if (!(spa_mode_global & FWRITE))
4768                 return;
4769 
4770         mutex_enter(&l2arc_feed_thr_lock);
4771         cv_signal(&l2arc_feed_thr_cv);      /* kick thread out of startup */
4772         l2arc_thread_exit = 1;
4773         while (l2arc_thread_exit != 0)
4774                 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
4775         mutex_exit(&l2arc_feed_thr_lock);
4776 }