8005-backout New usr/src/uts/common/fs/zfs/arc.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2019, Joyent, Inc.
  24  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  25  * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
  26  * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
  27  * Copyright (c) 2011, 2019, Delphix. All rights reserved.
  28  * Copyright (c) 2020, George Amanakis. All rights reserved.
  29  * Copyright (c) 2020, The FreeBSD Foundation [1]
  30  *
  31  * [1] Portions of this software were developed by Allan Jude
  32  *     under sponsorship from the FreeBSD Foundation.
  33  */
  34 
  35 /*
  36  * DVA-based Adjustable Replacement Cache
  37  *
  38  * While much of the theory of operation used here is
  39  * based on the self-tuning, low overhead replacement cache
  40  * presented by Megiddo and Modha at FAST 2003, there are some
  41  * significant differences:
  42  *
  43  * 1. The Megiddo and Modha model assumes any page is evictable.
  44  * Pages in its cache cannot be "locked" into memory.  This makes
  45  * the eviction algorithm simple: evict the last page in the list.
  46  * This also make the performance characteristics easy to reason
  47  * about.  Our cache is not so simple.  At any given moment, some
  48  * subset of the blocks in the cache are un-evictable because we
  49  * have handed out a reference to them.  Blocks are only evictable
  50  * when there are no external references active.  This makes
  51  * eviction far more problematic:  we choose to evict the evictable
  52  * blocks that are the "lowest" in the list.
  53  *
  54  * There are times when it is not possible to evict the requested
  55  * space.  In these circumstances we are unable to adjust the cache
  56  * size.  To prevent the cache growing unbounded at these times we
  57  * implement a "cache throttle" that slows the flow of new data
  58  * into the cache until we can make space available.
  59  *
  60  * 2. The Megiddo and Modha model assumes a fixed cache size.
  61  * Pages are evicted when the cache is full and there is a cache
  62  * miss.  Our model has a variable sized cache.  It grows with
  63  * high use, but also tries to react to memory pressure from the
  64  * operating system: decreasing its size when system memory is
  65  * tight.
  66  *
  67  * 3. The Megiddo and Modha model assumes a fixed page size. All
  68  * elements of the cache are therefore exactly the same size.  So
  69  * when adjusting the cache size following a cache miss, its simply
  70  * a matter of choosing a single page to evict.  In our model, we
  71  * have variable sized cache blocks (rangeing from 512 bytes to
  72  * 128K bytes).  We therefore choose a set of blocks to evict to make
  73  * space for a cache miss that approximates as closely as possible
  74  * the space used by the new block.
  75  *
  76  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  77  * by N. Megiddo & D. Modha, FAST 2003
  78  */
  79 
  80 /*
  81  * The locking model:
  82  *
  83  * A new reference to a cache buffer can be obtained in two
  84  * ways: 1) via a hash table lookup using the DVA as a key,
  85  * or 2) via one of the ARC lists.  The arc_read() interface
  86  * uses method 1, while the internal ARC algorithms for
  87  * adjusting the cache use method 2.  We therefore provide two
  88  * types of locks: 1) the hash table lock array, and 2) the
  89  * ARC list locks.
  90  *
  91  * Buffers do not have their own mutexes, rather they rely on the
  92  * hash table mutexes for the bulk of their protection (i.e. most
  93  * fields in the arc_buf_hdr_t are protected by these mutexes).
  94  *
  95  * buf_hash_find() returns the appropriate mutex (held) when it
  96  * locates the requested buffer in the hash table.  It returns
  97  * NULL for the mutex if the buffer was not in the table.
  98  *
  99  * buf_hash_remove() expects the appropriate hash mutex to be
 100  * already held before it is invoked.
 101  *
 102  * Each ARC state also has a mutex which is used to protect the
 103  * buffer list associated with the state.  When attempting to
 104  * obtain a hash table lock while holding an ARC list lock you
 105  * must use: mutex_tryenter() to avoid deadlock.  Also note that
 106  * the active state mutex must be held before the ghost state mutex.
 107  *
 108  * Note that the majority of the performance stats are manipulated
 109  * with atomic operations.
 110  *
 111  * The L2ARC uses the l2ad_mtx on each vdev for the following:
 112  *
 113  *      - L2ARC buflist creation
 114  *      - L2ARC buflist eviction
 115  *      - L2ARC write completion, which walks L2ARC buflists
 116  *      - ARC header destruction, as it removes from L2ARC buflists
 117  *      - ARC header release, as it removes from L2ARC buflists
 118  */
 119 
 120 /*
 121  * ARC operation:
 122  *
 123  * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure.
 124  * This structure can point either to a block that is still in the cache or to
 125  * one that is only accessible in an L2 ARC device, or it can provide
 126  * information about a block that was recently evicted. If a block is
 127  * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough
 128  * information to retrieve it from the L2ARC device. This information is
 129  * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block
 130  * that is in this state cannot access the data directly.
 131  *
 132  * Blocks that are actively being referenced or have not been evicted
 133  * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
 134  * the arc_buf_hdr_t that will point to the data block in memory. A block can
 135  * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
 136  * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
 137  * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd).
 138  *
 139  * The L1ARC's data pointer may or may not be uncompressed. The ARC has the
 140  * ability to store the physical data (b_pabd) associated with the DVA of the
 141  * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block,
 142  * it will match its on-disk compression characteristics. This behavior can be
 143  * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
 144  * compressed ARC functionality is disabled, the b_pabd will point to an
 145  * uncompressed version of the on-disk data.
 146  *
 147  * Data in the L1ARC is not accessed by consumers of the ARC directly. Each
 148  * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it.
 149  * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC
 150  * consumer. The ARC will provide references to this data and will keep it
 151  * cached until it is no longer in use. The ARC caches only the L1ARC's physical
 152  * data block and will evict any arc_buf_t that is no longer referenced. The
 153  * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the
 154  * "overhead_size" kstat.
 155  *
 156  * Depending on the consumer, an arc_buf_t can be requested in uncompressed or
 157  * compressed form. The typical case is that consumers will want uncompressed
 158  * data, and when that happens a new data buffer is allocated where the data is
 159  * decompressed for them to use. Currently the only consumer who wants
 160  * compressed arc_buf_t's is "zfs send", when it streams data exactly as it
 161  * exists on disk. When this happens, the arc_buf_t's data buffer is shared
 162  * with the arc_buf_hdr_t.
 163  *
 164  * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The
 165  * first one is owned by a compressed send consumer (and therefore references
 166  * the same compressed data buffer as the arc_buf_hdr_t) and the second could be
 167  * used by any other consumer (and has its own uncompressed copy of the data
 168  * buffer).
 169  *
 170  *   arc_buf_hdr_t
 171  *   +-----------+
 172  *   | fields    |
 173  *   | common to |
 174  *   | L1- and   |
 175  *   | L2ARC     |
 176  *   +-----------+
 177  *   | l2arc_buf_hdr_t
 178  *   |           |
 179  *   +-----------+
 180  *   | l1arc_buf_hdr_t
 181  *   |           |              arc_buf_t
 182  *   | b_buf     +------------>+-----------+      arc_buf_t
 183  *   | b_pabd    +-+           |b_next     +---->+-----------+
 184  *   +-----------+ |           |-----------|     |b_next     +-->NULL
 185  *                 |           |b_comp = T |     +-----------+
 186  *                 |           |b_data     +-+   |b_comp = F |
 187  *                 |           +-----------+ |   |b_data     +-+
 188  *                 +->+------+               |   +-----------+ |
 189  *        compressed  |      |               |                 |
 190  *           data     |      |<--------------+                 | uncompressed
 191  *                    +------+          compressed,            |     data
 192  *                                        shared               +-->+------+
 193  *                                         data                    |      |
 194  *                                                                 |      |
 195  *                                                                 +------+
 196  *
 197  * When a consumer reads a block, the ARC must first look to see if the
 198  * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
 199  * arc_buf_t and either copies uncompressed data into a new data buffer from an
 200  * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a
 201  * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the
 202  * hdr is compressed and the desired compression characteristics of the
 203  * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
 204  * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
 205  * the last buffer in the hdr's b_buf list, however a shared compressed buf can
 206  * be anywhere in the hdr's list.
 207  *
 208  * The diagram below shows an example of an uncompressed ARC hdr that is
 209  * sharing its data with an arc_buf_t (note that the shared uncompressed buf is
 210  * the last element in the buf list):
 211  *
 212  *                arc_buf_hdr_t
 213  *                +-----------+
 214  *                |           |
 215  *                |           |
 216  *                |           |
 217  *                +-----------+
 218  * l2arc_buf_hdr_t|           |
 219  *                |           |
 220  *                +-----------+
 221  * l1arc_buf_hdr_t|           |
 222  *                |           |                 arc_buf_t    (shared)
 223  *                |    b_buf  +------------>+---------+      arc_buf_t
 224  *                |           |             |b_next   +---->+---------+
 225  *                |  b_pabd   +-+           |---------|     |b_next   +-->NULL
 226  *                +-----------+ |           |         |     +---------+
 227  *                              |           |b_data   +-+   |         |
 228  *                              |           +---------+ |   |b_data   +-+
 229  *                              +->+------+             |   +---------+ |
 230  *                                 |      |             |               |
 231  *                   uncompressed  |      |             |               |
 232  *                        data     +------+             |               |
 233  *                                    ^                 +->+------+     |
 234  *                                    |       uncompressed |      |     |
 235  *                                    |           data     |      |     |
 236  *                                    |                    +------+     |
 237  *                                    +---------------------------------+
 238  *
 239  * Writing to the ARC requires that the ARC first discard the hdr's b_pabd
 240  * since the physical block is about to be rewritten. The new data contents
 241  * will be contained in the arc_buf_t. As the I/O pipeline performs the write,
 242  * it may compress the data before writing it to disk. The ARC will be called
 243  * with the transformed data and will bcopy the transformed on-disk block into
 244  * a newly allocated b_pabd. Writes are always done into buffers which have
 245  * either been loaned (and hence are new and don't have other readers) or
 246  * buffers which have been released (and hence have their own hdr, if there
 247  * were originally other readers of the buf's original hdr). This ensures that
 248  * the ARC only needs to update a single buf and its hdr after a write occurs.
 249  *
 250  * When the L2ARC is in use, it will also take advantage of the b_pabd. The
 251  * L2ARC will always write the contents of b_pabd to the L2ARC. This means
 252  * that when compressed ARC is enabled that the L2ARC blocks are identical
 253  * to the on-disk block in the main data pool. This provides a significant
 254  * advantage since the ARC can leverage the bp's checksum when reading from the
 255  * L2ARC to determine if the contents are valid. However, if the compressed
 256  * ARC is disabled, then the L2ARC's block must be transformed to look
 257  * like the physical block in the main data pool before comparing the
 258  * checksum and determining its validity.
 259  *
 260  * The L1ARC has a slightly different system for storing encrypted data.
 261  * Raw (encrypted + possibly compressed) data has a few subtle differences from
 262  * data that is just compressed. The biggest difference is that it is not
 263  * possible to decrypt encrypted data (or visa versa) if the keys aren't loaded.
 264  * The other difference is that encryption cannot be treated as a suggestion.
 265  * If a caller would prefer compressed data, but they actually wind up with
 266  * uncompressed data the worst thing that could happen is there might be a
 267  * performance hit. If the caller requests encrypted data, however, we must be
 268  * sure they actually get it or else secret information could be leaked. Raw
 269  * data is stored in hdr->b_crypt_hdr.b_rabd. An encrypted header, therefore,
 270  * may have both an encrypted version and a decrypted version of its data at
 271  * once. When a caller needs a raw arc_buf_t, it is allocated and the data is
 272  * copied out of this header. To avoid complications with b_pabd, raw buffers
 273  * cannot be shared.
 274  */
 275 
 276 #include <sys/spa.h>
 277 #include <sys/zio.h>
 278 #include <sys/spa_impl.h>
 279 #include <sys/zio_compress.h>
 280 #include <sys/zio_checksum.h>
 281 #include <sys/zfs_context.h>
 282 #include <sys/arc.h>
 283 #include <sys/refcount.h>
 284 #include <sys/vdev.h>
 285 #include <sys/vdev_impl.h>
 286 #include <sys/dsl_pool.h>
 287 #include <sys/zfs_zone.h>
 288 #include <sys/zio_checksum.h>
 289 #include <sys/multilist.h>
 290 #include <sys/abd.h>
 291 #include <sys/zil.h>
 292 #include <sys/fm/fs/zfs.h>
 293 #ifdef _KERNEL
 294 #include <sys/vmsystm.h>
 295 #include <vm/anon.h>
 296 #include <sys/fs/swapnode.h>
 297 #include <sys/dnlc.h>
 298 #endif
 299 #include <sys/callb.h>
 300 #include <sys/kstat.h>
 301 #include <sys/zthr.h>
 302 #include <zfs_fletcher.h>
 303 #include <sys/arc_impl.h>
 304 #include <sys/aggsum.h>
 305 #include <sys/cityhash.h>
 306 #include <sys/param.h>
 307 
 308 #ifndef _KERNEL
 309 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 310 boolean_t arc_watch = B_FALSE;
 311 int arc_procfd;
 312 #endif
 313 
 314 /*
 315  * This thread's job is to keep enough free memory in the system, by
 316  * calling arc_kmem_reap_now() plus arc_shrink(), which improves
 317  * arc_available_memory().
 318  */
 319 static zthr_t           *arc_reap_zthr;
 320 
 321 /*
 322  * This thread's job is to keep arc_size under arc_c, by calling
 323  * arc_adjust(), which improves arc_is_overflowing().
 324  */
 325 static zthr_t           *arc_adjust_zthr;
 326 
 327 static kmutex_t         arc_adjust_lock;
 328 static kcondvar_t       arc_adjust_waiters_cv;
 329 static boolean_t        arc_adjust_needed = B_FALSE;
 330 
 331 uint_t arc_reduce_dnlc_percent = 3;
 332 
 333 /*
 334  * The number of headers to evict in arc_evict_state_impl() before
 335  * dropping the sublist lock and evicting from another sublist. A lower
 336  * value means we're more likely to evict the "correct" header (i.e. the
 337  * oldest header in the arc state), but comes with higher overhead
 338  * (i.e. more invocations of arc_evict_state_impl()).
 339  */
 340 int zfs_arc_evict_batch_limit = 10;
 341 
 342 /* number of seconds before growing cache again */
 343 int arc_grow_retry = 60;
 344 
 345 /*
 346  * Minimum time between calls to arc_kmem_reap_soon().  Note that this will
 347  * be converted to ticks, so with the default hz=100, a setting of 15 ms
 348  * will actually wait 2 ticks, or 20ms.
 349  */
 350 int arc_kmem_cache_reap_retry_ms = 1000;
 351 
 352 /* shift of arc_c for calculating overflow limit in arc_get_data_impl */
 353 int zfs_arc_overflow_shift = 3;
 354 
 355 /* shift of arc_c for calculating both min and max arc_p */
 356 int arc_p_min_shift = 4;
 357 
 358 /* log2(fraction of arc to reclaim) */
 359 int arc_shrink_shift = 7;
 360 
 361 /*
 362  * log2(fraction of ARC which must be free to allow growing).
 363  * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
 364  * when reading a new block into the ARC, we will evict an equal-sized block
 365  * from the ARC.
 366  *
 367  * This must be less than arc_shrink_shift, so that when we shrink the ARC,
 368  * we will still not allow it to grow.
 369  */
 370 int                     arc_no_grow_shift = 5;
 371 
 372 
 373 /*
 374  * minimum lifespan of a prefetch block in clock ticks
 375  * (initialized in arc_init())
 376  */
 377 static int              zfs_arc_min_prefetch_ms = 1;
 378 static int              zfs_arc_min_prescient_prefetch_ms = 6;
 379 
 380 /*
 381  * If this percent of memory is free, don't throttle.
 382  */
 383 int arc_lotsfree_percent = 10;
 384 
 385 static boolean_t arc_initialized;
 386 
 387 /*
 388  * The arc has filled available memory and has now warmed up.
 389  */
 390 static boolean_t arc_warm;
 391 
 392 /*
 393  * log2 fraction of the zio arena to keep free.
 394  */
 395 int arc_zio_arena_free_shift = 2;
 396 
 397 /*
 398  * These tunables are for performance analysis.
 399  */
 400 uint64_t zfs_arc_max;
 401 uint64_t zfs_arc_min;
 402 uint64_t zfs_arc_meta_limit = 0;
 403 uint64_t zfs_arc_meta_min = 0;
 404 int zfs_arc_grow_retry = 0;
 405 int zfs_arc_shrink_shift = 0;
 406 int zfs_arc_p_min_shift = 0;
 407 int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
 408 
 409 /*
 410  * ARC dirty data constraints for arc_tempreserve_space() throttle
 411  */
 412 uint_t zfs_arc_dirty_limit_percent = 50;        /* total dirty data limit */
 413 uint_t zfs_arc_anon_limit_percent = 25;         /* anon block dirty limit */
 414 uint_t zfs_arc_pool_dirty_percent = 20;         /* each pool's anon allowance */
 415 
 416 boolean_t zfs_compressed_arc_enabled = B_TRUE;
 417 
 418 /* The 6 states: */
 419 static arc_state_t ARC_anon;
 420 static arc_state_t ARC_mru;
 421 static arc_state_t ARC_mru_ghost;
 422 static arc_state_t ARC_mfu;
 423 static arc_state_t ARC_mfu_ghost;
 424 static arc_state_t ARC_l2c_only;
 425 
 426 arc_stats_t arc_stats = {
 427         { "hits",                       KSTAT_DATA_UINT64 },
 428         { "misses",                     KSTAT_DATA_UINT64 },
 429         { "demand_data_hits",           KSTAT_DATA_UINT64 },
 430         { "demand_data_misses",         KSTAT_DATA_UINT64 },
 431         { "demand_metadata_hits",       KSTAT_DATA_UINT64 },
 432         { "demand_metadata_misses",     KSTAT_DATA_UINT64 },
 433         { "prefetch_data_hits",         KSTAT_DATA_UINT64 },
 434         { "prefetch_data_misses",       KSTAT_DATA_UINT64 },
 435         { "prefetch_metadata_hits",     KSTAT_DATA_UINT64 },
 436         { "prefetch_metadata_misses",   KSTAT_DATA_UINT64 },
 437         { "mru_hits",                   KSTAT_DATA_UINT64 },
 438         { "mru_ghost_hits",             KSTAT_DATA_UINT64 },
 439         { "mfu_hits",                   KSTAT_DATA_UINT64 },
 440         { "mfu_ghost_hits",             KSTAT_DATA_UINT64 },
 441         { "deleted",                    KSTAT_DATA_UINT64 },
 442         { "mutex_miss",                 KSTAT_DATA_UINT64 },
 443         { "access_skip",                KSTAT_DATA_UINT64 },
 444         { "evict_skip",                 KSTAT_DATA_UINT64 },
 445         { "evict_not_enough",           KSTAT_DATA_UINT64 },
 446         { "evict_l2_cached",            KSTAT_DATA_UINT64 },
 447         { "evict_l2_eligible",          KSTAT_DATA_UINT64 },
 448         { "evict_l2_eligible_mfu",      KSTAT_DATA_UINT64 },
 449         { "evict_l2_eligible_mru",      KSTAT_DATA_UINT64 },
 450         { "evict_l2_ineligible",        KSTAT_DATA_UINT64 },
 451         { "evict_l2_skip",              KSTAT_DATA_UINT64 },
 452         { "hash_elements",              KSTAT_DATA_UINT64 },
 453         { "hash_elements_max",          KSTAT_DATA_UINT64 },
 454         { "hash_collisions",            KSTAT_DATA_UINT64 },
 455         { "hash_chains",                KSTAT_DATA_UINT64 },
 456         { "hash_chain_max",             KSTAT_DATA_UINT64 },
 457         { "p",                          KSTAT_DATA_UINT64 },
 458         { "c",                          KSTAT_DATA_UINT64 },
 459         { "c_min",                      KSTAT_DATA_UINT64 },
 460         { "c_max",                      KSTAT_DATA_UINT64 },
 461         { "size",                       KSTAT_DATA_UINT64 },
 462         { "compressed_size",            KSTAT_DATA_UINT64 },
 463         { "uncompressed_size",          KSTAT_DATA_UINT64 },
 464         { "overhead_size",              KSTAT_DATA_UINT64 },
 465         { "hdr_size",                   KSTAT_DATA_UINT64 },
 466         { "data_size",                  KSTAT_DATA_UINT64 },
 467         { "metadata_size",              KSTAT_DATA_UINT64 },
 468         { "other_size",                 KSTAT_DATA_UINT64 },
 469         { "anon_size",                  KSTAT_DATA_UINT64 },
 470         { "anon_evictable_data",        KSTAT_DATA_UINT64 },
 471         { "anon_evictable_metadata",    KSTAT_DATA_UINT64 },
 472         { "mru_size",                   KSTAT_DATA_UINT64 },
 473         { "mru_evictable_data",         KSTAT_DATA_UINT64 },
 474         { "mru_evictable_metadata",     KSTAT_DATA_UINT64 },
 475         { "mru_ghost_size",             KSTAT_DATA_UINT64 },
 476         { "mru_ghost_evictable_data",   KSTAT_DATA_UINT64 },
 477         { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
 478         { "mfu_size",                   KSTAT_DATA_UINT64 },
 479         { "mfu_evictable_data",         KSTAT_DATA_UINT64 },
 480         { "mfu_evictable_metadata",     KSTAT_DATA_UINT64 },
 481         { "mfu_ghost_size",             KSTAT_DATA_UINT64 },
 482         { "mfu_ghost_evictable_data",   KSTAT_DATA_UINT64 },
 483         { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
 484         { "l2_hits",                    KSTAT_DATA_UINT64 },
 485         { "l2_misses",                  KSTAT_DATA_UINT64 },
 486         { "l2_prefetch_asize",          KSTAT_DATA_UINT64 },
 487         { "l2_mru_asize",               KSTAT_DATA_UINT64 },
 488         { "l2_mfu_asize",               KSTAT_DATA_UINT64 },
 489         { "l2_bufc_data_asize",         KSTAT_DATA_UINT64 },
 490         { "l2_bufc_metadata_asize",     KSTAT_DATA_UINT64 },
 491         { "l2_feeds",                   KSTAT_DATA_UINT64 },
 492         { "l2_rw_clash",                KSTAT_DATA_UINT64 },
 493         { "l2_read_bytes",              KSTAT_DATA_UINT64 },
 494         { "l2_write_bytes",             KSTAT_DATA_UINT64 },
 495         { "l2_writes_sent",             KSTAT_DATA_UINT64 },
 496         { "l2_writes_done",             KSTAT_DATA_UINT64 },
 497         { "l2_writes_error",            KSTAT_DATA_UINT64 },
 498         { "l2_writes_lock_retry",       KSTAT_DATA_UINT64 },
 499         { "l2_evict_lock_retry",        KSTAT_DATA_UINT64 },
 500         { "l2_evict_reading",           KSTAT_DATA_UINT64 },
 501         { "l2_evict_l1cached",          KSTAT_DATA_UINT64 },
 502         { "l2_free_on_write",           KSTAT_DATA_UINT64 },
 503         { "l2_abort_lowmem",            KSTAT_DATA_UINT64 },
 504         { "l2_cksum_bad",               KSTAT_DATA_UINT64 },
 505         { "l2_io_error",                KSTAT_DATA_UINT64 },
 506         { "l2_size",                    KSTAT_DATA_UINT64 },
 507         { "l2_asize",                   KSTAT_DATA_UINT64 },
 508         { "l2_hdr_size",                KSTAT_DATA_UINT64 },
 509         { "l2_log_blk_writes",          KSTAT_DATA_UINT64 },
 510         { "l2_log_blk_avg_asize",       KSTAT_DATA_UINT64 },
 511         { "l2_log_blk_asize",           KSTAT_DATA_UINT64 },
 512         { "l2_log_blk_count",           KSTAT_DATA_UINT64 },
 513         { "l2_data_to_meta_ratio",      KSTAT_DATA_UINT64 },
 514         { "l2_rebuild_success",         KSTAT_DATA_UINT64 },
 515         { "l2_rebuild_unsupported",     KSTAT_DATA_UINT64 },
 516         { "l2_rebuild_io_errors",       KSTAT_DATA_UINT64 },
 517         { "l2_rebuild_dh_errors",       KSTAT_DATA_UINT64 },
 518         { "l2_rebuild_cksum_lb_errors", KSTAT_DATA_UINT64 },
 519         { "l2_rebuild_lowmem",          KSTAT_DATA_UINT64 },
 520         { "l2_rebuild_size",            KSTAT_DATA_UINT64 },
 521         { "l2_rebuild_asize",           KSTAT_DATA_UINT64 },
 522         { "l2_rebuild_bufs",            KSTAT_DATA_UINT64 },
 523         { "l2_rebuild_bufs_precached",  KSTAT_DATA_UINT64 },
 524         { "l2_rebuild_log_blks",        KSTAT_DATA_UINT64 },
 525         { "memory_throttle_count",      KSTAT_DATA_UINT64 },
 526         { "arc_meta_used",              KSTAT_DATA_UINT64 },
 527         { "arc_meta_limit",             KSTAT_DATA_UINT64 },
 528         { "arc_meta_max",               KSTAT_DATA_UINT64 },
 529         { "arc_meta_min",               KSTAT_DATA_UINT64 },
 530         { "async_upgrade_sync",         KSTAT_DATA_UINT64 },
 531         { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
 532         { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
 533 };
 534 
 535 #define ARCSTAT_MAX(stat, val) {                                        \
 536         uint64_t m;                                                     \
 537         while ((val) > (m = arc_stats.stat.value.ui64) &&            \
 538             (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))     \
 539                 continue;                                               \
 540 }
 541 
 542 #define ARCSTAT_MAXSTAT(stat) \
 543         ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
 544 
 545 /*
 546  * We define a macro to allow ARC hits/misses to be easily broken down by
 547  * two separate conditions, giving a total of four different subtypes for
 548  * each of hits and misses (so eight statistics total).
 549  */
 550 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 551         if (cond1) {                                                    \
 552                 if (cond2) {                                            \
 553                         ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 554                 } else {                                                \
 555                         ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 556                 }                                                       \
 557         } else {                                                        \
 558                 if (cond2) {                                            \
 559                         ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 560                 } else {                                                \
 561                         ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 562                 }                                                       \
 563         }
 564 
 565 /*
 566  * This macro allows us to use kstats as floating averages. Each time we
 567  * update this kstat, we first factor it and the update value by
 568  * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
 569  * average. This macro assumes that integer loads and stores are atomic, but
 570  * is not safe for multiple writers updating the kstat in parallel (only the
 571  * last writer's update will remain).
 572  */
 573 #define ARCSTAT_F_AVG_FACTOR    3
 574 #define ARCSTAT_F_AVG(stat, value) \
 575         do { \
 576                 uint64_t x = ARCSTAT(stat); \
 577                 x = x - x / ARCSTAT_F_AVG_FACTOR + \
 578                     (value) / ARCSTAT_F_AVG_FACTOR; \
 579                 ARCSTAT(stat) = x; \
 580                 _NOTE(CONSTCOND) \
 581         } while (0)
 582 
 583 kstat_t                 *arc_ksp;
 584 static arc_state_t      *arc_anon;
 585 static arc_state_t      *arc_mru;
 586 static arc_state_t      *arc_mru_ghost;
 587 static arc_state_t      *arc_mfu;
 588 static arc_state_t      *arc_mfu_ghost;
 589 static arc_state_t      *arc_l2c_only;
 590 
 591 /*
 592  * There are also some ARC variables that we want to export, but that are
 593  * updated so often that having the canonical representation be the statistic
 594  * variable causes a performance bottleneck. We want to use aggsum_t's for these
 595  * instead, but still be able to export the kstat in the same way as before.
 596  * The solution is to always use the aggsum version, except in the kstat update
 597  * callback.
 598  */
 599 aggsum_t arc_size;
 600 aggsum_t arc_meta_used;
 601 aggsum_t astat_data_size;
 602 aggsum_t astat_metadata_size;
 603 aggsum_t astat_hdr_size;
 604 aggsum_t astat_other_size;
 605 aggsum_t astat_l2_hdr_size;
 606 
 607 static int              arc_no_grow;    /* Don't try to grow cache size */
 608 static hrtime_t         arc_growtime;
 609 static uint64_t         arc_tempreserve;
 610 static uint64_t         arc_loaned_bytes;
 611 
 612 #define GHOST_STATE(state)      \
 613         ((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||        \
 614         (state) == arc_l2c_only)
 615 
 616 #define HDR_IN_HASH_TABLE(hdr)  ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
 617 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
 618 #define HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
 619 #define HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_FLAG_PREFETCH)
 620 #define HDR_PRESCIENT_PREFETCH(hdr)     \
 621         ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
 622 #define HDR_COMPRESSION_ENABLED(hdr)    \
 623         ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
 624 
 625 #define HDR_L2CACHE(hdr)        ((hdr)->b_flags & ARC_FLAG_L2CACHE)
 626 #define HDR_L2_READING(hdr)     \
 627         (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&   \
 628         ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
 629 #define HDR_L2_WRITING(hdr)     ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
 630 #define HDR_L2_EVICTED(hdr)     ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
 631 #define HDR_L2_WRITE_HEAD(hdr)  ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
 632 #define HDR_PROTECTED(hdr)      ((hdr)->b_flags & ARC_FLAG_PROTECTED)
 633 #define HDR_NOAUTH(hdr)         ((hdr)->b_flags & ARC_FLAG_NOAUTH)
 634 #define HDR_SHARED_DATA(hdr)    ((hdr)->b_flags & ARC_FLAG_SHARED_DATA)
 635 
 636 #define HDR_ISTYPE_METADATA(hdr)        \
 637         ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
 638 #define HDR_ISTYPE_DATA(hdr)    (!HDR_ISTYPE_METADATA(hdr))
 639 
 640 #define HDR_HAS_L1HDR(hdr)      ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
 641 #define HDR_HAS_L2HDR(hdr)      ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
 642 #define HDR_HAS_RABD(hdr)       \
 643         (HDR_HAS_L1HDR(hdr) && HDR_PROTECTED(hdr) &&    \
 644         (hdr)->b_crypt_hdr.b_rabd != NULL)
 645 #define HDR_ENCRYPTED(hdr)      \
 646         (HDR_PROTECTED(hdr) && DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
 647 #define HDR_AUTHENTICATED(hdr)  \
 648         (HDR_PROTECTED(hdr) && !DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
 649 
 650 /* For storing compression mode in b_flags */
 651 #define HDR_COMPRESS_OFFSET     (highbit64(ARC_FLAG_COMPRESS_0) - 1)
 652 
 653 #define HDR_GET_COMPRESS(hdr)   ((enum zio_compress)BF32_GET((hdr)->b_flags, \
 654         HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS))
 655 #define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \
 656         HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
 657 
 658 #define ARC_BUF_LAST(buf)       ((buf)->b_next == NULL)
 659 #define ARC_BUF_SHARED(buf)     ((buf)->b_flags & ARC_BUF_FLAG_SHARED)
 660 #define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED)
 661 #define ARC_BUF_ENCRYPTED(buf)  ((buf)->b_flags & ARC_BUF_FLAG_ENCRYPTED)
 662 
 663 /*
 664  * Other sizes
 665  */
 666 
 667 #define HDR_FULL_CRYPT_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 668 #define HDR_FULL_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_crypt_hdr))
 669 #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
 670 
 671 /*
 672  * Hash table routines
 673  */
 674 
 675 #define HT_LOCK_PAD     64
 676 
 677 struct ht_lock {
 678         kmutex_t        ht_lock;
 679 #ifdef _KERNEL
 680         unsigned char   pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
 681 #endif
 682 };
 683 
 684 #define BUF_LOCKS 256
 685 typedef struct buf_hash_table {
 686         uint64_t ht_mask;
 687         arc_buf_hdr_t **ht_table;
 688         struct ht_lock ht_locks[BUF_LOCKS];
 689 } buf_hash_table_t;
 690 
 691 static buf_hash_table_t buf_hash_table;
 692 
 693 #define BUF_HASH_INDEX(spa, dva, birth) \
 694         (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 695 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 696 #define BUF_HASH_LOCK(idx)      (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
 697 #define HDR_LOCK(hdr) \
 698         (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 699 
 700 uint64_t zfs_crc64_table[256];
 701 
 702 /*
 703  * Level 2 ARC
 704  */
 705 
 706 #define L2ARC_WRITE_SIZE        (8 * 1024 * 1024)       /* initial write max */
 707 #define L2ARC_HEADROOM          2                       /* num of writes */
 708 /*
 709  * If we discover during ARC scan any buffers to be compressed, we boost
 710  * our headroom for the next scanning cycle by this percentage multiple.
 711  */
 712 #define L2ARC_HEADROOM_BOOST    200
 713 #define L2ARC_FEED_SECS         1               /* caching interval secs */
 714 #define L2ARC_FEED_MIN_MS       200             /* min caching interval ms */
 715 
 716 /*
 717  * We can feed L2ARC from two states of ARC buffers, mru and mfu,
 718  * and each of the state has two types: data and metadata.
 719  */
 720 #define L2ARC_FEED_TYPES        4
 721 
 722 
 723 #define l2arc_writes_sent       ARCSTAT(arcstat_l2_writes_sent)
 724 #define l2arc_writes_done       ARCSTAT(arcstat_l2_writes_done)
 725 
 726 /* L2ARC Performance Tunables */
 727 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;    /* default max write size */
 728 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;  /* extra write during warmup */
 729 uint64_t l2arc_headroom = L2ARC_HEADROOM;       /* number of dev writes */
 730 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 731 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;     /* interval seconds */
 732 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
 733 boolean_t l2arc_noprefetch = B_TRUE;            /* don't cache prefetch bufs */
 734 boolean_t l2arc_feed_again = B_TRUE;            /* turbo warmup */
 735 boolean_t l2arc_norw = B_TRUE;                  /* no reads during writes */
 736 int l2arc_meta_percent = 33;                    /* limit on headers size */
 737 
 738 /*
 739  * L2ARC Internals
 740  */
 741 static list_t L2ARC_dev_list;                   /* device list */
 742 static list_t *l2arc_dev_list;                  /* device list pointer */
 743 static kmutex_t l2arc_dev_mtx;                  /* device list mutex */
 744 static l2arc_dev_t *l2arc_dev_last;             /* last device used */
 745 static list_t L2ARC_free_on_write;              /* free after write buf list */
 746 static list_t *l2arc_free_on_write;             /* free after write list ptr */
 747 static kmutex_t l2arc_free_on_write_mtx;        /* mutex for list */
 748 static uint64_t l2arc_ndev;                     /* number of devices */
 749 
 750 typedef struct l2arc_read_callback {
 751         arc_buf_hdr_t           *l2rcb_hdr;             /* read header */
 752         blkptr_t                l2rcb_bp;               /* original blkptr */
 753         zbookmark_phys_t        l2rcb_zb;               /* original bookmark */
 754         int                     l2rcb_flags;            /* original flags */
 755         abd_t                   *l2rcb_abd;             /* temporary buffer */
 756 } l2arc_read_callback_t;
 757 
 758 typedef struct l2arc_data_free {
 759         /* protected by l2arc_free_on_write_mtx */
 760         abd_t           *l2df_abd;
 761         size_t          l2df_size;
 762         arc_buf_contents_t l2df_type;
 763         list_node_t     l2df_list_node;
 764 } l2arc_data_free_t;
 765 
 766 static kmutex_t l2arc_feed_thr_lock;
 767 static kcondvar_t l2arc_feed_thr_cv;
 768 static uint8_t l2arc_thread_exit;
 769 
 770 static kmutex_t l2arc_rebuild_thr_lock;
 771 static kcondvar_t l2arc_rebuild_thr_cv;
 772 
 773 enum arc_hdr_alloc_flags {
 774         ARC_HDR_ALLOC_RDATA = 0x1,
 775         ARC_HDR_DO_ADAPT = 0x2,
 776 };
 777 
 778 
 779 static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *, boolean_t);
 780 typedef enum arc_fill_flags {
 781         ARC_FILL_LOCKED         = 1 << 0, /* hdr lock is held */
 782         ARC_FILL_COMPRESSED     = 1 << 1, /* fill with compressed data */
 783         ARC_FILL_ENCRYPTED      = 1 << 2, /* fill with encrypted data */
 784         ARC_FILL_NOAUTH         = 1 << 3, /* don't attempt to authenticate */
 785         ARC_FILL_IN_PLACE       = 1 << 4  /* fill in place (special case) */
 786 } arc_fill_flags_t;
 787 
 788 static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
 789 static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *, boolean_t);
 790 static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *);
 791 static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
 792 static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag);
 793 static void arc_hdr_free_pabd(arc_buf_hdr_t *, boolean_t);
 794 static void arc_hdr_alloc_pabd(arc_buf_hdr_t *, int);
 795 static void arc_access(arc_buf_hdr_t *, kmutex_t *);
 796 static boolean_t arc_is_overflowing();
 797 static void arc_buf_watch(arc_buf_t *);
 798 static l2arc_dev_t *l2arc_vdev_get(vdev_t *vd);
 799 
 800 static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
 801 static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
 802 static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
 803 static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
 804 
 805 static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
 806 static void l2arc_read_done(zio_t *);
 807 static void l2arc_do_free_on_write(void);
 808 static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
 809     boolean_t state_only);
 810 
 811 #define l2arc_hdr_arcstats_increment(hdr) \
 812         l2arc_hdr_arcstats_update((hdr), B_TRUE, B_FALSE)
 813 #define l2arc_hdr_arcstats_decrement(hdr) \
 814         l2arc_hdr_arcstats_update((hdr), B_FALSE, B_FALSE)
 815 #define l2arc_hdr_arcstats_increment_state(hdr) \
 816         l2arc_hdr_arcstats_update((hdr), B_TRUE, B_TRUE)
 817 #define l2arc_hdr_arcstats_decrement_state(hdr) \
 818         l2arc_hdr_arcstats_update((hdr), B_FALSE, B_TRUE)
 819 
 820 /*
 821  * The arc_all_memory function is a ZoL enhancement that lives in their OSL
 822  * code. In user-space code, which is used primarily for testing, we return
 823  * half of all memory.
 824  */
 825 uint64_t
 826 arc_all_memory(void)
 827 {
 828 #ifdef _KERNEL
 829         return (ptob(physmem));
 830 #else
 831         return ((sysconf(_SC_PAGESIZE) * sysconf(_SC_PHYS_PAGES)) / 2);
 832 #endif
 833 }
 834 
 835 /*
 836  * We use Cityhash for this. It's fast, and has good hash properties without
 837  * requiring any large static buffers.
 838  */
 839 static uint64_t
 840 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 841 {
 842         return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth));
 843 }
 844 
 845 #define HDR_EMPTY(hdr)                                          \
 846         ((hdr)->b_dva.dva_word[0] == 0 &&                    \
 847         (hdr)->b_dva.dva_word[1] == 0)
 848 
 849 #define HDR_EMPTY_OR_LOCKED(hdr)                                \
 850         (HDR_EMPTY(hdr) || MUTEX_HELD(HDR_LOCK(hdr)))
 851 
 852 #define HDR_EQUAL(spa, dva, birth, hdr)                         \
 853         ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&       \
 854         ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&       \
 855         ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa)
 856 
 857 static void
 858 buf_discard_identity(arc_buf_hdr_t *hdr)
 859 {
 860         hdr->b_dva.dva_word[0] = 0;
 861         hdr->b_dva.dva_word[1] = 0;
 862         hdr->b_birth = 0;
 863 }
 864 
 865 static arc_buf_hdr_t *
 866 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
 867 {
 868         const dva_t *dva = BP_IDENTITY(bp);
 869         uint64_t birth = BP_PHYSICAL_BIRTH(bp);
 870         uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 871         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 872         arc_buf_hdr_t *hdr;
 873 
 874         mutex_enter(hash_lock);
 875         for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
 876             hdr = hdr->b_hash_next) {
 877                 if (HDR_EQUAL(spa, dva, birth, hdr)) {
 878                         *lockp = hash_lock;
 879                         return (hdr);
 880                 }
 881         }
 882         mutex_exit(hash_lock);
 883         *lockp = NULL;
 884         return (NULL);
 885 }
 886 
 887 /*
 888  * Insert an entry into the hash table.  If there is already an element
 889  * equal to elem in the hash table, then the already existing element
 890  * will be returned and the new element will not be inserted.
 891  * Otherwise returns NULL.
 892  * If lockp == NULL, the caller is assumed to already hold the hash lock.
 893  */
 894 static arc_buf_hdr_t *
 895 buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
 896 {
 897         uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
 898         kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 899         arc_buf_hdr_t *fhdr;
 900         uint32_t i;
 901 
 902         ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
 903         ASSERT(hdr->b_birth != 0);
 904         ASSERT(!HDR_IN_HASH_TABLE(hdr));
 905 
 906         if (lockp != NULL) {
 907                 *lockp = hash_lock;
 908                 mutex_enter(hash_lock);
 909         } else {
 910                 ASSERT(MUTEX_HELD(hash_lock));
 911         }
 912 
 913         for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
 914             fhdr = fhdr->b_hash_next, i++) {
 915                 if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
 916                         return (fhdr);
 917         }
 918 
 919         hdr->b_hash_next = buf_hash_table.ht_table[idx];
 920         buf_hash_table.ht_table[idx] = hdr;
 921         arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
 922 
 923         /* collect some hash table performance data */
 924         if (i > 0) {
 925                 ARCSTAT_BUMP(arcstat_hash_collisions);
 926                 if (i == 1)
 927                         ARCSTAT_BUMP(arcstat_hash_chains);
 928 
 929                 ARCSTAT_MAX(arcstat_hash_chain_max, i);
 930         }
 931 
 932         ARCSTAT_BUMP(arcstat_hash_elements);
 933         ARCSTAT_MAXSTAT(arcstat_hash_elements);
 934 
 935         return (NULL);
 936 }
 937 
 938 static void
 939 buf_hash_remove(arc_buf_hdr_t *hdr)
 940 {
 941         arc_buf_hdr_t *fhdr, **hdrp;
 942         uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
 943 
 944         ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
 945         ASSERT(HDR_IN_HASH_TABLE(hdr));
 946 
 947         hdrp = &buf_hash_table.ht_table[idx];
 948         while ((fhdr = *hdrp) != hdr) {
 949                 ASSERT3P(fhdr, !=, NULL);
 950                 hdrp = &fhdr->b_hash_next;
 951         }
 952         *hdrp = hdr->b_hash_next;
 953         hdr->b_hash_next = NULL;
 954         arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
 955 
 956         /* collect some hash table performance data */
 957         ARCSTAT_BUMPDOWN(arcstat_hash_elements);
 958 
 959         if (buf_hash_table.ht_table[idx] &&
 960             buf_hash_table.ht_table[idx]->b_hash_next == NULL)
 961                 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 962 }
 963 
 964 /*
 965  * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU
 966  *              metadata and data are cached from ARC into L2ARC.
 967  */
 968 int l2arc_mfuonly = 0;
 969 
 970 /*
 971  * Global data structures and functions for the buf kmem cache.
 972  */
 973 
 974 static kmem_cache_t *hdr_full_cache;
 975 static kmem_cache_t *hdr_full_crypt_cache;
 976 static kmem_cache_t *hdr_l2only_cache;
 977 static kmem_cache_t *buf_cache;
 978 
 979 static void
 980 buf_fini(void)
 981 {
 982         int i;
 983 
 984         kmem_free(buf_hash_table.ht_table,
 985             (buf_hash_table.ht_mask + 1) * sizeof (void *));
 986         for (i = 0; i < BUF_LOCKS; i++)
 987                 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
 988         kmem_cache_destroy(hdr_full_cache);
 989         kmem_cache_destroy(hdr_full_crypt_cache);
 990         kmem_cache_destroy(hdr_l2only_cache);
 991         kmem_cache_destroy(buf_cache);
 992 }
 993 
 994 /*
 995  * Constructor callback - called when the cache is empty
 996  * and a new buf is requested.
 997  */
 998 /* ARGSUSED */
 999 static int
1000 hdr_full_cons(void *vbuf, void *unused, int kmflag)
1001 {
1002         arc_buf_hdr_t *hdr = vbuf;
1003 
1004         bzero(hdr, HDR_FULL_SIZE);
1005         hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
1006         cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
1007         zfs_refcount_create(&hdr->b_l1hdr.b_refcnt);
1008         mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
1009         multilist_link_init(&hdr->b_l1hdr.b_arc_node);
1010         arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1011 
1012         return (0);
1013 }
1014 
1015 /* ARGSUSED */
1016 static int
1017 hdr_full_crypt_cons(void *vbuf, void *unused, int kmflag)
1018 {
1019         arc_buf_hdr_t *hdr = vbuf;
1020 
1021         (void) hdr_full_cons(vbuf, unused, kmflag);
1022         bzero(&hdr->b_crypt_hdr, sizeof (hdr->b_crypt_hdr));
1023         arc_space_consume(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS);
1024 
1025         return (0);
1026 }
1027 
1028 /* ARGSUSED */
1029 static int
1030 hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
1031 {
1032         arc_buf_hdr_t *hdr = vbuf;
1033 
1034         bzero(hdr, HDR_L2ONLY_SIZE);
1035         arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1036 
1037         return (0);
1038 }
1039 
1040 /* ARGSUSED */
1041 static int
1042 buf_cons(void *vbuf, void *unused, int kmflag)
1043 {
1044         arc_buf_t *buf = vbuf;
1045 
1046         bzero(buf, sizeof (arc_buf_t));
1047         mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
1048         arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1049 
1050         return (0);
1051 }
1052 
1053 /*
1054  * Destructor callback - called when a cached buf is
1055  * no longer required.
1056  */
1057 /* ARGSUSED */
1058 static void
1059 hdr_full_dest(void *vbuf, void *unused)
1060 {
1061         arc_buf_hdr_t *hdr = vbuf;
1062 
1063         ASSERT(HDR_EMPTY(hdr));
1064         cv_destroy(&hdr->b_l1hdr.b_cv);
1065         zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt);
1066         mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
1067         ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
1068         arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1069 }
1070 
1071 /* ARGSUSED */
1072 static void
1073 hdr_full_crypt_dest(void *vbuf, void *unused)
1074 {
1075         arc_buf_hdr_t *hdr = vbuf;
1076 
1077         hdr_full_dest(hdr, unused);
1078         arc_space_return(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS);
1079 }
1080 
1081 /* ARGSUSED */
1082 static void
1083 hdr_l2only_dest(void *vbuf, void *unused)
1084 {
1085         arc_buf_hdr_t *hdr = vbuf;
1086 
1087         ASSERT(HDR_EMPTY(hdr));
1088         arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1089 }
1090 
1091 /* ARGSUSED */
1092 static void
1093 buf_dest(void *vbuf, void *unused)
1094 {
1095         arc_buf_t *buf = vbuf;
1096 
1097         mutex_destroy(&buf->b_evict_lock);
1098         arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1099 }
1100 
1101 /*
1102  * Reclaim callback -- invoked when memory is low.
1103  */
1104 /* ARGSUSED */
1105 static void
1106 hdr_recl(void *unused)
1107 {
1108         dprintf("hdr_recl called\n");
1109         /*
1110          * umem calls the reclaim func when we destroy the buf cache,
1111          * which is after we do arc_fini().
1112          */
1113         if (arc_initialized)
1114                 zthr_wakeup(arc_reap_zthr);
1115 }
1116 
1117 static void
1118 buf_init(void)
1119 {
1120         uint64_t *ct;
1121         uint64_t hsize = 1ULL << 12;
1122         int i, j;
1123 
1124         /*
1125          * The hash table is big enough to fill all of physical memory
1126          * with an average block size of zfs_arc_average_blocksize (default 8K).
1127          * By default, the table will take up
1128          * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
1129          */
1130         while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE)
1131                 hsize <<= 1;
1132 retry:
1133         buf_hash_table.ht_mask = hsize - 1;
1134         buf_hash_table.ht_table =
1135             kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1136         if (buf_hash_table.ht_table == NULL) {
1137                 ASSERT(hsize > (1ULL << 8));
1138                 hsize >>= 1;
1139                 goto retry;
1140         }
1141 
1142         hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
1143             0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
1144         hdr_full_crypt_cache = kmem_cache_create("arc_buf_hdr_t_full_crypt",
1145             HDR_FULL_CRYPT_SIZE, 0, hdr_full_crypt_cons, hdr_full_crypt_dest,
1146             hdr_recl, NULL, NULL, 0);
1147         hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
1148             HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
1149             NULL, NULL, 0);
1150         buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1151             0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1152 
1153         for (i = 0; i < 256; i++)
1154                 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1155                         *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1156 
1157         for (i = 0; i < BUF_LOCKS; i++) {
1158                 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1159                     NULL, MUTEX_DEFAULT, NULL);
1160         }
1161 }
1162 
1163 /*
1164  * This is the size that the buf occupies in memory. If the buf is compressed,
1165  * it will correspond to the compressed size. You should use this method of
1166  * getting the buf size unless you explicitly need the logical size.
1167  */
1168 int32_t
1169 arc_buf_size(arc_buf_t *buf)
1170 {
1171         return (ARC_BUF_COMPRESSED(buf) ?
1172             HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr));
1173 }
1174 
1175 int32_t
1176 arc_buf_lsize(arc_buf_t *buf)
1177 {
1178         return (HDR_GET_LSIZE(buf->b_hdr));
1179 }
1180 
1181 /*
1182  * This function will return B_TRUE if the buffer is encrypted in memory.
1183  * This buffer can be decrypted by calling arc_untransform().
1184  */
1185 boolean_t
1186 arc_is_encrypted(arc_buf_t *buf)
1187 {
1188         return (ARC_BUF_ENCRYPTED(buf) != 0);
1189 }
1190 
1191 /*
1192  * Returns B_TRUE if the buffer represents data that has not had its MAC
1193  * verified yet.
1194  */
1195 boolean_t
1196 arc_is_unauthenticated(arc_buf_t *buf)
1197 {
1198         return (HDR_NOAUTH(buf->b_hdr) != 0);
1199 }
1200 
1201 void
1202 arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt,
1203     uint8_t *iv, uint8_t *mac)
1204 {
1205         arc_buf_hdr_t *hdr = buf->b_hdr;
1206 
1207         ASSERT(HDR_PROTECTED(hdr));
1208 
1209         bcopy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN);
1210         bcopy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN);
1211         bcopy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN);
1212         *byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
1213             /* CONSTCOND */
1214             ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
1215 }
1216 
1217 /*
1218  * Indicates how this buffer is compressed in memory. If it is not compressed
1219  * the value will be ZIO_COMPRESS_OFF. It can be made normally readable with
1220  * arc_untransform() as long as it is also unencrypted.
1221  */
1222 enum zio_compress
1223 arc_get_compression(arc_buf_t *buf)
1224 {
1225         return (ARC_BUF_COMPRESSED(buf) ?
1226             HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF);
1227 }
1228 
1229 #define ARC_MINTIME     (hz>>4) /* 62 ms */
1230 
1231 /*
1232  * Return the compression algorithm used to store this data in the ARC. If ARC
1233  * compression is enabled or this is an encrypted block, this will be the same
1234  * as what's used to store it on-disk. Otherwise, this will be ZIO_COMPRESS_OFF.
1235  */
1236 static inline enum zio_compress
1237 arc_hdr_get_compress(arc_buf_hdr_t *hdr)
1238 {
1239         return (HDR_COMPRESSION_ENABLED(hdr) ?
1240             HDR_GET_COMPRESS(hdr) : ZIO_COMPRESS_OFF);
1241 }
1242 
1243 static inline boolean_t
1244 arc_buf_is_shared(arc_buf_t *buf)
1245 {
1246         boolean_t shared = (buf->b_data != NULL &&
1247             buf->b_hdr->b_l1hdr.b_pabd != NULL &&
1248             abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
1249             buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
1250         IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
1251         IMPLY(shared, ARC_BUF_SHARED(buf));
1252         IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
1253 
1254         /*
1255          * It would be nice to assert arc_can_share() too, but the "hdr isn't
1256          * already being shared" requirement prevents us from doing that.
1257          */
1258 
1259         return (shared);
1260 }
1261 
1262 /*
1263  * Free the checksum associated with this header. If there is no checksum, this
1264  * is a no-op.
1265  */
1266 static inline void
1267 arc_cksum_free(arc_buf_hdr_t *hdr)
1268 {
1269         ASSERT(HDR_HAS_L1HDR(hdr));
1270 
1271         mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
1272         if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
1273                 kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
1274                 hdr->b_l1hdr.b_freeze_cksum = NULL;
1275         }
1276         mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1277 }
1278 
1279 /*
1280  * Return true iff at least one of the bufs on hdr is not compressed.
1281  * Encrypted buffers count as compressed.
1282  */
1283 static boolean_t
1284 arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
1285 {
1286         ASSERT(hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY_OR_LOCKED(hdr));
1287 
1288         for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) {
1289                 if (!ARC_BUF_COMPRESSED(b)) {
1290                         return (B_TRUE);
1291                 }
1292         }
1293         return (B_FALSE);
1294 }
1295 
1296 /*
1297  * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data
1298  * matches the checksum that is stored in the hdr. If there is no checksum,
1299  * or if the buf is compressed, this is a no-op.
1300  */
1301 static void
1302 arc_cksum_verify(arc_buf_t *buf)
1303 {
1304         arc_buf_hdr_t *hdr = buf->b_hdr;
1305         zio_cksum_t zc;
1306 
1307         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1308                 return;
1309 
1310         if (ARC_BUF_COMPRESSED(buf))
1311                 return;
1312 
1313         ASSERT(HDR_HAS_L1HDR(hdr));
1314 
1315         mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
1316 
1317         if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) {
1318                 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1319                 return;
1320         }
1321 
1322         fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc);
1323         if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
1324                 panic("buffer modified while frozen!");
1325         mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1326 }
1327 
1328 /*
1329  * This function makes the assumption that data stored in the L2ARC
1330  * will be transformed exactly as it is in the main pool. Because of
1331  * this we can verify the checksum against the reading process's bp.
1332  */
1333 static boolean_t
1334 arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
1335 {
1336         enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp);
1337         boolean_t valid_cksum;
1338 
1339         ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
1340         VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));
1341 
1342         /*
1343          * We rely on the blkptr's checksum to determine if the block
1344          * is valid or not. When compressed arc is enabled, the l2arc
1345          * writes the block to the l2arc just as it appears in the pool.
1346          * This allows us to use the blkptr's checksum to validate the
1347          * data that we just read off of the l2arc without having to store
1348          * a separate checksum in the arc_buf_hdr_t. However, if compressed
1349          * arc is disabled, then the data written to the l2arc is always
1350          * uncompressed and won't match the block as it exists in the main
1351          * pool. When this is the case, we must first compress it if it is
1352          * compressed on the main pool before we can validate the checksum.
1353          */
1354         if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) {
1355                 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
1356                 uint64_t lsize = HDR_GET_LSIZE(hdr);
1357                 uint64_t csize;
1358 
1359                 abd_t *cdata = abd_alloc_linear(HDR_GET_PSIZE(hdr), B_TRUE);
1360                 csize = zio_compress_data(compress, zio->io_abd,
1361                     abd_to_buf(cdata), lsize);
1362 
1363                 ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr));
1364                 if (csize < HDR_GET_PSIZE(hdr)) {
1365                         /*
1366                          * Compressed blocks are always a multiple of the
1367                          * smallest ashift in the pool. Ideally, we would
1368                          * like to round up the csize to the next
1369                          * spa_min_ashift but that value may have changed
1370                          * since the block was last written. Instead,
1371                          * we rely on the fact that the hdr's psize
1372                          * was set to the psize of the block when it was
1373                          * last written. We set the csize to that value
1374                          * and zero out any part that should not contain
1375                          * data.
1376                          */
1377                         abd_zero_off(cdata, csize, HDR_GET_PSIZE(hdr) - csize);
1378                         csize = HDR_GET_PSIZE(hdr);
1379                 }
1380                 zio_push_transform(zio, cdata, csize, HDR_GET_PSIZE(hdr), NULL);
1381         }
1382 
1383         /*
1384          * Block pointers always store the checksum for the logical data.
1385          * If the block pointer has the gang bit set, then the checksum
1386          * it represents is for the reconstituted data and not for an
1387          * individual gang member. The zio pipeline, however, must be able to
1388          * determine the checksum of each of the gang constituents so it
1389          * treats the checksum comparison differently than what we need
1390          * for l2arc blocks. This prevents us from using the
1391          * zio_checksum_error() interface directly. Instead we must call the
1392          * zio_checksum_error_impl() so that we can ensure the checksum is
1393          * generated using the correct checksum algorithm and accounts for the
1394          * logical I/O size and not just a gang fragment.
1395          */
1396         valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
1397             BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size,
1398             zio->io_offset, NULL) == 0);
1399         zio_pop_transforms(zio);
1400         return (valid_cksum);
1401 }
1402 
1403 /*
1404  * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a
1405  * checksum and attaches it to the buf's hdr so that we can ensure that the buf
1406  * isn't modified later on. If buf is compressed or there is already a checksum
1407  * on the hdr, this is a no-op (we only checksum uncompressed bufs).
1408  */
1409 static void
1410 arc_cksum_compute(arc_buf_t *buf)
1411 {
1412         arc_buf_hdr_t *hdr = buf->b_hdr;
1413 
1414         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1415                 return;
1416 
1417         ASSERT(HDR_HAS_L1HDR(hdr));
1418 
1419         mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1420         if (hdr->b_l1hdr.b_freeze_cksum != NULL || ARC_BUF_COMPRESSED(buf)) {
1421                 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1422                 return;
1423         }
1424 
1425         ASSERT(!ARC_BUF_ENCRYPTED(buf));
1426         ASSERT(!ARC_BUF_COMPRESSED(buf));
1427         hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
1428             KM_SLEEP);
1429         fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL,
1430             hdr->b_l1hdr.b_freeze_cksum);
1431         mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1432         arc_buf_watch(buf);
1433 }
1434 
1435 #ifndef _KERNEL
1436 typedef struct procctl {
1437         long cmd;
1438         prwatch_t prwatch;
1439 } procctl_t;
1440 #endif
1441 
1442 /* ARGSUSED */
1443 static void
1444 arc_buf_unwatch(arc_buf_t *buf)
1445 {
1446 #ifndef _KERNEL
1447         if (arc_watch) {
1448                 int result;
1449                 procctl_t ctl;
1450                 ctl.cmd = PCWATCH;
1451                 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1452                 ctl.prwatch.pr_size = 0;
1453                 ctl.prwatch.pr_wflags = 0;
1454                 result = write(arc_procfd, &ctl, sizeof (ctl));
1455                 ASSERT3U(result, ==, sizeof (ctl));
1456         }
1457 #endif
1458 }
1459 
1460 /* ARGSUSED */
1461 static void
1462 arc_buf_watch(arc_buf_t *buf)
1463 {
1464 #ifndef _KERNEL
1465         if (arc_watch) {
1466                 int result;
1467                 procctl_t ctl;
1468                 ctl.cmd = PCWATCH;
1469                 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1470                 ctl.prwatch.pr_size = arc_buf_size(buf);
1471                 ctl.prwatch.pr_wflags = WA_WRITE;
1472                 result = write(arc_procfd, &ctl, sizeof (ctl));
1473                 ASSERT3U(result, ==, sizeof (ctl));
1474         }
1475 #endif
1476 }
1477 
1478 static arc_buf_contents_t
1479 arc_buf_type(arc_buf_hdr_t *hdr)
1480 {
1481         arc_buf_contents_t type;
1482         if (HDR_ISTYPE_METADATA(hdr)) {
1483                 type = ARC_BUFC_METADATA;
1484         } else {
1485                 type = ARC_BUFC_DATA;
1486         }
1487         VERIFY3U(hdr->b_type, ==, type);
1488         return (type);
1489 }
1490 
1491 boolean_t
1492 arc_is_metadata(arc_buf_t *buf)
1493 {
1494         return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0);
1495 }
1496 
1497 static uint32_t
1498 arc_bufc_to_flags(arc_buf_contents_t type)
1499 {
1500         switch (type) {
1501         case ARC_BUFC_DATA:
1502                 /* metadata field is 0 if buffer contains normal data */
1503                 return (0);
1504         case ARC_BUFC_METADATA:
1505                 return (ARC_FLAG_BUFC_METADATA);
1506         default:
1507                 break;
1508         }
1509         panic("undefined ARC buffer type!");
1510         return ((uint32_t)-1);
1511 }
1512 
1513 void
1514 arc_buf_thaw(arc_buf_t *buf)
1515 {
1516         arc_buf_hdr_t *hdr = buf->b_hdr;
1517 
1518         ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
1519         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1520 
1521         arc_cksum_verify(buf);
1522 
1523         /*
1524          * Compressed buffers do not manipulate the b_freeze_cksum.
1525          */
1526         if (ARC_BUF_COMPRESSED(buf))
1527                 return;
1528 
1529         ASSERT(HDR_HAS_L1HDR(hdr));
1530         arc_cksum_free(hdr);
1531 
1532         mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
1533 #ifdef ZFS_DEBUG
1534         if (zfs_flags & ZFS_DEBUG_MODIFY) {
1535                 if (hdr->b_l1hdr.b_thawed != NULL)
1536                         kmem_free(hdr->b_l1hdr.b_thawed, 1);
1537                 hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP);
1538         }
1539 #endif
1540 
1541         mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1542 
1543         arc_buf_unwatch(buf);
1544 }
1545 
1546 void
1547 arc_buf_freeze(arc_buf_t *buf)
1548 {
1549         if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1550                 return;
1551 
1552         if (ARC_BUF_COMPRESSED(buf))
1553                 return;
1554 
1555         ASSERT(HDR_HAS_L1HDR(buf->b_hdr));
1556         arc_cksum_compute(buf);
1557 }
1558 
1559 /*
1560  * The arc_buf_hdr_t's b_flags should never be modified directly. Instead,
1561  * the following functions should be used to ensure that the flags are
1562  * updated in a thread-safe way. When manipulating the flags either
1563  * the hash_lock must be held or the hdr must be undiscoverable. This
1564  * ensures that we're not racing with any other threads when updating
1565  * the flags.
1566  */
1567 static inline void
1568 arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
1569 {
1570         ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1571         hdr->b_flags |= flags;
1572 }
1573 
1574 static inline void
1575 arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
1576 {
1577         ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1578         hdr->b_flags &= ~flags;
1579 }
1580 
1581 /*
1582  * Setting the compression bits in the arc_buf_hdr_t's b_flags is
1583  * done in a special way since we have to clear and set bits
1584  * at the same time. Consumers that wish to set the compression bits
1585  * must use this function to ensure that the flags are updated in
1586  * thread-safe manner.
1587  */
1588 static void
1589 arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
1590 {
1591         ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1592 
1593         /*
1594          * Holes and embedded blocks will always have a psize = 0 so
1595          * we ignore the compression of the blkptr and set the
1596          * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF.
1597          * Holes and embedded blocks remain anonymous so we don't
1598          * want to uncompress them. Mark them as uncompressed.
1599          */
1600         if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) {
1601                 arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
1602                 ASSERT(!HDR_COMPRESSION_ENABLED(hdr));
1603         } else {
1604                 arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
1605                 ASSERT(HDR_COMPRESSION_ENABLED(hdr));
1606         }
1607 
1608         HDR_SET_COMPRESS(hdr, cmp);
1609         ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
1610 }
1611 
1612 /*
1613  * Looks for another buf on the same hdr which has the data decompressed, copies
1614  * from it, and returns true. If no such buf exists, returns false.
1615  */
1616 static boolean_t
1617 arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
1618 {
1619         arc_buf_hdr_t *hdr = buf->b_hdr;
1620         boolean_t copied = B_FALSE;
1621 
1622         ASSERT(HDR_HAS_L1HDR(hdr));
1623         ASSERT3P(buf->b_data, !=, NULL);
1624         ASSERT(!ARC_BUF_COMPRESSED(buf));
1625 
1626         for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL;
1627             from = from->b_next) {
1628                 /* can't use our own data buffer */
1629                 if (from == buf) {
1630                         continue;
1631                 }
1632 
1633                 if (!ARC_BUF_COMPRESSED(from)) {
1634                         bcopy(from->b_data, buf->b_data, arc_buf_size(buf));
1635                         copied = B_TRUE;
1636                         break;
1637                 }
1638         }
1639 
1640         /*
1641          * Note: With encryption support, the following assertion is no longer
1642          * necessarily valid. If we receive two back to back raw snapshots
1643          * (send -w), the second receive can use a hdr with a cksum already
1644          * calculated. This happens via:
1645          *    dmu_recv_stream() -> receive_read_record() -> arc_loan_raw_buf()
1646          * The rsend/send_mixed_raw test case exercises this code path.
1647          *
1648          * There were no decompressed bufs, so there should not be a
1649          * checksum on the hdr either.
1650          * EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL);
1651          */
1652 
1653         return (copied);
1654 }
1655 
1656 /*
1657  * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
1658  */
1659 static uint64_t
1660 arc_hdr_size(arc_buf_hdr_t *hdr)
1661 {
1662         uint64_t size;
1663 
1664         if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
1665             HDR_GET_PSIZE(hdr) > 0) {
1666                 size = HDR_GET_PSIZE(hdr);
1667         } else {
1668                 ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0);
1669                 size = HDR_GET_LSIZE(hdr);
1670         }
1671         return (size);
1672 }
1673 
1674 static int
1675 arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
1676 {
1677         int ret;
1678         uint64_t csize;
1679         uint64_t lsize = HDR_GET_LSIZE(hdr);
1680         uint64_t psize = HDR_GET_PSIZE(hdr);
1681         void *tmpbuf = NULL;
1682         abd_t *abd = hdr->b_l1hdr.b_pabd;
1683 
1684         ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1685         ASSERT(HDR_AUTHENTICATED(hdr));
1686         ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
1687 
1688         /*
1689          * The MAC is calculated on the compressed data that is stored on disk.
1690          * However, if compressed arc is disabled we will only have the
1691          * decompressed data available to us now. Compress it into a temporary
1692          * abd so we can verify the MAC. The performance overhead of this will
1693          * be relatively low, since most objects in an encrypted objset will
1694          * be encrypted (instead of authenticated) anyway.
1695          */
1696         if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
1697             !HDR_COMPRESSION_ENABLED(hdr)) {
1698                 tmpbuf = zio_buf_alloc(lsize);
1699                 abd = abd_get_from_buf(tmpbuf, lsize);
1700                 abd_take_ownership_of_buf(abd, B_TRUE);
1701 
1702                 csize = zio_compress_data(HDR_GET_COMPRESS(hdr),
1703                     hdr->b_l1hdr.b_pabd, tmpbuf, lsize);
1704                 ASSERT3U(csize, <=, psize);
1705                 abd_zero_off(abd, csize, psize - csize);
1706         }
1707 
1708         /*
1709          * Authentication is best effort. We authenticate whenever the key is
1710          * available. If we succeed we clear ARC_FLAG_NOAUTH.
1711          */
1712         if (hdr->b_crypt_hdr.b_ot == DMU_OT_OBJSET) {
1713                 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
1714                 ASSERT3U(lsize, ==, psize);
1715                 ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, abd,
1716                     psize, hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
1717         } else {
1718                 ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, abd, psize,
1719                     hdr->b_crypt_hdr.b_mac);
1720         }
1721 
1722         if (ret == 0)
1723                 arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH);
1724         else if (ret != ENOENT)
1725                 goto error;
1726 
1727         if (tmpbuf != NULL)
1728                 abd_free(abd);
1729 
1730         return (0);
1731 
1732 error:
1733         if (tmpbuf != NULL)
1734                 abd_free(abd);
1735 
1736         return (ret);
1737 }
1738 
1739 /*
1740  * This function will take a header that only has raw encrypted data in
1741  * b_crypt_hdr.b_rabd and decrypt it into a new buffer which is stored in
1742  * b_l1hdr.b_pabd. If designated in the header flags, this function will
1743  * also decompress the data.
1744  */
1745 static int
1746 arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
1747 {
1748         int ret;
1749         abd_t *cabd = NULL;
1750         void *tmp = NULL;
1751         boolean_t no_crypt = B_FALSE;
1752         boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
1753 
1754         ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1755         ASSERT(HDR_ENCRYPTED(hdr));
1756 
1757         arc_hdr_alloc_pabd(hdr, ARC_HDR_DO_ADAPT);
1758 
1759         ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot,
1760             B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv,
1761             hdr->b_crypt_hdr.b_mac, HDR_GET_PSIZE(hdr), hdr->b_l1hdr.b_pabd,
1762             hdr->b_crypt_hdr.b_rabd, &no_crypt);
1763         if (ret != 0)
1764                 goto error;
1765 
1766         if (no_crypt) {
1767                 abd_copy(hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd,
1768                     HDR_GET_PSIZE(hdr));
1769         }
1770 
1771         /*
1772          * If this header has disabled arc compression but the b_pabd is
1773          * compressed after decrypting it, we need to decompress the newly
1774          * decrypted data.
1775          */
1776         if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
1777             !HDR_COMPRESSION_ENABLED(hdr)) {
1778                 /*
1779                  * We want to make sure that we are correctly honoring the
1780                  * zfs_abd_scatter_enabled setting, so we allocate an abd here
1781                  * and then loan a buffer from it, rather than allocating a
1782                  * linear buffer and wrapping it in an abd later.
1783                  */
1784                 cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, B_TRUE);
1785                 tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
1786 
1787                 ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
1788                     hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
1789                     HDR_GET_LSIZE(hdr));
1790                 if (ret != 0) {
1791                         abd_return_buf(cabd, tmp, arc_hdr_size(hdr));
1792                         goto error;
1793                 }
1794 
1795                 abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
1796                 arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
1797                     arc_hdr_size(hdr), hdr);
1798                 hdr->b_l1hdr.b_pabd = cabd;
1799         }
1800 
1801         return (0);
1802 
1803 error:
1804         arc_hdr_free_pabd(hdr, B_FALSE);
1805         if (cabd != NULL)
1806                 arc_free_data_buf(hdr, cabd, arc_hdr_size(hdr), hdr);
1807 
1808         return (ret);
1809 }
1810 
1811 /*
1812  * This function is called during arc_buf_fill() to prepare the header's
1813  * abd plaintext pointer for use. This involves authenticated protected
1814  * data and decrypting encrypted data into the plaintext abd.
1815  */
1816 static int
1817 arc_fill_hdr_crypt(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, spa_t *spa,
1818     const zbookmark_phys_t *zb, boolean_t noauth)
1819 {
1820         int ret;
1821 
1822         ASSERT(HDR_PROTECTED(hdr));
1823 
1824         if (hash_lock != NULL)
1825                 mutex_enter(hash_lock);
1826 
1827         if (HDR_NOAUTH(hdr) && !noauth) {
1828                 /*
1829                  * The caller requested authenticated data but our data has
1830                  * not been authenticated yet. Verify the MAC now if we can.
1831                  */
1832                 ret = arc_hdr_authenticate(hdr, spa, zb->zb_objset);
1833                 if (ret != 0)
1834                         goto error;
1835         } else if (HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd == NULL) {
1836                 /*
1837                  * If we only have the encrypted version of the data, but the
1838                  * unencrypted version was requested we take this opportunity
1839                  * to store the decrypted version in the header for future use.
1840                  */
1841                 ret = arc_hdr_decrypt(hdr, spa, zb);
1842                 if (ret != 0)
1843                         goto error;
1844         }
1845 
1846         ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
1847 
1848         if (hash_lock != NULL)
1849                 mutex_exit(hash_lock);
1850 
1851         return (0);
1852 
1853 error:
1854         if (hash_lock != NULL)
1855                 mutex_exit(hash_lock);
1856 
1857         return (ret);
1858 }
1859 
1860 /*
1861  * This function is used by the dbuf code to decrypt bonus buffers in place.
1862  * The dbuf code itself doesn't have any locking for decrypting a shared dnode
1863  * block, so we use the hash lock here to protect against concurrent calls to
1864  * arc_buf_fill().
1865  */
1866 /* ARGSUSED */
1867 static void
1868 arc_buf_untransform_in_place(arc_buf_t *buf, kmutex_t *hash_lock)
1869 {
1870         arc_buf_hdr_t *hdr = buf->b_hdr;
1871 
1872         ASSERT(HDR_ENCRYPTED(hdr));
1873         ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
1874         ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
1875         ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
1876 
1877         zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data,
1878             arc_buf_size(buf));
1879         buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
1880         buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
1881         hdr->b_crypt_hdr.b_ebufcnt -= 1;
1882 }
1883 
1884 /*
1885  * Given a buf that has a data buffer attached to it, this function will
1886  * efficiently fill the buf with data of the specified compression setting from
1887  * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr
1888  * are already sharing a data buf, no copy is performed.
1889  *
1890  * If the buf is marked as compressed but uncompressed data was requested, this
1891  * will allocate a new data buffer for the buf, remove that flag, and fill the
1892  * buf with uncompressed data. You can't request a compressed buf on a hdr with
1893  * uncompressed data, and (since we haven't added support for it yet) if you
1894  * want compressed data your buf must already be marked as compressed and have
1895  * the correct-sized data buffer.
1896  */
1897 static int
1898 arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
1899     arc_fill_flags_t flags)
1900 {
1901         int error = 0;
1902         arc_buf_hdr_t *hdr = buf->b_hdr;
1903         boolean_t hdr_compressed =
1904             (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
1905         boolean_t compressed = (flags & ARC_FILL_COMPRESSED) != 0;
1906         boolean_t encrypted = (flags & ARC_FILL_ENCRYPTED) != 0;
1907         dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
1908         kmutex_t *hash_lock = (flags & ARC_FILL_LOCKED) ? NULL : HDR_LOCK(hdr);
1909 
1910         ASSERT3P(buf->b_data, !=, NULL);
1911         IMPLY(compressed, hdr_compressed || ARC_BUF_ENCRYPTED(buf));
1912         IMPLY(compressed, ARC_BUF_COMPRESSED(buf));
1913         IMPLY(encrypted, HDR_ENCRYPTED(hdr));
1914         IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf));
1915         IMPLY(encrypted, ARC_BUF_COMPRESSED(buf));
1916         IMPLY(encrypted, !ARC_BUF_SHARED(buf));
1917 
1918         /*
1919          * If the caller wanted encrypted data we just need to copy it from
1920          * b_rabd and potentially byteswap it. We won't be able to do any
1921          * further transforms on it.
1922          */
1923         if (encrypted) {
1924                 ASSERT(HDR_HAS_RABD(hdr));
1925                 abd_copy_to_buf(buf->b_data, hdr->b_crypt_hdr.b_rabd,
1926                     HDR_GET_PSIZE(hdr));
1927                 goto byteswap;
1928         }
1929 
1930         /*
1931          * Adjust encrypted and authenticated headers to accomodate
1932          * the request if needed. Dnode blocks (ARC_FILL_IN_PLACE) are
1933          * allowed to fail decryption due to keys not being loaded
1934          * without being marked as an IO error.
1935          */
1936         if (HDR_PROTECTED(hdr)) {
1937                 error = arc_fill_hdr_crypt(hdr, hash_lock, spa,
1938                     zb, !!(flags & ARC_FILL_NOAUTH));
1939                 if (error == EACCES && (flags & ARC_FILL_IN_PLACE) != 0) {
1940                         return (error);
1941                 } else if (error != 0) {
1942                         if (hash_lock != NULL)
1943                                 mutex_enter(hash_lock);
1944                         arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
1945                         if (hash_lock != NULL)
1946                                 mutex_exit(hash_lock);
1947                         return (error);
1948                 }
1949         }
1950 
1951         /*
1952          * There is a special case here for dnode blocks which are
1953          * decrypting their bonus buffers. These blocks may request to
1954          * be decrypted in-place. This is necessary because there may
1955          * be many dnodes pointing into this buffer and there is
1956          * currently no method to synchronize replacing the backing
1957          * b_data buffer and updating all of the pointers. Here we use
1958          * the hash lock to ensure there are no races. If the need
1959          * arises for other types to be decrypted in-place, they must
1960          * add handling here as well.
1961          */
1962         if ((flags & ARC_FILL_IN_PLACE) != 0) {
1963                 ASSERT(!hdr_compressed);
1964                 ASSERT(!compressed);
1965                 ASSERT(!encrypted);
1966 
1967                 if (HDR_ENCRYPTED(hdr) && ARC_BUF_ENCRYPTED(buf)) {
1968                         ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
1969 
1970                         if (hash_lock != NULL)
1971                                 mutex_enter(hash_lock);
1972                         arc_buf_untransform_in_place(buf, hash_lock);
1973                         if (hash_lock != NULL)
1974                                 mutex_exit(hash_lock);
1975 
1976                         /* Compute the hdr's checksum if necessary */
1977                         arc_cksum_compute(buf);
1978                 }
1979 
1980                 return (0);
1981         }
1982 
1983         if (hdr_compressed == compressed) {
1984                 if (!arc_buf_is_shared(buf)) {
1985                         abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
1986                             arc_buf_size(buf));
1987                 }
1988         } else {
1989                 ASSERT(hdr_compressed);
1990                 ASSERT(!compressed);
1991                 ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr));
1992 
1993                 /*
1994                  * If the buf is sharing its data with the hdr, unlink it and
1995                  * allocate a new data buffer for the buf.
1996                  */
1997                 if (arc_buf_is_shared(buf)) {
1998                         ASSERT(ARC_BUF_COMPRESSED(buf));
1999 
2000                         /* We need to give the buf its own b_data */
2001                         buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
2002                         buf->b_data =
2003                             arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
2004                         arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
2005 
2006                         /* Previously overhead was 0; just add new overhead */
2007                         ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
2008                 } else if (ARC_BUF_COMPRESSED(buf)) {
2009                         /* We need to reallocate the buf's b_data */
2010                         arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
2011                             buf);
2012                         buf->b_data =
2013                             arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
2014 
2015                         /* We increased the size of b_data; update overhead */
2016                         ARCSTAT_INCR(arcstat_overhead_size,
2017                             HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr));
2018                 }
2019 
2020                 /*
2021                  * Regardless of the buf's previous compression settings, it
2022                  * should not be compressed at the end of this function.
2023                  */
2024                 buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
2025 
2026                 /*
2027                  * Try copying the data from another buf which already has a
2028                  * decompressed version. If that's not possible, it's time to
2029                  * bite the bullet and decompress the data from the hdr.
2030                  */
2031                 if (arc_buf_try_copy_decompressed_data(buf)) {
2032                         /* Skip byteswapping and checksumming (already done) */
2033                         ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL);
2034                         return (0);
2035                 } else {
2036                         error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
2037                             hdr->b_l1hdr.b_pabd, buf->b_data,
2038                             HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
2039 
2040                         /*
2041                          * Absent hardware errors or software bugs, this should
2042                          * be impossible, but log it anyway so we can debug it.
2043                          */
2044                         if (error != 0) {
2045                                 zfs_dbgmsg(
2046                                     "hdr %p, compress %d, psize %d, lsize %d",
2047                                     hdr, arc_hdr_get_compress(hdr),
2048                                     HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
2049                                 if (hash_lock != NULL)
2050                                         mutex_enter(hash_lock);
2051                                 arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
2052                                 if (hash_lock != NULL)
2053                                         mutex_exit(hash_lock);
2054                                 return (SET_ERROR(EIO));
2055                         }
2056                 }
2057         }
2058 
2059 byteswap:
2060         /* Byteswap the buf's data if necessary */
2061         if (bswap != DMU_BSWAP_NUMFUNCS) {
2062                 ASSERT(!HDR_SHARED_DATA(hdr));
2063                 ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS);
2064                 dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr));
2065         }
2066 
2067         /* Compute the hdr's checksum if necessary */
2068         arc_cksum_compute(buf);
2069 
2070         return (0);
2071 }
2072 
2073 /*
2074  * If this function is being called to decrypt an encrypted buffer or verify an
2075  * authenticated one, the key must be loaded and a mapping must be made
2076  * available in the keystore via spa_keystore_create_mapping() or one of its
2077  * callers.
2078  */
2079 int
2080 arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
2081     boolean_t in_place)
2082 {
2083         int ret;
2084         arc_fill_flags_t flags = 0;
2085 
2086         if (in_place)
2087                 flags |= ARC_FILL_IN_PLACE;
2088 
2089         ret = arc_buf_fill(buf, spa, zb, flags);
2090         if (ret == ECKSUM) {
2091                 /*
2092                  * Convert authentication and decryption errors to EIO
2093                  * (and generate an ereport) before leaving the ARC.
2094                  */
2095                 ret = SET_ERROR(EIO);
2096                 spa_log_error(spa, zb);
2097                 (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
2098                     spa, NULL, zb, NULL, 0, 0);
2099         }
2100 
2101         return (ret);
2102 }
2103 
2104 /*
2105  * Increment the amount of evictable space in the arc_state_t's refcount.
2106  * We account for the space used by the hdr and the arc buf individually
2107  * so that we can add and remove them from the refcount individually.
2108  */
2109 static void
2110 arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
2111 {
2112         arc_buf_contents_t type = arc_buf_type(hdr);
2113 
2114         ASSERT(HDR_HAS_L1HDR(hdr));
2115 
2116         if (GHOST_STATE(state)) {
2117                 ASSERT0(hdr->b_l1hdr.b_bufcnt);
2118                 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2119                 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2120                 ASSERT(!HDR_HAS_RABD(hdr));
2121                 (void) zfs_refcount_add_many(&state->arcs_esize[type],
2122                     HDR_GET_LSIZE(hdr), hdr);
2123                 return;
2124         }
2125 
2126         ASSERT(!GHOST_STATE(state));
2127         if (hdr->b_l1hdr.b_pabd != NULL) {
2128                 (void) zfs_refcount_add_many(&state->arcs_esize[type],
2129                     arc_hdr_size(hdr), hdr);
2130         }
2131         if (HDR_HAS_RABD(hdr)) {
2132                 (void) zfs_refcount_add_many(&state->arcs_esize[type],
2133                     HDR_GET_PSIZE(hdr), hdr);
2134         }
2135         for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2136             buf = buf->b_next) {
2137                 if (arc_buf_is_shared(buf))
2138                         continue;
2139                 (void) zfs_refcount_add_many(&state->arcs_esize[type],
2140                     arc_buf_size(buf), buf);
2141         }
2142 }
2143 
2144 /*
2145  * Decrement the amount of evictable space in the arc_state_t's refcount.
2146  * We account for the space used by the hdr and the arc buf individually
2147  * so that we can add and remove them from the refcount individually.
2148  */
2149 static void
2150 arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
2151 {
2152         arc_buf_contents_t type = arc_buf_type(hdr);
2153 
2154         ASSERT(HDR_HAS_L1HDR(hdr));
2155 
2156         if (GHOST_STATE(state)) {
2157                 ASSERT0(hdr->b_l1hdr.b_bufcnt);
2158                 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2159                 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2160                 ASSERT(!HDR_HAS_RABD(hdr));
2161                 (void) zfs_refcount_remove_many(&state->arcs_esize[type],
2162                     HDR_GET_LSIZE(hdr), hdr);
2163                 return;
2164         }
2165 
2166         ASSERT(!GHOST_STATE(state));
2167         if (hdr->b_l1hdr.b_pabd != NULL) {
2168                 (void) zfs_refcount_remove_many(&state->arcs_esize[type],
2169                     arc_hdr_size(hdr), hdr);
2170         }
2171         if (HDR_HAS_RABD(hdr)) {
2172                 (void) zfs_refcount_remove_many(&state->arcs_esize[type],
2173                     HDR_GET_PSIZE(hdr), hdr);
2174         }
2175         for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2176             buf = buf->b_next) {
2177                 if (arc_buf_is_shared(buf))
2178                         continue;
2179                 (void) zfs_refcount_remove_many(&state->arcs_esize[type],
2180                     arc_buf_size(buf), buf);
2181         }
2182 }
2183 
2184 /*
2185  * Add a reference to this hdr indicating that someone is actively
2186  * referencing that memory. When the refcount transitions from 0 to 1,
2187  * we remove it from the respective arc_state_t list to indicate that
2188  * it is not evictable.
2189  */
2190 static void
2191 add_reference(arc_buf_hdr_t *hdr, void *tag)
2192 {
2193         ASSERT(HDR_HAS_L1HDR(hdr));
2194         if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) {
2195                 ASSERT(hdr->b_l1hdr.b_state == arc_anon);
2196                 ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2197                 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2198         }
2199 
2200         arc_state_t *state = hdr->b_l1hdr.b_state;
2201 
2202         if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
2203             (state != arc_anon)) {
2204                 /* We don't use the L2-only state list. */
2205                 if (state != arc_l2c_only) {
2206                         multilist_remove(state->arcs_list[arc_buf_type(hdr)],
2207                             hdr);
2208                         arc_evictable_space_decrement(hdr, state);
2209                 }
2210                 /* remove the prefetch flag if we get a reference */
2211                 if (HDR_HAS_L2HDR(hdr))
2212                         l2arc_hdr_arcstats_decrement_state(hdr);
2213                 arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
2214                 if (HDR_HAS_L2HDR(hdr))
2215                         l2arc_hdr_arcstats_increment_state(hdr);
2216         }
2217 }
2218 
2219 /*
2220  * Remove a reference from this hdr. When the reference transitions from
2221  * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's
2222  * list making it eligible for eviction.
2223  */
2224 static int
2225 remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
2226 {
2227         int cnt;
2228         arc_state_t *state = hdr->b_l1hdr.b_state;
2229 
2230         ASSERT(HDR_HAS_L1HDR(hdr));
2231         ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
2232         ASSERT(!GHOST_STATE(state));
2233 
2234         /*
2235          * arc_l2c_only counts as a ghost state so we don't need to explicitly
2236          * check to prevent usage of the arc_l2c_only list.
2237          */
2238         if (((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
2239             (state != arc_anon)) {
2240                 multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr);
2241                 ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
2242                 arc_evictable_space_increment(hdr, state);
2243         }
2244         return (cnt);
2245 }
2246 
2247 /*
2248  * Move the supplied buffer to the indicated state. The hash lock
2249  * for the buffer must be held by the caller.
2250  */
2251 static void
2252 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
2253     kmutex_t *hash_lock)
2254 {
2255         arc_state_t *old_state;
2256         int64_t refcnt;
2257         uint32_t bufcnt;
2258         boolean_t update_old, update_new;
2259         arc_buf_contents_t buftype = arc_buf_type(hdr);
2260 
2261         /*
2262          * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
2263          * in arc_read() when bringing a buffer out of the L2ARC.  However, the
2264          * L1 hdr doesn't always exist when we change state to arc_anon before
2265          * destroying a header, in which case reallocating to add the L1 hdr is
2266          * pointless.
2267          */
2268         if (HDR_HAS_L1HDR(hdr)) {
2269                 old_state = hdr->b_l1hdr.b_state;
2270                 refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt);
2271                 bufcnt = hdr->b_l1hdr.b_bufcnt;
2272 
2273                 update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL ||
2274                     HDR_HAS_RABD(hdr));
2275         } else {
2276                 old_state = arc_l2c_only;
2277                 refcnt = 0;
2278                 bufcnt = 0;
2279                 update_old = B_FALSE;
2280         }
2281         update_new = update_old;
2282 
2283         ASSERT(MUTEX_HELD(hash_lock));
2284         ASSERT3P(new_state, !=, old_state);
2285         ASSERT(!GHOST_STATE(new_state) || bufcnt == 0);
2286         ASSERT(old_state != arc_anon || bufcnt <= 1);
2287 
2288         /*
2289          * If this buffer is evictable, transfer it from the
2290          * old state list to the new state list.
2291          */
2292         if (refcnt == 0) {
2293                 if (old_state != arc_anon && old_state != arc_l2c_only) {
2294                         ASSERT(HDR_HAS_L1HDR(hdr));
2295                         multilist_remove(old_state->arcs_list[buftype], hdr);
2296 
2297                         if (GHOST_STATE(old_state)) {
2298                                 ASSERT0(bufcnt);
2299                                 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2300                                 update_old = B_TRUE;
2301                         }
2302                         arc_evictable_space_decrement(hdr, old_state);
2303                 }
2304                 if (new_state != arc_anon && new_state != arc_l2c_only) {
2305 
2306                         /*
2307                          * An L1 header always exists here, since if we're
2308                          * moving to some L1-cached state (i.e. not l2c_only or
2309                          * anonymous), we realloc the header to add an L1hdr
2310                          * beforehand.
2311                          */
2312                         ASSERT(HDR_HAS_L1HDR(hdr));
2313                         multilist_insert(new_state->arcs_list[buftype], hdr);
2314 
2315                         if (GHOST_STATE(new_state)) {
2316                                 ASSERT0(bufcnt);
2317                                 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2318                                 update_new = B_TRUE;
2319                         }
2320                         arc_evictable_space_increment(hdr, new_state);
2321                 }
2322         }
2323 
2324         ASSERT(!HDR_EMPTY(hdr));
2325         if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
2326                 buf_hash_remove(hdr);
2327 
2328         /* adjust state sizes (ignore arc_l2c_only) */
2329 
2330         if (update_new && new_state != arc_l2c_only) {
2331                 ASSERT(HDR_HAS_L1HDR(hdr));
2332                 if (GHOST_STATE(new_state)) {
2333                         ASSERT0(bufcnt);
2334 
2335                         /*
2336                          * When moving a header to a ghost state, we first
2337                          * remove all arc buffers. Thus, we'll have a
2338                          * bufcnt of zero, and no arc buffer to use for
2339                          * the reference. As a result, we use the arc
2340                          * header pointer for the reference.
2341                          */
2342                         (void) zfs_refcount_add_many(&new_state->arcs_size,
2343                             HDR_GET_LSIZE(hdr), hdr);
2344                         ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2345                         ASSERT(!HDR_HAS_RABD(hdr));
2346                 } else {
2347                         uint32_t buffers = 0;
2348 
2349                         /*
2350                          * Each individual buffer holds a unique reference,
2351                          * thus we must remove each of these references one
2352                          * at a time.
2353                          */
2354                         for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2355                             buf = buf->b_next) {
2356                                 ASSERT3U(bufcnt, !=, 0);
2357                                 buffers++;
2358 
2359                                 /*
2360                                  * When the arc_buf_t is sharing the data
2361                                  * block with the hdr, the owner of the
2362                                  * reference belongs to the hdr. Only
2363                                  * add to the refcount if the arc_buf_t is
2364                                  * not shared.
2365                                  */
2366                                 if (arc_buf_is_shared(buf))
2367                                         continue;
2368 
2369                                 (void) zfs_refcount_add_many(
2370                                     &new_state->arcs_size,
2371                                     arc_buf_size(buf), buf);
2372                         }
2373                         ASSERT3U(bufcnt, ==, buffers);
2374 
2375                         if (hdr->b_l1hdr.b_pabd != NULL) {
2376                                 (void) zfs_refcount_add_many(
2377                                     &new_state->arcs_size,
2378                                     arc_hdr_size(hdr), hdr);
2379                         }
2380 
2381                         if (HDR_HAS_RABD(hdr)) {
2382                                 (void) zfs_refcount_add_many(
2383                                     &new_state->arcs_size,
2384                                     HDR_GET_PSIZE(hdr), hdr);
2385                         }
2386                 }
2387         }
2388 
2389         if (update_old && old_state != arc_l2c_only) {
2390                 ASSERT(HDR_HAS_L1HDR(hdr));
2391                 if (GHOST_STATE(old_state)) {
2392                         ASSERT0(bufcnt);
2393                         ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2394                         ASSERT(!HDR_HAS_RABD(hdr));
2395 
2396                         /*
2397                          * When moving a header off of a ghost state,
2398                          * the header will not contain any arc buffers.
2399                          * We use the arc header pointer for the reference
2400                          * which is exactly what we did when we put the
2401                          * header on the ghost state.
2402                          */
2403 
2404                         (void) zfs_refcount_remove_many(&old_state->arcs_size,
2405                             HDR_GET_LSIZE(hdr), hdr);
2406                 } else {
2407                         uint32_t buffers = 0;
2408 
2409                         /*
2410                          * Each individual buffer holds a unique reference,
2411                          * thus we must remove each of these references one
2412                          * at a time.
2413                          */
2414                         for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2415                             buf = buf->b_next) {
2416                                 ASSERT3U(bufcnt, !=, 0);
2417                                 buffers++;
2418 
2419                                 /*
2420                                  * When the arc_buf_t is sharing the data
2421                                  * block with the hdr, the owner of the
2422                                  * reference belongs to the hdr. Only
2423                                  * add to the refcount if the arc_buf_t is
2424                                  * not shared.
2425                                  */
2426                                 if (arc_buf_is_shared(buf))
2427                                         continue;
2428 
2429                                 (void) zfs_refcount_remove_many(
2430                                     &old_state->arcs_size, arc_buf_size(buf),
2431                                     buf);
2432                         }
2433                         ASSERT3U(bufcnt, ==, buffers);
2434                         ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
2435                             HDR_HAS_RABD(hdr));
2436 
2437                         if (hdr->b_l1hdr.b_pabd != NULL) {
2438                                 (void) zfs_refcount_remove_many(
2439                                     &old_state->arcs_size, arc_hdr_size(hdr),
2440                                     hdr);
2441                         }
2442 
2443                         if (HDR_HAS_RABD(hdr)) {
2444                                 (void) zfs_refcount_remove_many(
2445                                     &old_state->arcs_size, HDR_GET_PSIZE(hdr),
2446                                     hdr);
2447                         }
2448                 }
2449         }
2450 
2451         if (HDR_HAS_L1HDR(hdr)) {
2452                 hdr->b_l1hdr.b_state = new_state;
2453 
2454                 if (HDR_HAS_L2HDR(hdr) && new_state != arc_l2c_only) {
2455                         l2arc_hdr_arcstats_decrement_state(hdr);
2456                         hdr->b_l2hdr.b_arcs_state = new_state->arcs_state;
2457                         l2arc_hdr_arcstats_increment_state(hdr);
2458                 }
2459         }
2460 
2461         /*
2462          * L2 headers should never be on the L2 state list since they don't
2463          * have L1 headers allocated.
2464          */
2465         ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
2466             multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
2467 }
2468 
2469 void
2470 arc_space_consume(uint64_t space, arc_space_type_t type)
2471 {
2472         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
2473 
2474         switch (type) {
2475         case ARC_SPACE_DATA:
2476                 aggsum_add(&astat_data_size, space);
2477                 break;
2478         case ARC_SPACE_META:
2479                 aggsum_add(&astat_metadata_size, space);
2480                 break;
2481         case ARC_SPACE_OTHER:
2482                 aggsum_add(&astat_other_size, space);
2483                 break;
2484         case ARC_SPACE_HDRS:
2485                 aggsum_add(&astat_hdr_size, space);
2486                 break;
2487         case ARC_SPACE_L2HDRS:
2488                 aggsum_add(&astat_l2_hdr_size, space);
2489                 break;
2490         }
2491 
2492         if (type != ARC_SPACE_DATA)
2493                 aggsum_add(&arc_meta_used, space);
2494 
2495         aggsum_add(&arc_size, space);
2496 }
2497 
2498 void
2499 arc_space_return(uint64_t space, arc_space_type_t type)
2500 {
2501         ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
2502 
2503         switch (type) {
2504         case ARC_SPACE_DATA:
2505                 aggsum_add(&astat_data_size, -space);
2506                 break;
2507         case ARC_SPACE_META:
2508                 aggsum_add(&astat_metadata_size, -space);
2509                 break;
2510         case ARC_SPACE_OTHER:
2511                 aggsum_add(&astat_other_size, -space);
2512                 break;
2513         case ARC_SPACE_HDRS:
2514                 aggsum_add(&astat_hdr_size, -space);
2515                 break;
2516         case ARC_SPACE_L2HDRS:
2517                 aggsum_add(&astat_l2_hdr_size, -space);
2518                 break;
2519         }
2520 
2521         if (type != ARC_SPACE_DATA) {
2522                 ASSERT(aggsum_compare(&arc_meta_used, space) >= 0);
2523                 /*
2524                  * We use the upper bound here rather than the precise value
2525                  * because the arc_meta_max value doesn't need to be
2526                  * precise. It's only consumed by humans via arcstats.
2527                  */
2528                 if (arc_meta_max < aggsum_upper_bound(&arc_meta_used))
2529                         arc_meta_max = aggsum_upper_bound(&arc_meta_used);
2530                 aggsum_add(&arc_meta_used, -space);
2531         }
2532 
2533         ASSERT(aggsum_compare(&arc_size, space) >= 0);
2534         aggsum_add(&arc_size, -space);
2535 }
2536 
2537 /*
2538  * Given a hdr and a buf, returns whether that buf can share its b_data buffer
2539  * with the hdr's b_pabd.
2540  */
2541 static boolean_t
2542 arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
2543 {
2544         /*
2545          * The criteria for sharing a hdr's data are:
2546          * 1. the buffer is not encrypted
2547          * 2. the hdr's compression matches the buf's compression
2548          * 3. the hdr doesn't need to be byteswapped
2549          * 4. the hdr isn't already being shared
2550          * 5. the buf is either compressed or it is the last buf in the hdr list
2551          *
2552          * Criterion #5 maintains the invariant that shared uncompressed
2553          * bufs must be the final buf in the hdr's b_buf list. Reading this, you
2554          * might ask, "if a compressed buf is allocated first, won't that be the
2555          * last thing in the list?", but in that case it's impossible to create
2556          * a shared uncompressed buf anyway (because the hdr must be compressed
2557          * to have the compressed buf). You might also think that #3 is
2558          * sufficient to make this guarantee, however it's possible
2559          * (specifically in the rare L2ARC write race mentioned in
2560          * arc_buf_alloc_impl()) there will be an existing uncompressed buf that
2561          * is sharable, but wasn't at the time of its allocation. Rather than
2562          * allow a new shared uncompressed buf to be created and then shuffle
2563          * the list around to make it the last element, this simply disallows
2564          * sharing if the new buf isn't the first to be added.
2565          */
2566         ASSERT3P(buf->b_hdr, ==, hdr);
2567         boolean_t hdr_compressed = arc_hdr_get_compress(hdr) !=
2568             ZIO_COMPRESS_OFF;
2569         boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0;
2570         return (!ARC_BUF_ENCRYPTED(buf) &&
2571             buf_compressed == hdr_compressed &&
2572             hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
2573             !HDR_SHARED_DATA(hdr) &&
2574             (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf)));
2575 }
2576 
2577 /*
2578  * Allocate a buf for this hdr. If you care about the data that's in the hdr,
2579  * or if you want a compressed buffer, pass those flags in. Returns 0 if the
2580  * copy was made successfully, or an error code otherwise.
2581  */
2582 static int
2583 arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
2584     void *tag, boolean_t encrypted, boolean_t compressed, boolean_t noauth,
2585     boolean_t fill, arc_buf_t **ret)
2586 {
2587         arc_buf_t *buf;
2588         arc_fill_flags_t flags = ARC_FILL_LOCKED;
2589 
2590         ASSERT(HDR_HAS_L1HDR(hdr));
2591         ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
2592         VERIFY(hdr->b_type == ARC_BUFC_DATA ||
2593             hdr->b_type == ARC_BUFC_METADATA);
2594         ASSERT3P(ret, !=, NULL);
2595         ASSERT3P(*ret, ==, NULL);
2596         IMPLY(encrypted, compressed);
2597 
2598         buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2599         buf->b_hdr = hdr;
2600         buf->b_data = NULL;
2601         buf->b_next = hdr->b_l1hdr.b_buf;
2602         buf->b_flags = 0;
2603 
2604         add_reference(hdr, tag);
2605 
2606         /*
2607          * We're about to change the hdr's b_flags. We must either
2608          * hold the hash_lock or be undiscoverable.
2609          */
2610         ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
2611 
2612         /*
2613          * Only honor requests for compressed bufs if the hdr is actually
2614          * compressed. This must be overriden if the buffer is encrypted since
2615          * encrypted buffers cannot be decompressed.
2616          */
2617         if (encrypted) {
2618                 buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
2619                 buf->b_flags |= ARC_BUF_FLAG_ENCRYPTED;
2620                 flags |= ARC_FILL_COMPRESSED | ARC_FILL_ENCRYPTED;
2621         } else if (compressed &&
2622             arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
2623                 buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
2624                 flags |= ARC_FILL_COMPRESSED;
2625         }
2626 
2627         if (noauth) {
2628                 ASSERT0(encrypted);
2629                 flags |= ARC_FILL_NOAUTH;
2630         }
2631 
2632         /*
2633          * If the hdr's data can be shared then we share the data buffer and
2634          * set the appropriate bit in the hdr's b_flags to indicate the hdr is
2635          * allocate a new buffer to store the buf's data.
2636          *
2637          * There are two additional restrictions here because we're sharing
2638          * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
2639          * actively involved in an L2ARC write, because if this buf is used by
2640          * an arc_write() then the hdr's data buffer will be released when the
2641          * write completes, even though the L2ARC write might still be using it.
2642          * Second, the hdr's ABD must be linear so that the buf's user doesn't
2643          * need to be ABD-aware.
2644          */
2645         boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) &&
2646             hdr->b_l1hdr.b_pabd != NULL && abd_is_linear(hdr->b_l1hdr.b_pabd);
2647 
2648         /* Set up b_data and sharing */
2649         if (can_share) {
2650                 buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd);
2651                 buf->b_flags |= ARC_BUF_FLAG_SHARED;
2652                 arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
2653         } else {
2654                 buf->b_data =
2655                     arc_get_data_buf(hdr, arc_buf_size(buf), buf);
2656                 ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
2657         }
2658         VERIFY3P(buf->b_data, !=, NULL);
2659 
2660         hdr->b_l1hdr.b_buf = buf;
2661         hdr->b_l1hdr.b_bufcnt += 1;
2662         if (encrypted)
2663                 hdr->b_crypt_hdr.b_ebufcnt += 1;
2664 
2665         /*
2666          * If the user wants the data from the hdr, we need to either copy or
2667          * decompress the data.
2668          */
2669         if (fill) {
2670                 ASSERT3P(zb, !=, NULL);
2671                 return (arc_buf_fill(buf, spa, zb, flags));
2672         }
2673 
2674         return (0);
2675 }
2676 
2677 static char *arc_onloan_tag = "onloan";
2678 
2679 static inline void
2680 arc_loaned_bytes_update(int64_t delta)
2681 {
2682         atomic_add_64(&arc_loaned_bytes, delta);
2683 
2684         /* assert that it did not wrap around */
2685         ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
2686 }
2687 
2688 /*
2689  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
2690  * flight data by arc_tempreserve_space() until they are "returned". Loaned
2691  * buffers must be returned to the arc before they can be used by the DMU or
2692  * freed.
2693  */
2694 arc_buf_t *
2695 arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size)
2696 {
2697         arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag,
2698             is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size);
2699 
2700         arc_loaned_bytes_update(arc_buf_size(buf));
2701 
2702         return (buf);
2703 }
2704 
2705 arc_buf_t *
2706 arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
2707     enum zio_compress compression_type)
2708 {
2709         arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag,
2710             psize, lsize, compression_type);
2711 
2712         arc_loaned_bytes_update(arc_buf_size(buf));
2713 
2714         return (buf);
2715 }
2716 
2717 arc_buf_t *
2718 arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder,
2719     const uint8_t *salt, const uint8_t *iv, const uint8_t *mac,
2720     dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
2721     enum zio_compress compression_type)
2722 {
2723         arc_buf_t *buf = arc_alloc_raw_buf(spa, arc_onloan_tag, dsobj,
2724             byteorder, salt, iv, mac, ot, psize, lsize, compression_type);
2725 
2726         atomic_add_64(&arc_loaned_bytes, psize);
2727         return (buf);
2728 }
2729 
2730 /*
2731  * Performance tuning of L2ARC persistence:
2732  *
2733  * l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding
2734  *              an L2ARC device (either at pool import or later) will attempt
2735  *              to rebuild L2ARC buffer contents.
2736  * l2arc_rebuild_blocks_min_l2size : A ZFS module parameter that controls
2737  *              whether log blocks are written to the L2ARC device. If the L2ARC
2738  *              device is less than 1GB, the amount of data l2arc_evict()
2739  *              evicts is significant compared to the amount of restored L2ARC
2740  *              data. In this case do not write log blocks in L2ARC in order
2741  *              not to waste space.
2742  */
2743 int l2arc_rebuild_enabled = B_TRUE;
2744 unsigned long l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024;
2745 
2746 /* L2ARC persistence rebuild control routines. */
2747 void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen);
2748 static void l2arc_dev_rebuild_start(l2arc_dev_t *dev);
2749 static int l2arc_rebuild(l2arc_dev_t *dev);
2750 
2751 /* L2ARC persistence read I/O routines. */
2752 static int l2arc_dev_hdr_read(l2arc_dev_t *dev);
2753 static int l2arc_log_blk_read(l2arc_dev_t *dev,
2754     const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp,
2755     l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
2756     zio_t *this_io, zio_t **next_io);
2757 static zio_t *l2arc_log_blk_fetch(vdev_t *vd,
2758     const l2arc_log_blkptr_t *lp, l2arc_log_blk_phys_t *lb);
2759 static void l2arc_log_blk_fetch_abort(zio_t *zio);
2760 
2761 /* L2ARC persistence block restoration routines. */
2762 static void l2arc_log_blk_restore(l2arc_dev_t *dev,
2763     const l2arc_log_blk_phys_t *lb, uint64_t lb_asize);
2764 static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
2765     l2arc_dev_t *dev);
2766 
2767 /* L2ARC persistence write I/O routines. */
2768 static void l2arc_dev_hdr_update(l2arc_dev_t *dev);
2769 static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
2770     l2arc_write_callback_t *cb);
2771 
2772 /* L2ARC persistence auxilliary routines. */
2773 boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
2774     const l2arc_log_blkptr_t *lbp);
2775 static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev,
2776     const arc_buf_hdr_t *ab);
2777 boolean_t l2arc_range_check_overlap(uint64_t bottom,
2778     uint64_t top, uint64_t check);
2779 static void l2arc_blk_fetch_done(zio_t *zio);
2780 static inline uint64_t
2781     l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev);
2782 
2783 /*
2784  * Return a loaned arc buffer to the arc.
2785  */
2786 void
2787 arc_return_buf(arc_buf_t *buf, void *tag)
2788 {
2789         arc_buf_hdr_t *hdr = buf->b_hdr;
2790 
2791         ASSERT3P(buf->b_data, !=, NULL);
2792         ASSERT(HDR_HAS_L1HDR(hdr));
2793         (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
2794         (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
2795 
2796         arc_loaned_bytes_update(-arc_buf_size(buf));
2797 }
2798 
2799 /* Detach an arc_buf from a dbuf (tag) */
2800 void
2801 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
2802 {
2803         arc_buf_hdr_t *hdr = buf->b_hdr;
2804 
2805         ASSERT3P(buf->b_data, !=, NULL);
2806         ASSERT(HDR_HAS_L1HDR(hdr));
2807         (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
2808         (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
2809 
2810         arc_loaned_bytes_update(arc_buf_size(buf));
2811 }
2812 
2813 static void
2814 l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type)
2815 {
2816         l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP);
2817 
2818         df->l2df_abd = abd;
2819         df->l2df_size = size;
2820         df->l2df_type = type;
2821         mutex_enter(&l2arc_free_on_write_mtx);
2822         list_insert_head(l2arc_free_on_write, df);
2823         mutex_exit(&l2arc_free_on_write_mtx);
2824 }
2825 
2826 static void
2827 arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata)
2828 {
2829         arc_state_t *state = hdr->b_l1hdr.b_state;
2830         arc_buf_contents_t type = arc_buf_type(hdr);
2831         uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
2832 
2833         /* protected by hash lock, if in the hash table */
2834         if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
2835                 ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2836                 ASSERT(state != arc_anon && state != arc_l2c_only);
2837 
2838                 (void) zfs_refcount_remove_many(&state->arcs_esize[type],
2839                     size, hdr);
2840         }
2841         (void) zfs_refcount_remove_many(&state->arcs_size, size, hdr);
2842         if (type == ARC_BUFC_METADATA) {
2843                 arc_space_return(size, ARC_SPACE_META);
2844         } else {
2845                 ASSERT(type == ARC_BUFC_DATA);
2846                 arc_space_return(size, ARC_SPACE_DATA);
2847         }
2848 
2849         if (free_rdata) {
2850                 l2arc_free_abd_on_write(hdr->b_crypt_hdr.b_rabd, size, type);
2851         } else {
2852                 l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type);
2853         }
2854 }
2855 
2856 /*
2857  * Share the arc_buf_t's data with the hdr. Whenever we are sharing the
2858  * data buffer, we transfer the refcount ownership to the hdr and update
2859  * the appropriate kstats.
2860  */
2861 static void
2862 arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
2863 {
2864         /* LINTED */
2865         arc_state_t *state = hdr->b_l1hdr.b_state;
2866 
2867         ASSERT(arc_can_share(hdr, buf));
2868         ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2869         ASSERT(!ARC_BUF_ENCRYPTED(buf));
2870         ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
2871 
2872         /*
2873          * Start sharing the data buffer. We transfer the
2874          * refcount ownership to the hdr since it always owns
2875          * the refcount whenever an arc_buf_t is shared.
2876          */
2877         zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size,
2878             arc_hdr_size(hdr), buf, hdr);
2879         hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf));
2880         abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd,
2881             HDR_ISTYPE_METADATA(hdr));
2882         arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
2883         buf->b_flags |= ARC_BUF_FLAG_SHARED;
2884 
2885         /*
2886          * Since we've transferred ownership to the hdr we need
2887          * to increment its compressed and uncompressed kstats and
2888          * decrement the overhead size.
2889          */
2890         ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
2891         ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
2892         ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf));
2893 }
2894 
2895 static void
2896 arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
2897 {
2898         /* LINTED */
2899         arc_state_t *state = hdr->b_l1hdr.b_state;
2900 
2901         ASSERT(arc_buf_is_shared(buf));
2902         ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
2903         ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
2904 
2905         /*
2906          * We are no longer sharing this buffer so we need
2907          * to transfer its ownership to the rightful owner.
2908          */
2909         zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size,
2910             arc_hdr_size(hdr), hdr, buf);
2911         arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
2912         abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
2913         abd_put(hdr->b_l1hdr.b_pabd);
2914         hdr->b_l1hdr.b_pabd = NULL;
2915         buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
2916 
2917         /*
2918          * Since the buffer is no longer shared between
2919          * the arc buf and the hdr, count it as overhead.
2920          */
2921         ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
2922         ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
2923         ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
2924 }
2925 
2926 /*
2927  * Remove an arc_buf_t from the hdr's buf list and return the last
2928  * arc_buf_t on the list. If no buffers remain on the list then return
2929  * NULL.
2930  */
2931 static arc_buf_t *
2932 arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf)
2933 {
2934         arc_buf_t **bufp = &hdr->b_l1hdr.b_buf;
2935         arc_buf_t *lastbuf = NULL;
2936 
2937         ASSERT(HDR_HAS_L1HDR(hdr));
2938         ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
2939 
2940         /*
2941          * Remove the buf from the hdr list and locate the last
2942          * remaining buffer on the list.
2943          */
2944         while (*bufp != NULL) {
2945                 if (*bufp == buf)
2946                         *bufp = buf->b_next;
2947 
2948                 /*
2949                  * If we've removed a buffer in the middle of
2950                  * the list then update the lastbuf and update
2951                  * bufp.
2952                  */
2953                 if (*bufp != NULL) {
2954                         lastbuf = *bufp;
2955                         bufp = &(*bufp)->b_next;
2956                 }
2957         }
2958         buf->b_next = NULL;
2959         ASSERT3P(lastbuf, !=, buf);
2960         IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL);
2961         IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL);
2962         IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf));
2963 
2964         return (lastbuf);
2965 }
2966 
2967 /*
2968  * Free up buf->b_data and pull the arc_buf_t off of the the arc_buf_hdr_t's
2969  * list and free it.
2970  */
2971 static void
2972 arc_buf_destroy_impl(arc_buf_t *buf)
2973 {
2974         arc_buf_hdr_t *hdr = buf->b_hdr;
2975 
2976         /*
2977          * Free up the data associated with the buf but only if we're not
2978          * sharing this with the hdr. If we are sharing it with the hdr, the
2979          * hdr is responsible for doing the free.
2980          */
2981         if (buf->b_data != NULL) {
2982                 /*
2983                  * We're about to change the hdr's b_flags. We must either
2984                  * hold the hash_lock or be undiscoverable.
2985                  */
2986                 ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
2987 
2988                 arc_cksum_verify(buf);
2989                 arc_buf_unwatch(buf);
2990 
2991                 if (arc_buf_is_shared(buf)) {
2992                         arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
2993                 } else {
2994                         uint64_t size = arc_buf_size(buf);
2995                         arc_free_data_buf(hdr, buf->b_data, size, buf);
2996                         ARCSTAT_INCR(arcstat_overhead_size, -size);
2997                 }
2998                 buf->b_data = NULL;
2999 
3000                 ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
3001                 hdr->b_l1hdr.b_bufcnt -= 1;
3002 
3003                 if (ARC_BUF_ENCRYPTED(buf)) {
3004                         hdr->b_crypt_hdr.b_ebufcnt -= 1;
3005 
3006                         /*
3007                          * If we have no more encrypted buffers and we've
3008                          * already gotten a copy of the decrypted data we can
3009                          * free b_rabd to save some space.
3010                          */
3011                         if (hdr->b_crypt_hdr.b_ebufcnt == 0 &&
3012                             HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd != NULL &&
3013                             !HDR_IO_IN_PROGRESS(hdr)) {
3014                                 arc_hdr_free_pabd(hdr, B_TRUE);
3015                         }
3016                 }
3017         }
3018 
3019         arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
3020 
3021         if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
3022                 /*
3023                  * If the current arc_buf_t is sharing its data buffer with the
3024                  * hdr, then reassign the hdr's b_pabd to share it with the new
3025                  * buffer at the end of the list. The shared buffer is always
3026                  * the last one on the hdr's buffer list.
3027                  *
3028                  * There is an equivalent case for compressed bufs, but since
3029                  * they aren't guaranteed to be the last buf in the list and
3030                  * that is an exceedingly rare case, we just allow that space be
3031                  * wasted temporarily. We must also be careful not to share
3032                  * encrypted buffers, since they cannot be shared.
3033                  */
3034                 if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) {
3035                         /* Only one buf can be shared at once */
3036                         VERIFY(!arc_buf_is_shared(lastbuf));
3037                         /* hdr is uncompressed so can't have compressed buf */
3038                         VERIFY(!ARC_BUF_COMPRESSED(lastbuf));
3039 
3040                         ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
3041                         arc_hdr_free_pabd(hdr, B_FALSE);
3042 
3043                         /*
3044                          * We must setup a new shared block between the
3045                          * last buffer and the hdr. The data would have
3046                          * been allocated by the arc buf so we need to transfer
3047                          * ownership to the hdr since it's now being shared.
3048                          */
3049                         arc_share_buf(hdr, lastbuf);
3050                 }
3051         } else if (HDR_SHARED_DATA(hdr)) {
3052                 /*
3053                  * Uncompressed shared buffers are always at the end
3054                  * of the list. Compressed buffers don't have the
3055                  * same requirements. This makes it hard to
3056                  * simply assert that the lastbuf is shared so
3057                  * we rely on the hdr's compression flags to determine
3058                  * if we have a compressed, shared buffer.
3059                  */
3060                 ASSERT3P(lastbuf, !=, NULL);
3061                 ASSERT(arc_buf_is_shared(lastbuf) ||
3062                     arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
3063         }
3064 
3065         /*
3066          * Free the checksum if we're removing the last uncompressed buf from
3067          * this hdr.
3068          */
3069         if (!arc_hdr_has_uncompressed_buf(hdr)) {
3070                 arc_cksum_free(hdr);
3071         }
3072 
3073         /* clean up the buf */
3074         buf->b_hdr = NULL;
3075         kmem_cache_free(buf_cache, buf);
3076 }
3077 
3078 static void
3079 arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr, int alloc_flags)
3080 {
3081         uint64_t size;
3082         boolean_t alloc_rdata = ((alloc_flags & ARC_HDR_ALLOC_RDATA) != 0);
3083         boolean_t do_adapt = ((alloc_flags & ARC_HDR_DO_ADAPT) != 0);
3084 
3085         ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
3086         ASSERT(HDR_HAS_L1HDR(hdr));
3087         ASSERT(!HDR_SHARED_DATA(hdr) || alloc_rdata);
3088         IMPLY(alloc_rdata, HDR_PROTECTED(hdr));
3089 
3090         if (alloc_rdata) {
3091                 size = HDR_GET_PSIZE(hdr);
3092                 ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL);
3093                 hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr,
3094                     do_adapt);
3095                 ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL);
3096         } else {
3097                 size = arc_hdr_size(hdr);
3098                 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
3099                 hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr,
3100                     do_adapt);
3101                 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
3102         }
3103 
3104         ARCSTAT_INCR(arcstat_compressed_size, size);
3105         ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
3106 }
3107 
3108 static void
3109 arc_hdr_free_pabd(arc_buf_hdr_t *hdr, boolean_t free_rdata)
3110 {
3111         uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
3112 
3113         ASSERT(HDR_HAS_L1HDR(hdr));
3114         ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
3115         IMPLY(free_rdata, HDR_HAS_RABD(hdr));
3116 
3117 
3118         /*
3119          * If the hdr is currently being written to the l2arc then
3120          * we defer freeing the data by adding it to the l2arc_free_on_write
3121          * list. The l2arc will free the data once it's finished
3122          * writing it to the l2arc device.
3123          */
3124         if (HDR_L2_WRITING(hdr)) {
3125                 arc_hdr_free_on_write(hdr, free_rdata);
3126                 ARCSTAT_BUMP(arcstat_l2_free_on_write);
3127         } else if (free_rdata) {
3128                 arc_free_data_abd(hdr, hdr->b_crypt_hdr.b_rabd, size, hdr);
3129         } else {
3130                 arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
3131                     size, hdr);
3132         }
3133 
3134         if (free_rdata) {
3135                 hdr->b_crypt_hdr.b_rabd = NULL;
3136         } else {
3137                 hdr->b_l1hdr.b_pabd = NULL;
3138         }
3139 
3140         if (hdr->b_l1hdr.b_pabd == NULL && !HDR_HAS_RABD(hdr))
3141                 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
3142 
3143         ARCSTAT_INCR(arcstat_compressed_size, -size);
3144         ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
3145 }
3146 
3147 static arc_buf_hdr_t *
3148 arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
3149     boolean_t protected, enum zio_compress compression_type,
3150     arc_buf_contents_t type, boolean_t alloc_rdata)
3151 {
3152         arc_buf_hdr_t *hdr;
3153         int flags = ARC_HDR_DO_ADAPT;
3154 
3155         VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
3156         if (protected) {
3157                 hdr = kmem_cache_alloc(hdr_full_crypt_cache, KM_PUSHPAGE);
3158         } else {
3159                 hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
3160         }
3161         flags |= alloc_rdata ? ARC_HDR_ALLOC_RDATA : 0;
3162         ASSERT(HDR_EMPTY(hdr));
3163         ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
3164         ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL);
3165         HDR_SET_PSIZE(hdr, psize);
3166         HDR_SET_LSIZE(hdr, lsize);
3167         hdr->b_spa = spa;
3168         hdr->b_type = type;
3169         hdr->b_flags = 0;
3170         arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR);
3171         arc_hdr_set_compress(hdr, compression_type);
3172         if (protected)
3173                 arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
3174 
3175         hdr->b_l1hdr.b_state = arc_anon;
3176         hdr->b_l1hdr.b_arc_access = 0;
3177         hdr->b_l1hdr.b_bufcnt = 0;
3178         hdr->b_l1hdr.b_buf = NULL;
3179 
3180         /*
3181          * Allocate the hdr's buffer. This will contain either
3182          * the compressed or uncompressed data depending on the block
3183          * it references and compressed arc enablement.
3184          */
3185         arc_hdr_alloc_pabd(hdr, flags);
3186         ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3187 
3188         return (hdr);
3189 }
3190 
3191 /*
3192  * Transition between the two allocation states for the arc_buf_hdr struct.
3193  * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
3194  * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
3195  * version is used when a cache buffer is only in the L2ARC in order to reduce
3196  * memory usage.
3197  */
3198 static arc_buf_hdr_t *
3199 arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
3200 {
3201         ASSERT(HDR_HAS_L2HDR(hdr));
3202 
3203         arc_buf_hdr_t *nhdr;
3204         l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
3205 
3206         ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
3207             (old == hdr_l2only_cache && new == hdr_full_cache));
3208 
3209         /*
3210          * if the caller wanted a new full header and the header is to be
3211          * encrypted we will actually allocate the header from the full crypt
3212          * cache instead. The same applies to freeing from the old cache.
3213          */
3214         if (HDR_PROTECTED(hdr) && new == hdr_full_cache)
3215                 new = hdr_full_crypt_cache;
3216         if (HDR_PROTECTED(hdr) && old == hdr_full_cache)
3217                 old = hdr_full_crypt_cache;
3218 
3219         nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
3220 
3221         ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
3222         buf_hash_remove(hdr);
3223 
3224         bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
3225 
3226         if (new == hdr_full_cache || new == hdr_full_crypt_cache) {
3227                 arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
3228                 /*
3229                  * arc_access and arc_change_state need to be aware that a
3230                  * header has just come out of L2ARC, so we set its state to
3231                  * l2c_only even though it's about to change.
3232                  */
3233                 nhdr->b_l1hdr.b_state = arc_l2c_only;
3234 
3235                 /* Verify previous threads set to NULL before freeing */
3236                 ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL);
3237                 ASSERT(!HDR_HAS_RABD(hdr));
3238         } else {
3239                 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
3240                 ASSERT0(hdr->b_l1hdr.b_bufcnt);
3241                 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
3242 
3243                 /*
3244                  * If we've reached here, We must have been called from
3245                  * arc_evict_hdr(), as such we should have already been
3246                  * removed from any ghost list we were previously on
3247                  * (which protects us from racing with arc_evict_state),
3248                  * thus no locking is needed during this check.
3249                  */
3250                 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
3251 
3252                 /*
3253                  * A buffer must not be moved into the arc_l2c_only
3254                  * state if it's not finished being written out to the
3255                  * l2arc device. Otherwise, the b_l1hdr.b_pabd field
3256                  * might try to be accessed, even though it was removed.
3257                  */
3258                 VERIFY(!HDR_L2_WRITING(hdr));
3259                 VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL);
3260                 ASSERT(!HDR_HAS_RABD(hdr));
3261 
3262 #ifdef ZFS_DEBUG
3263                 if (hdr->b_l1hdr.b_thawed != NULL) {
3264                         kmem_free(hdr->b_l1hdr.b_thawed, 1);
3265                         hdr->b_l1hdr.b_thawed = NULL;
3266                 }
3267 #endif
3268 
3269                 arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR);
3270         }
3271         /*
3272          * The header has been reallocated so we need to re-insert it into any
3273          * lists it was on.
3274          */
3275         (void) buf_hash_insert(nhdr, NULL);
3276 
3277         ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
3278 
3279         mutex_enter(&dev->l2ad_mtx);
3280 
3281         /*
3282          * We must place the realloc'ed header back into the list at
3283          * the same spot. Otherwise, if it's placed earlier in the list,
3284          * l2arc_write_buffers() could find it during the function's
3285          * write phase, and try to write it out to the l2arc.
3286          */
3287         list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
3288         list_remove(&dev->l2ad_buflist, hdr);
3289 
3290         mutex_exit(&dev->l2ad_mtx);
3291 
3292         /*
3293          * Since we're using the pointer address as the tag when
3294          * incrementing and decrementing the l2ad_alloc refcount, we
3295          * must remove the old pointer (that we're about to destroy) and
3296          * add the new pointer to the refcount. Otherwise we'd remove
3297          * the wrong pointer address when calling arc_hdr_destroy() later.
3298          */
3299 
3300         (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr),
3301             hdr);
3302         (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr),
3303             nhdr);
3304 
3305         buf_discard_identity(hdr);
3306         kmem_cache_free(old, hdr);
3307 
3308         return (nhdr);
3309 }
3310 
3311 /*
3312  * This function allows an L1 header to be reallocated as a crypt
3313  * header and vice versa. If we are going to a crypt header, the
3314  * new fields will be zeroed out.
3315  */
3316 static arc_buf_hdr_t *
3317 arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt)
3318 {
3319         arc_buf_hdr_t *nhdr;
3320         arc_buf_t *buf;
3321         kmem_cache_t *ncache, *ocache;
3322 
3323         ASSERT(HDR_HAS_L1HDR(hdr));
3324         ASSERT3U(!!HDR_PROTECTED(hdr), !=, need_crypt);
3325         ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
3326         ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
3327         ASSERT(!list_link_active(&hdr->b_l2hdr.b_l2node));
3328         ASSERT3P(hdr->b_hash_next, ==, NULL);
3329 
3330         if (need_crypt) {
3331                 ncache = hdr_full_crypt_cache;
3332                 ocache = hdr_full_cache;
3333         } else {
3334                 ncache = hdr_full_cache;
3335                 ocache = hdr_full_crypt_cache;
3336         }
3337 
3338         nhdr = kmem_cache_alloc(ncache, KM_PUSHPAGE);
3339 
3340         /*
3341          * Copy all members that aren't locks or condvars to the new header.
3342          * No lists are pointing to us (as we asserted above), so we don't
3343          * need to worry about the list nodes.
3344          */
3345         nhdr->b_dva = hdr->b_dva;
3346         nhdr->b_birth = hdr->b_birth;
3347         nhdr->b_type = hdr->b_type;
3348         nhdr->b_flags = hdr->b_flags;
3349         nhdr->b_psize = hdr->b_psize;
3350         nhdr->b_lsize = hdr->b_lsize;
3351         nhdr->b_spa = hdr->b_spa;
3352         nhdr->b_l2hdr.b_dev = hdr->b_l2hdr.b_dev;
3353         nhdr->b_l2hdr.b_daddr = hdr->b_l2hdr.b_daddr;
3354         nhdr->b_l1hdr.b_freeze_cksum = hdr->b_l1hdr.b_freeze_cksum;
3355         nhdr->b_l1hdr.b_bufcnt = hdr->b_l1hdr.b_bufcnt;
3356         nhdr->b_l1hdr.b_byteswap = hdr->b_l1hdr.b_byteswap;
3357         nhdr->b_l1hdr.b_state = hdr->b_l1hdr.b_state;
3358         nhdr->b_l1hdr.b_arc_access = hdr->b_l1hdr.b_arc_access;
3359         nhdr->b_l1hdr.b_acb = hdr->b_l1hdr.b_acb;
3360         nhdr->b_l1hdr.b_pabd = hdr->b_l1hdr.b_pabd;
3361 #ifdef ZFS_DEBUG
3362         if (hdr->b_l1hdr.b_thawed != NULL) {
3363                 nhdr->b_l1hdr.b_thawed = hdr->b_l1hdr.b_thawed;
3364                 hdr->b_l1hdr.b_thawed = NULL;
3365         }
3366 #endif
3367 
3368         /*
3369          * This refcount_add() exists only to ensure that the individual
3370          * arc buffers always point to a header that is referenced, avoiding
3371          * a small race condition that could trigger ASSERTs.
3372          */
3373         (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, FTAG);
3374         nhdr->b_l1hdr.b_buf = hdr->b_l1hdr.b_buf;
3375         for (buf = nhdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) {
3376                 mutex_enter(&buf->b_evict_lock);
3377                 buf->b_hdr = nhdr;
3378                 mutex_exit(&buf->b_evict_lock);
3379         }
3380         zfs_refcount_transfer(&nhdr->b_l1hdr.b_refcnt, &hdr->b_l1hdr.b_refcnt);
3381         (void) zfs_refcount_remove(&nhdr->b_l1hdr.b_refcnt, FTAG);
3382         ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
3383 
3384         if (need_crypt) {
3385                 arc_hdr_set_flags(nhdr, ARC_FLAG_PROTECTED);
3386         } else {
3387                 arc_hdr_clear_flags(nhdr, ARC_FLAG_PROTECTED);
3388         }
3389 
3390         /* unset all members of the original hdr */
3391         bzero(&hdr->b_dva, sizeof (dva_t));
3392         hdr->b_birth = 0;
3393         hdr->b_type = ARC_BUFC_INVALID;
3394         hdr->b_flags = 0;
3395         hdr->b_psize = 0;
3396         hdr->b_lsize = 0;
3397         hdr->b_spa = 0;
3398         hdr->b_l2hdr.b_dev = NULL;
3399         hdr->b_l2hdr.b_daddr = 0;
3400         hdr->b_l1hdr.b_freeze_cksum = NULL;
3401         hdr->b_l1hdr.b_buf = NULL;
3402         hdr->b_l1hdr.b_bufcnt = 0;
3403         hdr->b_l1hdr.b_byteswap = 0;
3404         hdr->b_l1hdr.b_state = NULL;
3405         hdr->b_l1hdr.b_arc_access = 0;
3406         hdr->b_l1hdr.b_acb = NULL;
3407         hdr->b_l1hdr.b_pabd = NULL;
3408 
3409         if (ocache == hdr_full_crypt_cache) {
3410                 ASSERT(!HDR_HAS_RABD(hdr));
3411                 hdr->b_crypt_hdr.b_ot = DMU_OT_NONE;
3412                 hdr->b_crypt_hdr.b_ebufcnt = 0;
3413                 hdr->b_crypt_hdr.b_dsobj = 0;
3414                 bzero(hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
3415                 bzero(hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
3416                 bzero(hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
3417         }
3418 
3419         buf_discard_identity(hdr);
3420         kmem_cache_free(ocache, hdr);
3421 
3422         return (nhdr);
3423 }
3424 
3425 /*
3426  * This function is used by the send / receive code to convert a newly
3427  * allocated arc_buf_t to one that is suitable for a raw encrypted write. It
3428  * is also used to allow the root objset block to be uupdated without altering
3429  * its embedded MACs. Both block types will always be uncompressed so we do not
3430  * have to worry about compression type or psize.
3431  */
3432 void
3433 arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder,
3434     dmu_object_type_t ot, const uint8_t *salt, const uint8_t *iv,
3435     const uint8_t *mac)
3436 {
3437         arc_buf_hdr_t *hdr = buf->b_hdr;
3438 
3439         ASSERT(ot == DMU_OT_DNODE || ot == DMU_OT_OBJSET);
3440         ASSERT(HDR_HAS_L1HDR(hdr));
3441         ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
3442 
3443         buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED);
3444         if (!HDR_PROTECTED(hdr))
3445                 hdr = arc_hdr_realloc_crypt(hdr, B_TRUE);
3446         hdr->b_crypt_hdr.b_dsobj = dsobj;
3447         hdr->b_crypt_hdr.b_ot = ot;
3448         hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
3449             DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
3450         if (!arc_hdr_has_uncompressed_buf(hdr))
3451                 arc_cksum_free(hdr);
3452 
3453         if (salt != NULL)
3454                 bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
3455         if (iv != NULL)
3456                 bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
3457         if (mac != NULL)
3458                 bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
3459 }
3460 
3461 /*
3462  * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller.
3463  * The buf is returned thawed since we expect the consumer to modify it.
3464  */
3465 arc_buf_t *
3466 arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size)
3467 {
3468         arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
3469             B_FALSE, ZIO_COMPRESS_OFF, type, B_FALSE);
3470 
3471         arc_buf_t *buf = NULL;
3472         VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE,
3473             B_FALSE, B_FALSE, &buf));
3474         arc_buf_thaw(buf);
3475 
3476         return (buf);
3477 }
3478 
3479 /*
3480  * Allocates an ARC buf header that's in an evicted & L2-cached state.
3481  * This is used during l2arc reconstruction to make empty ARC buffers
3482  * which circumvent the regular disk->arc->l2arc path and instead come
3483  * into being in the reverse order, i.e. l2arc->arc.
3484  */
3485 arc_buf_hdr_t *
3486 arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev,
3487     dva_t dva, uint64_t daddr, int32_t psize, uint64_t birth,
3488     enum zio_compress compress, boolean_t protected,
3489     boolean_t prefetch, arc_state_type_t arcs_state)
3490 {
3491         arc_buf_hdr_t   *hdr;
3492 
3493         ASSERT(size != 0);
3494         hdr = kmem_cache_alloc(hdr_l2only_cache, KM_SLEEP);
3495         hdr->b_birth = birth;
3496         hdr->b_type = type;
3497         hdr->b_flags = 0;
3498         arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L2HDR);
3499         HDR_SET_LSIZE(hdr, size);
3500         HDR_SET_PSIZE(hdr, psize);
3501         arc_hdr_set_compress(hdr, compress);
3502         if (protected)
3503                 arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
3504         if (prefetch)
3505                 arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
3506         hdr->b_spa = spa_load_guid(dev->l2ad_vdev->vdev_spa);
3507 
3508         hdr->b_dva = dva;
3509 
3510         hdr->b_l2hdr.b_dev = dev;
3511         hdr->b_l2hdr.b_daddr = daddr;
3512         hdr->b_l2hdr.b_arcs_state = arcs_state;
3513 
3514         return (hdr);
3515 }
3516 
3517 /*
3518  * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this
3519  * for bufs containing metadata.
3520  */
3521 arc_buf_t *
3522 arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
3523     enum zio_compress compression_type)
3524 {
3525         ASSERT3U(lsize, >, 0);
3526         ASSERT3U(lsize, >=, psize);
3527         ASSERT3U(compression_type, >, ZIO_COMPRESS_OFF);
3528         ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
3529 
3530         arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
3531             B_FALSE, compression_type, ARC_BUFC_DATA, B_FALSE);
3532 
3533         arc_buf_t *buf = NULL;
3534         VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE,
3535             B_TRUE, B_FALSE, B_FALSE, &buf));
3536         arc_buf_thaw(buf);
3537         ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
3538 
3539         if (!arc_buf_is_shared(buf)) {
3540                 /*
3541                  * To ensure that the hdr has the correct data in it if we call
3542                  * arc_untransform() on this buf before it's been written to
3543                  * disk, it's easiest if we just set up sharing between the
3544                  * buf and the hdr.
3545                  */
3546                 ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd));
3547                 arc_hdr_free_pabd(hdr, B_FALSE);
3548                 arc_share_buf(hdr, buf);
3549         }
3550 
3551         return (buf);
3552 }
3553 
3554 arc_buf_t *
3555 arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder,
3556     const uint8_t *salt, const uint8_t *iv, const uint8_t *mac,
3557     dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
3558     enum zio_compress compression_type)
3559 {
3560         arc_buf_hdr_t *hdr;
3561         arc_buf_t *buf;
3562         arc_buf_contents_t type = DMU_OT_IS_METADATA(ot) ?
3563             ARC_BUFC_METADATA : ARC_BUFC_DATA;
3564 
3565         ASSERT3U(lsize, >, 0);
3566         ASSERT3U(lsize, >=, psize);
3567         ASSERT3U(compression_type, >=, ZIO_COMPRESS_OFF);
3568         ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
3569 
3570         hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE,
3571             compression_type, type, B_TRUE);
3572 
3573         hdr->b_crypt_hdr.b_dsobj = dsobj;
3574         hdr->b_crypt_hdr.b_ot = ot;
3575         hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
3576             DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
3577         bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
3578         bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
3579         bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
3580 
3581         /*
3582          * This buffer will be considered encrypted even if the ot is not an
3583          * encrypted type. It will become authenticated instead in
3584          * arc_write_ready().
3585          */
3586         buf = NULL;
3587         VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_TRUE, B_TRUE,
3588             B_FALSE, B_FALSE, &buf));
3589         arc_buf_thaw(buf);
3590         ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
3591 
3592         return (buf);
3593 }
3594 
3595 static void
3596 l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
3597     boolean_t state_only)
3598 {
3599         l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
3600         l2arc_dev_t *dev = l2hdr->b_dev;
3601         uint64_t lsize = HDR_GET_LSIZE(hdr);
3602         uint64_t psize = HDR_GET_PSIZE(hdr);
3603         uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
3604         arc_buf_contents_t type = hdr->b_type;
3605         int64_t lsize_s;
3606         int64_t psize_s;
3607         int64_t asize_s;
3608 
3609         if (incr) {
3610                 lsize_s = lsize;
3611                 psize_s = psize;
3612                 asize_s = asize;
3613         } else {
3614                 lsize_s = -lsize;
3615                 psize_s = -psize;
3616                 asize_s = -asize;
3617         }
3618 
3619         /* If the buffer is a prefetch, count it as such. */
3620         if (HDR_PREFETCH(hdr)) {
3621                 ARCSTAT_INCR(arcstat_l2_prefetch_asize, asize_s);
3622         } else {
3623                 /*
3624                  * We use the value stored in the L2 header upon initial
3625                  * caching in L2ARC. This value will be updated in case
3626                  * an MRU/MRU_ghost buffer transitions to MFU but the L2ARC
3627                  * metadata (log entry) cannot currently be updated. Having
3628                  * the ARC state in the L2 header solves the problem of a
3629                  * possibly absent L1 header (apparent in buffers restored
3630                  * from persistent L2ARC).
3631                  */
3632                 switch (hdr->b_l2hdr.b_arcs_state) {
3633                         case ARC_STATE_MRU_GHOST:
3634                         case ARC_STATE_MRU:
3635                                 ARCSTAT_INCR(arcstat_l2_mru_asize, asize_s);
3636                                 break;
3637                         case ARC_STATE_MFU_GHOST:
3638                         case ARC_STATE_MFU:
3639                                 ARCSTAT_INCR(arcstat_l2_mfu_asize, asize_s);
3640                                 break;
3641                         default:
3642                                 break;
3643                 }
3644         }
3645 
3646         if (state_only)
3647                 return;
3648 
3649         ARCSTAT_INCR(arcstat_l2_psize, psize_s);
3650         ARCSTAT_INCR(arcstat_l2_lsize, lsize_s);
3651 
3652         switch (type) {
3653                 case ARC_BUFC_DATA:
3654                         ARCSTAT_INCR(arcstat_l2_bufc_data_asize, asize_s);
3655                         break;
3656                 case ARC_BUFC_METADATA:
3657                         ARCSTAT_INCR(arcstat_l2_bufc_metadata_asize, asize_s);
3658                         break;
3659                 default:
3660                         break;
3661         }
3662 }
3663 
3664 
3665 static void
3666 arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
3667 {
3668         l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
3669         l2arc_dev_t *dev = l2hdr->b_dev;
3670         uint64_t psize = HDR_GET_PSIZE(hdr);
3671         uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
3672 
3673         ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
3674         ASSERT(HDR_HAS_L2HDR(hdr));
3675 
3676         list_remove(&dev->l2ad_buflist, hdr);
3677 
3678         l2arc_hdr_arcstats_decrement(hdr);
3679         vdev_space_update(dev->l2ad_vdev, -asize, 0, 0);
3680 
3681         (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr),
3682             hdr);
3683         arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
3684 }
3685 
3686 static void
3687 arc_hdr_destroy(arc_buf_hdr_t *hdr)
3688 {
3689         if (HDR_HAS_L1HDR(hdr)) {
3690                 ASSERT(hdr->b_l1hdr.b_buf == NULL ||
3691                     hdr->b_l1hdr.b_bufcnt > 0);
3692                 ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3693                 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
3694         }
3695         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3696         ASSERT(!HDR_IN_HASH_TABLE(hdr));
3697 
3698         if (HDR_HAS_L2HDR(hdr)) {
3699                 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
3700                 boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
3701 
3702                 if (!buflist_held)
3703                         mutex_enter(&dev->l2ad_mtx);
3704 
3705                 /*
3706                  * Even though we checked this conditional above, we
3707                  * need to check this again now that we have the
3708                  * l2ad_mtx. This is because we could be racing with
3709                  * another thread calling l2arc_evict() which might have
3710                  * destroyed this header's L2 portion as we were waiting
3711                  * to acquire the l2ad_mtx. If that happens, we don't
3712                  * want to re-destroy the header's L2 portion.
3713                  */
3714                 if (HDR_HAS_L2HDR(hdr))
3715                         arc_hdr_l2hdr_destroy(hdr);
3716 
3717                 if (!buflist_held)
3718                         mutex_exit(&dev->l2ad_mtx);
3719         }
3720 
3721         /*
3722          * The header's identity can only be safely discarded once it is no
3723          * longer discoverable.  This requires removing it from the hash table
3724          * and the l2arc header list.  After this point the hash lock can not
3725          * be used to protect the header.
3726          */
3727         if (!HDR_EMPTY(hdr))
3728                 buf_discard_identity(hdr);
3729 
3730         if (HDR_HAS_L1HDR(hdr)) {
3731                 arc_cksum_free(hdr);
3732 
3733                 while (hdr->b_l1hdr.b_buf != NULL)
3734                         arc_buf_destroy_impl(hdr->b_l1hdr.b_buf);
3735 
3736 #ifdef ZFS_DEBUG
3737                 if (hdr->b_l1hdr.b_thawed != NULL) {
3738                         kmem_free(hdr->b_l1hdr.b_thawed, 1);
3739                         hdr->b_l1hdr.b_thawed = NULL;
3740                 }
3741 #endif
3742 
3743                 if (hdr->b_l1hdr.b_pabd != NULL)
3744                         arc_hdr_free_pabd(hdr, B_FALSE);
3745 
3746                 if (HDR_HAS_RABD(hdr))
3747                         arc_hdr_free_pabd(hdr, B_TRUE);
3748         }
3749 
3750         ASSERT3P(hdr->b_hash_next, ==, NULL);
3751         if (HDR_HAS_L1HDR(hdr)) {
3752                 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
3753                 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
3754 
3755                 if (!HDR_PROTECTED(hdr)) {
3756                         kmem_cache_free(hdr_full_cache, hdr);
3757                 } else {
3758                         kmem_cache_free(hdr_full_crypt_cache, hdr);
3759                 }
3760         } else {
3761                 kmem_cache_free(hdr_l2only_cache, hdr);
3762         }
3763 }
3764 
3765 void
3766 arc_buf_destroy(arc_buf_t *buf, void* tag)
3767 {
3768         arc_buf_hdr_t *hdr = buf->b_hdr;
3769 
3770         if (hdr->b_l1hdr.b_state == arc_anon) {
3771                 ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
3772                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3773                 VERIFY0(remove_reference(hdr, NULL, tag));
3774                 arc_hdr_destroy(hdr);
3775                 return;
3776         }
3777 
3778         kmutex_t *hash_lock = HDR_LOCK(hdr);
3779         mutex_enter(hash_lock);
3780 
3781         ASSERT3P(hdr, ==, buf->b_hdr);
3782         ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
3783         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3784         ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon);
3785         ASSERT3P(buf->b_data, !=, NULL);
3786 
3787         (void) remove_reference(hdr, hash_lock, tag);
3788         arc_buf_destroy_impl(buf);
3789         mutex_exit(hash_lock);
3790 }
3791 
3792 /*
3793  * Evict the arc_buf_hdr that is provided as a parameter. The resultant
3794  * state of the header is dependent on its state prior to entering this
3795  * function. The following transitions are possible:
3796  *
3797  *    - arc_mru -> arc_mru_ghost
3798  *    - arc_mfu -> arc_mfu_ghost
3799  *    - arc_mru_ghost -> arc_l2c_only
3800  *    - arc_mru_ghost -> deleted
3801  *    - arc_mfu_ghost -> arc_l2c_only
3802  *    - arc_mfu_ghost -> deleted
3803  */
3804 static int64_t
3805 arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
3806 {
3807         arc_state_t *evicted_state, *state;
3808         int64_t bytes_evicted = 0;
3809         int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
3810             zfs_arc_min_prescient_prefetch_ms : zfs_arc_min_prefetch_ms;
3811 
3812         ASSERT(MUTEX_HELD(hash_lock));
3813         ASSERT(HDR_HAS_L1HDR(hdr));
3814 
3815         state = hdr->b_l1hdr.b_state;
3816         if (GHOST_STATE(state)) {
3817                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3818                 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
3819 
3820                 /*
3821                  * l2arc_write_buffers() relies on a header's L1 portion
3822                  * (i.e. its b_pabd field) during its write phase.
3823                  * Thus, we cannot push a header onto the arc_l2c_only
3824                  * state (removing its L1 piece) until the header is
3825                  * done being written to the l2arc.
3826                  */
3827                 if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
3828                         ARCSTAT_BUMP(arcstat_evict_l2_skip);
3829                         return (bytes_evicted);
3830                 }
3831 
3832                 ARCSTAT_BUMP(arcstat_deleted);
3833                 bytes_evicted += HDR_GET_LSIZE(hdr);
3834 
3835                 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
3836 
3837                 if (HDR_HAS_L2HDR(hdr)) {
3838                         ASSERT(hdr->b_l1hdr.b_pabd == NULL);
3839                         ASSERT(!HDR_HAS_RABD(hdr));
3840                         /*
3841                          * This buffer is cached on the 2nd Level ARC;
3842                          * don't destroy the header.
3843                          */
3844                         arc_change_state(arc_l2c_only, hdr, hash_lock);
3845                         /*
3846                          * dropping from L1+L2 cached to L2-only,
3847                          * realloc to remove the L1 header.
3848                          */
3849                         hdr = arc_hdr_realloc(hdr, hdr_full_cache,
3850                             hdr_l2only_cache);
3851                 } else {
3852                         arc_change_state(arc_anon, hdr, hash_lock);
3853                         arc_hdr_destroy(hdr);
3854                 }
3855                 return (bytes_evicted);
3856         }
3857 
3858         ASSERT(state == arc_mru || state == arc_mfu);
3859         evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3860 
3861         /* prefetch buffers have a minimum lifespan */
3862         if (HDR_IO_IN_PROGRESS(hdr) ||
3863             ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
3864             ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime * hz)) {
3865                 ARCSTAT_BUMP(arcstat_evict_skip);
3866                 return (bytes_evicted);
3867         }
3868 
3869         ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
3870         while (hdr->b_l1hdr.b_buf) {
3871                 arc_buf_t *buf = hdr->b_l1hdr.b_buf;
3872                 if (!mutex_tryenter(&buf->b_evict_lock)) {
3873                         ARCSTAT_BUMP(arcstat_mutex_miss);
3874                         break;
3875                 }
3876                 if (buf->b_data != NULL)
3877                         bytes_evicted += HDR_GET_LSIZE(hdr);
3878                 mutex_exit(&buf->b_evict_lock);
3879                 arc_buf_destroy_impl(buf);
3880         }
3881 
3882         if (HDR_HAS_L2HDR(hdr)) {
3883                 ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr));
3884         } else {
3885                 if (l2arc_write_eligible(hdr->b_spa, hdr)) {
3886                         ARCSTAT_INCR(arcstat_evict_l2_eligible,
3887                             HDR_GET_LSIZE(hdr));
3888 
3889                         switch (state->arcs_state) {
3890                                 case ARC_STATE_MRU:
3891                                         ARCSTAT_INCR(
3892                                             arcstat_evict_l2_eligible_mru,
3893                                             HDR_GET_LSIZE(hdr));
3894                                         break;
3895                                 case ARC_STATE_MFU:
3896                                         ARCSTAT_INCR(
3897                                             arcstat_evict_l2_eligible_mfu,
3898                                             HDR_GET_LSIZE(hdr));
3899                                         break;
3900                                 default:
3901                                         break;
3902                         }
3903                 } else {
3904                         ARCSTAT_INCR(arcstat_evict_l2_ineligible,
3905                             HDR_GET_LSIZE(hdr));
3906                 }
3907         }
3908 
3909         if (hdr->b_l1hdr.b_bufcnt == 0) {
3910                 arc_cksum_free(hdr);
3911 
3912                 bytes_evicted += arc_hdr_size(hdr);
3913 
3914                 /*
3915                  * If this hdr is being evicted and has a compressed
3916                  * buffer then we discard it here before we change states.
3917                  * This ensures that the accounting is updated correctly
3918                  * in arc_free_data_impl().
3919                  */
3920                 if (hdr->b_l1hdr.b_pabd != NULL)
3921                         arc_hdr_free_pabd(hdr, B_FALSE);
3922 
3923                 if (HDR_HAS_RABD(hdr))
3924                         arc_hdr_free_pabd(hdr, B_TRUE);
3925 
3926                 arc_change_state(evicted_state, hdr, hash_lock);
3927                 ASSERT(HDR_IN_HASH_TABLE(hdr));
3928                 arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
3929                 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
3930         }
3931 
3932         return (bytes_evicted);
3933 }
3934 
3935 static uint64_t
3936 arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
3937     uint64_t spa, int64_t bytes)
3938 {
3939         multilist_sublist_t *mls;
3940         uint64_t bytes_evicted = 0;
3941         arc_buf_hdr_t *hdr;
3942         kmutex_t *hash_lock;
3943         int evict_count = 0;
3944 
3945         ASSERT3P(marker, !=, NULL);
3946         IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
3947 
3948         mls = multilist_sublist_lock(ml, idx);
3949 
3950         for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
3951             hdr = multilist_sublist_prev(mls, marker)) {
3952                 if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
3953                     (evict_count >= zfs_arc_evict_batch_limit))
3954                         break;
3955 
3956                 /*
3957                  * To keep our iteration location, move the marker
3958                  * forward. Since we're not holding hdr's hash lock, we
3959                  * must be very careful and not remove 'hdr' from the
3960                  * sublist. Otherwise, other consumers might mistake the
3961                  * 'hdr' as not being on a sublist when they call the
3962                  * multilist_link_active() function (they all rely on
3963                  * the hash lock protecting concurrent insertions and
3964                  * removals). multilist_sublist_move_forward() was
3965                  * specifically implemented to ensure this is the case
3966                  * (only 'marker' will be removed and re-inserted).
3967                  */
3968                 multilist_sublist_move_forward(mls, marker);
3969 
3970                 /*
3971                  * The only case where the b_spa field should ever be
3972                  * zero, is the marker headers inserted by
3973                  * arc_evict_state(). It's possible for multiple threads
3974                  * to be calling arc_evict_state() concurrently (e.g.
3975                  * dsl_pool_close() and zio_inject_fault()), so we must
3976                  * skip any markers we see from these other threads.
3977                  */
3978                 if (hdr->b_spa == 0)
3979                         continue;
3980 
3981                 /* we're only interested in evicting buffers of a certain spa */
3982                 if (spa != 0 && hdr->b_spa != spa) {
3983                         ARCSTAT_BUMP(arcstat_evict_skip);
3984                         continue;
3985                 }
3986 
3987                 hash_lock = HDR_LOCK(hdr);
3988 
3989                 /*
3990                  * We aren't calling this function from any code path
3991                  * that would already be holding a hash lock, so we're
3992                  * asserting on this assumption to be defensive in case
3993                  * this ever changes. Without this check, it would be
3994                  * possible to incorrectly increment arcstat_mutex_miss
3995                  * below (e.g. if the code changed such that we called
3996                  * this function with a hash lock held).
3997                  */
3998                 ASSERT(!MUTEX_HELD(hash_lock));
3999 
4000                 if (mutex_tryenter(hash_lock)) {
4001                         uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
4002                         mutex_exit(hash_lock);
4003 
4004                         bytes_evicted += evicted;
4005 
4006                         /*
4007                          * If evicted is zero, arc_evict_hdr() must have
4008                          * decided to skip this header, don't increment
4009                          * evict_count in this case.
4010                          */
4011                         if (evicted != 0)
4012                                 evict_count++;
4013 
4014                         /*
4015                          * If arc_size isn't overflowing, signal any
4016                          * threads that might happen to be waiting.
4017                          *
4018                          * For each header evicted, we wake up a single
4019                          * thread. If we used cv_broadcast, we could
4020                          * wake up "too many" threads causing arc_size
4021                          * to significantly overflow arc_c; since
4022                          * arc_get_data_impl() doesn't check for overflow
4023                          * when it's woken up (it doesn't because it's
4024                          * possible for the ARC to be overflowing while
4025                          * full of un-evictable buffers, and the
4026                          * function should proceed in this case).
4027                          *
4028                          * If threads are left sleeping, due to not
4029                          * using cv_broadcast here, they will be woken
4030                          * up via cv_broadcast in arc_adjust_cb() just
4031                          * before arc_adjust_zthr sleeps.
4032                          */
4033                         mutex_enter(&arc_adjust_lock);
4034                         if (!arc_is_overflowing())
4035                                 cv_signal(&arc_adjust_waiters_cv);
4036                         mutex_exit(&arc_adjust_lock);
4037                 } else {
4038                         ARCSTAT_BUMP(arcstat_mutex_miss);
4039                 }
4040         }
4041 
4042         multilist_sublist_unlock(mls);
4043 
4044         return (bytes_evicted);
4045 }
4046 
4047 /*
4048  * Evict buffers from the given arc state, until we've removed the
4049  * specified number of bytes. Move the removed buffers to the
4050  * appropriate evict state.
4051  *
4052  * This function makes a "best effort". It skips over any buffers
4053  * it can't get a hash_lock on, and so, may not catch all candidates.
4054  * It may also return without evicting as much space as requested.
4055  *
4056  * If bytes is specified using the special value ARC_EVICT_ALL, this
4057  * will evict all available (i.e. unlocked and evictable) buffers from
4058  * the given arc state; which is used by arc_flush().
4059  */
4060 static uint64_t
4061 arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
4062     arc_buf_contents_t type)
4063 {
4064         uint64_t total_evicted = 0;
4065         multilist_t *ml = state->arcs_list[type];
4066         int num_sublists;
4067         arc_buf_hdr_t **markers;
4068 
4069         IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
4070 
4071         num_sublists = multilist_get_num_sublists(ml);
4072 
4073         /*
4074          * If we've tried to evict from each sublist, made some
4075          * progress, but still have not hit the target number of bytes
4076          * to evict, we want to keep trying. The markers allow us to
4077          * pick up where we left off for each individual sublist, rather
4078          * than starting from the tail each time.
4079          */
4080         markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
4081         for (int i = 0; i < num_sublists; i++) {
4082                 markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
4083 
4084                 /*
4085                  * A b_spa of 0 is used to indicate that this header is
4086                  * a marker. This fact is used in arc_adjust_type() and
4087                  * arc_evict_state_impl().
4088                  */
4089                 markers[i]->b_spa = 0;
4090 
4091                 multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
4092                 multilist_sublist_insert_tail(mls, markers[i]);
4093                 multilist_sublist_unlock(mls);
4094         }
4095 
4096         /*
4097          * While we haven't hit our target number of bytes to evict, or
4098          * we're evicting all available buffers.
4099          */
4100         while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
4101                 /*
4102                  * Start eviction using a randomly selected sublist,
4103                  * this is to try and evenly balance eviction across all
4104                  * sublists. Always starting at the same sublist
4105                  * (e.g. index 0) would cause evictions to favor certain
4106                  * sublists over others.
4107                  */
4108                 int sublist_idx = multilist_get_random_index(ml);
4109                 uint64_t scan_evicted = 0;
4110 
4111                 for (int i = 0; i < num_sublists; i++) {
4112                         uint64_t bytes_remaining;
4113                         uint64_t bytes_evicted;
4114 
4115                         if (bytes == ARC_EVICT_ALL)
4116                                 bytes_remaining = ARC_EVICT_ALL;
4117                         else if (total_evicted < bytes)
4118                                 bytes_remaining = bytes - total_evicted;
4119                         else
4120                                 break;
4121 
4122                         bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
4123                             markers[sublist_idx], spa, bytes_remaining);
4124 
4125                         scan_evicted += bytes_evicted;
4126                         total_evicted += bytes_evicted;
4127 
4128                         /* we've reached the end, wrap to the beginning */
4129                         if (++sublist_idx >= num_sublists)
4130                                 sublist_idx = 0;
4131                 }
4132 
4133                 /*
4134                  * If we didn't evict anything during this scan, we have
4135                  * no reason to believe we'll evict more during another
4136                  * scan, so break the loop.
4137                  */
4138                 if (scan_evicted == 0) {
4139                         /* This isn't possible, let's make that obvious */
4140                         ASSERT3S(bytes, !=, 0);
4141 
4142                         /*
4143                          * When bytes is ARC_EVICT_ALL, the only way to
4144                          * break the loop is when scan_evicted is zero.
4145                          * In that case, we actually have evicted enough,
4146                          * so we don't want to increment the kstat.
4147                          */
4148                         if (bytes != ARC_EVICT_ALL) {
4149                                 ASSERT3S(total_evicted, <, bytes);
4150                                 ARCSTAT_BUMP(arcstat_evict_not_enough);
4151                         }
4152 
4153                         break;
4154                 }
4155         }
4156 
4157         for (int i = 0; i < num_sublists; i++) {
4158                 multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
4159                 multilist_sublist_remove(mls, markers[i]);
4160                 multilist_sublist_unlock(mls);
4161 
4162                 kmem_cache_free(hdr_full_cache, markers[i]);
4163         }
4164         kmem_free(markers, sizeof (*markers) * num_sublists);
4165 
4166         return (total_evicted);
4167 }
4168 
4169 /*
4170  * Flush all "evictable" data of the given type from the arc state
4171  * specified. This will not evict any "active" buffers (i.e. referenced).
4172  *
4173  * When 'retry' is set to B_FALSE, the function will make a single pass
4174  * over the state and evict any buffers that it can. Since it doesn't
4175  * continually retry the eviction, it might end up leaving some buffers
4176  * in the ARC due to lock misses.
4177  *
4178  * When 'retry' is set to B_TRUE, the function will continually retry the
4179  * eviction until *all* evictable buffers have been removed from the
4180  * state. As a result, if concurrent insertions into the state are
4181  * allowed (e.g. if the ARC isn't shutting down), this function might
4182  * wind up in an infinite loop, continually trying to evict buffers.
4183  */
4184 static uint64_t
4185 arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
4186     boolean_t retry)
4187 {
4188         uint64_t evicted = 0;
4189 
4190         while (zfs_refcount_count(&state->arcs_esize[type]) != 0) {
4191                 evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
4192 
4193                 if (!retry)
4194                         break;
4195         }
4196 
4197         return (evicted);
4198 }
4199 
4200 /*
4201  * Evict the specified number of bytes from the state specified,
4202  * restricting eviction to the spa and type given. This function
4203  * prevents us from trying to evict more from a state's list than
4204  * is "evictable", and to skip evicting altogether when passed a
4205  * negative value for "bytes". In contrast, arc_evict_state() will
4206  * evict everything it can, when passed a negative value for "bytes".
4207  */
4208 static uint64_t
4209 arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
4210     arc_buf_contents_t type)
4211 {
4212         int64_t delta;
4213 
4214         if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) {
4215                 delta = MIN(zfs_refcount_count(&state->arcs_esize[type]),
4216                     bytes);
4217                 return (arc_evict_state(state, spa, delta, type));
4218         }
4219 
4220         return (0);
4221 }
4222 
4223 /*
4224  * Evict metadata buffers from the cache, such that arc_meta_used is
4225  * capped by the arc_meta_limit tunable.
4226  */
4227 static uint64_t
4228 arc_adjust_meta(uint64_t meta_used)
4229 {
4230         uint64_t total_evicted = 0;
4231         int64_t target;
4232 
4233         /*
4234          * If we're over the meta limit, we want to evict enough
4235          * metadata to get back under the meta limit. We don't want to
4236          * evict so much that we drop the MRU below arc_p, though. If
4237          * we're over the meta limit more than we're over arc_p, we
4238          * evict some from the MRU here, and some from the MFU below.
4239          */
4240         target = MIN((int64_t)(meta_used - arc_meta_limit),
4241             (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) +
4242             zfs_refcount_count(&arc_mru->arcs_size) - arc_p));
4243 
4244         total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
4245 
4246         /*
4247          * Similar to the above, we want to evict enough bytes to get us
4248          * below the meta limit, but not so much as to drop us below the
4249          * space allotted to the MFU (which is defined as arc_c - arc_p).
4250          */
4251         target = MIN((int64_t)(meta_used - arc_meta_limit),
4252             (int64_t)(zfs_refcount_count(&arc_mfu->arcs_size) -
4253             (arc_c - arc_p)));
4254 
4255         total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
4256 
4257         return (total_evicted);
4258 }
4259 
4260 /*
4261  * Return the type of the oldest buffer in the given arc state
4262  *
4263  * This function will select a random sublist of type ARC_BUFC_DATA and
4264  * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
4265  * is compared, and the type which contains the "older" buffer will be
4266  * returned.
4267  */
4268 static arc_buf_contents_t
4269 arc_adjust_type(arc_state_t *state)
4270 {
4271         multilist_t *data_ml = state->arcs_list[ARC_BUFC_DATA];
4272         multilist_t *meta_ml = state->arcs_list[ARC_BUFC_METADATA];
4273         int data_idx = multilist_get_random_index(data_ml);
4274         int meta_idx = multilist_get_random_index(meta_ml);
4275         multilist_sublist_t *data_mls;
4276         multilist_sublist_t *meta_mls;
4277         arc_buf_contents_t type;
4278         arc_buf_hdr_t *data_hdr;
4279         arc_buf_hdr_t *meta_hdr;
4280 
4281         /*
4282          * We keep the sublist lock until we're finished, to prevent
4283          * the headers from being destroyed via arc_evict_state().
4284          */
4285         data_mls = multilist_sublist_lock(data_ml, data_idx);
4286         meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
4287 
4288         /*
4289          * These two loops are to ensure we skip any markers that
4290          * might be at the tail of the lists due to arc_evict_state().
4291          */
4292 
4293         for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
4294             data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
4295                 if (data_hdr->b_spa != 0)
4296                         break;
4297         }
4298 
4299         for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
4300             meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
4301                 if (meta_hdr->b_spa != 0)
4302                         break;
4303         }
4304 
4305         if (data_hdr == NULL && meta_hdr == NULL) {
4306                 type = ARC_BUFC_DATA;
4307         } else if (data_hdr == NULL) {
4308                 ASSERT3P(meta_hdr, !=, NULL);
4309                 type = ARC_BUFC_METADATA;
4310         } else if (meta_hdr == NULL) {
4311                 ASSERT3P(data_hdr, !=, NULL);
4312                 type = ARC_BUFC_DATA;
4313         } else {
4314                 ASSERT3P(data_hdr, !=, NULL);
4315                 ASSERT3P(meta_hdr, !=, NULL);
4316 
4317                 /* The headers can't be on the sublist without an L1 header */
4318                 ASSERT(HDR_HAS_L1HDR(data_hdr));
4319                 ASSERT(HDR_HAS_L1HDR(meta_hdr));
4320 
4321                 if (data_hdr->b_l1hdr.b_arc_access <
4322                     meta_hdr->b_l1hdr.b_arc_access) {
4323                         type = ARC_BUFC_DATA;
4324                 } else {
4325                         type = ARC_BUFC_METADATA;
4326                 }
4327         }
4328 
4329         multilist_sublist_unlock(meta_mls);
4330         multilist_sublist_unlock(data_mls);
4331 
4332         return (type);
4333 }
4334 
4335 /*
4336  * Evict buffers from the cache, such that arc_size is capped by arc_c.
4337  */
4338 static uint64_t
4339 arc_adjust(void)
4340 {
4341         uint64_t total_evicted = 0;
4342         uint64_t bytes;
4343         int64_t target;
4344         uint64_t asize = aggsum_value(&arc_size);
4345         uint64_t ameta = aggsum_value(&arc_meta_used);
4346 
4347         /*
4348          * If we're over arc_meta_limit, we want to correct that before
4349          * potentially evicting data buffers below.
4350          */
4351         total_evicted += arc_adjust_meta(ameta);
4352 
4353         /*
4354          * Adjust MRU size
4355          *
4356          * If we're over the target cache size, we want to evict enough
4357          * from the list to get back to our target size. We don't want
4358          * to evict too much from the MRU, such that it drops below
4359          * arc_p. So, if we're over our target cache size more than
4360          * the MRU is over arc_p, we'll evict enough to get back to
4361          * arc_p here, and then evict more from the MFU below.
4362          */
4363         target = MIN((int64_t)(asize - arc_c),
4364             (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) +
4365             zfs_refcount_count(&arc_mru->arcs_size) + ameta - arc_p));
4366 
4367         /*
4368          * If we're below arc_meta_min, always prefer to evict data.
4369          * Otherwise, try to satisfy the requested number of bytes to
4370          * evict from the type which contains older buffers; in an
4371          * effort to keep newer buffers in the cache regardless of their
4372          * type. If we cannot satisfy the number of bytes from this
4373          * type, spill over into the next type.
4374          */
4375         if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
4376             ameta > arc_meta_min) {
4377                 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
4378                 total_evicted += bytes;
4379 
4380                 /*
4381                  * If we couldn't evict our target number of bytes from
4382                  * metadata, we try to get the rest from data.
4383                  */
4384                 target -= bytes;
4385 
4386                 total_evicted +=
4387                     arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
4388         } else {
4389                 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
4390                 total_evicted += bytes;
4391 
4392                 /*
4393                  * If we couldn't evict our target number of bytes from
4394                  * data, we try to get the rest from metadata.
4395                  */
4396                 target -= bytes;
4397 
4398                 total_evicted +=
4399                     arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
4400         }
4401 
4402         /*
4403          * Adjust MFU size
4404          *
4405          * Now that we've tried to evict enough from the MRU to get its
4406          * size back to arc_p, if we're still above the target cache
4407          * size, we evict the rest from the MFU.
4408          */
4409         target = asize - arc_c;
4410 
4411         if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
4412             ameta > arc_meta_min) {
4413                 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
4414                 total_evicted += bytes;
4415 
4416                 /*
4417                  * If we couldn't evict our target number of bytes from
4418                  * metadata, we try to get the rest from data.
4419                  */
4420                 target -= bytes;
4421 
4422                 total_evicted +=
4423                     arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
4424         } else {
4425                 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
4426                 total_evicted += bytes;
4427 
4428                 /*
4429                  * If we couldn't evict our target number of bytes from
4430                  * data, we try to get the rest from data.
4431                  */
4432                 target -= bytes;
4433 
4434                 total_evicted +=
4435                     arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
4436         }
4437 
4438         /*
4439          * Adjust ghost lists
4440          *
4441          * In addition to the above, the ARC also defines target values
4442          * for the ghost lists. The sum of the mru list and mru ghost
4443          * list should never exceed the target size of the cache, and
4444          * the sum of the mru list, mfu list, mru ghost list, and mfu
4445          * ghost list should never exceed twice the target size of the
4446          * cache. The following logic enforces these limits on the ghost
4447          * caches, and evicts from them as needed.
4448          */
4449         target = zfs_refcount_count(&arc_mru->arcs_size) +
4450             zfs_refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
4451 
4452         bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
4453         total_evicted += bytes;
4454 
4455         target -= bytes;
4456 
4457         total_evicted +=
4458             arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
4459 
4460         /*
4461          * We assume the sum of the mru list and mfu list is less than
4462          * or equal to arc_c (we enforced this above), which means we
4463          * can use the simpler of the two equations below:
4464          *
4465          *      mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
4466          *                  mru ghost + mfu ghost <= arc_c
4467          */
4468         target = zfs_refcount_count(&arc_mru_ghost->arcs_size) +
4469             zfs_refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
4470 
4471         bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
4472         total_evicted += bytes;
4473 
4474         target -= bytes;
4475 
4476         total_evicted +=
4477             arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
4478 
4479         return (total_evicted);
4480 }
4481 
4482 void
4483 arc_flush(spa_t *spa, boolean_t retry)
4484 {
4485         uint64_t guid = 0;
4486 
4487         /*
4488          * If retry is B_TRUE, a spa must not be specified since we have
4489          * no good way to determine if all of a spa's buffers have been
4490          * evicted from an arc state.
4491          */
4492         ASSERT(!retry || spa == 0);
4493 
4494         if (spa != NULL)
4495                 guid = spa_load_guid(spa);
4496 
4497         (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
4498         (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
4499 
4500         (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
4501         (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
4502 
4503         (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
4504         (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
4505 
4506         (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
4507         (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
4508 }
4509 
4510 static void
4511 arc_reduce_target_size(int64_t to_free)
4512 {
4513         uint64_t asize = aggsum_value(&arc_size);
4514         if (arc_c > arc_c_min) {
4515 
4516                 if (arc_c > arc_c_min + to_free)
4517                         atomic_add_64(&arc_c, -to_free);
4518                 else
4519                         arc_c = arc_c_min;
4520 
4521                 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
4522                 if (asize < arc_c)
4523                         arc_c = MAX(asize, arc_c_min);
4524                 if (arc_p > arc_c)
4525                         arc_p = (arc_c >> 1);
4526                 ASSERT(arc_c >= arc_c_min);
4527                 ASSERT((int64_t)arc_p >= 0);
4528         }
4529 
4530         if (asize > arc_c) {
4531                 /* See comment in arc_adjust_cb_check() on why lock+flag */
4532                 mutex_enter(&arc_adjust_lock);
4533                 arc_adjust_needed = B_TRUE;
4534                 mutex_exit(&arc_adjust_lock);
4535                 zthr_wakeup(arc_adjust_zthr);
4536         }
4537 }
4538 
4539 typedef enum free_memory_reason_t {
4540         FMR_UNKNOWN,
4541         FMR_NEEDFREE,
4542         FMR_LOTSFREE,
4543         FMR_SWAPFS_MINFREE,
4544         FMR_PAGES_PP_MAXIMUM,
4545         FMR_HEAP_ARENA,
4546         FMR_ZIO_ARENA,
4547 } free_memory_reason_t;
4548 
4549 int64_t last_free_memory;
4550 free_memory_reason_t last_free_reason;
4551 
4552 /*
4553  * Additional reserve of pages for pp_reserve.
4554  */
4555 int64_t arc_pages_pp_reserve = 64;
4556 
4557 /*
4558  * Additional reserve of pages for swapfs.
4559  */
4560 int64_t arc_swapfs_reserve = 64;
4561 
4562 /*
4563  * Return the amount of memory that can be consumed before reclaim will be
4564  * needed.  Positive if there is sufficient free memory, negative indicates
4565  * the amount of memory that needs to be freed up.
4566  */
4567 static int64_t
4568 arc_available_memory(void)
4569 {
4570         int64_t lowest = INT64_MAX;
4571         int64_t n;
4572         free_memory_reason_t r = FMR_UNKNOWN;
4573 
4574 #ifdef _KERNEL
4575         if (needfree > 0) {
4576                 n = PAGESIZE * (-needfree);
4577                 if (n < lowest) {
4578                         lowest = n;
4579                         r = FMR_NEEDFREE;
4580                 }
4581         }
4582 
4583         /*
4584          * check that we're out of range of the pageout scanner.  It starts to
4585          * schedule paging if freemem is less than lotsfree and needfree.
4586          * lotsfree is the high-water mark for pageout, and needfree is the
4587          * number of needed free pages.  We add extra pages here to make sure
4588          * the scanner doesn't start up while we're freeing memory.
4589          */
4590         n = PAGESIZE * (freemem - lotsfree - needfree - desfree);
4591         if (n < lowest) {
4592                 lowest = n;
4593                 r = FMR_LOTSFREE;
4594         }
4595 
4596         /*
4597          * check to make sure that swapfs has enough space so that anon
4598          * reservations can still succeed. anon_resvmem() checks that the
4599          * availrmem is greater than swapfs_minfree, and the number of reserved
4600          * swap pages.  We also add a bit of extra here just to prevent
4601          * circumstances from getting really dire.
4602          */
4603         n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve -
4604             desfree - arc_swapfs_reserve);
4605         if (n < lowest) {
4606                 lowest = n;
4607                 r = FMR_SWAPFS_MINFREE;
4608         }
4609 
4610 
4611         /*
4612          * Check that we have enough availrmem that memory locking (e.g., via
4613          * mlock(3C) or memcntl(2)) can still succeed.  (pages_pp_maximum
4614          * stores the number of pages that cannot be locked; when availrmem
4615          * drops below pages_pp_maximum, page locking mechanisms such as
4616          * page_pp_lock() will fail.)
4617          */
4618         n = PAGESIZE * (availrmem - pages_pp_maximum -
4619             arc_pages_pp_reserve);
4620         if (n < lowest) {
4621                 lowest = n;
4622                 r = FMR_PAGES_PP_MAXIMUM;
4623         }
4624 
4625 #if defined(__i386)
4626         /*
4627          * If we're on an i386 platform, it's possible that we'll exhaust the
4628          * kernel heap space before we ever run out of available physical
4629          * memory.  Most checks of the size of the heap_area compare against
4630          * tune.t_minarmem, which is the minimum available real memory that we
4631          * can have in the system.  However, this is generally fixed at 25 pages
4632          * which is so low that it's useless.  In this comparison, we seek to
4633          * calculate the total heap-size, and reclaim if more than 3/4ths of the
4634          * heap is allocated.  (Or, in the calculation, if less than 1/4th is
4635          * free)
4636          */
4637         n = (int64_t)vmem_size(heap_arena, VMEM_FREE) -
4638             (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2);
4639         if (n < lowest) {
4640                 lowest = n;
4641                 r = FMR_HEAP_ARENA;
4642         }
4643 #endif
4644 
4645         /*
4646          * If zio data pages are being allocated out of a separate heap segment,
4647          * then enforce that the size of available vmem for this arena remains
4648          * above about 1/4th (1/(2^arc_zio_arena_free_shift)) free.
4649          *
4650          * Note that reducing the arc_zio_arena_free_shift keeps more virtual
4651          * memory (in the zio_arena) free, which can avoid memory
4652          * fragmentation issues.
4653          */
4654         if (zio_arena != NULL) {
4655                 n = (int64_t)vmem_size(zio_arena, VMEM_FREE) -
4656                     (vmem_size(zio_arena, VMEM_ALLOC) >>
4657                     arc_zio_arena_free_shift);
4658                 if (n < lowest) {
4659                         lowest = n;
4660                         r = FMR_ZIO_ARENA;
4661                 }
4662         }
4663 #else
4664         /* Every 100 calls, free a small amount */
4665         if (spa_get_random(100) == 0)
4666                 lowest = -1024;
4667 #endif
4668 
4669         last_free_memory = lowest;
4670         last_free_reason = r;
4671 
4672         return (lowest);
4673 }
4674 
4675 
4676 /*
4677  * Determine if the system is under memory pressure and is asking
4678  * to reclaim memory. A return value of B_TRUE indicates that the system
4679  * is under memory pressure and that the arc should adjust accordingly.
4680  */
4681 static boolean_t
4682 arc_reclaim_needed(void)
4683 {
4684         return (arc_available_memory() < 0);
4685 }
4686 
4687 static void
4688 arc_kmem_reap_soon(void)
4689 {
4690         size_t                  i;
4691         kmem_cache_t            *prev_cache = NULL;
4692         kmem_cache_t            *prev_data_cache = NULL;
4693         extern kmem_cache_t     *zio_buf_cache[];
4694         extern kmem_cache_t     *zio_data_buf_cache[];
4695         extern kmem_cache_t     *zfs_btree_leaf_cache;
4696         extern kmem_cache_t     *abd_chunk_cache;
4697 
4698 #ifdef _KERNEL
4699         if (aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) {
4700                 /*
4701                  * We are exceeding our meta-data cache limit.
4702                  * Purge some DNLC entries to release holds on meta-data.
4703                  */
4704                 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
4705         }
4706 #if defined(__i386)
4707         /*
4708          * Reclaim unused memory from all kmem caches.
4709          */
4710         kmem_reap();
4711 #endif
4712 #endif
4713 
4714         for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
4715                 if (zio_buf_cache[i] != prev_cache) {
4716                         prev_cache = zio_buf_cache[i];
4717                         kmem_cache_reap_soon(zio_buf_cache[i]);
4718                 }
4719                 if (zio_data_buf_cache[i] != prev_data_cache) {
4720                         prev_data_cache = zio_data_buf_cache[i];
4721                         kmem_cache_reap_soon(zio_data_buf_cache[i]);
4722                 }
4723         }
4724         kmem_cache_reap_soon(abd_chunk_cache);
4725         kmem_cache_reap_soon(buf_cache);
4726         kmem_cache_reap_soon(hdr_full_cache);
4727         kmem_cache_reap_soon(hdr_l2only_cache);
4728         kmem_cache_reap_soon(zfs_btree_leaf_cache);
4729 
4730         if (zio_arena != NULL) {
4731                 /*
4732                  * Ask the vmem arena to reclaim unused memory from its
4733                  * quantum caches.
4734                  */
4735                 vmem_qcache_reap(zio_arena);
4736         }
4737 }
4738 
4739 /* ARGSUSED */
4740 static boolean_t
4741 arc_adjust_cb_check(void *arg, zthr_t *zthr)
4742 {
4743         /*
4744          * This is necessary in order for the mdb ::arc dcmd to
4745          * show up to date information. Since the ::arc command
4746          * does not call the kstat's update function, without
4747          * this call, the command may show stale stats for the
4748          * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
4749          * with this change, the data might be up to 1 second
4750          * out of date(the arc_adjust_zthr has a maximum sleep
4751          * time of 1 second); but that should suffice.  The
4752          * arc_state_t structures can be queried directly if more
4753          * accurate information is needed.
4754          */
4755         if (arc_ksp != NULL)
4756                 arc_ksp->ks_update(arc_ksp, KSTAT_READ);
4757 
4758         /*
4759          * We have to rely on arc_get_data_impl() to tell us when to adjust,
4760          * rather than checking if we are overflowing here, so that we are
4761          * sure to not leave arc_get_data_impl() waiting on
4762          * arc_adjust_waiters_cv.  If we have become "not overflowing" since
4763          * arc_get_data_impl() checked, we need to wake it up.  We could
4764          * broadcast the CV here, but arc_get_data_impl() may have not yet
4765          * gone to sleep.  We would need to use a mutex to ensure that this
4766          * function doesn't broadcast until arc_get_data_impl() has gone to
4767          * sleep (e.g. the arc_adjust_lock).  However, the lock ordering of
4768          * such a lock would necessarily be incorrect with respect to the
4769          * zthr_lock, which is held before this function is called, and is
4770          * held by arc_get_data_impl() when it calls zthr_wakeup().
4771          */
4772         return (arc_adjust_needed);
4773 }
4774 
4775 /*
4776  * Keep arc_size under arc_c by running arc_adjust which evicts data
4777  * from the ARC.
4778  */
4779 /* ARGSUSED */
4780 static void
4781 arc_adjust_cb(void *arg, zthr_t *zthr)
4782 {
4783         uint64_t evicted = 0;
4784 
4785         /* Evict from cache */
4786         evicted = arc_adjust();
4787 
4788         /*
4789          * If evicted is zero, we couldn't evict anything
4790          * via arc_adjust(). This could be due to hash lock
4791          * collisions, but more likely due to the majority of
4792          * arc buffers being unevictable. Therefore, even if
4793          * arc_size is above arc_c, another pass is unlikely to
4794          * be helpful and could potentially cause us to enter an
4795          * infinite loop.  Additionally, zthr_iscancelled() is
4796          * checked here so that if the arc is shutting down, the
4797          * broadcast will wake any remaining arc adjust waiters.
4798          */
4799         mutex_enter(&arc_adjust_lock);
4800         arc_adjust_needed = !zthr_iscancelled(arc_adjust_zthr) &&
4801             evicted > 0 && aggsum_compare(&arc_size, arc_c) > 0;
4802         if (!arc_adjust_needed) {
4803                 /*
4804                  * We're either no longer overflowing, or we
4805                  * can't evict anything more, so we should wake
4806                  * up any waiters.
4807                  */
4808                 cv_broadcast(&arc_adjust_waiters_cv);
4809         }
4810         mutex_exit(&arc_adjust_lock);
4811 }
4812 
4813 /* ARGSUSED */
4814 static boolean_t
4815 arc_reap_cb_check(void *arg, zthr_t *zthr)
4816 {
4817         int64_t free_memory = arc_available_memory();
4818 
4819         /*
4820          * If a kmem reap is already active, don't schedule more.  We must
4821          * check for this because kmem_cache_reap_soon() won't actually
4822          * block on the cache being reaped (this is to prevent callers from
4823          * becoming implicitly blocked by a system-wide kmem reap -- which,
4824          * on a system with many, many full magazines, can take minutes).
4825          */
4826         if (!kmem_cache_reap_active() &&
4827             free_memory < 0) {
4828                 arc_no_grow = B_TRUE;
4829                 arc_warm = B_TRUE;
4830                 /*
4831                  * Wait at least zfs_grow_retry (default 60) seconds
4832                  * before considering growing.
4833                  */
4834                 arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
4835                 return (B_TRUE);
4836         } else if (free_memory < arc_c >> arc_no_grow_shift) {
4837                 arc_no_grow = B_TRUE;
4838         } else if (gethrtime() >= arc_growtime) {
4839                 arc_no_grow = B_FALSE;
4840         }
4841 
4842         return (B_FALSE);
4843 }
4844 
4845 /*
4846  * Keep enough free memory in the system by reaping the ARC's kmem
4847  * caches.  To cause more slabs to be reapable, we may reduce the
4848  * target size of the cache (arc_c), causing the arc_adjust_cb()
4849  * to free more buffers.
4850  */
4851 /* ARGSUSED */
4852 static void
4853 arc_reap_cb(void *arg, zthr_t *zthr)
4854 {
4855         int64_t free_memory;
4856 
4857         /*
4858          * Kick off asynchronous kmem_reap()'s of all our caches.
4859          */
4860         arc_kmem_reap_soon();
4861 
4862         /*
4863          * Wait at least arc_kmem_cache_reap_retry_ms between
4864          * arc_kmem_reap_soon() calls. Without this check it is possible to
4865          * end up in a situation where we spend lots of time reaping
4866          * caches, while we're near arc_c_min.  Waiting here also gives the
4867          * subsequent free memory check a chance of finding that the
4868          * asynchronous reap has already freed enough memory, and we don't
4869          * need to call arc_reduce_target_size().
4870          */
4871         delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000);
4872 
4873         /*
4874          * Reduce the target size as needed to maintain the amount of free
4875          * memory in the system at a fraction of the arc_size (1/128th by
4876          * default).  If oversubscribed (free_memory < 0) then reduce the
4877          * target arc_size by the deficit amount plus the fractional
4878          * amount.  If free memory is positive but less then the fractional
4879          * amount, reduce by what is needed to hit the fractional amount.
4880          */
4881         free_memory = arc_available_memory();
4882 
4883         int64_t to_free =
4884             (arc_c >> arc_shrink_shift) - free_memory;
4885         if (to_free > 0) {
4886 #ifdef _KERNEL
4887                 to_free = MAX(to_free, ptob(needfree));
4888 #endif
4889                 arc_reduce_target_size(to_free);
4890         }
4891 }
4892 
4893 /*
4894  * Adapt arc info given the number of bytes we are trying to add and
4895  * the state that we are coming from.  This function is only called
4896  * when we are adding new content to the cache.
4897  */
4898 static void
4899 arc_adapt(int bytes, arc_state_t *state)
4900 {
4901         int mult;
4902         uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
4903         int64_t mrug_size = zfs_refcount_count(&arc_mru_ghost->arcs_size);
4904         int64_t mfug_size = zfs_refcount_count(&arc_mfu_ghost->arcs_size);
4905 
4906         ASSERT(bytes > 0);
4907         /*
4908          * Adapt the target size of the MRU list:
4909          *      - if we just hit in the MRU ghost list, then increase
4910          *        the target size of the MRU list.
4911          *      - if we just hit in the MFU ghost list, then increase
4912          *        the target size of the MFU list by decreasing the
4913          *        target size of the MRU list.
4914          */
4915         if (state == arc_mru_ghost) {
4916                 mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
4917                 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
4918 
4919                 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
4920         } else if (state == arc_mfu_ghost) {
4921                 uint64_t delta;
4922 
4923                 mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
4924                 mult = MIN(mult, 10);
4925 
4926                 delta = MIN(bytes * mult, arc_p);
4927                 arc_p = MAX(arc_p_min, arc_p - delta);
4928         }
4929         ASSERT((int64_t)arc_p >= 0);
4930 
4931         /*
4932          * Wake reap thread if we do not have any available memory
4933          */
4934         if (arc_reclaim_needed()) {
4935                 zthr_wakeup(arc_reap_zthr);
4936                 return;
4937         }
4938 
4939 
4940         if (arc_no_grow)
4941                 return;
4942 
4943         if (arc_c >= arc_c_max)
4944                 return;
4945 
4946         /*
4947          * If we're within (2 * maxblocksize) bytes of the target
4948          * cache size, increment the target cache size
4949          */
4950         if (aggsum_compare(&arc_size, arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) >
4951             0) {
4952                 atomic_add_64(&arc_c, (int64_t)bytes);
4953                 if (arc_c > arc_c_max)
4954                         arc_c = arc_c_max;
4955                 else if (state == arc_anon)
4956                         atomic_add_64(&arc_p, (int64_t)bytes);
4957                 if (arc_p > arc_c)
4958                         arc_p = arc_c;
4959         }
4960         ASSERT((int64_t)arc_p >= 0);
4961 }
4962 
4963 /*
4964  * Check if arc_size has grown past our upper threshold, determined by
4965  * zfs_arc_overflow_shift.
4966  */
4967 static boolean_t
4968 arc_is_overflowing(void)
4969 {
4970         /* Always allow at least one block of overflow */
4971         uint64_t overflow = MAX(SPA_MAXBLOCKSIZE,
4972             arc_c >> zfs_arc_overflow_shift);
4973 
4974         /*
4975          * We just compare the lower bound here for performance reasons. Our
4976          * primary goals are to make sure that the arc never grows without
4977          * bound, and that it can reach its maximum size. This check
4978          * accomplishes both goals. The maximum amount we could run over by is
4979          * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block
4980          * in the ARC. In practice, that's in the tens of MB, which is low
4981          * enough to be safe.
4982          */
4983         return (aggsum_lower_bound(&arc_size) >= arc_c + overflow);
4984 }
4985 
4986 static abd_t *
4987 arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag,
4988     boolean_t do_adapt)
4989 {
4990         arc_buf_contents_t type = arc_buf_type(hdr);
4991 
4992         arc_get_data_impl(hdr, size, tag, do_adapt);
4993         if (type == ARC_BUFC_METADATA) {
4994                 return (abd_alloc(size, B_TRUE));
4995         } else {
4996                 ASSERT(type == ARC_BUFC_DATA);
4997                 return (abd_alloc(size, B_FALSE));
4998         }
4999 }
5000 
5001 static void *
5002 arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
5003 {
5004         arc_buf_contents_t type = arc_buf_type(hdr);
5005 
5006         arc_get_data_impl(hdr, size, tag, B_TRUE);
5007         if (type == ARC_BUFC_METADATA) {
5008                 return (zio_buf_alloc(size));
5009         } else {
5010                 ASSERT(type == ARC_BUFC_DATA);
5011                 return (zio_data_buf_alloc(size));
5012         }
5013 }
5014 
5015 /*
5016  * Allocate a block and return it to the caller. If we are hitting the
5017  * hard limit for the cache size, we must sleep, waiting for the eviction
5018  * thread to catch up. If we're past the target size but below the hard
5019  * limit, we'll only signal the reclaim thread and continue on.
5020  */
5021 static void
5022 arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag,
5023     boolean_t do_adapt)
5024 {
5025         arc_state_t *state = hdr->b_l1hdr.b_state;
5026         arc_buf_contents_t type = arc_buf_type(hdr);
5027 
5028         if (do_adapt)
5029                 arc_adapt(size, state);
5030 
5031         /*
5032          * If arc_size is currently overflowing, and has grown past our
5033          * upper limit, we must be adding data faster than the evict
5034          * thread can evict. Thus, to ensure we don't compound the
5035          * problem by adding more data and forcing arc_size to grow even
5036          * further past its target size, we halt and wait for the
5037          * eviction thread to catch up.
5038          *
5039          * It's also possible that the reclaim thread is unable to evict
5040          * enough buffers to get arc_size below the overflow limit (e.g.
5041          * due to buffers being un-evictable, or hash lock collisions).
5042          * In this case, we want to proceed regardless if we're
5043          * overflowing; thus we don't use a while loop here.
5044          */
5045         if (arc_is_overflowing()) {
5046                 mutex_enter(&arc_adjust_lock);
5047 
5048                 /*
5049                  * Now that we've acquired the lock, we may no longer be
5050                  * over the overflow limit, lets check.
5051                  *
5052                  * We're ignoring the case of spurious wake ups. If that
5053                  * were to happen, it'd let this thread consume an ARC
5054                  * buffer before it should have (i.e. before we're under
5055                  * the overflow limit and were signalled by the reclaim
5056                  * thread). As long as that is a rare occurrence, it
5057                  * shouldn't cause any harm.
5058                  */
5059                 if (arc_is_overflowing()) {
5060                         arc_adjust_needed = B_TRUE;
5061                         zthr_wakeup(arc_adjust_zthr);
5062                         (void) cv_wait(&arc_adjust_waiters_cv,
5063                             &arc_adjust_lock);
5064                 }
5065                 mutex_exit(&arc_adjust_lock);
5066         }
5067 
5068         VERIFY3U(hdr->b_type, ==, type);
5069         if (type == ARC_BUFC_METADATA) {
5070                 arc_space_consume(size, ARC_SPACE_META);
5071         } else {
5072                 arc_space_consume(size, ARC_SPACE_DATA);
5073         }
5074 
5075         /*
5076          * Update the state size.  Note that ghost states have a
5077          * "ghost size" and so don't need to be updated.
5078          */
5079         if (!GHOST_STATE(state)) {
5080 
5081                 (void) zfs_refcount_add_many(&state->arcs_size, size, tag);
5082 
5083                 /*
5084                  * If this is reached via arc_read, the link is
5085                  * protected by the hash lock. If reached via
5086                  * arc_buf_alloc, the header should not be accessed by
5087                  * any other thread. And, if reached via arc_read_done,
5088                  * the hash lock will protect it if it's found in the
5089                  * hash table; otherwise no other thread should be
5090                  * trying to [add|remove]_reference it.
5091                  */
5092                 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
5093                         ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
5094                         (void) zfs_refcount_add_many(&state->arcs_esize[type],
5095                             size, tag);
5096                 }
5097 
5098                 /*
5099                  * If we are growing the cache, and we are adding anonymous
5100                  * data, and we have outgrown arc_p, update arc_p
5101                  */
5102                 if (aggsum_compare(&arc_size, arc_c) < 0 &&
5103                     hdr->b_l1hdr.b_state == arc_anon &&
5104                     (zfs_refcount_count(&arc_anon->arcs_size) +
5105                     zfs_refcount_count(&arc_mru->arcs_size) > arc_p))
5106                         arc_p = MIN(arc_c, arc_p + size);
5107         }
5108 }
5109 
5110 static void
5111 arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag)
5112 {
5113         arc_free_data_impl(hdr, size, tag);
5114         abd_free(abd);
5115 }
5116 
5117 static void
5118 arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag)
5119 {
5120         arc_buf_contents_t type = arc_buf_type(hdr);
5121 
5122         arc_free_data_impl(hdr, size, tag);
5123         if (type == ARC_BUFC_METADATA) {
5124                 zio_buf_free(buf, size);
5125         } else {
5126                 ASSERT(type == ARC_BUFC_DATA);
5127                 zio_data_buf_free(buf, size);
5128         }
5129 }
5130 
5131 /*
5132  * Free the arc data buffer.
5133  */
5134 static void
5135 arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
5136 {
5137         arc_state_t *state = hdr->b_l1hdr.b_state;
5138         arc_buf_contents_t type = arc_buf_type(hdr);
5139 
5140         /* protected by hash lock, if in the hash table */
5141         if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
5142                 ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
5143                 ASSERT(state != arc_anon && state != arc_l2c_only);
5144 
5145                 (void) zfs_refcount_remove_many(&state->arcs_esize[type],
5146                     size, tag);
5147         }
5148         (void) zfs_refcount_remove_many(&state->arcs_size, size, tag);
5149 
5150         VERIFY3U(hdr->b_type, ==, type);
5151         if (type == ARC_BUFC_METADATA) {
5152                 arc_space_return(size, ARC_SPACE_META);
5153         } else {
5154                 ASSERT(type == ARC_BUFC_DATA);
5155                 arc_space_return(size, ARC_SPACE_DATA);
5156         }
5157 }
5158 
5159 /*
5160  * This routine is called whenever a buffer is accessed.
5161  * NOTE: the hash lock is dropped in this function.
5162  */
5163 static void
5164 arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
5165 {
5166         clock_t now;
5167 
5168         ASSERT(MUTEX_HELD(hash_lock));
5169         ASSERT(HDR_HAS_L1HDR(hdr));
5170 
5171         if (hdr->b_l1hdr.b_state == arc_anon) {
5172                 /*
5173                  * This buffer is not in the cache, and does not
5174                  * appear in our "ghost" list.  Add the new buffer
5175                  * to the MRU state.
5176                  */
5177 
5178                 ASSERT0(hdr->b_l1hdr.b_arc_access);
5179                 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
5180                 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
5181                 arc_change_state(arc_mru, hdr, hash_lock);
5182 
5183         } else if (hdr->b_l1hdr.b_state == arc_mru) {
5184                 now = ddi_get_lbolt();
5185 
5186                 /*
5187                  * If this buffer is here because of a prefetch, then either:
5188                  * - clear the flag if this is a "referencing" read
5189                  *   (any subsequent access will bump this into the MFU state).
5190                  * or
5191                  * - move the buffer to the head of the list if this is
5192                  *   another prefetch (to make it less likely to be evicted).
5193                  */
5194                 if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
5195                         if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
5196                                 /* link protected by hash lock */
5197                                 ASSERT(multilist_link_active(
5198                                     &hdr->b_l1hdr.b_arc_node));
5199                         } else {
5200                                 if (HDR_HAS_L2HDR(hdr))
5201                                         l2arc_hdr_arcstats_decrement_state(hdr);
5202                                 arc_hdr_clear_flags(hdr,
5203                                     ARC_FLAG_PREFETCH |
5204                                     ARC_FLAG_PRESCIENT_PREFETCH);
5205                                 ARCSTAT_BUMP(arcstat_mru_hits);
5206                                 if (HDR_HAS_L2HDR(hdr))
5207                                         l2arc_hdr_arcstats_increment_state(hdr);
5208                         }
5209                         hdr->b_l1hdr.b_arc_access = now;
5210                         return;
5211                 }
5212 
5213                 /*
5214                  * This buffer has been "accessed" only once so far,
5215                  * but it is still in the cache. Move it to the MFU
5216                  * state.
5217                  */
5218                 if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) {
5219                         /*
5220                          * More than 125ms have passed since we
5221                          * instantiated this buffer.  Move it to the
5222                          * most frequently used state.
5223                          */
5224                         hdr->b_l1hdr.b_arc_access = now;
5225                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
5226                         arc_change_state(arc_mfu, hdr, hash_lock);
5227                 }
5228                 ARCSTAT_BUMP(arcstat_mru_hits);
5229         } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
5230                 arc_state_t     *new_state;
5231                 /*
5232                  * This buffer has been "accessed" recently, but
5233                  * was evicted from the cache.  Move it to the
5234                  * MFU state.
5235                  */
5236                 if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
5237                         new_state = arc_mru;
5238                         if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) {
5239                                 if (HDR_HAS_L2HDR(hdr))
5240                                         l2arc_hdr_arcstats_decrement_state(hdr);
5241                                 arc_hdr_clear_flags(hdr,
5242                                     ARC_FLAG_PREFETCH |
5243                                     ARC_FLAG_PRESCIENT_PREFETCH);
5244                                 if (HDR_HAS_L2HDR(hdr))
5245                                         l2arc_hdr_arcstats_increment_state(hdr);
5246                         }
5247                         DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
5248                 } else {
5249                         new_state = arc_mfu;
5250                         DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
5251                 }
5252 
5253                 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
5254                 arc_change_state(new_state, hdr, hash_lock);
5255 
5256                 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
5257         } else if (hdr->b_l1hdr.b_state == arc_mfu) {
5258                 /*
5259                  * This buffer has been accessed more than once and is
5260                  * still in the cache.  Keep it in the MFU state.
5261                  *
5262                  * NOTE: an add_reference() that occurred when we did
5263                  * the arc_read() will have kicked this off the list.
5264                  * If it was a prefetch, we will explicitly move it to
5265                  * the head of the list now.
5266                  */
5267                 ARCSTAT_BUMP(arcstat_mfu_hits);
5268                 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
5269         } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
5270                 arc_state_t     *new_state = arc_mfu;
5271                 /*
5272                  * This buffer has been accessed more than once but has
5273                  * been evicted from the cache.  Move it back to the
5274                  * MFU state.
5275                  */
5276 
5277                 if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
5278                         /*
5279                          * This is a prefetch access...
5280                          * move this block back to the MRU state.
5281                          */
5282                         new_state = arc_mru;
5283                 }
5284 
5285                 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
5286                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
5287                 arc_change_state(new_state, hdr, hash_lock);
5288 
5289                 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
5290         } else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
5291                 /*
5292                  * This buffer is on the 2nd Level ARC.
5293                  */
5294 
5295                 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
5296                 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
5297                 arc_change_state(arc_mfu, hdr, hash_lock);
5298         } else {
5299                 ASSERT(!"invalid arc state");
5300         }
5301 }
5302 
5303 /*
5304  * This routine is called by dbuf_hold() to update the arc_access() state
5305  * which otherwise would be skipped for entries in the dbuf cache.
5306  */
5307 void
5308 arc_buf_access(arc_buf_t *buf)
5309 {
5310         mutex_enter(&buf->b_evict_lock);
5311         arc_buf_hdr_t *hdr = buf->b_hdr;
5312 
5313         /*
5314          * Avoid taking the hash_lock when possible as an optimization.
5315          * The header must be checked again under the hash_lock in order
5316          * to handle the case where it is concurrently being released.
5317          */
5318         if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
5319                 mutex_exit(&buf->b_evict_lock);
5320                 return;
5321         }
5322 
5323         kmutex_t *hash_lock = HDR_LOCK(hdr);
5324         mutex_enter(hash_lock);
5325 
5326         if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
5327                 mutex_exit(hash_lock);
5328                 mutex_exit(&buf->b_evict_lock);
5329                 ARCSTAT_BUMP(arcstat_access_skip);
5330                 return;
5331         }
5332 
5333         mutex_exit(&buf->b_evict_lock);
5334 
5335         ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
5336             hdr->b_l1hdr.b_state == arc_mfu);
5337 
5338         DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
5339         arc_access(hdr, hash_lock);
5340         mutex_exit(hash_lock);
5341 
5342         ARCSTAT_BUMP(arcstat_hits);
5343         ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
5344             demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits);
5345 }
5346 
5347 /* a generic arc_read_done_func_t which you can use */
5348 /* ARGSUSED */
5349 void
5350 arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
5351     arc_buf_t *buf, void *arg)
5352 {
5353         if (buf == NULL)
5354                 return;
5355 
5356         bcopy(buf->b_data, arg, arc_buf_size(buf));
5357         arc_buf_destroy(buf, arg);
5358 }
5359 
5360 /* a generic arc_read_done_func_t */
5361 void
5362 arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
5363     arc_buf_t *buf, void *arg)
5364 {
5365         arc_buf_t **bufp = arg;
5366 
5367         if (buf == NULL) {
5368                 ASSERT(zio == NULL || zio->io_error != 0);
5369                 *bufp = NULL;
5370         } else {
5371                 ASSERT(zio == NULL || zio->io_error == 0);
5372                 *bufp = buf;
5373                 ASSERT(buf->b_data != NULL);
5374         }
5375 }
5376 
5377 static void
5378 arc_hdr_verify(arc_buf_hdr_t *hdr, const blkptr_t *bp)
5379 {
5380         if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
5381                 ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0);
5382                 ASSERT3U(arc_hdr_get_compress(hdr), ==, ZIO_COMPRESS_OFF);
5383         } else {
5384                 if (HDR_COMPRESSION_ENABLED(hdr)) {
5385                         ASSERT3U(arc_hdr_get_compress(hdr), ==,
5386                             BP_GET_COMPRESS(bp));
5387                 }
5388                 ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
5389                 ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp));
5390                 ASSERT3U(!!HDR_PROTECTED(hdr), ==, BP_IS_PROTECTED(bp));
5391         }
5392 }
5393 
5394 /*
5395  * XXX this should be changed to return an error, and callers
5396  * re-read from disk on failure (on nondebug bits).
5397  */
5398 static void
5399 arc_hdr_verify_checksum(spa_t *spa, arc_buf_hdr_t *hdr, const blkptr_t *bp)
5400 {
5401         arc_hdr_verify(hdr, bp);
5402         if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
5403                 return;
5404         int err = 0;
5405         abd_t *abd = NULL;
5406         if (BP_IS_ENCRYPTED(bp)) {
5407                 if (HDR_HAS_RABD(hdr)) {
5408                         abd = hdr->b_crypt_hdr.b_rabd;
5409                 }
5410         } else if (HDR_COMPRESSION_ENABLED(hdr)) {
5411                 abd = hdr->b_l1hdr.b_pabd;
5412         }
5413         if (abd != NULL) {
5414                 /*
5415                  * The offset is only used for labels, which are not
5416                  * cached in the ARC, so it doesn't matter what we
5417                  * pass for the offset parameter.
5418                  */
5419                 int psize = HDR_GET_PSIZE(hdr);
5420                 err = zio_checksum_error_impl(spa, bp,
5421                     BP_GET_CHECKSUM(bp), abd, psize, 0, NULL);
5422                 if (err != 0) {
5423                         /*
5424                          * Use abd_copy_to_buf() rather than
5425                          * abd_borrow_buf_copy() so that we are sure to
5426                          * include the buf in crash dumps.
5427                          */
5428                         void *buf = kmem_alloc(psize, KM_SLEEP);
5429                         abd_copy_to_buf(buf, abd, psize);
5430                         panic("checksum of cached data doesn't match BP "
5431                             "err=%u hdr=%p bp=%p abd=%p buf=%p",
5432                             err, (void *)hdr, (void *)bp, (void *)abd, buf);
5433                 }
5434         }
5435 }
5436 
5437 static void
5438 arc_read_done(zio_t *zio)
5439 {
5440         blkptr_t        *bp = zio->io_bp;
5441         arc_buf_hdr_t   *hdr = zio->io_private;
5442         kmutex_t        *hash_lock = NULL;
5443         arc_callback_t  *callback_list;
5444         arc_callback_t  *acb;
5445         boolean_t       freeable = B_FALSE;
5446 
5447         /*
5448          * The hdr was inserted into hash-table and removed from lists
5449          * prior to starting I/O.  We should find this header, since
5450          * it's in the hash table, and it should be legit since it's
5451          * not possible to evict it during the I/O.  The only possible
5452          * reason for it not to be found is if we were freed during the
5453          * read.
5454          */
5455         if (HDR_IN_HASH_TABLE(hdr)) {
5456                 ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
5457                 ASSERT3U(hdr->b_dva.dva_word[0], ==,
5458                     BP_IDENTITY(zio->io_bp)->dva_word[0]);
5459                 ASSERT3U(hdr->b_dva.dva_word[1], ==,
5460                     BP_IDENTITY(zio->io_bp)->dva_word[1]);
5461 
5462                 arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
5463                     &hash_lock);
5464 
5465                 ASSERT((found == hdr &&
5466                     DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
5467                     (found == hdr && HDR_L2_READING(hdr)));
5468                 ASSERT3P(hash_lock, !=, NULL);
5469         }
5470 
5471         if (BP_IS_PROTECTED(bp)) {
5472                 hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
5473                 hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
5474                 zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
5475                     hdr->b_crypt_hdr.b_iv);
5476 
5477                 if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) {
5478                         void *tmpbuf;
5479 
5480                         tmpbuf = abd_borrow_buf_copy(zio->io_abd,
5481                             sizeof (zil_chain_t));
5482                         zio_crypt_decode_mac_zil(tmpbuf,
5483                             hdr->b_crypt_hdr.b_mac);
5484                         abd_return_buf(zio->io_abd, tmpbuf,
5485                             sizeof (zil_chain_t));
5486                 } else {
5487                         zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac);
5488                 }
5489         }
5490 
5491         if (zio->io_error == 0) {
5492                 /* byteswap if necessary */
5493                 if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
5494                         if (BP_GET_LEVEL(zio->io_bp) > 0) {
5495                                 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
5496                         } else {
5497                                 hdr->b_l1hdr.b_byteswap =
5498                                     DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
5499                         }
5500                 } else {
5501                         hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
5502                 }
5503         }
5504 
5505         arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED);
5506 
5507         callback_list = hdr->b_l1hdr.b_acb;
5508         ASSERT3P(callback_list, !=, NULL);
5509 
5510         if (hash_lock && zio->io_error == 0 &&
5511             hdr->b_l1hdr.b_state == arc_anon) {
5512                 /*
5513                  * Only call arc_access on anonymous buffers.  This is because
5514                  * if we've issued an I/O for an evicted buffer, we've already
5515                  * called arc_access (to prevent any simultaneous readers from
5516                  * getting confused).
5517                  */
5518                 arc_access(hdr, hash_lock);
5519         }
5520 
5521         /*
5522          * If a read request has a callback (i.e. acb_done is not NULL), then we
5523          * make a buf containing the data according to the parameters which were
5524          * passed in. The implementation of arc_buf_alloc_impl() ensures that we
5525          * aren't needlessly decompressing the data multiple times.
5526          */
5527         int callback_cnt = 0;
5528         for (acb = callback_list; acb != NULL; acb = acb->acb_next) {
5529                 if (!acb->acb_done)
5530                         continue;
5531 
5532                 callback_cnt++;
5533 
5534                 if (zio->io_error != 0)
5535                         continue;
5536 
5537                 int error = arc_buf_alloc_impl(hdr, zio->io_spa,
5538                     &acb->acb_zb, acb->acb_private, acb->acb_encrypted,
5539                     acb->acb_compressed, acb->acb_noauth, B_TRUE,
5540                     &acb->acb_buf);
5541 
5542                 /*
5543                  * Assert non-speculative zios didn't fail because an
5544                  * encryption key wasn't loaded
5545                  */
5546                 ASSERT((zio->io_flags & ZIO_FLAG_SPECULATIVE) ||
5547                     error != EACCES);
5548 
5549                 /*
5550                  * If we failed to decrypt, report an error now (as the zio
5551                  * layer would have done if it had done the transforms).
5552                  */
5553                 if (error == ECKSUM) {
5554                         ASSERT(BP_IS_PROTECTED(bp));
5555                         error = SET_ERROR(EIO);
5556                         if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
5557                                 spa_log_error(zio->io_spa, &acb->acb_zb);
5558                                 (void) zfs_ereport_post(
5559                                     FM_EREPORT_ZFS_AUTHENTICATION,
5560                                     zio->io_spa, NULL, &acb->acb_zb, zio, 0, 0);
5561                         }
5562                 }
5563 
5564                 if (error != 0) {
5565                         /*
5566                          * Decompression failed.  Set io_error
5567                          * so that when we call acb_done (below),
5568                          * we will indicate that the read failed.
5569                          * Note that in the unusual case where one
5570                          * callback is compressed and another
5571                          * uncompressed, we will mark all of them
5572                          * as failed, even though the uncompressed
5573                          * one can't actually fail.  In this case,
5574                          * the hdr will not be anonymous, because
5575                          * if there are multiple callbacks, it's
5576                          * because multiple threads found the same
5577                          * arc buf in the hash table.
5578                          */
5579                         zio->io_error = error;
5580                 }
5581         }
5582 
5583         /*
5584          * If there are multiple callbacks, we must have the hash lock,
5585          * because the only way for multiple threads to find this hdr is
5586          * in the hash table.  This ensures that if there are multiple
5587          * callbacks, the hdr is not anonymous.  If it were anonymous,
5588          * we couldn't use arc_buf_destroy() in the error case below.
5589          */
5590         ASSERT(callback_cnt < 2 || hash_lock != NULL);
5591 
5592         hdr->b_l1hdr.b_acb = NULL;
5593         arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
5594         if (callback_cnt == 0)
5595                 ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
5596 
5597         ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
5598             callback_list != NULL);
5599 
5600         if (zio->io_error == 0) {
5601                 arc_hdr_verify(hdr, zio->io_bp);
5602         } else {
5603                 arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
5604                 if (hdr->b_l1hdr.b_state != arc_anon)
5605                         arc_change_state(arc_anon, hdr, hash_lock);
5606                 if (HDR_IN_HASH_TABLE(hdr))
5607                         buf_hash_remove(hdr);
5608                 freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
5609         }
5610 
5611         /*
5612          * Broadcast before we drop the hash_lock to avoid the possibility
5613          * that the hdr (and hence the cv) might be freed before we get to
5614          * the cv_broadcast().
5615          */
5616         cv_broadcast(&hdr->b_l1hdr.b_cv);
5617 
5618         if (hash_lock != NULL) {
5619                 mutex_exit(hash_lock);
5620         } else {
5621                 /*
5622                  * This block was freed while we waited for the read to
5623                  * complete.  It has been removed from the hash table and
5624                  * moved to the anonymous state (so that it won't show up
5625                  * in the cache).
5626                  */
5627                 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
5628                 freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
5629         }
5630 
5631         /* execute each callback and free its structure */
5632         while ((acb = callback_list) != NULL) {
5633 
5634                 if (acb->acb_done != NULL) {
5635                         if (zio->io_error != 0 && acb->acb_buf != NULL) {
5636                                 /*
5637                                  * If arc_buf_alloc_impl() fails during
5638                                  * decompression, the buf will still be
5639                                  * allocated, and needs to be freed here.
5640                                  */
5641                                 arc_buf_destroy(acb->acb_buf, acb->acb_private);
5642                                 acb->acb_buf = NULL;
5643                         }
5644                         acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
5645                             acb->acb_buf, acb->acb_private);
5646                 }
5647 
5648                 if (acb->acb_zio_dummy != NULL) {
5649                         acb->acb_zio_dummy->io_error = zio->io_error;
5650                         zio_nowait(acb->acb_zio_dummy);
5651                 }
5652 
5653                 callback_list = acb->acb_next;
5654                 kmem_free(acb, sizeof (arc_callback_t));
5655         }
5656 
5657         if (freeable)
5658                 arc_hdr_destroy(hdr);
5659 }
5660 
5661 /*
5662  * "Read" the block at the specified DVA (in bp) via the
5663  * cache.  If the block is found in the cache, invoke the provided
5664  * callback immediately and return.  Note that the `zio' parameter
5665  * in the callback will be NULL in this case, since no IO was
5666  * required.  If the block is not in the cache pass the read request
5667  * on to the spa with a substitute callback function, so that the
5668  * requested block will be added to the cache.
5669  *
5670  * If a read request arrives for a block that has a read in-progress,
5671  * either wait for the in-progress read to complete (and return the
5672  * results); or, if this is a read with a "done" func, add a record
5673  * to the read to invoke the "done" func when the read completes,
5674  * and return; or just return.
5675  *
5676  * arc_read_done() will invoke all the requested "done" functions
5677  * for readers of this block.
5678  */
5679 int
5680 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done,
5681     void *private, zio_priority_t priority, int zio_flags,
5682     arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
5683 {
5684         arc_buf_hdr_t *hdr = NULL;
5685         kmutex_t *hash_lock = NULL;
5686         zio_t *rzio;
5687         uint64_t guid = spa_load_guid(spa);
5688         boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW_COMPRESS) != 0;
5689         boolean_t encrypted_read = BP_IS_ENCRYPTED(bp) &&
5690             (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
5691         boolean_t noauth_read = BP_IS_AUTHENTICATED(bp) &&
5692             (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
5693         int rc = 0;
5694 
5695         ASSERT(!BP_IS_EMBEDDED(bp) ||
5696             BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
5697 
5698 top:
5699         if (!BP_IS_EMBEDDED(bp)) {
5700                 /*
5701                  * Embedded BP's have no DVA and require no I/O to "read".
5702                  * Create an anonymous arc buf to back it.
5703                  */
5704                 hdr = buf_hash_find(guid, bp, &hash_lock);
5705         }
5706 
5707         /*
5708          * Determine if we have an L1 cache hit or a cache miss. For simplicity
5709          * we maintain encrypted data seperately from compressed / uncompressed
5710          * data. If the user is requesting raw encrypted data and we don't have
5711          * that in the header we will read from disk to guarantee that we can
5712          * get it even if the encryption keys aren't loaded.
5713          */
5714         if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) ||
5715             (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) {
5716                 arc_buf_t *buf = NULL;
5717                 *arc_flags |= ARC_FLAG_CACHED;
5718 
5719                 if (HDR_IO_IN_PROGRESS(hdr)) {
5720                         zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
5721 
5722                         ASSERT3P(head_zio, !=, NULL);
5723                         if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
5724                             priority == ZIO_PRIORITY_SYNC_READ) {
5725                                 /*
5726                                  * This is a sync read that needs to wait for
5727                                  * an in-flight async read. Request that the
5728                                  * zio have its priority upgraded.
5729                                  */
5730                                 zio_change_priority(head_zio, priority);
5731                                 DTRACE_PROBE1(arc__async__upgrade__sync,
5732                                     arc_buf_hdr_t *, hdr);
5733                                 ARCSTAT_BUMP(arcstat_async_upgrade_sync);
5734                         }
5735                         if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
5736                                 arc_hdr_clear_flags(hdr,
5737                                     ARC_FLAG_PREDICTIVE_PREFETCH);
5738                         }
5739 
5740                         if (*arc_flags & ARC_FLAG_WAIT) {
5741                                 cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
5742                                 mutex_exit(hash_lock);
5743                                 goto top;
5744                         }
5745                         ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
5746 
5747                         if (done) {
5748                                 arc_callback_t *acb = NULL;
5749 
5750                                 acb = kmem_zalloc(sizeof (arc_callback_t),
5751                                     KM_SLEEP);
5752                                 acb->acb_done = done;
5753                                 acb->acb_private = private;
5754                                 acb->acb_compressed = compressed_read;
5755                                 acb->acb_encrypted = encrypted_read;
5756                                 acb->acb_noauth = noauth_read;
5757                                 acb->acb_zb = *zb;
5758                                 if (pio != NULL)
5759                                         acb->acb_zio_dummy = zio_null(pio,
5760                                             spa, NULL, NULL, NULL, zio_flags);
5761 
5762                                 ASSERT3P(acb->acb_done, !=, NULL);
5763                                 acb->acb_zio_head = head_zio;
5764                                 acb->acb_next = hdr->b_l1hdr.b_acb;
5765                                 hdr->b_l1hdr.b_acb = acb;
5766                                 mutex_exit(hash_lock);
5767                                 return (0);
5768                         }
5769                         mutex_exit(hash_lock);
5770                         return (0);
5771                 }
5772 
5773                 ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
5774                     hdr->b_l1hdr.b_state == arc_mfu);
5775 
5776                 if (done) {
5777                         if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
5778                                 /*
5779                                  * This is a demand read which does not have to
5780                                  * wait for i/o because we did a predictive
5781                                  * prefetch i/o for it, which has completed.
5782                                  */
5783                                 DTRACE_PROBE1(
5784                                     arc__demand__hit__predictive__prefetch,
5785                                     arc_buf_hdr_t *, hdr);
5786                                 ARCSTAT_BUMP(
5787                                     arcstat_demand_hit_predictive_prefetch);
5788                                 arc_hdr_clear_flags(hdr,
5789                                     ARC_FLAG_PREDICTIVE_PREFETCH);
5790                         }
5791 
5792                         if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
5793                                 ARCSTAT_BUMP(
5794                                     arcstat_demand_hit_prescient_prefetch);
5795                                 arc_hdr_clear_flags(hdr,
5796                                     ARC_FLAG_PRESCIENT_PREFETCH);
5797                         }
5798 
5799                         ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp));
5800 
5801                         arc_hdr_verify_checksum(spa, hdr, bp);
5802 
5803                         /* Get a buf with the desired data in it. */
5804                         rc = arc_buf_alloc_impl(hdr, spa, zb, private,
5805                             encrypted_read, compressed_read, noauth_read,
5806                             B_TRUE, &buf);
5807                         if (rc == ECKSUM) {
5808                                 /*
5809                                  * Convert authentication and decryption errors
5810                                  * to EIO (and generate an ereport if needed)
5811                                  * before leaving the ARC.
5812                                  */
5813                                 rc = SET_ERROR(EIO);
5814                                 if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) {
5815                                         spa_log_error(spa, zb);
5816                                         (void) zfs_ereport_post(
5817                                             FM_EREPORT_ZFS_AUTHENTICATION,
5818                                             spa, NULL, zb, NULL, 0, 0);
5819                                 }
5820                         }
5821                         if (rc != 0) {
5822                                 (void) remove_reference(hdr, hash_lock,
5823                                     private);
5824                                 arc_buf_destroy_impl(buf);
5825                                 buf = NULL;
5826                         }
5827                         /* assert any errors weren't due to unloaded keys */
5828                         ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) ||
5829                             rc != EACCES);
5830                 } else if (*arc_flags & ARC_FLAG_PREFETCH &&
5831                     zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
5832                         if (HDR_HAS_L2HDR(hdr))
5833                                 l2arc_hdr_arcstats_decrement_state(hdr);
5834                         arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
5835                         if (HDR_HAS_L2HDR(hdr))
5836                                 l2arc_hdr_arcstats_increment_state(hdr);
5837                 }
5838                 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
5839                 arc_access(hdr, hash_lock);
5840                 if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
5841                         arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
5842                 if (*arc_flags & ARC_FLAG_L2CACHE)
5843                         arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
5844                 mutex_exit(hash_lock);
5845                 ARCSTAT_BUMP(arcstat_hits);
5846                 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
5847                     demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
5848                     data, metadata, hits);
5849 
5850                 if (done)
5851                         done(NULL, zb, bp, buf, private);
5852         } else {
5853                 uint64_t lsize = BP_GET_LSIZE(bp);
5854                 uint64_t psize = BP_GET_PSIZE(bp);
5855                 arc_callback_t *acb;
5856                 vdev_t *vd = NULL;
5857                 uint64_t addr = 0;
5858                 boolean_t devw = B_FALSE;
5859                 uint64_t size;
5860                 abd_t *hdr_abd;
5861                 int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0;
5862 
5863                 if (hdr == NULL) {
5864                         /* this block is not in the cache */
5865                         arc_buf_hdr_t *exists = NULL;
5866                         arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
5867                         hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
5868                             BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), type,
5869                             encrypted_read);
5870 
5871                         if (!BP_IS_EMBEDDED(bp)) {
5872                                 hdr->b_dva = *BP_IDENTITY(bp);
5873                                 hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
5874                                 exists = buf_hash_insert(hdr, &hash_lock);
5875                         }
5876                         if (exists != NULL) {
5877                                 /* somebody beat us to the hash insert */
5878                                 mutex_exit(hash_lock);
5879                                 buf_discard_identity(hdr);
5880                                 arc_hdr_destroy(hdr);
5881                                 goto top; /* restart the IO request */
5882                         }
5883                 } else {
5884                         /*
5885                          * This block is in the ghost cache or encrypted data
5886                          * was requested and we didn't have it. If it was
5887                          * L2-only (and thus didn't have an L1 hdr),
5888                          * we realloc the header to add an L1 hdr.
5889                          */
5890                         if (!HDR_HAS_L1HDR(hdr)) {
5891                                 hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
5892                                     hdr_full_cache);
5893                         }
5894 
5895                         if (GHOST_STATE(hdr->b_l1hdr.b_state)) {
5896                                 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
5897                                 ASSERT(!HDR_HAS_RABD(hdr));
5898                                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
5899                                 ASSERT0(zfs_refcount_count(
5900                                     &hdr->b_l1hdr.b_refcnt));
5901                                 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
5902                                 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
5903                         } else if (HDR_IO_IN_PROGRESS(hdr)) {
5904                                 /*
5905                                  * If this header already had an IO in progress
5906                                  * and we are performing another IO to fetch
5907                                  * encrypted data we must wait until the first
5908                                  * IO completes so as not to confuse
5909                                  * arc_read_done(). This should be very rare
5910                                  * and so the performance impact shouldn't
5911                                  * matter.
5912                                  */
5913                                 cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
5914                                 mutex_exit(hash_lock);
5915                                 goto top;
5916                         }
5917 
5918                         /*
5919                          * This is a delicate dance that we play here.
5920                          * This hdr might be in the ghost list so we access
5921                          * it to move it out of the ghost list before we
5922                          * initiate the read. If it's a prefetch then
5923                          * it won't have a callback so we'll remove the
5924                          * reference that arc_buf_alloc_impl() created. We
5925                          * do this after we've called arc_access() to
5926                          * avoid hitting an assert in remove_reference().
5927                          */
5928                         arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state);
5929                         arc_access(hdr, hash_lock);
5930                         arc_hdr_alloc_pabd(hdr, alloc_flags);
5931                 }
5932 
5933                 if (encrypted_read) {
5934                         ASSERT(HDR_HAS_RABD(hdr));
5935                         size = HDR_GET_PSIZE(hdr);
5936                         hdr_abd = hdr->b_crypt_hdr.b_rabd;
5937                         zio_flags |= ZIO_FLAG_RAW;
5938                 } else {
5939                         ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
5940                         size = arc_hdr_size(hdr);
5941                         hdr_abd = hdr->b_l1hdr.b_pabd;
5942 
5943                         if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
5944                                 zio_flags |= ZIO_FLAG_RAW_COMPRESS;
5945                         }
5946 
5947                         /*
5948                          * For authenticated bp's, we do not ask the ZIO layer
5949                          * to authenticate them since this will cause the entire
5950                          * IO to fail if the key isn't loaded. Instead, we
5951                          * defer authentication until arc_buf_fill(), which will
5952                          * verify the data when the key is available.
5953                          */
5954                         if (BP_IS_AUTHENTICATED(bp))
5955                                 zio_flags |= ZIO_FLAG_RAW_ENCRYPT;
5956                 }
5957 
5958                 if (*arc_flags & ARC_FLAG_PREFETCH &&
5959                     zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
5960                         if (HDR_HAS_L2HDR(hdr))
5961                                 l2arc_hdr_arcstats_decrement_state(hdr);
5962                         arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
5963                         if (HDR_HAS_L2HDR(hdr))
5964                                 l2arc_hdr_arcstats_increment_state(hdr);
5965                 }
5966                 if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
5967                         arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
5968 
5969                 if (*arc_flags & ARC_FLAG_L2CACHE)
5970                         arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
5971                 if (BP_IS_AUTHENTICATED(bp))
5972                         arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
5973                 if (BP_GET_LEVEL(bp) > 0)
5974                         arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT);
5975                 if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
5976                         arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH);
5977                 ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
5978 
5979                 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
5980                 acb->acb_done = done;
5981                 acb->acb_private = private;
5982                 acb->acb_compressed = compressed_read;
5983                 acb->acb_encrypted = encrypted_read;
5984                 acb->acb_noauth = noauth_read;
5985                 acb->acb_zb = *zb;
5986 
5987                 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
5988                 hdr->b_l1hdr.b_acb = acb;
5989                 arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
5990 
5991                 if (HDR_HAS_L2HDR(hdr) &&
5992                     (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
5993                         devw = hdr->b_l2hdr.b_dev->l2ad_writing;
5994                         addr = hdr->b_l2hdr.b_daddr;
5995                         /*
5996                          * Lock out L2ARC device removal.
5997                          */
5998                         if (vdev_is_dead(vd) ||
5999                             !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
6000                                 vd = NULL;
6001                 }
6002 
6003                 /*
6004                  * We count both async reads and scrub IOs as asynchronous so
6005                  * that both can be upgraded in the event of a cache hit while
6006                  * the read IO is still in-flight.
6007                  */
6008                 if (priority == ZIO_PRIORITY_ASYNC_READ ||
6009                     priority == ZIO_PRIORITY_SCRUB)
6010                         arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
6011                 else
6012                         arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
6013 
6014                 /*
6015                  * At this point, we have a level 1 cache miss.  Try again in
6016                  * L2ARC if possible.
6017                  */
6018                 ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize);
6019 
6020                 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
6021                     uint64_t, lsize, zbookmark_phys_t *, zb);
6022                 ARCSTAT_BUMP(arcstat_misses);
6023                 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
6024                     demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
6025                     data, metadata, misses);
6026 
6027                 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
6028                         /*
6029                          * Read from the L2ARC if the following are true:
6030                          * 1. The L2ARC vdev was previously cached.
6031                          * 2. This buffer still has L2ARC metadata.
6032                          * 3. This buffer isn't currently writing to the L2ARC.
6033                          * 4. The L2ARC entry wasn't evicted, which may
6034                          *    also have invalidated the vdev.
6035                          * 5. This isn't prefetch or l2arc_noprefetch is 0.
6036                          */
6037                         if (HDR_HAS_L2HDR(hdr) &&
6038                             !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
6039                             !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
6040                                 l2arc_read_callback_t *cb;
6041                                 abd_t *abd;
6042                                 uint64_t asize;
6043 
6044                                 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
6045                                 ARCSTAT_BUMP(arcstat_l2_hits);
6046 
6047                                 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
6048                                     KM_SLEEP);
6049                                 cb->l2rcb_hdr = hdr;
6050                                 cb->l2rcb_bp = *bp;
6051                                 cb->l2rcb_zb = *zb;
6052                                 cb->l2rcb_flags = zio_flags;
6053 
6054                                 /*
6055                                  * When Compressed ARC is disabled, but the
6056                                  * L2ARC block is compressed, arc_hdr_size()
6057                                  * will have returned LSIZE rather than PSIZE.
6058                                  */
6059                                 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
6060                                     !HDR_COMPRESSION_ENABLED(hdr) &&
6061                                     HDR_GET_PSIZE(hdr) != 0) {
6062                                         size = HDR_GET_PSIZE(hdr);
6063                                 }
6064 
6065                                 asize = vdev_psize_to_asize(vd, size);
6066                                 if (asize != size) {
6067                                         abd = abd_alloc_for_io(asize,
6068                                             HDR_ISTYPE_METADATA(hdr));
6069                                         cb->l2rcb_abd = abd;
6070                                 } else {
6071                                         abd = hdr_abd;
6072                                 }
6073 
6074                                 ASSERT(addr >= VDEV_LABEL_START_SIZE &&
6075                                     addr + asize <= vd->vdev_psize -
6076                                     VDEV_LABEL_END_SIZE);
6077 
6078                                 /*
6079                                  * l2arc read.  The SCL_L2ARC lock will be
6080                                  * released by l2arc_read_done().
6081                                  * Issue a null zio if the underlying buffer
6082                                  * was squashed to zero size by compression.
6083                                  */
6084                                 ASSERT3U(arc_hdr_get_compress(hdr), !=,
6085                                     ZIO_COMPRESS_EMPTY);
6086                                 rzio = zio_read_phys(pio, vd, addr,
6087                                     asize, abd,
6088                                     ZIO_CHECKSUM_OFF,
6089                                     l2arc_read_done, cb, priority,
6090                                     zio_flags | ZIO_FLAG_DONT_CACHE |
6091                                     ZIO_FLAG_CANFAIL |
6092                                     ZIO_FLAG_DONT_PROPAGATE |
6093                                     ZIO_FLAG_DONT_RETRY, B_FALSE);
6094                                 acb->acb_zio_head = rzio;
6095 
6096                                 if (hash_lock != NULL)
6097                                         mutex_exit(hash_lock);
6098 
6099                                 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
6100                                     zio_t *, rzio);
6101                                 ARCSTAT_INCR(arcstat_l2_read_bytes,
6102                                     HDR_GET_PSIZE(hdr));
6103 
6104                                 if (*arc_flags & ARC_FLAG_NOWAIT) {
6105                                         zio_nowait(rzio);
6106                                         return (0);
6107                                 }
6108 
6109                                 ASSERT(*arc_flags & ARC_FLAG_WAIT);
6110                                 if (zio_wait(rzio) == 0)
6111                                         return (0);
6112 
6113                                 /* l2arc read error; goto zio_read() */
6114                                 if (hash_lock != NULL)
6115                                         mutex_enter(hash_lock);
6116                         } else {
6117                                 DTRACE_PROBE1(l2arc__miss,
6118                                     arc_buf_hdr_t *, hdr);
6119                                 ARCSTAT_BUMP(arcstat_l2_misses);
6120                                 if (HDR_L2_WRITING(hdr))
6121                                         ARCSTAT_BUMP(arcstat_l2_rw_clash);
6122                                 spa_config_exit(spa, SCL_L2ARC, vd);
6123                         }
6124                 } else {
6125                         if (vd != NULL)
6126                                 spa_config_exit(spa, SCL_L2ARC, vd);
6127                         if (l2arc_ndev != 0) {
6128                                 DTRACE_PROBE1(l2arc__miss,
6129                                     arc_buf_hdr_t *, hdr);
6130                                 ARCSTAT_BUMP(arcstat_l2_misses);
6131                         }
6132                 }
6133 
6134                 rzio = zio_read(pio, spa, bp, hdr_abd, size,
6135                     arc_read_done, hdr, priority, zio_flags, zb);
6136                 acb->acb_zio_head = rzio;
6137 
6138                 if (hash_lock != NULL)
6139                         mutex_exit(hash_lock);
6140 
6141                 /*
6142                  * At this point, this read I/O has already missed in the ARC
6143                  * and will be going through to the disk.  The I/O throttle
6144                  * should delay this I/O if this zone is using more than its I/O
6145                  * priority allows.
6146                  */
6147                 zfs_zone_io_throttle(ZFS_ZONE_IOP_READ);
6148 
6149                 if (*arc_flags & ARC_FLAG_WAIT)
6150                         return (zio_wait(rzio));
6151 
6152                 ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
6153                 zio_nowait(rzio);
6154         }
6155         return (rc);
6156 }
6157 
6158 /*
6159  * Notify the arc that a block was freed, and thus will never be used again.
6160  */
6161 void
6162 arc_freed(spa_t *spa, const blkptr_t *bp)
6163 {
6164         arc_buf_hdr_t *hdr;
6165         kmutex_t *hash_lock;
6166         uint64_t guid = spa_load_guid(spa);
6167 
6168         ASSERT(!BP_IS_EMBEDDED(bp));
6169 
6170         hdr = buf_hash_find(guid, bp, &hash_lock);
6171         if (hdr == NULL)
6172                 return;
6173 
6174         /*
6175          * We might be trying to free a block that is still doing I/O
6176          * (i.e. prefetch) or has a reference (i.e. a dedup-ed,
6177          * dmu_sync-ed block). If this block is being prefetched, then it
6178          * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr
6179          * until the I/O completes. A block may also have a reference if it is
6180          * part of a dedup-ed, dmu_synced write. The dmu_sync() function would
6181          * have written the new block to its final resting place on disk but
6182          * without the dedup flag set. This would have left the hdr in the MRU
6183          * state and discoverable. When the txg finally syncs it detects that
6184          * the block was overridden in open context and issues an override I/O.
6185          * Since this is a dedup block, the override I/O will determine if the
6186          * block is already in the DDT. If so, then it will replace the io_bp
6187          * with the bp from the DDT and allow the I/O to finish. When the I/O
6188          * reaches the done callback, dbuf_write_override_done, it will
6189          * check to see if the io_bp and io_bp_override are identical.
6190          * If they are not, then it indicates that the bp was replaced with
6191          * the bp in the DDT and the override bp is freed. This allows
6192          * us to arrive here with a reference on a block that is being
6193          * freed. So if we have an I/O in progress, or a reference to
6194          * this hdr, then we don't destroy the hdr.
6195          */
6196         if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) &&
6197             zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) {
6198                 arc_change_state(arc_anon, hdr, hash_lock);
6199                 arc_hdr_destroy(hdr);
6200                 mutex_exit(hash_lock);
6201         } else {
6202                 mutex_exit(hash_lock);
6203         }
6204 
6205 }
6206 
6207 /*
6208  * Release this buffer from the cache, making it an anonymous buffer.  This
6209  * must be done after a read and prior to modifying the buffer contents.
6210  * If the buffer has more than one reference, we must make
6211  * a new hdr for the buffer.
6212  */
6213 void
6214 arc_release(arc_buf_t *buf, void *tag)
6215 {
6216         arc_buf_hdr_t *hdr = buf->b_hdr;
6217 
6218         /*
6219          * It would be nice to assert that if its DMU metadata (level >
6220          * 0 || it's the dnode file), then it must be syncing context.
6221          * But we don't know that information at this level.
6222          */
6223 
6224         mutex_enter(&buf->b_evict_lock);
6225 
6226         ASSERT(HDR_HAS_L1HDR(hdr));
6227 
6228         /*
6229          * We don't grab the hash lock prior to this check, because if
6230          * the buffer's header is in the arc_anon state, it won't be
6231          * linked into the hash table.
6232          */
6233         if (hdr->b_l1hdr.b_state == arc_anon) {
6234                 mutex_exit(&buf->b_evict_lock);
6235                 /*
6236                  * If we are called from dmu_convert_mdn_block_to_raw(),
6237                  * a write might be in progress.  This is OK because
6238                  * the caller won't change the content of this buffer,
6239                  * only the flags (via arc_convert_to_raw()).
6240                  */
6241                 /* ASSERT(!HDR_IO_IN_PROGRESS(hdr)); */
6242                 ASSERT(!HDR_IN_HASH_TABLE(hdr));
6243                 ASSERT(!HDR_HAS_L2HDR(hdr));
6244                 ASSERT(HDR_EMPTY(hdr));
6245 
6246                 ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
6247                 ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
6248                 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
6249 
6250                 hdr->b_l1hdr.b_arc_access = 0;
6251 
6252                 /*
6253                  * If the buf is being overridden then it may already
6254                  * have a hdr that is not empty.
6255                  */
6256                 buf_discard_identity(hdr);
6257                 arc_buf_thaw(buf);
6258 
6259                 return;
6260         }
6261 
6262         kmutex_t *hash_lock = HDR_LOCK(hdr);
6263         mutex_enter(hash_lock);
6264 
6265         /*
6266          * This assignment is only valid as long as the hash_lock is
6267          * held, we must be careful not to reference state or the
6268          * b_state field after dropping the lock.
6269          */
6270         arc_state_t *state = hdr->b_l1hdr.b_state;
6271         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
6272         ASSERT3P(state, !=, arc_anon);
6273 
6274         /* this buffer is not on any list */
6275         ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0);
6276 
6277         if (HDR_HAS_L2HDR(hdr)) {
6278                 mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
6279 
6280                 /*
6281                  * We have to recheck this conditional again now that
6282                  * we're holding the l2ad_mtx to prevent a race with
6283                  * another thread which might be concurrently calling
6284                  * l2arc_evict(). In that case, l2arc_evict() might have
6285                  * destroyed the header's L2 portion as we were waiting
6286                  * to acquire the l2ad_mtx.
6287                  */
6288                 if (HDR_HAS_L2HDR(hdr))
6289                         arc_hdr_l2hdr_destroy(hdr);
6290 
6291                 mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
6292         }
6293 
6294         /*
6295          * Do we have more than one buf?
6296          */
6297         if (hdr->b_l1hdr.b_bufcnt > 1) {
6298                 arc_buf_hdr_t *nhdr;
6299                 uint64_t spa = hdr->b_spa;
6300                 uint64_t psize = HDR_GET_PSIZE(hdr);
6301                 uint64_t lsize = HDR_GET_LSIZE(hdr);
6302                 boolean_t protected = HDR_PROTECTED(hdr);
6303                 enum zio_compress compress = arc_hdr_get_compress(hdr);
6304                 arc_buf_contents_t type = arc_buf_type(hdr);
6305                 VERIFY3U(hdr->b_type, ==, type);
6306 
6307                 ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
6308                 (void) remove_reference(hdr, hash_lock, tag);
6309 
6310                 if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) {
6311                         ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
6312                         ASSERT(ARC_BUF_LAST(buf));
6313                 }
6314 
6315                 /*
6316                  * Pull the data off of this hdr and attach it to
6317                  * a new anonymous hdr. Also find the last buffer
6318                  * in the hdr's buffer list.
6319                  */
6320                 arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
6321                 ASSERT3P(lastbuf, !=, NULL);
6322 
6323                 /*
6324                  * If the current arc_buf_t and the hdr are sharing their data
6325                  * buffer, then we must stop sharing that block.
6326                  */
6327                 if (arc_buf_is_shared(buf)) {
6328                         ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
6329                         VERIFY(!arc_buf_is_shared(lastbuf));
6330 
6331                         /*
6332                          * First, sever the block sharing relationship between
6333                          * buf and the arc_buf_hdr_t.
6334                          */
6335                         arc_unshare_buf(hdr, buf);
6336 
6337                         /*
6338                          * Now we need to recreate the hdr's b_pabd. Since we
6339                          * have lastbuf handy, we try to share with it, but if
6340                          * we can't then we allocate a new b_pabd and copy the
6341                          * data from buf into it.
6342                          */
6343                         if (arc_can_share(hdr, lastbuf)) {
6344                                 arc_share_buf(hdr, lastbuf);
6345                         } else {
6346                                 arc_hdr_alloc_pabd(hdr, ARC_HDR_DO_ADAPT);
6347                                 abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
6348                                     buf->b_data, psize);
6349                         }
6350                         VERIFY3P(lastbuf->b_data, !=, NULL);
6351                 } else if (HDR_SHARED_DATA(hdr)) {
6352                         /*
6353                          * Uncompressed shared buffers are always at the end
6354                          * of the list. Compressed buffers don't have the
6355                          * same requirements. This makes it hard to
6356                          * simply assert that the lastbuf is shared so
6357                          * we rely on the hdr's compression flags to determine
6358                          * if we have a compressed, shared buffer.
6359                          */
6360                         ASSERT(arc_buf_is_shared(lastbuf) ||
6361                             arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
6362                         ASSERT(!ARC_BUF_SHARED(buf));
6363                 }
6364                 ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
6365                 ASSERT3P(state, !=, arc_l2c_only);
6366 
6367                 (void) zfs_refcount_remove_many(&state->arcs_size,
6368                     arc_buf_size(buf), buf);
6369 
6370                 if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
6371                         ASSERT3P(state, !=, arc_l2c_only);
6372                         (void) zfs_refcount_remove_many(
6373                             &state->arcs_esize[type],
6374                             arc_buf_size(buf), buf);
6375                 }
6376 
6377                 hdr->b_l1hdr.b_bufcnt -= 1;
6378                 if (ARC_BUF_ENCRYPTED(buf))
6379                         hdr->b_crypt_hdr.b_ebufcnt -= 1;
6380 
6381                 arc_cksum_verify(buf);
6382                 arc_buf_unwatch(buf);
6383 
6384                 /* if this is the last uncompressed buf free the checksum */
6385                 if (!arc_hdr_has_uncompressed_buf(hdr))
6386                         arc_cksum_free(hdr);
6387 
6388                 mutex_exit(hash_lock);
6389 
6390                 /*
6391                  * Allocate a new hdr. The new hdr will contain a b_pabd
6392                  * buffer which will be freed in arc_write().
6393                  */
6394                 nhdr = arc_hdr_alloc(spa, psize, lsize, protected,
6395                     compress, type, HDR_HAS_RABD(hdr));
6396                 ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL);
6397                 ASSERT0(nhdr->b_l1hdr.b_bufcnt);
6398                 ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt));
6399                 VERIFY3U(nhdr->b_type, ==, type);
6400                 ASSERT(!HDR_SHARED_DATA(nhdr));
6401 
6402                 nhdr->b_l1hdr.b_buf = buf;
6403                 nhdr->b_l1hdr.b_bufcnt = 1;
6404                 if (ARC_BUF_ENCRYPTED(buf))
6405                         nhdr->b_crypt_hdr.b_ebufcnt = 1;
6406                 (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
6407                 buf->b_hdr = nhdr;
6408 
6409                 mutex_exit(&buf->b_evict_lock);
6410                 (void) zfs_refcount_add_many(&arc_anon->arcs_size,
6411                     arc_buf_size(buf), buf);
6412         } else {
6413                 mutex_exit(&buf->b_evict_lock);
6414                 ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
6415                 /* protected by hash lock, or hdr is on arc_anon */
6416                 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
6417                 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
6418                 arc_change_state(arc_anon, hdr, hash_lock);
6419                 hdr->b_l1hdr.b_arc_access = 0;
6420 
6421                 mutex_exit(hash_lock);
6422                 buf_discard_identity(hdr);
6423                 arc_buf_thaw(buf);
6424         }
6425 }
6426 
6427 int
6428 arc_released(arc_buf_t *buf)
6429 {
6430         int released;
6431 
6432         mutex_enter(&buf->b_evict_lock);
6433         released = (buf->b_data != NULL &&
6434             buf->b_hdr->b_l1hdr.b_state == arc_anon);
6435         mutex_exit(&buf->b_evict_lock);
6436         return (released);
6437 }
6438 
6439 #ifdef ZFS_DEBUG
6440 int
6441 arc_referenced(arc_buf_t *buf)
6442 {
6443         int referenced;
6444 
6445         mutex_enter(&buf->b_evict_lock);
6446         referenced = (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
6447         mutex_exit(&buf->b_evict_lock);
6448         return (referenced);
6449 }
6450 #endif
6451 
6452 static void
6453 arc_write_ready(zio_t *zio)
6454 {
6455         arc_write_callback_t *callback = zio->io_private;
6456         arc_buf_t *buf = callback->awcb_buf;
6457         arc_buf_hdr_t *hdr = buf->b_hdr;
6458         blkptr_t *bp = zio->io_bp;
6459         uint64_t psize = BP_IS_HOLE(bp) ? 0 : BP_GET_PSIZE(bp);
6460 
6461         ASSERT(HDR_HAS_L1HDR(hdr));
6462         ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
6463         ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
6464 
6465         /*
6466          * If we're reexecuting this zio because the pool suspended, then
6467          * cleanup any state that was previously set the first time the
6468          * callback was invoked.
6469          */
6470         if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
6471                 arc_cksum_free(hdr);
6472                 arc_buf_unwatch(buf);
6473                 if (hdr->b_l1hdr.b_pabd != NULL) {
6474                         if (arc_buf_is_shared(buf)) {
6475                                 arc_unshare_buf(hdr, buf);
6476                         } else {
6477                                 arc_hdr_free_pabd(hdr, B_FALSE);
6478                         }
6479                 }
6480 
6481                 if (HDR_HAS_RABD(hdr))
6482                         arc_hdr_free_pabd(hdr, B_TRUE);
6483         }
6484         ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
6485         ASSERT(!HDR_HAS_RABD(hdr));
6486         ASSERT(!HDR_SHARED_DATA(hdr));
6487         ASSERT(!arc_buf_is_shared(buf));
6488 
6489         callback->awcb_ready(zio, buf, callback->awcb_private);
6490 
6491         if (HDR_IO_IN_PROGRESS(hdr))
6492                 ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED);
6493 
6494         arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
6495 
6496         if (BP_IS_PROTECTED(bp) != !!HDR_PROTECTED(hdr))
6497                 hdr = arc_hdr_realloc_crypt(hdr, BP_IS_PROTECTED(bp));
6498 
6499         if (BP_IS_PROTECTED(bp)) {
6500                 /* ZIL blocks are written through zio_rewrite */
6501                 ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
6502                 ASSERT(HDR_PROTECTED(hdr));
6503 
6504                 if (BP_SHOULD_BYTESWAP(bp)) {
6505                         if (BP_GET_LEVEL(bp) > 0) {
6506                                 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
6507                         } else {
6508                                 hdr->b_l1hdr.b_byteswap =
6509                                     DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
6510                         }
6511                 } else {
6512                         hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
6513                 }
6514 
6515                 hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
6516                 hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
6517                 zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
6518                     hdr->b_crypt_hdr.b_iv);
6519                 zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac);
6520         }
6521 
6522         /*
6523          * If this block was written for raw encryption but the zio layer
6524          * ended up only authenticating it, adjust the buffer flags now.
6525          */
6526         if (BP_IS_AUTHENTICATED(bp) && ARC_BUF_ENCRYPTED(buf)) {
6527                 arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
6528                 buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
6529                 if (BP_GET_COMPRESS(bp) == ZIO_COMPRESS_OFF)
6530                         buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
6531         } else if (BP_IS_HOLE(bp) && ARC_BUF_ENCRYPTED(buf)) {
6532                 buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
6533                 buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
6534         }
6535 
6536         /* this must be done after the buffer flags are adjusted */
6537         arc_cksum_compute(buf);
6538 
6539         enum zio_compress compress;
6540         if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
6541                 compress = ZIO_COMPRESS_OFF;
6542         } else {
6543                 ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
6544                 compress = BP_GET_COMPRESS(bp);
6545         }
6546         HDR_SET_PSIZE(hdr, psize);
6547         arc_hdr_set_compress(hdr, compress);
6548 
6549         if (zio->io_error != 0 || psize == 0)
6550                 goto out;
6551 
6552         /*
6553          * Fill the hdr with data. If the buffer is encrypted we have no choice
6554          * but to copy the data into b_rabd. If the hdr is compressed, the data
6555          * we want is available from the zio, otherwise we can take it from
6556          * the buf.
6557          *
6558          * We might be able to share the buf's data with the hdr here. However,
6559          * doing so would cause the ARC to be full of linear ABDs if we write a
6560          * lot of shareable data. As a compromise, we check whether scattered
6561          * ABDs are allowed, and assume that if they are then the user wants
6562          * the ARC to be primarily filled with them regardless of the data being
6563          * written. Therefore, if they're allowed then we allocate one and copy
6564          * the data into it; otherwise, we share the data directly if we can.
6565          */
6566         if (ARC_BUF_ENCRYPTED(buf)) {
6567                 ASSERT3U(psize, >, 0);
6568                 ASSERT(ARC_BUF_COMPRESSED(buf));
6569                 arc_hdr_alloc_pabd(hdr, ARC_HDR_DO_ADAPT|ARC_HDR_ALLOC_RDATA);
6570                 abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
6571         } else if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) {
6572                 /*
6573                  * Ideally, we would always copy the io_abd into b_pabd, but the
6574                  * user may have disabled compressed ARC, thus we must check the
6575                  * hdr's compression setting rather than the io_bp's.
6576                  */
6577                 if (BP_IS_ENCRYPTED(bp)) {
6578                         ASSERT3U(psize, >, 0);
6579                         arc_hdr_alloc_pabd(hdr,
6580                             ARC_HDR_DO_ADAPT|ARC_HDR_ALLOC_RDATA);
6581                         abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
6582                 } else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
6583                     !ARC_BUF_COMPRESSED(buf)) {
6584                         ASSERT3U(psize, >, 0);
6585                         arc_hdr_alloc_pabd(hdr, ARC_HDR_DO_ADAPT);
6586                         abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
6587                 } else {
6588                         ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));
6589                         arc_hdr_alloc_pabd(hdr, ARC_HDR_DO_ADAPT);
6590                         abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
6591                             arc_buf_size(buf));
6592                 }
6593         } else {
6594                 ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd));
6595                 ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
6596                 ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
6597                 arc_share_buf(hdr, buf);
6598         }
6599 
6600 out:
6601         arc_hdr_verify(hdr, bp);
6602 }
6603 
6604 static void
6605 arc_write_children_ready(zio_t *zio)
6606 {
6607         arc_write_callback_t *callback = zio->io_private;
6608         arc_buf_t *buf = callback->awcb_buf;
6609 
6610         callback->awcb_children_ready(zio, buf, callback->awcb_private);
6611 }
6612 
6613 /*
6614  * The SPA calls this callback for each physical write that happens on behalf
6615  * of a logical write.  See the comment in dbuf_write_physdone() for details.
6616  */
6617 static void
6618 arc_write_physdone(zio_t *zio)
6619 {
6620         arc_write_callback_t *cb = zio->io_private;
6621         if (cb->awcb_physdone != NULL)
6622                 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
6623 }
6624 
6625 static void
6626 arc_write_done(zio_t *zio)
6627 {
6628         arc_write_callback_t *callback = zio->io_private;
6629         arc_buf_t *buf = callback->awcb_buf;
6630         arc_buf_hdr_t *hdr = buf->b_hdr;
6631 
6632         ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
6633 
6634         if (zio->io_error == 0) {
6635                 arc_hdr_verify(hdr, zio->io_bp);
6636 
6637                 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
6638                         buf_discard_identity(hdr);
6639                 } else {
6640                         hdr->b_dva = *BP_IDENTITY(zio->io_bp);
6641                         hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
6642                 }
6643         } else {
6644                 ASSERT(HDR_EMPTY(hdr));
6645         }
6646 
6647         /*
6648          * If the block to be written was all-zero or compressed enough to be
6649          * embedded in the BP, no write was performed so there will be no
6650          * dva/birth/checksum.  The buffer must therefore remain anonymous
6651          * (and uncached).
6652          */
6653         if (!HDR_EMPTY(hdr)) {
6654                 arc_buf_hdr_t *exists;
6655                 kmutex_t *hash_lock;
6656 
6657                 ASSERT3U(zio->io_error, ==, 0);
6658 
6659                 arc_cksum_verify(buf);
6660 
6661                 exists = buf_hash_insert(hdr, &hash_lock);
6662                 if (exists != NULL) {
6663                         /*
6664                          * This can only happen if we overwrite for
6665                          * sync-to-convergence, because we remove
6666                          * buffers from the hash table when we arc_free().
6667                          */
6668                         if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
6669                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
6670                                         panic("bad overwrite, hdr=%p exists=%p",
6671                                             (void *)hdr, (void *)exists);
6672                                 ASSERT(zfs_refcount_is_zero(
6673                                     &exists->b_l1hdr.b_refcnt));
6674                                 arc_change_state(arc_anon, exists, hash_lock);
6675                                 arc_hdr_destroy(exists);
6676                                 mutex_exit(hash_lock);
6677                                 exists = buf_hash_insert(hdr, &hash_lock);
6678                                 ASSERT3P(exists, ==, NULL);
6679                         } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
6680                                 /* nopwrite */
6681                                 ASSERT(zio->io_prop.zp_nopwrite);
6682                                 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
6683                                         panic("bad nopwrite, hdr=%p exists=%p",
6684                                             (void *)hdr, (void *)exists);
6685                         } else {
6686                                 /* Dedup */
6687                                 ASSERT(hdr->b_l1hdr.b_bufcnt == 1);
6688                                 ASSERT(hdr->b_l1hdr.b_state == arc_anon);
6689                                 ASSERT(BP_GET_DEDUP(zio->io_bp));
6690                                 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
6691                         }
6692                 }
6693                 arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
6694                 /* if it's not anon, we are doing a scrub */
6695                 if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
6696                         arc_access(hdr, hash_lock);
6697                 mutex_exit(hash_lock);
6698         } else {
6699                 arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
6700         }
6701 
6702         ASSERT(!zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
6703         callback->awcb_done(zio, buf, callback->awcb_private);
6704 
6705         abd_put(zio->io_abd);
6706         kmem_free(callback, sizeof (arc_write_callback_t));
6707 }
6708 
6709 zio_t *
6710 arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
6711     boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready,
6712     arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone,
6713     arc_write_done_func_t *done, void *private, zio_priority_t priority,
6714     int zio_flags, const zbookmark_phys_t *zb)
6715 {
6716         arc_buf_hdr_t *hdr = buf->b_hdr;
6717         arc_write_callback_t *callback;
6718         zio_t *zio;
6719         zio_prop_t localprop = *zp;
6720 
6721         ASSERT3P(ready, !=, NULL);
6722         ASSERT3P(done, !=, NULL);
6723         ASSERT(!HDR_IO_ERROR(hdr));
6724         ASSERT(!HDR_IO_IN_PROGRESS(hdr));
6725         ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
6726         ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
6727         if (l2arc)
6728                 arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
6729 
6730         if (ARC_BUF_ENCRYPTED(buf)) {
6731                 ASSERT(ARC_BUF_COMPRESSED(buf));
6732                 localprop.zp_encrypt = B_TRUE;
6733                 localprop.zp_compress = HDR_GET_COMPRESS(hdr);
6734                 /* CONSTCOND */
6735                 localprop.zp_byteorder =
6736                     (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
6737                     ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
6738                 bcopy(hdr->b_crypt_hdr.b_salt, localprop.zp_salt,
6739                     ZIO_DATA_SALT_LEN);
6740                 bcopy(hdr->b_crypt_hdr.b_iv, localprop.zp_iv,
6741                     ZIO_DATA_IV_LEN);
6742                 bcopy(hdr->b_crypt_hdr.b_mac, localprop.zp_mac,
6743                     ZIO_DATA_MAC_LEN);
6744                 if (DMU_OT_IS_ENCRYPTED(localprop.zp_type)) {
6745                         localprop.zp_nopwrite = B_FALSE;
6746                         localprop.zp_copies =
6747                             MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1);
6748                 }
6749                 zio_flags |= ZIO_FLAG_RAW;
6750         } else if (ARC_BUF_COMPRESSED(buf)) {
6751                 ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf));
6752                 localprop.zp_compress = HDR_GET_COMPRESS(hdr);
6753                 zio_flags |= ZIO_FLAG_RAW_COMPRESS;
6754         }
6755 
6756         callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
6757         callback->awcb_ready = ready;
6758         callback->awcb_children_ready = children_ready;
6759         callback->awcb_physdone = physdone;
6760         callback->awcb_done = done;
6761         callback->awcb_private = private;
6762         callback->awcb_buf = buf;
6763 
6764         /*
6765          * The hdr's b_pabd is now stale, free it now. A new data block
6766          * will be allocated when the zio pipeline calls arc_write_ready().
6767          */
6768         if (hdr->b_l1hdr.b_pabd != NULL) {
6769                 /*
6770                  * If the buf is currently sharing the data block with
6771                  * the hdr then we need to break that relationship here.
6772                  * The hdr will remain with a NULL data pointer and the
6773                  * buf will take sole ownership of the block.
6774                  */
6775                 if (arc_buf_is_shared(buf)) {
6776                         arc_unshare_buf(hdr, buf);
6777                 } else {
6778                         arc_hdr_free_pabd(hdr, B_FALSE);
6779                 }
6780                 VERIFY3P(buf->b_data, !=, NULL);
6781         }
6782 
6783         if (HDR_HAS_RABD(hdr))
6784                 arc_hdr_free_pabd(hdr, B_TRUE);
6785 
6786         if (!(zio_flags & ZIO_FLAG_RAW))
6787                 arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
6788 
6789         ASSERT(!arc_buf_is_shared(buf));
6790         ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
6791 
6792         zio = zio_write(pio, spa, txg, bp,
6793             abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
6794             HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready,
6795             (children_ready != NULL) ? arc_write_children_ready : NULL,
6796             arc_write_physdone, arc_write_done, callback,
6797             priority, zio_flags, zb);
6798 
6799         return (zio);
6800 }
6801 
6802 static int
6803 arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
6804 {
6805 #ifdef _KERNEL
6806         uint64_t available_memory = ptob(freemem);
6807 
6808 #if defined(__i386)
6809         available_memory =
6810             MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
6811 #endif
6812 
6813         if (freemem > physmem * arc_lotsfree_percent / 100)
6814                 return (0);
6815 
6816         if (txg > spa->spa_lowmem_last_txg) {
6817                 spa->spa_lowmem_last_txg = txg;
6818                 spa->spa_lowmem_page_load = 0;
6819         }
6820         /*
6821          * If we are in pageout, we know that memory is already tight,
6822          * the arc is already going to be evicting, so we just want to
6823          * continue to let page writes occur as quickly as possible.
6824          */
6825         if (curproc == proc_pageout) {
6826                 if (spa->spa_lowmem_page_load >
6827                     MAX(ptob(minfree), available_memory) / 4)
6828                         return (SET_ERROR(ERESTART));
6829                 /* Note: reserve is inflated, so we deflate */
6830                 atomic_add_64(&spa->spa_lowmem_page_load, reserve / 8);
6831                 return (0);
6832         } else if (spa->spa_lowmem_page_load > 0 && arc_reclaim_needed()) {
6833                 /* memory is low, delay before restarting */
6834                 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
6835                 return (SET_ERROR(EAGAIN));
6836         }
6837         spa->spa_lowmem_page_load = 0;
6838 #endif /* _KERNEL */
6839         return (0);
6840 }
6841 
6842 void
6843 arc_tempreserve_clear(uint64_t reserve)
6844 {
6845         atomic_add_64(&arc_tempreserve, -reserve);
6846         ASSERT((int64_t)arc_tempreserve >= 0);
6847 }
6848 
6849 int
6850 arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg)
6851 {
6852         int error;
6853         uint64_t anon_size;
6854 
6855         if (reserve > arc_c/4 && !arc_no_grow)
6856                 arc_c = MIN(arc_c_max, reserve * 4);
6857         if (reserve > arc_c)
6858                 return (SET_ERROR(ENOMEM));
6859 
6860         /*
6861          * Don't count loaned bufs as in flight dirty data to prevent long
6862          * network delays from blocking transactions that are ready to be
6863          * assigned to a txg.
6864          */
6865 
6866         /* assert that it has not wrapped around */
6867         ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
6868 
6869         anon_size = MAX((int64_t)(zfs_refcount_count(&arc_anon->arcs_size) -
6870             arc_loaned_bytes), 0);
6871 
6872         /*
6873          * Writes will, almost always, require additional memory allocations
6874          * in order to compress/encrypt/etc the data.  We therefore need to
6875          * make sure that there is sufficient available memory for this.
6876          */
6877         error = arc_memory_throttle(spa, reserve, txg);
6878         if (error != 0)
6879                 return (error);
6880 
6881         /*
6882          * Throttle writes when the amount of dirty data in the cache
6883          * gets too large.  We try to keep the cache less than half full
6884          * of dirty blocks so that our sync times don't grow too large.
6885          *
6886          * In the case of one pool being built on another pool, we want
6887          * to make sure we don't end up throttling the lower (backing)
6888          * pool when the upper pool is the majority contributor to dirty
6889          * data. To insure we make forward progress during throttling, we
6890          * also check the current pool's net dirty data and only throttle
6891          * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty
6892          * data in the cache.
6893          *
6894          * Note: if two requests come in concurrently, we might let them
6895          * both succeed, when one of them should fail.  Not a huge deal.
6896          */
6897         uint64_t total_dirty = reserve + arc_tempreserve + anon_size;
6898         uint64_t spa_dirty_anon = spa_dirty_data(spa);
6899 
6900         if (total_dirty > arc_c * zfs_arc_dirty_limit_percent / 100 &&
6901             anon_size > arc_c * zfs_arc_anon_limit_percent / 100 &&
6902             spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) {
6903                 uint64_t meta_esize =
6904                     zfs_refcount_count(
6905                     &arc_anon->arcs_esize[ARC_BUFC_METADATA]);
6906                 uint64_t data_esize =
6907                     zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
6908                 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
6909                     "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
6910                     arc_tempreserve >> 10, meta_esize >> 10,
6911                     data_esize >> 10, reserve >> 10, arc_c >> 10);
6912                 return (SET_ERROR(ERESTART));
6913         }
6914         atomic_add_64(&arc_tempreserve, reserve);
6915         return (0);
6916 }
6917 
6918 static void
6919 arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
6920     kstat_named_t *evict_data, kstat_named_t *evict_metadata)
6921 {
6922         size->value.ui64 = zfs_refcount_count(&state->arcs_size);
6923         evict_data->value.ui64 =
6924             zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]);
6925         evict_metadata->value.ui64 =
6926             zfs_refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]);
6927 }
6928 
6929 static int
6930 arc_kstat_update(kstat_t *ksp, int rw)
6931 {
6932         arc_stats_t *as = ksp->ks_data;
6933 
6934         if (rw == KSTAT_WRITE) {
6935                 return (EACCES);
6936         } else {
6937                 arc_kstat_update_state(arc_anon,
6938                     &as->arcstat_anon_size,
6939                     &as->arcstat_anon_evictable_data,
6940                     &as->arcstat_anon_evictable_metadata);
6941                 arc_kstat_update_state(arc_mru,
6942                     &as->arcstat_mru_size,
6943                     &as->arcstat_mru_evictable_data,
6944                     &as->arcstat_mru_evictable_metadata);
6945                 arc_kstat_update_state(arc_mru_ghost,
6946                     &as->arcstat_mru_ghost_size,
6947                     &as->arcstat_mru_ghost_evictable_data,
6948                     &as->arcstat_mru_ghost_evictable_metadata);
6949                 arc_kstat_update_state(arc_mfu,
6950                     &as->arcstat_mfu_size,
6951                     &as->arcstat_mfu_evictable_data,
6952                     &as->arcstat_mfu_evictable_metadata);
6953                 arc_kstat_update_state(arc_mfu_ghost,
6954                     &as->arcstat_mfu_ghost_size,
6955                     &as->arcstat_mfu_ghost_evictable_data,
6956                     &as->arcstat_mfu_ghost_evictable_metadata);
6957 
6958                 ARCSTAT(arcstat_size) = aggsum_value(&arc_size);
6959                 ARCSTAT(arcstat_meta_used) = aggsum_value(&arc_meta_used);
6960                 ARCSTAT(arcstat_data_size) = aggsum_value(&astat_data_size);
6961                 ARCSTAT(arcstat_metadata_size) =
6962                     aggsum_value(&astat_metadata_size);
6963                 ARCSTAT(arcstat_hdr_size) = aggsum_value(&astat_hdr_size);
6964                 ARCSTAT(arcstat_other_size) = aggsum_value(&astat_other_size);
6965                 ARCSTAT(arcstat_l2_hdr_size) = aggsum_value(&astat_l2_hdr_size);
6966         }
6967 
6968         return (0);
6969 }
6970 
6971 /*
6972  * This function *must* return indices evenly distributed between all
6973  * sublists of the multilist. This is needed due to how the ARC eviction
6974  * code is laid out; arc_evict_state() assumes ARC buffers are evenly
6975  * distributed between all sublists and uses this assumption when
6976  * deciding which sublist to evict from and how much to evict from it.
6977  */
6978 unsigned int
6979 arc_state_multilist_index_func(multilist_t *ml, void *obj)
6980 {
6981         arc_buf_hdr_t *hdr = obj;
6982 
6983         /*
6984          * We rely on b_dva to generate evenly distributed index
6985          * numbers using buf_hash below. So, as an added precaution,
6986          * let's make sure we never add empty buffers to the arc lists.
6987          */
6988         ASSERT(!HDR_EMPTY(hdr));
6989 
6990         /*
6991          * The assumption here, is the hash value for a given
6992          * arc_buf_hdr_t will remain constant throughout its lifetime
6993          * (i.e. its b_spa, b_dva, and b_birth fields don't change).
6994          * Thus, we don't need to store the header's sublist index
6995          * on insertion, as this index can be recalculated on removal.
6996          *
6997          * Also, the low order bits of the hash value are thought to be
6998          * distributed evenly. Otherwise, in the case that the multilist
6999          * has a power of two number of sublists, each sublists' usage
7000          * would not be evenly distributed.
7001          */
7002         return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
7003             multilist_get_num_sublists(ml));
7004 }
7005 
7006 static void
7007 arc_state_init(void)
7008 {
7009         arc_anon = &ARC_anon;
7010         arc_mru = &ARC_mru;
7011         arc_mru_ghost = &ARC_mru_ghost;
7012         arc_mfu = &ARC_mfu;
7013         arc_mfu_ghost = &ARC_mfu_ghost;
7014         arc_l2c_only = &ARC_l2c_only;
7015 
7016         arc_mru->arcs_list[ARC_BUFC_METADATA] =
7017             multilist_create(sizeof (arc_buf_hdr_t),
7018             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
7019             arc_state_multilist_index_func);
7020         arc_mru->arcs_list[ARC_BUFC_DATA] =
7021             multilist_create(sizeof (arc_buf_hdr_t),
7022             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
7023             arc_state_multilist_index_func);
7024         arc_mru_ghost->arcs_list[ARC_BUFC_METADATA] =
7025             multilist_create(sizeof (arc_buf_hdr_t),
7026             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
7027             arc_state_multilist_index_func);
7028         arc_mru_ghost->arcs_list[ARC_BUFC_DATA] =
7029             multilist_create(sizeof (arc_buf_hdr_t),
7030             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
7031             arc_state_multilist_index_func);
7032         arc_mfu->arcs_list[ARC_BUFC_METADATA] =
7033             multilist_create(sizeof (arc_buf_hdr_t),
7034             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
7035             arc_state_multilist_index_func);
7036         arc_mfu->arcs_list[ARC_BUFC_DATA] =
7037             multilist_create(sizeof (arc_buf_hdr_t),
7038             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
7039             arc_state_multilist_index_func);
7040         arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA] =
7041             multilist_create(sizeof (arc_buf_hdr_t),
7042             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
7043             arc_state_multilist_index_func);
7044         arc_mfu_ghost->arcs_list[ARC_BUFC_DATA] =
7045             multilist_create(sizeof (arc_buf_hdr_t),
7046             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
7047             arc_state_multilist_index_func);
7048         arc_l2c_only->arcs_list[ARC_BUFC_METADATA] =
7049             multilist_create(sizeof (arc_buf_hdr_t),
7050             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
7051             arc_state_multilist_index_func);
7052         arc_l2c_only->arcs_list[ARC_BUFC_DATA] =
7053             multilist_create(sizeof (arc_buf_hdr_t),
7054             offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
7055             arc_state_multilist_index_func);
7056 
7057         zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
7058         zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
7059         zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
7060         zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
7061         zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
7062         zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
7063         zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
7064         zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
7065         zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
7066         zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
7067         zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
7068         zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
7069 
7070         zfs_refcount_create(&arc_anon->arcs_size);
7071         zfs_refcount_create(&arc_mru->arcs_size);
7072         zfs_refcount_create(&arc_mru_ghost->arcs_size);
7073         zfs_refcount_create(&arc_mfu->arcs_size);
7074         zfs_refcount_create(&arc_mfu_ghost->arcs_size);
7075         zfs_refcount_create(&arc_l2c_only->arcs_size);
7076 
7077         aggsum_init(&arc_meta_used, 0);
7078         aggsum_init(&arc_size, 0);
7079         aggsum_init(&astat_data_size, 0);
7080         aggsum_init(&astat_metadata_size, 0);
7081         aggsum_init(&astat_hdr_size, 0);
7082         aggsum_init(&astat_other_size, 0);
7083         aggsum_init(&astat_l2_hdr_size, 0);
7084 
7085         arc_anon->arcs_state = ARC_STATE_ANON;
7086         arc_mru->arcs_state = ARC_STATE_MRU;
7087         arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST;
7088         arc_mfu->arcs_state = ARC_STATE_MFU;
7089         arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST;
7090         arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY;
7091 }
7092 
7093 static void
7094 arc_state_fini(void)
7095 {
7096         zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
7097         zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
7098         zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
7099         zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
7100         zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
7101         zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
7102         zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
7103         zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
7104         zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
7105         zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
7106         zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
7107         zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
7108 
7109         zfs_refcount_destroy(&arc_anon->arcs_size);
7110         zfs_refcount_destroy(&arc_mru->arcs_size);
7111         zfs_refcount_destroy(&arc_mru_ghost->arcs_size);
7112         zfs_refcount_destroy(&arc_mfu->arcs_size);
7113         zfs_refcount_destroy(&arc_mfu_ghost->arcs_size);
7114         zfs_refcount_destroy(&arc_l2c_only->arcs_size);
7115 
7116         multilist_destroy(arc_mru->arcs_list[ARC_BUFC_METADATA]);
7117         multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
7118         multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_METADATA]);
7119         multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
7120         multilist_destroy(arc_mru->arcs_list[ARC_BUFC_DATA]);
7121         multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
7122         multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_DATA]);
7123         multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
7124         multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
7125         multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
7126 
7127         aggsum_fini(&arc_meta_used);
7128         aggsum_fini(&arc_size);
7129         aggsum_fini(&astat_data_size);
7130         aggsum_fini(&astat_metadata_size);
7131         aggsum_fini(&astat_hdr_size);
7132         aggsum_fini(&astat_other_size);
7133         aggsum_fini(&astat_l2_hdr_size);
7134 
7135 }
7136 
7137 uint64_t
7138 arc_max_bytes(void)
7139 {
7140         return (arc_c_max);
7141 }
7142 
7143 void
7144 arc_init(void)
7145 {
7146         /*
7147          * allmem is "all memory that we could possibly use".
7148          */
7149 #ifdef _KERNEL
7150         uint64_t allmem = ptob(physmem - swapfs_minfree);
7151 #else
7152         uint64_t allmem = (physmem * PAGESIZE) / 2;
7153 #endif
7154         mutex_init(&arc_adjust_lock, NULL, MUTEX_DEFAULT, NULL);
7155         cv_init(&arc_adjust_waiters_cv, NULL, CV_DEFAULT, NULL);
7156 
7157         /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
7158         arc_c_min = MAX(allmem / 32, 64 << 20);
7159         /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
7160         if (allmem >= 1 << 30)
7161                 arc_c_max = allmem - (1 << 30);
7162         else
7163                 arc_c_max = arc_c_min;
7164         arc_c_max = MAX(allmem * 3 / 4, arc_c_max);
7165 
7166         /*
7167          * In userland, there's only the memory pressure that we artificially
7168          * create (see arc_available_memory()).  Don't let arc_c get too
7169          * small, because it can cause transactions to be larger than
7170          * arc_c, causing arc_tempreserve_space() to fail.
7171          */
7172 #ifndef _KERNEL
7173         arc_c_min = arc_c_max / 2;
7174 #endif
7175 
7176         /*
7177          * Allow the tunables to override our calculations if they are
7178          * reasonable (ie. over 64MB)
7179          */
7180         if (zfs_arc_max > 64 << 20 && zfs_arc_max < allmem) {
7181                 arc_c_max = zfs_arc_max;
7182                 arc_c_min = MIN(arc_c_min, arc_c_max);
7183         }
7184         if (zfs_arc_min > 64 << 20 && zfs_arc_min <= arc_c_max)
7185                 arc_c_min = zfs_arc_min;
7186 
7187         arc_c = arc_c_max;
7188         arc_p = (arc_c >> 1);
7189 
7190         /* limit meta-data to 1/4 of the arc capacity */
7191         arc_meta_limit = arc_c_max / 4;
7192 
7193 #ifdef _KERNEL
7194         /*
7195          * Metadata is stored in the kernel's heap.  Don't let us
7196          * use more than half the heap for the ARC.
7197          */
7198         arc_meta_limit = MIN(arc_meta_limit,
7199             vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 2);
7200 #endif
7201 
7202         /* Allow the tunable to override if it is reasonable */
7203         if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
7204                 arc_meta_limit = zfs_arc_meta_limit;
7205 
7206         if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
7207                 arc_c_min = arc_meta_limit / 2;
7208 
7209         /* On larger-memory machines, we clamp the minimum at 1GB */
7210         if (zfs_arc_min == 0)
7211                 arc_c_min = MIN(arc_c_min, (1 << 30));
7212 
7213         if (zfs_arc_meta_min > 0) {
7214                 arc_meta_min = zfs_arc_meta_min;
7215         } else {
7216                 arc_meta_min = arc_c_min / 2;
7217         }
7218 
7219         if (zfs_arc_grow_retry > 0)
7220                 arc_grow_retry = zfs_arc_grow_retry;
7221 
7222         if (zfs_arc_shrink_shift > 0)
7223                 arc_shrink_shift = zfs_arc_shrink_shift;
7224 
7225         /*
7226          * Ensure that arc_no_grow_shift is less than arc_shrink_shift.
7227          */
7228         if (arc_no_grow_shift >= arc_shrink_shift)
7229                 arc_no_grow_shift = arc_shrink_shift - 1;
7230 
7231         if (zfs_arc_p_min_shift > 0)
7232                 arc_p_min_shift = zfs_arc_p_min_shift;
7233 
7234         /* if kmem_flags are set, lets try to use less memory */
7235         if (kmem_debugging())
7236                 arc_c = arc_c / 2;
7237         if (arc_c < arc_c_min)
7238                 arc_c = arc_c_min;
7239 
7240         arc_state_init();
7241 
7242         /*
7243          * The arc must be "uninitialized", so that hdr_recl() (which is
7244          * registered by buf_init()) will not access arc_reap_zthr before
7245          * it is created.
7246          */
7247         ASSERT(!arc_initialized);
7248         buf_init();
7249 
7250         arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
7251             sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
7252 
7253         if (arc_ksp != NULL) {
7254                 arc_ksp->ks_data = &arc_stats;
7255                 arc_ksp->ks_update = arc_kstat_update;
7256                 kstat_install(arc_ksp);
7257         }
7258 
7259         arc_adjust_zthr = zthr_create(arc_adjust_cb_check,
7260             arc_adjust_cb, NULL);
7261         arc_reap_zthr = zthr_create_timer(arc_reap_cb_check,
7262             arc_reap_cb, NULL, SEC2NSEC(1));
7263 
7264         arc_initialized = B_TRUE;
7265         arc_warm = B_FALSE;
7266 
7267         /*
7268          * Calculate maximum amount of dirty data per pool.
7269          *
7270          * If it has been set by /etc/system, take that.
7271          * Otherwise, use a percentage of physical memory defined by
7272          * zfs_dirty_data_max_percent (default 10%) with a cap at
7273          * zfs_dirty_data_max_max (default 4GB).
7274          */
7275         if (zfs_dirty_data_max == 0) {
7276                 zfs_dirty_data_max = physmem * PAGESIZE *
7277                     zfs_dirty_data_max_percent / 100;
7278                 zfs_dirty_data_max = MIN(zfs_dirty_data_max,
7279                     zfs_dirty_data_max_max);
7280         }
7281 }
7282 
7283 void
7284 arc_fini(void)
7285 {
7286         /* Use B_TRUE to ensure *all* buffers are evicted */
7287         arc_flush(NULL, B_TRUE);
7288 
7289         arc_initialized = B_FALSE;
7290 
7291         if (arc_ksp != NULL) {
7292                 kstat_delete(arc_ksp);
7293                 arc_ksp = NULL;
7294         }
7295 
7296         (void) zthr_cancel(arc_adjust_zthr);
7297         zthr_destroy(arc_adjust_zthr);
7298 
7299         (void) zthr_cancel(arc_reap_zthr);
7300         zthr_destroy(arc_reap_zthr);
7301 
7302         mutex_destroy(&arc_adjust_lock);
7303         cv_destroy(&arc_adjust_waiters_cv);
7304 
7305         /*
7306          * buf_fini() must proceed arc_state_fini() because buf_fin() may
7307          * trigger the release of kmem magazines, which can callback to
7308          * arc_space_return() which accesses aggsums freed in act_state_fini().
7309          */
7310         buf_fini();
7311         arc_state_fini();
7312 
7313         ASSERT0(arc_loaned_bytes);
7314 }
7315 
7316 /*
7317  * Level 2 ARC
7318  *
7319  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
7320  * It uses dedicated storage devices to hold cached data, which are populated
7321  * using large infrequent writes.  The main role of this cache is to boost
7322  * the performance of random read workloads.  The intended L2ARC devices
7323  * include short-stroked disks, solid state disks, and other media with
7324  * substantially faster read latency than disk.
7325  *
7326  *                 +-----------------------+
7327  *                 |         ARC           |
7328  *                 +-----------------------+
7329  *                    |         ^     ^
7330  *                    |         |     |
7331  *      l2arc_feed_thread()    arc_read()
7332  *                    |         |     |
7333  *                    |  l2arc read   |
7334  *                    V         |     |
7335  *               +---------------+    |
7336  *               |     L2ARC     |    |
7337  *               +---------------+    |
7338  *                   |    ^           |
7339  *          l2arc_write() |           |
7340  *                   |    |           |
7341  *                   V    |           |
7342  *                 +-------+      +-------+
7343  *                 | vdev  |      | vdev  |
7344  *                 | cache |      | cache |
7345  *                 +-------+      +-------+
7346  *                 +=========+     .-----.
7347  *                 :  L2ARC  :    |-_____-|
7348  *                 : devices :    | Disks |
7349  *                 +=========+    `-_____-'
7350  *
7351  * Read requests are satisfied from the following sources, in order:
7352  *
7353  *      1) ARC
7354  *      2) vdev cache of L2ARC devices
7355  *      3) L2ARC devices
7356  *      4) vdev cache of disks
7357  *      5) disks
7358  *
7359  * Some L2ARC device types exhibit extremely slow write performance.
7360  * To accommodate for this there are some significant differences between
7361  * the L2ARC and traditional cache design:
7362  *
7363  * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
7364  * the ARC behave as usual, freeing buffers and placing headers on ghost
7365  * lists.  The ARC does not send buffers to the L2ARC during eviction as
7366  * this would add inflated write latencies for all ARC memory pressure.
7367  *
7368  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
7369  * It does this by periodically scanning buffers from the eviction-end of
7370  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
7371  * not already there. It scans until a headroom of buffers is satisfied,
7372  * which itself is a buffer for ARC eviction. If a compressible buffer is
7373  * found during scanning and selected for writing to an L2ARC device, we
7374  * temporarily boost scanning headroom during the next scan cycle to make
7375  * sure we adapt to compression effects (which might significantly reduce
7376  * the data volume we write to L2ARC). The thread that does this is
7377  * l2arc_feed_thread(), illustrated below; example sizes are included to
7378  * provide a better sense of ratio than this diagram:
7379  *
7380  *             head -->                        tail
7381  *              +---------------------+----------+
7382  *      ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
7383  *              +---------------------+----------+   |   o L2ARC eligible
7384  *      ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
7385  *              +---------------------+----------+   |
7386  *                   15.9 Gbytes      ^ 32 Mbytes    |
7387  *                                 headroom          |
7388  *                                            l2arc_feed_thread()
7389  *                                                   |
7390  *                       l2arc write hand <--[oooo]--'
7391  *                               |           8 Mbyte
7392  *                               |          write max
7393  *                               V
7394  *                +==============================+
7395  *      L2ARC dev |####|#|###|###|    |####| ... |
7396  *                +==============================+
7397  *                           32 Gbytes
7398  *
7399  * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
7400  * evicted, then the L2ARC has cached a buffer much sooner than it probably
7401  * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
7402  * safe to say that this is an uncommon case, since buffers at the end of
7403  * the ARC lists have moved there due to inactivity.
7404  *
7405  * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
7406  * then the L2ARC simply misses copying some buffers.  This serves as a
7407  * pressure valve to prevent heavy read workloads from both stalling the ARC
7408  * with waits and clogging the L2ARC with writes.  This also helps prevent
7409  * the potential for the L2ARC to churn if it attempts to cache content too
7410  * quickly, such as during backups of the entire pool.
7411  *
7412  * 5. After system boot and before the ARC has filled main memory, there are
7413  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
7414  * lists can remain mostly static.  Instead of searching from tail of these
7415  * lists as pictured, the l2arc_feed_thread() will search from the list heads
7416  * for eligible buffers, greatly increasing its chance of finding them.
7417  *
7418  * The L2ARC device write speed is also boosted during this time so that
7419  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
7420  * there are no L2ARC reads, and no fear of degrading read performance
7421  * through increased writes.
7422  *
7423  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
7424  * the vdev queue can aggregate them into larger and fewer writes.  Each
7425  * device is written to in a rotor fashion, sweeping writes through
7426  * available space then repeating.
7427  *
7428  * 7. The L2ARC does not store dirty content.  It never needs to flush
7429  * write buffers back to disk based storage.
7430  *
7431  * 8. If an ARC buffer is written (and dirtied) which also exists in the
7432  * L2ARC, the now stale L2ARC buffer is immediately dropped.
7433  *
7434  * The performance of the L2ARC can be tweaked by a number of tunables, which
7435  * may be necessary for different workloads:
7436  *
7437  *      l2arc_write_max         max write bytes per interval
7438  *      l2arc_write_boost       extra write bytes during device warmup
7439  *      l2arc_noprefetch        skip caching prefetched buffers
7440  *      l2arc_headroom          number of max device writes to precache
7441  *      l2arc_headroom_boost    when we find compressed buffers during ARC
7442  *                              scanning, we multiply headroom by this
7443  *                              percentage factor for the next scan cycle,
7444  *                              since more compressed buffers are likely to
7445  *                              be present
7446  *      l2arc_feed_secs         seconds between L2ARC writing
7447  *
7448  * Tunables may be removed or added as future performance improvements are
7449  * integrated, and also may become zpool properties.
7450  *
7451  * There are three key functions that control how the L2ARC warms up:
7452  *
7453  *      l2arc_write_eligible()  check if a buffer is eligible to cache
7454  *      l2arc_write_size()      calculate how much to write
7455  *      l2arc_write_interval()  calculate sleep delay between writes
7456  *
7457  * These three functions determine what to write, how much, and how quickly
7458  * to send writes.
7459  *
7460  * L2ARC persistence:
7461  *
7462  * When writing buffers to L2ARC, we periodically add some metadata to
7463  * make sure we can pick them up after reboot, thus dramatically reducing
7464  * the impact that any downtime has on the performance of storage systems
7465  * with large caches.
7466  *
7467  * The implementation works fairly simply by integrating the following two
7468  * modifications:
7469  *
7470  * *) When writing to the L2ARC, we occasionally write a "l2arc log block",
7471  *    which is an additional piece of metadata which describes what's been
7472  *    written. This allows us to rebuild the arc_buf_hdr_t structures of the
7473  *    main ARC buffers. There are 2 linked-lists of log blocks headed by
7474  *    dh_start_lbps[2]. We alternate which chain we append to, so they are
7475  *    time-wise and offset-wise interleaved, but that is an optimization rather
7476  *    than for correctness. The log block also includes a pointer to the
7477  *    previous block in its chain.
7478  *
7479  * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device
7480  *    for our header bookkeeping purposes. This contains a device header,
7481  *    which contains our top-level reference structures. We update it each
7482  *    time we write a new log block, so that we're able to locate it in the
7483  *    L2ARC device. If this write results in an inconsistent device header
7484  *    (e.g. due to power failure), we detect this by verifying the header's
7485  *    checksum and simply fail to reconstruct the L2ARC after reboot.
7486  *
7487  * Implementation diagram:
7488  *
7489  * +=== L2ARC device (not to scale) ======================================+
7490  * |       ___two newest log block pointers__.__________                  |
7491  * |      /                                   \dh_start_lbps[1]           |
7492  * |     /                                     \         \dh_start_lbps[0]|
7493  * |.___/__.                                    V         V               |
7494  * ||L2 dev|....|lb |bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---|
7495  * ||   hdr|      ^         /^       /^        /         /                |
7496  * |+------+  ...--\-------/  \-----/--\------/         /                 |
7497  * |                \--------------/    \--------------/                  |
7498  * +======================================================================+
7499  *
7500  * As can be seen on the diagram, rather than using a simple linked list,
7501  * we use a pair of linked lists with alternating elements. This is a
7502  * performance enhancement due to the fact that we only find out the
7503  * address of the next log block access once the current block has been
7504  * completely read in. Obviously, this hurts performance, because we'd be
7505  * keeping the device's I/O queue at only a 1 operation deep, thus
7506  * incurring a large amount of I/O round-trip latency. Having two lists
7507  * allows us to fetch two log blocks ahead of where we are currently
7508  * rebuilding L2ARC buffers.
7509  *
7510  * On-device data structures:
7511  *
7512  * L2ARC device header: l2arc_dev_hdr_phys_t
7513  * L2ARC log block:     l2arc_log_blk_phys_t
7514  *
7515  * L2ARC reconstruction:
7516  *
7517  * When writing data, we simply write in the standard rotary fashion,
7518  * evicting buffers as we go and simply writing new data over them (writing
7519  * a new log block every now and then). This obviously means that once we
7520  * loop around the end of the device, we will start cutting into an already
7521  * committed log block (and its referenced data buffers), like so:
7522  *
7523  *    current write head__       __old tail
7524  *                        \     /
7525  *                        V    V
7526  * <--|bufs |lb |bufs |lb |    |bufs |lb |bufs |lb |-->
7527  *                         ^    ^^^^^^^^^___________________________________
7528  *                         |                                                \
7529  *                   <<nextwrite>> may overwrite this blk and/or its bufs --'
7530  *
7531  * When importing the pool, we detect this situation and use it to stop
7532  * our scanning process (see l2arc_rebuild).
7533  *
7534  * There is one significant caveat to consider when rebuilding ARC contents
7535  * from an L2ARC device: what about invalidated buffers? Given the above
7536  * construction, we cannot update blocks which we've already written to amend
7537  * them to remove buffers which were invalidated. Thus, during reconstruction,
7538  * we might be populating the cache with buffers for data that's not on the
7539  * main pool anymore, or may have been overwritten!
7540  *
7541  * As it turns out, this isn't a problem. Every arc_read request includes
7542  * both the DVA and, crucially, the birth TXG of the BP the caller is
7543  * looking for. So even if the cache were populated by completely rotten
7544  * blocks for data that had been long deleted and/or overwritten, we'll
7545  * never actually return bad data from the cache, since the DVA with the
7546  * birth TXG uniquely identify a block in space and time - once created,
7547  * a block is immutable on disk. The worst thing we have done is wasted
7548  * some time and memory at l2arc rebuild to reconstruct outdated ARC
7549  * entries that will get dropped from the l2arc as it is being updated
7550  * with new blocks.
7551  *
7552  * L2ARC buffers that have been evicted by l2arc_evict() ahead of the write
7553  * hand are not restored. This is done by saving the offset (in bytes)
7554  * l2arc_evict() has evicted to in the L2ARC device header and taking it
7555  * into account when restoring buffers.
7556  */
7557 
7558 static boolean_t
7559 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
7560 {
7561         /*
7562          * A buffer is *not* eligible for the L2ARC if it:
7563          * 1. belongs to a different spa.
7564          * 2. is already cached on the L2ARC.
7565          * 3. has an I/O in progress (it may be an incomplete read).
7566          * 4. is flagged not eligible (zfs property).
7567          * 5. is a prefetch and l2arc_noprefetch is set.
7568          */
7569         if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) ||
7570             HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr) ||
7571             (l2arc_noprefetch && HDR_PREFETCH(hdr)))
7572                 return (B_FALSE);
7573 
7574         return (B_TRUE);
7575 }
7576 
7577 static uint64_t
7578 l2arc_write_size(l2arc_dev_t *dev)
7579 {
7580         uint64_t size, dev_size;
7581 
7582         /*
7583          * Make sure our globals have meaningful values in case the user
7584          * altered them.
7585          */
7586         size = l2arc_write_max;
7587         if (size == 0) {
7588                 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
7589                     "be greater than zero, resetting it to the default (%d)",
7590                     L2ARC_WRITE_SIZE);
7591                 size = l2arc_write_max = L2ARC_WRITE_SIZE;
7592         }
7593 
7594         if (arc_warm == B_FALSE)
7595                 size += l2arc_write_boost;
7596 
7597         /*
7598          * Make sure the write size does not exceed the size of the cache
7599          * device. This is important in l2arc_evict(), otherwise infinite
7600          * iteration can occur.
7601          */
7602         dev_size = dev->l2ad_end - dev->l2ad_start;
7603         if ((size + l2arc_log_blk_overhead(size, dev)) >= dev_size) {
7604                 cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost "
7605                     "plus the overhead of log blocks (persistent L2ARC, "
7606                     "%" PRIu64 " bytes) exceeds the size of the cache device "
7607                     "(guid %" PRIu64 "), resetting them to the default (%d)",
7608                     l2arc_log_blk_overhead(size, dev),
7609                     dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE);
7610                 size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE;
7611 
7612                 if (arc_warm == B_FALSE)
7613                         size += l2arc_write_boost;
7614         }
7615 
7616         return (size);
7617 
7618 }
7619 
7620 static clock_t
7621 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
7622 {
7623         clock_t interval, next, now;
7624 
7625         /*
7626          * If the ARC lists are busy, increase our write rate; if the
7627          * lists are stale, idle back.  This is achieved by checking
7628          * how much we previously wrote - if it was more than half of
7629          * what we wanted, schedule the next write much sooner.
7630          */
7631         if (l2arc_feed_again && wrote > (wanted / 2))
7632                 interval = (hz * l2arc_feed_min_ms) / 1000;
7633         else
7634                 interval = hz * l2arc_feed_secs;
7635 
7636         now = ddi_get_lbolt();
7637         next = MAX(now, MIN(now + interval, began + interval));
7638 
7639         return (next);
7640 }
7641 
7642 /*
7643  * Cycle through L2ARC devices.  This is how L2ARC load balances.
7644  * If a device is returned, this also returns holding the spa config lock.
7645  */
7646 static l2arc_dev_t *
7647 l2arc_dev_get_next(void)
7648 {
7649         l2arc_dev_t *first, *next = NULL;
7650 
7651         /*
7652          * Lock out the removal of spas (spa_namespace_lock), then removal
7653          * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
7654          * both locks will be dropped and a spa config lock held instead.
7655          */
7656         mutex_enter(&spa_namespace_lock);
7657         mutex_enter(&l2arc_dev_mtx);
7658 
7659         /* if there are no vdevs, there is nothing to do */
7660         if (l2arc_ndev == 0)
7661                 goto out;
7662 
7663         first = NULL;
7664         next = l2arc_dev_last;
7665         do {
7666                 /* loop around the list looking for a non-faulted vdev */
7667                 if (next == NULL) {
7668                         next = list_head(l2arc_dev_list);
7669                 } else {
7670                         next = list_next(l2arc_dev_list, next);
7671                         if (next == NULL)
7672                                 next = list_head(l2arc_dev_list);
7673                 }
7674 
7675                 /* if we have come back to the start, bail out */
7676                 if (first == NULL)
7677                         first = next;
7678                 else if (next == first)
7679                         break;
7680 
7681         } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild);
7682 
7683         /* if we were unable to find any usable vdevs, return NULL */
7684         if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild)
7685                 next = NULL;
7686 
7687         l2arc_dev_last = next;
7688 
7689 out:
7690         mutex_exit(&l2arc_dev_mtx);
7691 
7692         /*
7693          * Grab the config lock to prevent the 'next' device from being
7694          * removed while we are writing to it.
7695          */
7696         if (next != NULL)
7697                 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
7698         mutex_exit(&spa_namespace_lock);
7699 
7700         return (next);
7701 }
7702 
7703 /*
7704  * Free buffers that were tagged for destruction.
7705  */
7706 static void
7707 l2arc_do_free_on_write()
7708 {
7709         list_t *buflist;
7710         l2arc_data_free_t *df, *df_prev;
7711 
7712         mutex_enter(&l2arc_free_on_write_mtx);
7713         buflist = l2arc_free_on_write;
7714 
7715         for (df = list_tail(buflist); df; df = df_prev) {
7716                 df_prev = list_prev(buflist, df);
7717                 ASSERT3P(df->l2df_abd, !=, NULL);
7718                 abd_free(df->l2df_abd);
7719                 list_remove(buflist, df);
7720                 kmem_free(df, sizeof (l2arc_data_free_t));
7721         }
7722 
7723         mutex_exit(&l2arc_free_on_write_mtx);
7724 }
7725 
7726 /*
7727  * A write to a cache device has completed.  Update all headers to allow
7728  * reads from these buffers to begin.
7729  */
7730 static void
7731 l2arc_write_done(zio_t *zio)
7732 {
7733         l2arc_write_callback_t  *cb;
7734         l2arc_lb_abd_buf_t      *abd_buf;
7735         l2arc_lb_ptr_buf_t      *lb_ptr_buf;
7736         l2arc_dev_t             *dev;
7737         l2arc_dev_hdr_phys_t    *l2dhdr;
7738         list_t                  *buflist;
7739         arc_buf_hdr_t           *head, *hdr, *hdr_prev;
7740         kmutex_t                *hash_lock;
7741         int64_t                 bytes_dropped = 0;
7742 
7743         cb = zio->io_private;
7744         ASSERT3P(cb, !=, NULL);
7745         dev = cb->l2wcb_dev;
7746         l2dhdr = dev->l2ad_dev_hdr;
7747         ASSERT3P(dev, !=, NULL);
7748         head = cb->l2wcb_head;
7749         ASSERT3P(head, !=, NULL);
7750         buflist = &dev->l2ad_buflist;
7751         ASSERT3P(buflist, !=, NULL);
7752         DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
7753             l2arc_write_callback_t *, cb);
7754 
7755         /*
7756          * All writes completed, or an error was hit.
7757          */
7758 top:
7759         mutex_enter(&dev->l2ad_mtx);
7760         for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
7761                 hdr_prev = list_prev(buflist, hdr);
7762 
7763                 hash_lock = HDR_LOCK(hdr);
7764 
7765                 /*
7766                  * We cannot use mutex_enter or else we can deadlock
7767                  * with l2arc_write_buffers (due to swapping the order
7768                  * the hash lock and l2ad_mtx are taken).
7769                  */
7770                 if (!mutex_tryenter(hash_lock)) {
7771                         /*
7772                          * Missed the hash lock. We must retry so we
7773                          * don't leave the ARC_FLAG_L2_WRITING bit set.
7774                          */
7775                         ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
7776 
7777                         /*
7778                          * We don't want to rescan the headers we've
7779                          * already marked as having been written out, so
7780                          * we reinsert the head node so we can pick up
7781                          * where we left off.
7782                          */
7783                         list_remove(buflist, head);
7784                         list_insert_after(buflist, hdr, head);
7785 
7786                         mutex_exit(&dev->l2ad_mtx);
7787 
7788                         /*
7789                          * We wait for the hash lock to become available
7790                          * to try and prevent busy waiting, and increase
7791                          * the chance we'll be able to acquire the lock
7792                          * the next time around.
7793                          */
7794                         mutex_enter(hash_lock);
7795                         mutex_exit(hash_lock);
7796                         goto top;
7797                 }
7798 
7799                 /*
7800                  * We could not have been moved into the arc_l2c_only
7801                  * state while in-flight due to our ARC_FLAG_L2_WRITING
7802                  * bit being set. Let's just ensure that's being enforced.
7803                  */
7804                 ASSERT(HDR_HAS_L1HDR(hdr));
7805 
7806                 if (zio->io_error != 0) {
7807                         /*
7808                          * Error - drop L2ARC entry.
7809                          */
7810                         list_remove(buflist, hdr);
7811                         arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
7812 
7813                         uint64_t psize = HDR_GET_PSIZE(hdr);
7814                         l2arc_hdr_arcstats_decrement(hdr);
7815 
7816                         bytes_dropped +=
7817                             vdev_psize_to_asize(dev->l2ad_vdev, psize);
7818                         (void) zfs_refcount_remove_many(&dev->l2ad_alloc,
7819                             arc_hdr_size(hdr), hdr);
7820                 }
7821 
7822                 /*
7823                  * Allow ARC to begin reads and ghost list evictions to
7824                  * this L2ARC entry.
7825                  */
7826                 arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING);
7827 
7828                 mutex_exit(hash_lock);
7829         }
7830 
7831         /*
7832          * Free the allocated abd buffers for writing the log blocks.
7833          * If the zio failed reclaim the allocated space and remove the
7834          * pointers to these log blocks from the log block pointer list
7835          * of the L2ARC device.
7836          */
7837         while ((abd_buf = list_remove_tail(&cb->l2wcb_abd_list)) != NULL) {
7838                 abd_free(abd_buf->abd);
7839                 zio_buf_free(abd_buf, sizeof (*abd_buf));
7840                 if (zio->io_error != 0) {
7841                         lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list);
7842                         /*
7843                          * L2BLK_GET_PSIZE returns aligned size for log
7844                          * blocks.
7845                          */
7846                         uint64_t asize =
7847                             L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop);
7848                         bytes_dropped += asize;
7849                         ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
7850                         ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
7851                         zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
7852                             lb_ptr_buf);
7853                         zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf);
7854                         kmem_free(lb_ptr_buf->lb_ptr,
7855                             sizeof (l2arc_log_blkptr_t));
7856                         kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
7857                 }
7858         }
7859         list_destroy(&cb->l2wcb_abd_list);
7860 
7861         if (zio->io_error != 0) {
7862                 ARCSTAT_BUMP(arcstat_l2_writes_error);
7863 
7864                 /*
7865                  * Restore the lbps array in the header to its previous state.
7866                  * If the list of log block pointers is empty, zero out the
7867                  * log block pointers in the device header.
7868                  */
7869                 lb_ptr_buf = list_head(&dev->l2ad_lbptr_list);
7870                 for (int i = 0; i < 2; i++) {
7871                         if (lb_ptr_buf == NULL) {
7872                                 /*
7873                                  * If the list is empty zero out the device
7874                                  * header. Otherwise zero out the second log
7875                                  * block pointer in the header.
7876                                  */
7877                                 if (i == 0) {
7878                                         bzero(l2dhdr, dev->l2ad_dev_hdr_asize);
7879                                 } else {
7880                                         bzero(&l2dhdr->dh_start_lbps[i],
7881                                             sizeof (l2arc_log_blkptr_t));
7882                                 }
7883                                 break;
7884                         }
7885                         bcopy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[i],
7886                             sizeof (l2arc_log_blkptr_t));
7887                         lb_ptr_buf = list_next(&dev->l2ad_lbptr_list,
7888                             lb_ptr_buf);
7889                 }
7890         }
7891 
7892         atomic_inc_64(&l2arc_writes_done);
7893         list_remove(buflist, head);
7894         ASSERT(!HDR_HAS_L1HDR(head));
7895         kmem_cache_free(hdr_l2only_cache, head);
7896         mutex_exit(&dev->l2ad_mtx);
7897 
7898         ASSERT(dev->l2ad_vdev != NULL);
7899         vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
7900 
7901         l2arc_do_free_on_write();
7902 
7903         kmem_free(cb, sizeof (l2arc_write_callback_t));
7904 }
7905 
7906 static int
7907 l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb)
7908 {
7909         int ret;
7910         spa_t *spa = zio->io_spa;
7911         arc_buf_hdr_t *hdr = cb->l2rcb_hdr;
7912         blkptr_t *bp = zio->io_bp;
7913         uint8_t salt[ZIO_DATA_SALT_LEN];
7914         uint8_t iv[ZIO_DATA_IV_LEN];
7915         uint8_t mac[ZIO_DATA_MAC_LEN];
7916         boolean_t no_crypt = B_FALSE;
7917 
7918         /*
7919          * ZIL data is never be written to the L2ARC, so we don't need
7920          * special handling for its unique MAC storage.
7921          */
7922         ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
7923         ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
7924         ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
7925 
7926         /*
7927          * If the data was encrypted, decrypt it now. Note that
7928          * we must check the bp here and not the hdr, since the
7929          * hdr does not have its encryption parameters updated
7930          * until arc_read_done().
7931          */
7932         if (BP_IS_ENCRYPTED(bp)) {
7933                 abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
7934                     B_TRUE);
7935 
7936                 zio_crypt_decode_params_bp(bp, salt, iv);
7937                 zio_crypt_decode_mac_bp(bp, mac);
7938 
7939                 ret = spa_do_crypt_abd(B_FALSE, spa, &cb->l2rcb_zb,
7940                     BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp),
7941                     salt, iv, mac, HDR_GET_PSIZE(hdr), eabd,
7942                     hdr->b_l1hdr.b_pabd, &no_crypt);
7943                 if (ret != 0) {
7944                         arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr);
7945                         goto error;
7946                 }
7947 
7948                 /*
7949                  * If we actually performed decryption, replace b_pabd
7950                  * with the decrypted data. Otherwise we can just throw
7951                  * our decryption buffer away.
7952                  */
7953                 if (!no_crypt) {
7954                         arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
7955                             arc_hdr_size(hdr), hdr);
7956                         hdr->b_l1hdr.b_pabd = eabd;
7957                         zio->io_abd = eabd;
7958                 } else {
7959                         arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr);
7960                 }
7961         }
7962 
7963         /*
7964          * If the L2ARC block was compressed, but ARC compression
7965          * is disabled we decompress the data into a new buffer and
7966          * replace the existing data.
7967          */
7968         if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
7969             !HDR_COMPRESSION_ENABLED(hdr)) {
7970                 abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
7971                     B_TRUE);
7972                 void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
7973 
7974                 ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
7975                     hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
7976                     HDR_GET_LSIZE(hdr));
7977                 if (ret != 0) {
7978                         abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
7979                         arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr);
7980                         goto error;
7981                 }
7982 
7983                 abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
7984                 arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
7985                     arc_hdr_size(hdr), hdr);
7986                 hdr->b_l1hdr.b_pabd = cabd;
7987                 zio->io_abd = cabd;
7988                 zio->io_size = HDR_GET_LSIZE(hdr);
7989         }
7990 
7991         return (0);
7992 
7993 error:
7994         return (ret);
7995 }
7996 
7997 
7998 /*
7999  * A read to a cache device completed.  Validate buffer contents before
8000  * handing over to the regular ARC routines.
8001  */
8002 static void
8003 l2arc_read_done(zio_t *zio)
8004 {
8005         int tfm_error = 0;
8006         l2arc_read_callback_t *cb = zio->io_private;
8007         arc_buf_hdr_t *hdr;
8008         kmutex_t *hash_lock;
8009         boolean_t valid_cksum;
8010         boolean_t using_rdata = (BP_IS_ENCRYPTED(&cb->l2rcb_bp) &&
8011             (cb->l2rcb_flags & ZIO_FLAG_RAW_ENCRYPT));
8012 
8013         ASSERT3P(zio->io_vd, !=, NULL);
8014         ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
8015 
8016         spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
8017 
8018         ASSERT3P(cb, !=, NULL);
8019         hdr = cb->l2rcb_hdr;
8020         ASSERT3P(hdr, !=, NULL);
8021 
8022         hash_lock = HDR_LOCK(hdr);
8023         mutex_enter(hash_lock);
8024         ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
8025 
8026         /*
8027          * If the data was read into a temporary buffer,
8028          * move it and free the buffer.
8029          */
8030         if (cb->l2rcb_abd != NULL) {
8031                 ASSERT3U(arc_hdr_size(hdr), <, zio->io_size);
8032                 if (zio->io_error == 0) {
8033                         if (using_rdata) {
8034                                 abd_copy(hdr->b_crypt_hdr.b_rabd,
8035                                     cb->l2rcb_abd, arc_hdr_size(hdr));
8036                         } else {
8037                                 abd_copy(hdr->b_l1hdr.b_pabd,
8038                                     cb->l2rcb_abd, arc_hdr_size(hdr));
8039                         }
8040                 }
8041 
8042                 /*
8043                  * The following must be done regardless of whether
8044                  * there was an error:
8045                  * - free the temporary buffer
8046                  * - point zio to the real ARC buffer
8047                  * - set zio size accordingly
8048                  * These are required because zio is either re-used for
8049                  * an I/O of the block in the case of the error
8050                  * or the zio is passed to arc_read_done() and it
8051                  * needs real data.
8052                  */
8053                 abd_free(cb->l2rcb_abd);
8054                 zio->io_size = zio->io_orig_size = arc_hdr_size(hdr);
8055 
8056                 if (using_rdata) {
8057                         ASSERT(HDR_HAS_RABD(hdr));
8058                         zio->io_abd = zio->io_orig_abd =
8059                             hdr->b_crypt_hdr.b_rabd;
8060                 } else {
8061                         ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
8062                         zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd;
8063                 }
8064         }
8065 
8066         ASSERT3P(zio->io_abd, !=, NULL);
8067 
8068         /*
8069          * Check this survived the L2ARC journey.
8070          */
8071         ASSERT(zio->io_abd == hdr->b_l1hdr.b_pabd ||
8072             (HDR_HAS_RABD(hdr) && zio->io_abd == hdr->b_crypt_hdr.b_rabd));
8073         zio->io_bp_copy = cb->l2rcb_bp;   /* XXX fix in L2ARC 2.0 */
8074         zio->io_bp = &zio->io_bp_copy;        /* XXX fix in L2ARC 2.0 */
8075 
8076         valid_cksum = arc_cksum_is_equal(hdr, zio);
8077 
8078         /*
8079          * b_rabd will always match the data as it exists on disk if it is
8080          * being used. Therefore if we are reading into b_rabd we do not
8081          * attempt to untransform the data.
8082          */
8083         if (valid_cksum && !using_rdata)
8084                 tfm_error = l2arc_untransform(zio, cb);
8085 
8086         if (valid_cksum && tfm_error == 0 && zio->io_error == 0 &&
8087             !HDR_L2_EVICTED(hdr)) {
8088                 mutex_exit(hash_lock);
8089                 zio->io_private = hdr;
8090                 arc_read_done(zio);
8091         } else {
8092                 /*
8093                  * Buffer didn't survive caching.  Increment stats and
8094                  * reissue to the original storage device.
8095                  */
8096                 if (zio->io_error != 0) {
8097                         ARCSTAT_BUMP(arcstat_l2_io_error);
8098                 } else {
8099                         zio->io_error = SET_ERROR(EIO);
8100                 }
8101                 if (!valid_cksum || tfm_error != 0)
8102                         ARCSTAT_BUMP(arcstat_l2_cksum_bad);
8103 
8104                 /*
8105                  * If there's no waiter, issue an async i/o to the primary
8106                  * storage now.  If there *is* a waiter, the caller must
8107                  * issue the i/o in a context where it's OK to block.
8108                  */
8109                 if (zio->io_waiter == NULL) {
8110                         zio_t *pio = zio_unique_parent(zio);
8111                         void *abd = (using_rdata) ?
8112                             hdr->b_crypt_hdr.b_rabd : hdr->b_l1hdr.b_pabd;
8113 
8114                         ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
8115 
8116                         zio = zio_read(pio, zio->io_spa, zio->io_bp,
8117                             abd, zio->io_size, arc_read_done,
8118                             hdr, zio->io_priority, cb->l2rcb_flags,
8119                             &cb->l2rcb_zb);
8120 
8121                         /*
8122                          * Original ZIO will be freed, so we need to update
8123                          * ARC header with the new ZIO pointer to be used
8124                          * by zio_change_priority() in arc_read().
8125                          */
8126                         for (struct arc_callback *acb = hdr->b_l1hdr.b_acb;
8127                             acb != NULL; acb = acb->acb_next)
8128                                 acb->acb_zio_head = zio;
8129 
8130                         mutex_exit(hash_lock);
8131                         zio_nowait(zio);
8132                 } else {
8133                         mutex_exit(hash_lock);
8134                 }
8135         }
8136 
8137         kmem_free(cb, sizeof (l2arc_read_callback_t));
8138 }
8139 
8140 /*
8141  * This is the list priority from which the L2ARC will search for pages to
8142  * cache.  This is used within loops (0..3) to cycle through lists in the
8143  * desired order.  This order can have a significant effect on cache
8144  * performance.
8145  *
8146  * Currently the metadata lists are hit first, MFU then MRU, followed by
8147  * the data lists.  This function returns a locked list, and also returns
8148  * the lock pointer.
8149  */
8150 static multilist_sublist_t *
8151 l2arc_sublist_lock(int list_num)
8152 {
8153         multilist_t *ml = NULL;
8154         unsigned int idx;
8155 
8156         ASSERT(list_num >= 0 && list_num < L2ARC_FEED_TYPES);
8157 
8158         switch (list_num) {
8159         case 0:
8160                 ml = arc_mfu->arcs_list[ARC_BUFC_METADATA];
8161                 break;
8162         case 1:
8163                 ml = arc_mru->arcs_list[ARC_BUFC_METADATA];
8164                 break;
8165         case 2:
8166                 ml = arc_mfu->arcs_list[ARC_BUFC_DATA];
8167                 break;
8168         case 3:
8169                 ml = arc_mru->arcs_list[ARC_BUFC_DATA];
8170                 break;
8171         default:
8172                 return (NULL);
8173         }
8174 
8175         /*
8176          * Return a randomly-selected sublist. This is acceptable
8177          * because the caller feeds only a little bit of data for each
8178          * call (8MB). Subsequent calls will result in different
8179          * sublists being selected.
8180          */
8181         idx = multilist_get_random_index(ml);
8182         return (multilist_sublist_lock(ml, idx));
8183 }
8184 
8185 /*
8186  * Calculates the maximum overhead of L2ARC metadata log blocks for a given
8187  * L2ARC write size. l2arc_evict and l2arc_write_size need to include this
8188  * overhead in processing to make sure there is enough headroom available
8189  * when writing buffers.
8190  */
8191 static inline uint64_t
8192 l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev)
8193 {
8194         if (dev->l2ad_log_entries == 0) {
8195                 return (0);
8196         } else {
8197                 uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT;
8198 
8199                 uint64_t log_blocks = (log_entries +
8200                     dev->l2ad_log_entries - 1) /
8201                     dev->l2ad_log_entries;
8202 
8203                 return (vdev_psize_to_asize(dev->l2ad_vdev,
8204                     sizeof (l2arc_log_blk_phys_t)) * log_blocks);
8205         }
8206 }
8207 
8208 /*
8209  * Evict buffers from the device write hand to the distance specified in
8210  * bytes. This distance may span populated buffers, it may span nothing.
8211  * This is clearing a region on the L2ARC device ready for writing.
8212  * If the 'all' boolean is set, every buffer is evicted.
8213  */
8214 static void
8215 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
8216 {
8217         list_t *buflist;
8218         arc_buf_hdr_t *hdr, *hdr_prev;
8219         kmutex_t *hash_lock;
8220         uint64_t taddr;
8221         l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev;
8222         boolean_t rerun;
8223 
8224         buflist = &dev->l2ad_buflist;
8225 
8226         /*
8227          * We need to add in the worst case scenario of log block overhead.
8228          */
8229         distance += l2arc_log_blk_overhead(distance, dev);
8230 
8231 top:
8232         rerun = B_FALSE;
8233         if (dev->l2ad_hand >= (dev->l2ad_end - distance)) {
8234                 /*
8235                  * When there is no space to accommodate upcoming writes,
8236                  * evict to the end. Then bump the write and evict hands
8237                  * to the start and iterate. This iteration does not
8238                  * happen indefinitely as we make sure in
8239                  * l2arc_write_size() that when the write hand is reset,
8240                  * the write size does not exceed the end of the device.
8241                  */
8242                 rerun = B_TRUE;
8243                 taddr = dev->l2ad_end;
8244         } else {
8245                 taddr = dev->l2ad_hand + distance;
8246         }
8247         DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
8248             uint64_t, taddr, boolean_t, all);
8249 
8250         /*
8251          * This check has to be placed after deciding whether to iterate
8252          * (rerun).
8253          */
8254         if (!all && dev->l2ad_first) {
8255                 /*
8256                  * This is the first sweep through the device. There is
8257                  * nothing to evict.
8258                  */
8259                 goto out;
8260         }
8261 
8262         /*
8263          * When rebuilding L2ARC we retrieve the evict hand from the header of
8264          * the device. Of note, l2arc_evict() does not actually delete buffers
8265          * from the cache device, but keeping track of the evict hand will be
8266          * useful when TRIM is implemented.
8267          */
8268         dev->l2ad_evict = MAX(dev->l2ad_evict, taddr);
8269 
8270 retry:
8271         mutex_enter(&dev->l2ad_mtx);
8272         /*
8273          * We have to account for evicted log blocks. Run vdev_space_update()
8274          * on log blocks whose offset (in bytes) is before the evicted offset
8275          * (in bytes) by searching in the list of pointers to log blocks
8276          * present in the L2ARC device.
8277          */
8278         for (lb_ptr_buf = list_tail(&dev->l2ad_lbptr_list); lb_ptr_buf;
8279             lb_ptr_buf = lb_ptr_buf_prev) {
8280 
8281                 lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf);
8282 
8283                 /* L2BLK_GET_PSIZE returns aligned size for log blocks */
8284                 uint64_t asize = L2BLK_GET_PSIZE(
8285                     (lb_ptr_buf->lb_ptr)->lbp_prop);
8286 
8287                 /*
8288                  * We don't worry about log blocks left behind (ie
8289                  * lbp_payload_start < l2ad_hand) because l2arc_write_buffers()
8290                  * will never write more than l2arc_evict() evicts.
8291                  */
8292                 if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) {
8293                         break;
8294                 } else {
8295                         vdev_space_update(dev->l2ad_vdev, -asize, 0, 0);
8296                         ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
8297                         ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
8298                         zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
8299                             lb_ptr_buf);
8300                         zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf);
8301                         list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf);
8302                         kmem_free(lb_ptr_buf->lb_ptr,
8303                             sizeof (l2arc_log_blkptr_t));
8304                         kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
8305                 }
8306         }
8307 
8308         for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
8309                 hdr_prev = list_prev(buflist, hdr);
8310 
8311                 ASSERT(!HDR_EMPTY(hdr));
8312                 hash_lock = HDR_LOCK(hdr);
8313 
8314                 /*
8315                  * We cannot use mutex_enter or else we can deadlock
8316                  * with l2arc_write_buffers (due to swapping the order
8317                  * the hash lock and l2ad_mtx are taken).
8318                  */
8319                 if (!mutex_tryenter(hash_lock)) {
8320                         /*
8321                          * Missed the hash lock.  Retry.
8322                          */
8323                         ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
8324                         mutex_exit(&dev->l2ad_mtx);
8325                         mutex_enter(hash_lock);
8326                         mutex_exit(hash_lock);
8327                         goto retry;
8328                 }
8329 
8330                 /*
8331                  * A header can't be on this list if it doesn't have L2 header.
8332                  */
8333                 ASSERT(HDR_HAS_L2HDR(hdr));
8334 
8335                 /* Ensure this header has finished being written. */
8336                 ASSERT(!HDR_L2_WRITING(hdr));
8337                 ASSERT(!HDR_L2_WRITE_HEAD(hdr));
8338 
8339                 if (!all && (hdr->b_l2hdr.b_daddr >= dev->l2ad_evict ||
8340                     hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
8341                         /*
8342                          * We've evicted to the target address,
8343                          * or the end of the device.
8344                          */
8345                         mutex_exit(hash_lock);
8346                         break;
8347                 }
8348 
8349                 if (!HDR_HAS_L1HDR(hdr)) {
8350                         ASSERT(!HDR_L2_READING(hdr));
8351                         /*
8352                          * This doesn't exist in the ARC.  Destroy.
8353                          * arc_hdr_destroy() will call list_remove()
8354                          * and decrement arcstat_l2_lsize.
8355                          */
8356                         arc_change_state(arc_anon, hdr, hash_lock);
8357                         arc_hdr_destroy(hdr);
8358                 } else {
8359                         ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
8360                         ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
8361                         /*
8362                          * Invalidate issued or about to be issued
8363                          * reads, since we may be about to write
8364                          * over this location.
8365                          */
8366                         if (HDR_L2_READING(hdr)) {
8367                                 ARCSTAT_BUMP(arcstat_l2_evict_reading);
8368                                 arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED);
8369                         }
8370 
8371                         arc_hdr_l2hdr_destroy(hdr);
8372                 }
8373                 mutex_exit(hash_lock);
8374         }
8375         mutex_exit(&dev->l2ad_mtx);
8376 
8377 out:
8378         /*
8379          * We need to check if we evict all buffers, otherwise we may iterate
8380          * unnecessarily.
8381          */
8382         if (!all && rerun) {
8383                 /*
8384                  * Bump device hand to the device start if it is approaching the
8385                  * end. l2arc_evict() has already evicted ahead for this case.
8386                  */
8387                 dev->l2ad_hand = dev->l2ad_start;
8388                 dev->l2ad_evict = dev->l2ad_start;
8389                 dev->l2ad_first = B_FALSE;
8390                 goto top;
8391         }
8392 
8393         ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end);
8394         if (!dev->l2ad_first)
8395                 ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict);
8396 }
8397 
8398 /*
8399  * Handle any abd transforms that might be required for writing to the L2ARC.
8400  * If successful, this function will always return an abd with the data
8401  * transformed as it is on disk in a new abd of asize bytes.
8402  */
8403 static int
8404 l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
8405     abd_t **abd_out)
8406 {
8407         int ret;
8408         void *tmp = NULL;
8409         abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd;
8410         enum zio_compress compress = HDR_GET_COMPRESS(hdr);
8411         uint64_t psize = HDR_GET_PSIZE(hdr);
8412         uint64_t size = arc_hdr_size(hdr);
8413         boolean_t ismd = HDR_ISTYPE_METADATA(hdr);
8414         boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
8415         dsl_crypto_key_t *dck = NULL;
8416         uint8_t mac[ZIO_DATA_MAC_LEN] = { 0 };
8417         boolean_t no_crypt = B_FALSE;
8418 
8419         ASSERT((HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
8420             !HDR_COMPRESSION_ENABLED(hdr)) ||
8421             HDR_ENCRYPTED(hdr) || HDR_SHARED_DATA(hdr) || psize != asize);
8422         ASSERT3U(psize, <=, asize);
8423 
8424         /*
8425          * If this data simply needs its own buffer, we simply allocate it
8426          * and copy the data. This may be done to eliminate a dependency on a
8427          * shared buffer or to reallocate the buffer to match asize.
8428          */
8429         if (HDR_HAS_RABD(hdr) && asize != psize) {
8430                 ASSERT3U(asize, >=, psize);
8431                 to_write = abd_alloc_for_io(asize, ismd);
8432                 abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize);
8433                 if (psize != asize)
8434                         abd_zero_off(to_write, psize, asize - psize);
8435                 goto out;
8436         }
8437 
8438         if ((compress == ZIO_COMPRESS_OFF || HDR_COMPRESSION_ENABLED(hdr)) &&
8439             !HDR_ENCRYPTED(hdr)) {
8440                 ASSERT3U(size, ==, psize);
8441                 to_write = abd_alloc_for_io(asize, ismd);
8442                 abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
8443                 if (size != asize)
8444                         abd_zero_off(to_write, size, asize - size);
8445                 goto out;
8446         }
8447 
8448         if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) {
8449                 cabd = abd_alloc_for_io(asize, ismd);
8450                 tmp = abd_borrow_buf(cabd, asize);
8451 
8452                 psize = zio_compress_data(compress, to_write, tmp, size);
8453                 ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr));
8454                 if (psize < asize)
8455                         bzero((char *)tmp + psize, asize - psize);
8456                 psize = HDR_GET_PSIZE(hdr);
8457                 abd_return_buf_copy(cabd, tmp, asize);
8458                 to_write = cabd;
8459         }
8460 
8461         if (HDR_ENCRYPTED(hdr)) {
8462                 eabd = abd_alloc_for_io(asize, ismd);
8463 
8464                 /*
8465                  * If the dataset was disowned before the buffer
8466                  * made it to this point, the key to re-encrypt
8467                  * it won't be available. In this case we simply
8468                  * won't write the buffer to the L2ARC.
8469                  */
8470                 ret = spa_keystore_lookup_key(spa, hdr->b_crypt_hdr.b_dsobj,
8471                     FTAG, &dck);
8472                 if (ret != 0)
8473                         goto error;
8474 
8475                 ret = zio_do_crypt_abd(B_TRUE, &dck->dck_key,
8476                     hdr->b_crypt_hdr.b_ot, bswap, hdr->b_crypt_hdr.b_salt,
8477                     hdr->b_crypt_hdr.b_iv, mac, psize, to_write, eabd,
8478                     &no_crypt);
8479                 if (ret != 0)
8480                         goto error;
8481 
8482                 if (no_crypt)
8483                         abd_copy(eabd, to_write, psize);
8484 
8485                 if (psize != asize)
8486                         abd_zero_off(eabd, psize, asize - psize);
8487 
8488                 /* assert that the MAC we got here matches the one we saved */
8489                 ASSERT0(bcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN));
8490                 spa_keystore_dsl_key_rele(spa, dck, FTAG);
8491 
8492                 if (to_write == cabd)
8493                         abd_free(cabd);
8494 
8495                 to_write = eabd;
8496         }
8497 
8498 out:
8499         ASSERT3P(to_write, !=, hdr->b_l1hdr.b_pabd);
8500         *abd_out = to_write;
8501         return (0);
8502 
8503 error:
8504         if (dck != NULL)
8505                 spa_keystore_dsl_key_rele(spa, dck, FTAG);
8506         if (cabd != NULL)
8507                 abd_free(cabd);
8508         if (eabd != NULL)
8509                 abd_free(eabd);
8510 
8511         *abd_out = NULL;
8512         return (ret);
8513 }
8514 
8515 static void
8516 l2arc_blk_fetch_done(zio_t *zio)
8517 {
8518         l2arc_read_callback_t *cb;
8519 
8520         cb = zio->io_private;
8521         if (cb->l2rcb_abd != NULL)
8522                 abd_put(cb->l2rcb_abd);
8523         kmem_free(cb, sizeof (l2arc_read_callback_t));
8524 }
8525 
8526 /*
8527  * Find and write ARC buffers to the L2ARC device.
8528  *
8529  * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
8530  * for reading until they have completed writing.
8531  * The headroom_boost is an in-out parameter used to maintain headroom boost
8532  * state between calls to this function.
8533  *
8534  * Returns the number of bytes actually written (which may be smaller than
8535  * the delta by which the device hand has changed due to alignment and the
8536  * writing of log blocks).
8537  */
8538 static uint64_t
8539 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
8540 {
8541         arc_buf_hdr_t           *hdr, *hdr_prev, *head;
8542         uint64_t                write_asize, write_psize, write_lsize, headroom;
8543         boolean_t               full;
8544         l2arc_write_callback_t  *cb = NULL;
8545         zio_t                   *pio, *wzio;
8546         uint64_t                guid = spa_load_guid(spa);
8547         l2arc_dev_hdr_phys_t    *l2dhdr = dev->l2ad_dev_hdr;
8548 
8549         ASSERT3P(dev->l2ad_vdev, !=, NULL);
8550 
8551         pio = NULL;
8552         write_lsize = write_asize = write_psize = 0;
8553         full = B_FALSE;
8554         head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
8555         arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
8556 
8557         /*
8558          * Copy buffers for L2ARC writing.
8559          */
8560         for (int try = 0; try < L2ARC_FEED_TYPES; try++) {
8561                 /*
8562                  * If try == 1 or 3, we cache MRU metadata and data
8563                  * respectively.
8564                  */
8565                 if (l2arc_mfuonly) {
8566                         if (try == 1 || try == 3)
8567                                 continue;
8568                 }
8569 
8570                 multilist_sublist_t *mls = l2arc_sublist_lock(try);
8571                 uint64_t passed_sz = 0;
8572 
8573                 VERIFY3P(mls, !=, NULL);
8574 
8575                 /*
8576                  * L2ARC fast warmup.
8577                  *
8578                  * Until the ARC is warm and starts to evict, read from the
8579                  * head of the ARC lists rather than the tail.
8580                  */
8581                 if (arc_warm == B_FALSE)
8582                         hdr = multilist_sublist_head(mls);
8583                 else
8584                         hdr = multilist_sublist_tail(mls);
8585 
8586                 headroom = target_sz * l2arc_headroom;
8587                 if (zfs_compressed_arc_enabled)
8588                         headroom = (headroom * l2arc_headroom_boost) / 100;
8589 
8590                 for (; hdr; hdr = hdr_prev) {
8591                         kmutex_t *hash_lock;
8592                         abd_t *to_write = NULL;
8593 
8594                         if (arc_warm == B_FALSE)
8595                                 hdr_prev = multilist_sublist_next(mls, hdr);
8596                         else
8597                                 hdr_prev = multilist_sublist_prev(mls, hdr);
8598 
8599                         hash_lock = HDR_LOCK(hdr);
8600                         if (!mutex_tryenter(hash_lock)) {
8601                                 /*
8602                                  * Skip this buffer rather than waiting.
8603                                  */
8604                                 continue;
8605                         }
8606 
8607                         passed_sz += HDR_GET_LSIZE(hdr);
8608                         if (l2arc_headroom != 0 && passed_sz > headroom) {
8609                                 /*
8610                                  * Searched too far.
8611                                  */
8612                                 mutex_exit(hash_lock);
8613                                 break;
8614                         }
8615 
8616                         if (!l2arc_write_eligible(guid, hdr)) {
8617                                 mutex_exit(hash_lock);
8618                                 continue;
8619                         }
8620 
8621                         /*
8622                          * We rely on the L1 portion of the header below, so
8623                          * it's invalid for this header to have been evicted out
8624                          * of the ghost cache, prior to being written out. The
8625                          * ARC_FLAG_L2_WRITING bit ensures this won't happen.
8626                          */
8627                         ASSERT(HDR_HAS_L1HDR(hdr));
8628 
8629                         ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
8630                         ASSERT3U(arc_hdr_size(hdr), >, 0);
8631                         ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
8632                             HDR_HAS_RABD(hdr));
8633                         uint64_t psize = HDR_GET_PSIZE(hdr);
8634                         uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
8635                             psize);
8636 
8637                         if ((write_asize + asize) > target_sz) {
8638                                 full = B_TRUE;
8639                                 mutex_exit(hash_lock);
8640                                 break;
8641                         }
8642 
8643                         /*
8644                          * We rely on the L1 portion of the header below, so
8645                          * it's invalid for this header to have been evicted out
8646                          * of the ghost cache, prior to being written out. The
8647                          * ARC_FLAG_L2_WRITING bit ensures this won't happen.
8648                          */
8649                         arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING);
8650                         ASSERT(HDR_HAS_L1HDR(hdr));
8651 
8652                         ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
8653                         ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
8654                             HDR_HAS_RABD(hdr));
8655                         ASSERT3U(arc_hdr_size(hdr), >, 0);
8656 
8657                         /*
8658                          * If this header has b_rabd, we can use this since it
8659                          * must always match the data exactly as it exists on
8660                          * disk. Otherwise, the L2ARC can normally use the
8661                          * hdr's data, but if we're sharing data between the
8662                          * hdr and one of its bufs, L2ARC needs its own copy of
8663                          * the data so that the ZIO below can't race with the
8664                          * buf consumer. To ensure that this copy will be
8665                          * available for the lifetime of the ZIO and be cleaned
8666                          * up afterwards, we add it to the l2arc_free_on_write
8667                          * queue. If we need to apply any transforms to the
8668                          * data (compression, encryption) we will also need the
8669                          * extra buffer.
8670                          */
8671                         if (HDR_HAS_RABD(hdr) && psize == asize) {
8672                                 to_write = hdr->b_crypt_hdr.b_rabd;
8673                         } else if ((HDR_COMPRESSION_ENABLED(hdr) ||
8674                             HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) &&
8675                             !HDR_ENCRYPTED(hdr) && !HDR_SHARED_DATA(hdr) &&
8676                             psize == asize) {
8677                                 to_write = hdr->b_l1hdr.b_pabd;
8678                         } else {
8679                                 int ret;
8680                                 arc_buf_contents_t type = arc_buf_type(hdr);
8681 
8682                                 ret = l2arc_apply_transforms(spa, hdr, asize,
8683                                     &to_write);
8684                                 if (ret != 0) {
8685                                         arc_hdr_clear_flags(hdr,
8686                                             ARC_FLAG_L2_WRITING);
8687                                         mutex_exit(hash_lock);
8688                                         continue;
8689                                 }
8690 
8691                                 l2arc_free_abd_on_write(to_write, asize, type);
8692                         }
8693 
8694                         if (pio == NULL) {
8695                                 /*
8696                                  * Insert a dummy header on the buflist so
8697                                  * l2arc_write_done() can find where the
8698                                  * write buffers begin without searching.
8699                                  */
8700                                 mutex_enter(&dev->l2ad_mtx);
8701                                 list_insert_head(&dev->l2ad_buflist, head);
8702                                 mutex_exit(&dev->l2ad_mtx);
8703 
8704                                 cb = kmem_alloc(
8705                                     sizeof (l2arc_write_callback_t), KM_SLEEP);
8706                                 cb->l2wcb_dev = dev;
8707                                 cb->l2wcb_head = head;
8708                                 /*
8709                                  * Create a list to save allocated abd buffers
8710                                  * for l2arc_log_blk_commit().
8711                                  */
8712                                 list_create(&cb->l2wcb_abd_list,
8713                                     sizeof (l2arc_lb_abd_buf_t),
8714                                     offsetof(l2arc_lb_abd_buf_t, node));
8715                                 pio = zio_root(spa, l2arc_write_done, cb,
8716                                     ZIO_FLAG_CANFAIL);
8717                         }
8718 
8719                         hdr->b_l2hdr.b_dev = dev;
8720                         hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
8721                         hdr->b_l2hdr.b_arcs_state =
8722                             hdr->b_l1hdr.b_state->arcs_state;
8723                         arc_hdr_set_flags(hdr,
8724                             ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR);
8725 
8726                         mutex_enter(&dev->l2ad_mtx);
8727                         list_insert_head(&dev->l2ad_buflist, hdr);
8728                         mutex_exit(&dev->l2ad_mtx);
8729 
8730                         (void) zfs_refcount_add_many(&dev->l2ad_alloc,
8731                             arc_hdr_size(hdr), hdr);
8732 
8733                         wzio = zio_write_phys(pio, dev->l2ad_vdev,
8734                             hdr->b_l2hdr.b_daddr, asize, to_write,
8735                             ZIO_CHECKSUM_OFF, NULL, hdr,
8736                             ZIO_PRIORITY_ASYNC_WRITE,
8737                             ZIO_FLAG_CANFAIL, B_FALSE);
8738 
8739                         write_lsize += HDR_GET_LSIZE(hdr);
8740                         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
8741                             zio_t *, wzio);
8742 
8743                         write_psize += psize;
8744                         write_asize += asize;
8745                         dev->l2ad_hand += asize;
8746                         l2arc_hdr_arcstats_increment(hdr);
8747                         vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
8748 
8749                         mutex_exit(hash_lock);
8750 
8751                         /*
8752                          * Append buf info to current log and commit if full.
8753                          * arcstat_l2_{size,asize} kstats are updated
8754                          * internally.
8755                          */
8756                         if (l2arc_log_blk_insert(dev, hdr))
8757                                 l2arc_log_blk_commit(dev, pio, cb);
8758 
8759                         (void) zio_nowait(wzio);
8760                 }
8761 
8762                 multilist_sublist_unlock(mls);
8763 
8764                 if (full == B_TRUE)
8765                         break;
8766         }
8767 
8768         /* No buffers selected for writing? */
8769         if (pio == NULL) {
8770                 ASSERT0(write_lsize);
8771                 ASSERT(!HDR_HAS_L1HDR(head));
8772                 kmem_cache_free(hdr_l2only_cache, head);
8773 
8774                 /*
8775                  * Although we did not write any buffers l2ad_evict may
8776                  * have advanced.
8777                  */
8778                 if (dev->l2ad_evict != l2dhdr->dh_evict)
8779                         l2arc_dev_hdr_update(dev);
8780 
8781                 return (0);
8782         }
8783 
8784         if (!dev->l2ad_first)
8785                 ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict);
8786 
8787         ASSERT3U(write_asize, <=, target_sz);
8788         ARCSTAT_BUMP(arcstat_l2_writes_sent);
8789         ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize);
8790 
8791         dev->l2ad_writing = B_TRUE;
8792         (void) zio_wait(pio);
8793         dev->l2ad_writing = B_FALSE;
8794 
8795         /*
8796          * Update the device header after the zio completes as
8797          * l2arc_write_done() may have updated the memory holding the log block
8798          * pointers in the device header.
8799          */
8800         l2arc_dev_hdr_update(dev);
8801 
8802         return (write_asize);
8803 }
8804 
8805 static boolean_t
8806 l2arc_hdr_limit_reached(void)
8807 {
8808         int64_t s = aggsum_upper_bound(&astat_l2_hdr_size);
8809 
8810         return (arc_reclaim_needed() || (s > arc_meta_limit * 3 / 4) ||
8811             (s > (arc_warm ? arc_c : arc_c_max) * l2arc_meta_percent / 100));
8812 }
8813 
8814 /*
8815  * This thread feeds the L2ARC at regular intervals.  This is the beating
8816  * heart of the L2ARC.
8817  */
8818 /* ARGSUSED */
8819 static void
8820 l2arc_feed_thread(void *unused)
8821 {
8822         callb_cpr_t cpr;
8823         l2arc_dev_t *dev;
8824         spa_t *spa;
8825         uint64_t size, wrote;
8826         clock_t begin, next = ddi_get_lbolt();
8827 
8828         CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
8829 
8830         mutex_enter(&l2arc_feed_thr_lock);
8831 
8832         while (l2arc_thread_exit == 0) {
8833                 CALLB_CPR_SAFE_BEGIN(&cpr);
8834                 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
8835                     next);
8836                 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
8837                 next = ddi_get_lbolt() + hz;
8838 
8839                 /*
8840                  * Quick check for L2ARC devices.
8841                  */
8842                 mutex_enter(&l2arc_dev_mtx);
8843                 if (l2arc_ndev == 0) {
8844                         mutex_exit(&l2arc_dev_mtx);
8845                         continue;
8846                 }
8847                 mutex_exit(&l2arc_dev_mtx);
8848                 begin = ddi_get_lbolt();
8849 
8850                 /*
8851                  * This selects the next l2arc device to write to, and in
8852                  * doing so the next spa to feed from: dev->l2ad_spa.   This
8853                  * will return NULL if there are now no l2arc devices or if
8854                  * they are all faulted.
8855                  *
8856                  * If a device is returned, its spa's config lock is also
8857                  * held to prevent device removal.  l2arc_dev_get_next()
8858                  * will grab and release l2arc_dev_mtx.
8859                  */
8860                 if ((dev = l2arc_dev_get_next()) == NULL)
8861                         continue;
8862 
8863                 spa = dev->l2ad_spa;
8864                 ASSERT3P(spa, !=, NULL);
8865 
8866                 /*
8867                  * If the pool is read-only then force the feed thread to
8868                  * sleep a little longer.
8869                  */
8870                 if (!spa_writeable(spa)) {
8871                         next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
8872                         spa_config_exit(spa, SCL_L2ARC, dev);
8873                         continue;
8874                 }
8875 
8876                 /*
8877                  * Avoid contributing to memory pressure.
8878                  */
8879                 if (l2arc_hdr_limit_reached()) {
8880                         ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
8881                         spa_config_exit(spa, SCL_L2ARC, dev);
8882                         continue;
8883                 }
8884 
8885                 ARCSTAT_BUMP(arcstat_l2_feeds);
8886 
8887                 size = l2arc_write_size(dev);
8888 
8889                 /*
8890                  * Evict L2ARC buffers that will be overwritten.
8891                  */
8892                 l2arc_evict(dev, size, B_FALSE);
8893 
8894                 /*
8895                  * Write ARC buffers.
8896                  */
8897                 wrote = l2arc_write_buffers(spa, dev, size);
8898 
8899                 /*
8900                  * Calculate interval between writes.
8901                  */
8902                 next = l2arc_write_interval(begin, size, wrote);
8903                 spa_config_exit(spa, SCL_L2ARC, dev);
8904         }
8905 
8906         l2arc_thread_exit = 0;
8907         cv_broadcast(&l2arc_feed_thr_cv);
8908         CALLB_CPR_EXIT(&cpr);               /* drops l2arc_feed_thr_lock */
8909         thread_exit();
8910 }
8911 
8912 boolean_t
8913 l2arc_vdev_present(vdev_t *vd)
8914 {
8915         return (l2arc_vdev_get(vd) != NULL);
8916 }
8917 
8918 /*
8919  * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if
8920  * the vdev_t isn't an L2ARC device.
8921  */
8922 static l2arc_dev_t *
8923 l2arc_vdev_get(vdev_t *vd)
8924 {
8925         l2arc_dev_t     *dev;
8926 
8927         mutex_enter(&l2arc_dev_mtx);
8928         for (dev = list_head(l2arc_dev_list); dev != NULL;
8929             dev = list_next(l2arc_dev_list, dev)) {
8930                 if (dev->l2ad_vdev == vd)
8931                         break;
8932         }
8933         mutex_exit(&l2arc_dev_mtx);
8934 
8935         return (dev);
8936 }
8937 
8938 /*
8939  * Add a vdev for use by the L2ARC.  By this point the spa has already
8940  * validated the vdev and opened it.
8941  */
8942 void
8943 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
8944 {
8945         l2arc_dev_t             *adddev;
8946         uint64_t                l2dhdr_asize;
8947 
8948         ASSERT(!l2arc_vdev_present(vd));
8949 
8950         /*
8951          * Create a new l2arc device entry.
8952          */
8953         adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
8954         adddev->l2ad_spa = spa;
8955         adddev->l2ad_vdev = vd;
8956         /* leave extra size for an l2arc device header */
8957         l2dhdr_asize = adddev->l2ad_dev_hdr_asize =
8958             MAX(sizeof (*adddev->l2ad_dev_hdr), 1 << vd->vdev_ashift);
8959         adddev->l2ad_start = VDEV_LABEL_START_SIZE + l2dhdr_asize;
8960         adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
8961         ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end);
8962         adddev->l2ad_hand = adddev->l2ad_start;
8963         adddev->l2ad_evict = adddev->l2ad_start;
8964         adddev->l2ad_first = B_TRUE;
8965         adddev->l2ad_writing = B_FALSE;
8966         adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP);
8967 
8968         mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
8969         /*
8970          * This is a list of all ARC buffers that are still valid on the
8971          * device.
8972          */
8973         list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
8974             offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
8975 
8976         /*
8977          * This is a list of pointers to log blocks that are still present
8978          * on the device.
8979          */
8980         list_create(&adddev->l2ad_lbptr_list, sizeof (l2arc_lb_ptr_buf_t),
8981             offsetof(l2arc_lb_ptr_buf_t, node));
8982 
8983         vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
8984         zfs_refcount_create(&adddev->l2ad_alloc);
8985         zfs_refcount_create(&adddev->l2ad_lb_asize);
8986         zfs_refcount_create(&adddev->l2ad_lb_count);
8987 
8988         /*
8989          * Add device to global list
8990          */
8991         mutex_enter(&l2arc_dev_mtx);
8992         list_insert_head(l2arc_dev_list, adddev);
8993         atomic_inc_64(&l2arc_ndev);
8994         mutex_exit(&l2arc_dev_mtx);
8995 
8996         /*
8997          * Decide if vdev is eligible for L2ARC rebuild
8998          */
8999         l2arc_rebuild_vdev(adddev->l2ad_vdev, B_FALSE);
9000 }
9001 
9002 void
9003 l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen)
9004 {
9005         l2arc_dev_t             *dev = NULL;
9006         l2arc_dev_hdr_phys_t    *l2dhdr;
9007         uint64_t                l2dhdr_asize;
9008         spa_t                   *spa;
9009 
9010         dev = l2arc_vdev_get(vd);
9011         ASSERT3P(dev, !=, NULL);
9012         spa = dev->l2ad_spa;
9013         l2dhdr = dev->l2ad_dev_hdr;
9014         l2dhdr_asize = dev->l2ad_dev_hdr_asize;
9015 
9016         /*
9017          * The L2ARC has to hold at least the payload of one log block for
9018          * them to be restored (persistent L2ARC). The payload of a log block
9019          * depends on the amount of its log entries. We always write log blocks
9020          * with 1022 entries. How many of them are committed or restored depends
9021          * on the size of the L2ARC device. Thus the maximum payload of
9022          * one log block is 1022 * SPA_MAXBLOCKSIZE = 16GB. If the L2ARC device
9023          * is less than that, we reduce the amount of committed and restored
9024          * log entries per block so as to enable persistence.
9025          */
9026         if (dev->l2ad_end < l2arc_rebuild_blocks_min_l2size) {
9027                 dev->l2ad_log_entries = 0;
9028         } else {
9029                 dev->l2ad_log_entries = MIN((dev->l2ad_end -
9030                     dev->l2ad_start) >> SPA_MAXBLOCKSHIFT,
9031                     L2ARC_LOG_BLK_MAX_ENTRIES);
9032         }
9033 
9034         /*
9035          * Read the device header, if an error is returned do not rebuild L2ARC.
9036          */
9037         if (l2arc_dev_hdr_read(dev) == 0 && dev->l2ad_log_entries > 0) {
9038                 /*
9039                  * If we are onlining a cache device (vdev_reopen) that was
9040                  * still present (l2arc_vdev_present()) and rebuild is enabled,
9041                  * we should evict all ARC buffers and pointers to log blocks
9042                  * and reclaim their space before restoring its contents to
9043                  * L2ARC.
9044                  */
9045                 if (reopen) {
9046                         if (!l2arc_rebuild_enabled) {
9047                                 return;
9048                         } else {
9049                                 l2arc_evict(dev, 0, B_TRUE);
9050                                 /* start a new log block */
9051                                 dev->l2ad_log_ent_idx = 0;
9052                                 dev->l2ad_log_blk_payload_asize = 0;
9053                                 dev->l2ad_log_blk_payload_start = 0;
9054                         }
9055                 }
9056                 /*
9057                  * Just mark the device as pending for a rebuild. We won't
9058                  * be starting a rebuild in line here as it would block pool
9059                  * import. Instead spa_load_impl will hand that off to an
9060                  * async task which will call l2arc_spa_rebuild_start.
9061                  */
9062                 dev->l2ad_rebuild = B_TRUE;
9063         } else if (spa_writeable(spa)) {
9064                 /*
9065                  * In this case create a new header. We zero out the memory
9066                  * holding the header to reset dh_start_lbps.
9067                  */
9068                 bzero(l2dhdr, l2dhdr_asize);
9069                 l2arc_dev_hdr_update(dev);
9070         }
9071 }
9072 
9073 /*
9074  * Remove a vdev from the L2ARC.
9075  */
9076 void
9077 l2arc_remove_vdev(vdev_t *vd)
9078 {
9079         l2arc_dev_t *remdev = NULL;
9080 
9081         /*
9082          * Find the device by vdev
9083          */
9084         remdev = l2arc_vdev_get(vd);
9085         ASSERT3P(remdev, !=, NULL);
9086 
9087         /*
9088          * Cancel any ongoing or scheduled rebuild.
9089          */
9090         mutex_enter(&l2arc_rebuild_thr_lock);
9091         if (remdev->l2ad_rebuild_began == B_TRUE) {
9092                 remdev->l2ad_rebuild_cancel = B_TRUE;
9093                 while (remdev->l2ad_rebuild == B_TRUE)
9094                         cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock);
9095         }
9096         mutex_exit(&l2arc_rebuild_thr_lock);
9097 
9098         /*
9099          * Remove device from global list
9100          */
9101         mutex_enter(&l2arc_dev_mtx);
9102         list_remove(l2arc_dev_list, remdev);
9103         l2arc_dev_last = NULL;          /* may have been invalidated */
9104         atomic_dec_64(&l2arc_ndev);
9105         mutex_exit(&l2arc_dev_mtx);
9106 
9107         /*
9108          * Clear all buflists and ARC references.  L2ARC device flush.
9109          */
9110         l2arc_evict(remdev, 0, B_TRUE);
9111         list_destroy(&remdev->l2ad_buflist);
9112         ASSERT(list_is_empty(&remdev->l2ad_lbptr_list));
9113         list_destroy(&remdev->l2ad_lbptr_list);
9114         mutex_destroy(&remdev->l2ad_mtx);
9115         zfs_refcount_destroy(&remdev->l2ad_alloc);
9116         zfs_refcount_destroy(&remdev->l2ad_lb_asize);
9117         zfs_refcount_destroy(&remdev->l2ad_lb_count);
9118         kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize);
9119         kmem_free(remdev, sizeof (l2arc_dev_t));
9120 }
9121 
9122 void
9123 l2arc_init(void)
9124 {
9125         l2arc_thread_exit = 0;
9126         l2arc_ndev = 0;
9127         l2arc_writes_sent = 0;
9128         l2arc_writes_done = 0;
9129 
9130         mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
9131         cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
9132         mutex_init(&l2arc_rebuild_thr_lock, NULL, MUTEX_DEFAULT, NULL);
9133         cv_init(&l2arc_rebuild_thr_cv, NULL, CV_DEFAULT, NULL);
9134         mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
9135         mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
9136 
9137         l2arc_dev_list = &L2ARC_dev_list;
9138         l2arc_free_on_write = &L2ARC_free_on_write;
9139         list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
9140             offsetof(l2arc_dev_t, l2ad_node));
9141         list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
9142             offsetof(l2arc_data_free_t, l2df_list_node));
9143 }
9144 
9145 void
9146 l2arc_fini(void)
9147 {
9148         /*
9149          * This is called from dmu_fini(), which is called from spa_fini();
9150          * Because of this, we can assume that all l2arc devices have
9151          * already been removed when the pools themselves were removed.
9152          */
9153 
9154         l2arc_do_free_on_write();
9155 
9156         mutex_destroy(&l2arc_feed_thr_lock);
9157         cv_destroy(&l2arc_feed_thr_cv);
9158         mutex_destroy(&l2arc_rebuild_thr_lock);
9159         cv_destroy(&l2arc_rebuild_thr_cv);
9160         mutex_destroy(&l2arc_dev_mtx);
9161         mutex_destroy(&l2arc_free_on_write_mtx);
9162 
9163         list_destroy(l2arc_dev_list);
9164         list_destroy(l2arc_free_on_write);
9165 }
9166 
9167 void
9168 l2arc_start(void)
9169 {
9170         if (!(spa_mode_global & FWRITE))
9171                 return;
9172 
9173         (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
9174             TS_RUN, minclsyspri);
9175 }
9176 
9177 void
9178 l2arc_stop(void)
9179 {
9180         if (!(spa_mode_global & FWRITE))
9181                 return;
9182 
9183         mutex_enter(&l2arc_feed_thr_lock);
9184         cv_signal(&l2arc_feed_thr_cv);      /* kick thread out of startup */
9185         l2arc_thread_exit = 1;
9186         while (l2arc_thread_exit != 0)
9187                 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
9188         mutex_exit(&l2arc_feed_thr_lock);
9189 }
9190 
9191 /*
9192  * Punches out rebuild threads for the L2ARC devices in a spa. This should
9193  * be called after pool import from the spa async thread, since starting
9194  * these threads directly from spa_import() will make them part of the
9195  * "zpool import" context and delay process exit (and thus pool import).
9196  */
9197 void
9198 l2arc_spa_rebuild_start(spa_t *spa)
9199 {
9200         ASSERT(MUTEX_HELD(&spa_namespace_lock));
9201 
9202         /*
9203          * Locate the spa's l2arc devices and kick off rebuild threads.
9204          */
9205         for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
9206                 l2arc_dev_t *dev =
9207                     l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
9208                 if (dev == NULL) {
9209                         /* Don't attempt a rebuild if the vdev is UNAVAIL */
9210                         continue;
9211                 }
9212                 mutex_enter(&l2arc_rebuild_thr_lock);
9213                 if (dev->l2ad_rebuild && !dev->l2ad_rebuild_cancel) {
9214                         dev->l2ad_rebuild_began = B_TRUE;
9215                         (void) thread_create(NULL, 0,
9216                             (void (*)(void *))l2arc_dev_rebuild_start,
9217                             dev, 0, &p0, TS_RUN, minclsyspri);
9218                 }
9219                 mutex_exit(&l2arc_rebuild_thr_lock);
9220         }
9221 }
9222 
9223 /*
9224  * Main entry point for L2ARC rebuilding.
9225  */
9226 static void
9227 l2arc_dev_rebuild_start(l2arc_dev_t *dev)
9228 {
9229         VERIFY(!dev->l2ad_rebuild_cancel);
9230         VERIFY(dev->l2ad_rebuild);
9231         (void) l2arc_rebuild(dev);
9232         mutex_enter(&l2arc_rebuild_thr_lock);
9233         dev->l2ad_rebuild_began = B_FALSE;
9234         dev->l2ad_rebuild = B_FALSE;
9235         mutex_exit(&l2arc_rebuild_thr_lock);
9236 
9237         thread_exit();
9238 }
9239 
9240 /*
9241  * This function implements the actual L2ARC metadata rebuild. It:
9242  * starts reading the log block chain and restores each block's contents
9243  * to memory (reconstructing arc_buf_hdr_t's).
9244  *
9245  * Operation stops under any of the following conditions:
9246  *
9247  * 1) We reach the end of the log block chain.
9248  * 2) We encounter *any* error condition (cksum errors, io errors)
9249  */
9250 static int
9251 l2arc_rebuild(l2arc_dev_t *dev)
9252 {
9253         vdev_t                  *vd = dev->l2ad_vdev;
9254         spa_t                   *spa = vd->vdev_spa;
9255         int                     err = 0;
9256         l2arc_dev_hdr_phys_t    *l2dhdr = dev->l2ad_dev_hdr;
9257         l2arc_log_blk_phys_t    *this_lb, *next_lb;
9258         zio_t                   *this_io = NULL, *next_io = NULL;
9259         l2arc_log_blkptr_t      lbps[2];
9260         l2arc_lb_ptr_buf_t      *lb_ptr_buf;
9261         boolean_t               lock_held;
9262 
9263         this_lb = kmem_zalloc(sizeof (*this_lb), KM_SLEEP);
9264         next_lb = kmem_zalloc(sizeof (*next_lb), KM_SLEEP);
9265 
9266         /*
9267          * We prevent device removal while issuing reads to the device,
9268          * then during the rebuilding phases we drop this lock again so
9269          * that a spa_unload or device remove can be initiated - this is
9270          * safe, because the spa will signal us to stop before removing
9271          * our device and wait for us to stop.
9272          */
9273         spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
9274         lock_held = B_TRUE;
9275 
9276         /*
9277          * Retrieve the persistent L2ARC device state.
9278          * L2BLK_GET_PSIZE returns aligned size for log blocks.
9279          */
9280         dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start);
9281         dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr +
9282             L2BLK_GET_PSIZE((&l2dhdr->dh_start_lbps[0])->lbp_prop),
9283             dev->l2ad_start);
9284         dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST);
9285 
9286         /*
9287          * In case the zfs module parameter l2arc_rebuild_enabled is false
9288          * we do not start the rebuild process.
9289          */
9290         if (!l2arc_rebuild_enabled)
9291                 goto out;
9292 
9293         /* Prepare the rebuild process */
9294         bcopy(l2dhdr->dh_start_lbps, lbps, sizeof (lbps));
9295 
9296         /* Start the rebuild process */
9297         for (;;) {
9298                 if (!l2arc_log_blkptr_valid(dev, &lbps[0]))
9299                         break;
9300 
9301                 if ((err = l2arc_log_blk_read(dev, &lbps[0], &lbps[1],
9302                     this_lb, next_lb, this_io, &next_io)) != 0)
9303                         goto out;
9304 
9305                 /*
9306                  * Our memory pressure valve. If the system is running low
9307                  * on memory, rather than swamping memory with new ARC buf
9308                  * hdrs, we opt not to rebuild the L2ARC. At this point,
9309                  * however, we have already set up our L2ARC dev to chain in
9310                  * new metadata log blocks, so the user may choose to offline/
9311                  * online the L2ARC dev at a later time (or re-import the pool)
9312                  * to reconstruct it (when there's less memory pressure).
9313                  */
9314                 if (l2arc_hdr_limit_reached()) {
9315                         ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
9316                         cmn_err(CE_NOTE, "System running low on memory, "
9317                             "aborting L2ARC rebuild.");
9318                         err = SET_ERROR(ENOMEM);
9319                         goto out;
9320                 }
9321 
9322                 spa_config_exit(spa, SCL_L2ARC, vd);
9323                 lock_held = B_FALSE;
9324 
9325                 /*
9326                  * Now that we know that the next_lb checks out alright, we
9327                  * can start reconstruction from this log block.
9328                  * L2BLK_GET_PSIZE returns aligned size for log blocks.
9329                  */
9330                 uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
9331                 l2arc_log_blk_restore(dev, this_lb, asize);
9332 
9333                 /*
9334                  * log block restored, include its pointer in the list of
9335                  * pointers to log blocks present in the L2ARC device.
9336                  */
9337                 lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
9338                 lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t),
9339                     KM_SLEEP);
9340                 bcopy(&lbps[0], lb_ptr_buf->lb_ptr,
9341                     sizeof (l2arc_log_blkptr_t));
9342                 mutex_enter(&dev->l2ad_mtx);
9343                 list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf);
9344                 ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize);
9345                 ARCSTAT_BUMP(arcstat_l2_log_blk_count);
9346                 zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
9347                 zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
9348                 mutex_exit(&dev->l2ad_mtx);
9349                 vdev_space_update(vd, asize, 0, 0);
9350 
9351                 /* BEGIN CSTYLED */
9352                 /*
9353                  * Protection against loops of log blocks:
9354                  *
9355                  *                                     l2ad_hand  l2ad_evict
9356                  *                                         V          V
9357                  * l2ad_start |=======================================| l2ad_end
9358                  *             -----|||----|||---|||----|||
9359                  *                  (3)    (2)   (1)    (0)
9360                  *             ---|||---|||----|||---|||
9361                  *                (7)   (6)    (5)   (4)
9362                  *
9363                  * In this situation the pointer of log block (4) passes
9364                  * l2arc_log_blkptr_valid() but the log block should not be
9365                  * restored as it is overwritten by the payload of log block
9366                  * (0). Only log blocks (0)-(3) should be restored. We check
9367                  * whether l2ad_evict lies in between the payload starting
9368                  * offset of the next log block (lbps[1].lbp_payload_start)
9369                  * and the payload starting offset of the present log block
9370                  * (lbps[0].lbp_payload_start). If true and this isn't the
9371                  * first pass, we are looping from the beginning and we should
9372                  * stop.
9373                  */
9374                 /* END CSTYLED */
9375                 if (l2arc_range_check_overlap(lbps[1].lbp_payload_start,
9376                     lbps[0].lbp_payload_start, dev->l2ad_evict) &&
9377                     !dev->l2ad_first)
9378                         goto out;
9379 
9380                 for (;;) {
9381                         mutex_enter(&l2arc_rebuild_thr_lock);
9382                         if (dev->l2ad_rebuild_cancel) {
9383                                 dev->l2ad_rebuild = B_FALSE;
9384                                 cv_signal(&l2arc_rebuild_thr_cv);
9385                                 mutex_exit(&l2arc_rebuild_thr_lock);
9386                                 err = SET_ERROR(ECANCELED);
9387                                 goto out;
9388                         }
9389                         mutex_exit(&l2arc_rebuild_thr_lock);
9390                         if (spa_config_tryenter(spa, SCL_L2ARC, vd,
9391                             RW_READER)) {
9392                                 lock_held = B_TRUE;
9393                                 break;
9394                         }
9395                         /*
9396                          * L2ARC config lock held by somebody in writer,
9397                          * possibly due to them trying to remove us. They'll
9398                          * likely to want us to shut down, so after a little
9399                          * delay, we check l2ad_rebuild_cancel and retry
9400                          * the lock again.
9401                          */
9402                         delay(1);
9403                 }
9404 
9405                 /*
9406                  * Continue with the next log block.
9407                  */
9408                 lbps[0] = lbps[1];
9409                 lbps[1] = this_lb->lb_prev_lbp;
9410                 PTR_SWAP(this_lb, next_lb);
9411                 this_io = next_io;
9412                 next_io = NULL;
9413         }
9414 
9415         if (this_io != NULL)
9416                 l2arc_log_blk_fetch_abort(this_io);
9417 out:
9418         if (next_io != NULL)
9419                 l2arc_log_blk_fetch_abort(next_io);
9420         kmem_free(this_lb, sizeof (*this_lb));
9421         kmem_free(next_lb, sizeof (*next_lb));
9422 
9423         if (!l2arc_rebuild_enabled) {
9424                 spa_history_log_internal(spa, "L2ARC rebuild", NULL,
9425                     "disabled");
9426         } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) {
9427                 ARCSTAT_BUMP(arcstat_l2_rebuild_success);
9428                 spa_history_log_internal(spa, "L2ARC rebuild", NULL,
9429                     "successful, restored %llu blocks",
9430                     (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
9431         } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) == 0) {
9432                 /*
9433                  * No error but also nothing restored, meaning the lbps array
9434                  * in the device header points to invalid/non-present log
9435                  * blocks. Reset the header.
9436                  */
9437                 spa_history_log_internal(spa, "L2ARC rebuild", NULL,
9438                     "no valid log blocks");
9439                 bzero(l2dhdr, dev->l2ad_dev_hdr_asize);
9440                 l2arc_dev_hdr_update(dev);
9441         } else if (err == ECANCELED) {
9442                 /*
9443                  * In case the rebuild was canceled do not log to spa history
9444                  * log as the pool may be in the process of being removed.
9445                  */
9446                 zfs_dbgmsg("L2ARC rebuild aborted, restored %llu blocks",
9447                     zfs_refcount_count(&dev->l2ad_lb_count));
9448         } else if (err != 0) {
9449                 spa_history_log_internal(spa, "L2ARC rebuild", NULL,
9450                     "aborted, restored %llu blocks",
9451                     (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
9452         }
9453 
9454         if (lock_held)
9455                 spa_config_exit(spa, SCL_L2ARC, vd);
9456 
9457         return (err);
9458 }
9459 
9460 /*
9461  * Attempts to read the device header on the provided L2ARC device and writes
9462  * it to `hdr'. On success, this function returns 0, otherwise the appropriate
9463  * error code is returned.
9464  */
9465 static int
9466 l2arc_dev_hdr_read(l2arc_dev_t *dev)
9467 {
9468         int                     err;
9469         uint64_t                guid;
9470         l2arc_dev_hdr_phys_t    *l2dhdr = dev->l2ad_dev_hdr;
9471         const uint64_t          l2dhdr_asize = dev->l2ad_dev_hdr_asize;
9472         abd_t                   *abd;
9473 
9474         guid = spa_guid(dev->l2ad_vdev->vdev_spa);
9475 
9476         abd = abd_get_from_buf(l2dhdr, l2dhdr_asize);
9477 
9478         err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
9479             VDEV_LABEL_START_SIZE, l2dhdr_asize, abd,
9480             ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
9481             ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
9482             ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
9483             ZIO_FLAG_SPECULATIVE, B_FALSE));
9484 
9485         abd_put(abd);
9486 
9487         if (err != 0) {
9488                 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_dh_errors);
9489                 zfs_dbgmsg("L2ARC IO error (%d) while reading device header, "
9490                     "vdev guid: %llu", err, dev->l2ad_vdev->vdev_guid);
9491                 return (err);
9492         }
9493 
9494         if (l2dhdr->dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
9495                 byteswap_uint64_array(l2dhdr, sizeof (*l2dhdr));
9496 
9497         if (l2dhdr->dh_magic != L2ARC_DEV_HDR_MAGIC ||
9498             l2dhdr->dh_spa_guid != guid ||
9499             l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid ||
9500             l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION ||
9501             l2dhdr->dh_log_entries != dev->l2ad_log_entries ||
9502             l2dhdr->dh_end != dev->l2ad_end ||
9503             !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end,
9504             l2dhdr->dh_evict)) {
9505                 /*
9506                  * Attempt to rebuild a device containing no actual dev hdr
9507                  * or containing a header from some other pool or from another
9508                  * version of persistent L2ARC.
9509                  */
9510                 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
9511                 return (SET_ERROR(ENOTSUP));
9512         }
9513 
9514         return (0);
9515 }
9516 
9517 /*
9518  * Reads L2ARC log blocks from storage and validates their contents.
9519  *
9520  * This function implements a simple fetcher to make sure that while
9521  * we're processing one buffer the L2ARC is already fetching the next
9522  * one in the chain.
9523  *
9524  * The arguments this_lp and next_lp point to the current and next log block
9525  * address in the block chain. Similarly, this_lb and next_lb hold the
9526  * l2arc_log_blk_phys_t's of the current and next L2ARC blk.
9527  *
9528  * The `this_io' and `next_io' arguments are used for block fetching.
9529  * When issuing the first blk IO during rebuild, you should pass NULL for
9530  * `this_io'. This function will then issue a sync IO to read the block and
9531  * also issue an async IO to fetch the next block in the block chain. The
9532  * fetched IO is returned in `next_io'. On subsequent calls to this
9533  * function, pass the value returned in `next_io' from the previous call
9534  * as `this_io' and a fresh `next_io' pointer to hold the next fetch IO.
9535  * Prior to the call, you should initialize your `next_io' pointer to be
9536  * NULL. If no fetch IO was issued, the pointer is left set at NULL.
9537  *
9538  * On success, this function returns 0, otherwise it returns an appropriate
9539  * error code. On error the fetching IO is aborted and cleared before
9540  * returning from this function. Therefore, if we return `success', the
9541  * caller can assume that we have taken care of cleanup of fetch IOs.
9542  */
9543 static int
9544 l2arc_log_blk_read(l2arc_dev_t *dev,
9545     const l2arc_log_blkptr_t *this_lbp, const l2arc_log_blkptr_t *next_lbp,
9546     l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
9547     zio_t *this_io, zio_t **next_io)
9548 {
9549         int             err = 0;
9550         zio_cksum_t     cksum;
9551         abd_t           *abd = NULL;
9552         uint64_t        asize;
9553 
9554         ASSERT(this_lbp != NULL && next_lbp != NULL);
9555         ASSERT(this_lb != NULL && next_lb != NULL);
9556         ASSERT(next_io != NULL && *next_io == NULL);
9557         ASSERT(l2arc_log_blkptr_valid(dev, this_lbp));
9558 
9559         /*
9560          * Check to see if we have issued the IO for this log block in a
9561          * previous run. If not, this is the first call, so issue it now.
9562          */
9563         if (this_io == NULL) {
9564                 this_io = l2arc_log_blk_fetch(dev->l2ad_vdev, this_lbp,
9565                     this_lb);
9566         }
9567 
9568         /*
9569          * Peek to see if we can start issuing the next IO immediately.
9570          */
9571         if (l2arc_log_blkptr_valid(dev, next_lbp)) {
9572                 /*
9573                  * Start issuing IO for the next log block early - this
9574                  * should help keep the L2ARC device busy while we
9575                  * decompress and restore this log block.
9576                  */
9577                 *next_io = l2arc_log_blk_fetch(dev->l2ad_vdev, next_lbp,
9578                     next_lb);
9579         }
9580 
9581         /* Wait for the IO to read this log block to complete */
9582         if ((err = zio_wait(this_io)) != 0) {
9583                 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
9584                 zfs_dbgmsg("L2ARC IO error (%d) while reading log block, "
9585                     "offset: %llu, vdev guid: %llu", err, this_lbp->lbp_daddr,
9586                     dev->l2ad_vdev->vdev_guid);
9587                 goto cleanup;
9588         }
9589 
9590         /*
9591          * Make sure the buffer checks out.
9592          * L2BLK_GET_PSIZE returns aligned size for log blocks.
9593          */
9594         asize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop);
9595         fletcher_4_native(this_lb, asize, NULL, &cksum);
9596         if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) {
9597                 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors);
9598                 zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, "
9599                     "vdev guid: %llu, l2ad_hand: %llu, l2ad_evict: %llu",
9600                     this_lbp->lbp_daddr, dev->l2ad_vdev->vdev_guid,
9601                     dev->l2ad_hand, dev->l2ad_evict);
9602                 err = SET_ERROR(ECKSUM);
9603                 goto cleanup;
9604         }
9605 
9606         /* Now we can take our time decoding this buffer */
9607         switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) {
9608         case ZIO_COMPRESS_OFF:
9609                 break;
9610         case ZIO_COMPRESS_LZ4:
9611                 abd = abd_alloc_for_io(asize, B_TRUE);
9612                 abd_copy_from_buf_off(abd, this_lb, 0, asize);
9613                 if ((err = zio_decompress_data(
9614                     L2BLK_GET_COMPRESS((this_lbp)->lbp_prop),
9615                     abd, this_lb, asize, sizeof (*this_lb))) != 0) {
9616                         err = SET_ERROR(EINVAL);
9617                         goto cleanup;
9618                 }
9619                 break;
9620         default:
9621                 err = SET_ERROR(EINVAL);
9622                 goto cleanup;
9623         }
9624         if (this_lb->lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
9625                 byteswap_uint64_array(this_lb, sizeof (*this_lb));
9626         if (this_lb->lb_magic != L2ARC_LOG_BLK_MAGIC) {
9627                 err = SET_ERROR(EINVAL);
9628                 goto cleanup;
9629         }
9630 cleanup:
9631         /* Abort an in-flight fetch I/O in case of error */
9632         if (err != 0 && *next_io != NULL) {
9633                 l2arc_log_blk_fetch_abort(*next_io);
9634                 *next_io = NULL;
9635         }
9636         if (abd != NULL)
9637                 abd_free(abd);
9638         return (err);
9639 }
9640 
9641 /*
9642  * Restores the payload of a log block to ARC. This creates empty ARC hdr
9643  * entries which only contain an l2arc hdr, essentially restoring the
9644  * buffers to their L2ARC evicted state. This function also updates space
9645  * usage on the L2ARC vdev to make sure it tracks restored buffers.
9646  */
9647 static void
9648 l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb,
9649     uint64_t lb_asize)
9650 {
9651         uint64_t        size = 0, asize = 0;
9652         uint64_t        log_entries = dev->l2ad_log_entries;
9653 
9654         /*
9655          * Usually arc_adapt() is called only for data, not headers, but
9656          * since we may allocate significant amount of memory here, let ARC
9657          * grow its arc_c.
9658          */
9659         arc_adapt(log_entries * HDR_L2ONLY_SIZE, arc_l2c_only);
9660 
9661         for (int i = log_entries - 1; i >= 0; i--) {
9662                 /*
9663                  * Restore goes in the reverse temporal direction to preserve
9664                  * correct temporal ordering of buffers in the l2ad_buflist.
9665                  * l2arc_hdr_restore also does a list_insert_tail instead of
9666                  * list_insert_head on the l2ad_buflist:
9667                  *
9668                  *              LIST    l2ad_buflist            LIST
9669                  *              HEAD  <------ (time) ------  TAIL
9670                  * direction    +-----+-----+-----+-----+-----+    direction
9671                  * of l2arc <== | buf | buf | buf | buf | buf | ===> of rebuild
9672                  * fill         +-----+-----+-----+-----+-----+
9673                  *              ^                               ^
9674                  *              |                               |
9675                  *              |                               |
9676                  *      l2arc_feed_thread               l2arc_rebuild
9677                  *      will place new bufs here        restores bufs here
9678                  *
9679                  * During l2arc_rebuild() the device is not used by
9680                  * l2arc_feed_thread() as dev->l2ad_rebuild is set to true.
9681                  */
9682                 size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop);
9683                 asize += vdev_psize_to_asize(dev->l2ad_vdev,
9684                     L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop));
9685                 l2arc_hdr_restore(&lb->lb_entries[i], dev);
9686         }
9687 
9688         /*
9689          * Record rebuild stats:
9690          *      size            Logical size of restored buffers in the L2ARC
9691          *      asize           Aligned size of restored buffers in the L2ARC
9692          */
9693         ARCSTAT_INCR(arcstat_l2_rebuild_size, size);
9694         ARCSTAT_INCR(arcstat_l2_rebuild_asize, asize);
9695         ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries);
9696         ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, lb_asize);
9697         ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, asize / lb_asize);
9698         ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks);
9699 }
9700 
9701 /*
9702  * Restores a single ARC buf hdr from a log entry. The ARC buffer is put
9703  * into a state indicating that it has been evicted to L2ARC.
9704  */
9705 static void
9706 l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev)
9707 {
9708         arc_buf_hdr_t           *hdr, *exists;
9709         kmutex_t                *hash_lock;
9710         arc_buf_contents_t      type = L2BLK_GET_TYPE((le)->le_prop);
9711         uint64_t                asize;
9712 
9713         /*
9714          * Do all the allocation before grabbing any locks, this lets us
9715          * sleep if memory is full and we don't have to deal with failed
9716          * allocations.
9717          */
9718         hdr = arc_buf_alloc_l2only(L2BLK_GET_LSIZE((le)->le_prop), type,
9719             dev, le->le_dva, le->le_daddr,
9720             L2BLK_GET_PSIZE((le)->le_prop), le->le_birth,
9721             L2BLK_GET_COMPRESS((le)->le_prop),
9722             L2BLK_GET_PROTECTED((le)->le_prop),
9723             L2BLK_GET_PREFETCH((le)->le_prop),
9724             L2BLK_GET_STATE((le)->le_prop));
9725         asize = vdev_psize_to_asize(dev->l2ad_vdev,
9726             L2BLK_GET_PSIZE((le)->le_prop));
9727 
9728         /*
9729          * vdev_space_update() has to be called before arc_hdr_destroy() to
9730          * avoid underflow since the latter also calls vdev_space_update().
9731          */
9732         l2arc_hdr_arcstats_increment(hdr);
9733         vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
9734 
9735         mutex_enter(&dev->l2ad_mtx);
9736         list_insert_tail(&dev->l2ad_buflist, hdr);
9737         (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr);
9738         mutex_exit(&dev->l2ad_mtx);
9739 
9740         exists = buf_hash_insert(hdr, &hash_lock);
9741         if (exists) {
9742                 /* Buffer was already cached, no need to restore it. */
9743                 arc_hdr_destroy(hdr);
9744                 /*
9745                  * If the buffer is already cached, check whether it has
9746                  * L2ARC metadata. If not, enter them and update the flag.
9747                  * This is important is case of onlining a cache device, since
9748                  * we previously evicted all L2ARC metadata from ARC.
9749                  */
9750                 if (!HDR_HAS_L2HDR(exists)) {
9751                         arc_hdr_set_flags(exists, ARC_FLAG_HAS_L2HDR);
9752                         exists->b_l2hdr.b_dev = dev;
9753                         exists->b_l2hdr.b_daddr = le->le_daddr;
9754                         exists->b_l2hdr.b_arcs_state =
9755                             L2BLK_GET_STATE((le)->le_prop);
9756                         mutex_enter(&dev->l2ad_mtx);
9757                         list_insert_tail(&dev->l2ad_buflist, exists);
9758                         (void) zfs_refcount_add_many(&dev->l2ad_alloc,
9759                             arc_hdr_size(exists), exists);
9760                         mutex_exit(&dev->l2ad_mtx);
9761                         l2arc_hdr_arcstats_increment(exists);
9762                         vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
9763                 }
9764                 ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
9765         }
9766 
9767         mutex_exit(hash_lock);
9768 }
9769 
9770 /*
9771  * Starts an asynchronous read IO to read a log block. This is used in log
9772  * block reconstruction to start reading the next block before we are done
9773  * decoding and reconstructing the current block, to keep the l2arc device
9774  * nice and hot with read IO to process.
9775  * The returned zio will contain newly allocated memory buffers for the IO
9776  * data which should then be freed by the caller once the zio is no longer
9777  * needed (i.e. due to it having completed). If you wish to abort this
9778  * zio, you should do so using l2arc_log_blk_fetch_abort, which takes
9779  * care of disposing of the allocated buffers correctly.
9780  */
9781 static zio_t *
9782 l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp,
9783     l2arc_log_blk_phys_t *lb)
9784 {
9785         uint32_t                asize;
9786         zio_t                   *pio;
9787         l2arc_read_callback_t   *cb;
9788 
9789         /* L2BLK_GET_PSIZE returns aligned size for log blocks */
9790         asize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
9791         ASSERT(asize <= sizeof (l2arc_log_blk_phys_t));
9792 
9793         cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP);
9794         cb->l2rcb_abd = abd_get_from_buf(lb, asize);
9795         pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb,
9796             ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
9797             ZIO_FLAG_DONT_RETRY);
9798         (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize,
9799             cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL,
9800             ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
9801             ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE));
9802 
9803         return (pio);
9804 }
9805 
9806 /*
9807  * Aborts a zio returned from l2arc_log_blk_fetch and frees the data
9808  * buffers allocated for it.
9809  */
9810 static void
9811 l2arc_log_blk_fetch_abort(zio_t *zio)
9812 {
9813         (void) zio_wait(zio);
9814 }
9815 
9816 /*
9817  * Creates a zio to update the device header on an l2arc device.
9818  */
9819 static void
9820 l2arc_dev_hdr_update(l2arc_dev_t *dev)
9821 {
9822         l2arc_dev_hdr_phys_t    *l2dhdr = dev->l2ad_dev_hdr;
9823         const uint64_t          l2dhdr_asize = dev->l2ad_dev_hdr_asize;
9824         abd_t                   *abd;
9825         int                     err;
9826 
9827         VERIFY(spa_config_held(dev->l2ad_spa, SCL_STATE_ALL, RW_READER));
9828 
9829         l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC;
9830         l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION;
9831         l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
9832         l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid;
9833         l2dhdr->dh_log_entries = dev->l2ad_log_entries;
9834         l2dhdr->dh_evict = dev->l2ad_evict;
9835         l2dhdr->dh_start = dev->l2ad_start;
9836         l2dhdr->dh_end = dev->l2ad_end;
9837         l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize);
9838         l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count);
9839         l2dhdr->dh_flags = 0;
9840         if (dev->l2ad_first)
9841                 l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST;
9842 
9843         abd = abd_get_from_buf(l2dhdr, l2dhdr_asize);
9844 
9845         err = zio_wait(zio_write_phys(NULL, dev->l2ad_vdev,
9846             VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL,
9847             NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE));
9848 
9849         abd_put(abd);
9850 
9851         if (err != 0) {
9852                 zfs_dbgmsg("L2ARC IO error (%d) while writing device header, "
9853                     "vdev guid: %llu", err, dev->l2ad_vdev->vdev_guid);
9854         }
9855 }
9856 
9857 /*
9858  * Commits a log block to the L2ARC device. This routine is invoked from
9859  * l2arc_write_buffers when the log block fills up.
9860  * This function allocates some memory to temporarily hold the serialized
9861  * buffer to be written. This is then released in l2arc_write_done.
9862  */
9863 static void
9864 l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
9865 {
9866         l2arc_log_blk_phys_t    *lb = &dev->l2ad_log_blk;
9867         l2arc_dev_hdr_phys_t    *l2dhdr = dev->l2ad_dev_hdr;
9868         uint64_t                psize, asize;
9869         zio_t                   *wzio;
9870         l2arc_lb_abd_buf_t      *abd_buf;
9871         uint8_t                 *tmpbuf;
9872         l2arc_lb_ptr_buf_t      *lb_ptr_buf;
9873 
9874         VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries);
9875 
9876         tmpbuf = zio_buf_alloc(sizeof (*lb));
9877         abd_buf = zio_buf_alloc(sizeof (*abd_buf));
9878         abd_buf->abd = abd_get_from_buf(lb, sizeof (*lb));
9879         lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
9880         lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP);
9881 
9882         /* link the buffer into the block chain */
9883         lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1];
9884         lb->lb_magic = L2ARC_LOG_BLK_MAGIC;
9885 
9886         /*
9887          * l2arc_log_blk_commit() may be called multiple times during a single
9888          * l2arc_write_buffers() call. Save the allocated abd buffers in a list
9889          * so we can free them in l2arc_write_done() later on.
9890          */
9891         list_insert_tail(&cb->l2wcb_abd_list, abd_buf);
9892 
9893         /* try to compress the buffer */
9894         psize = zio_compress_data(ZIO_COMPRESS_LZ4,
9895             abd_buf->abd, tmpbuf, sizeof (*lb));
9896 
9897         /* a log block is never entirely zero */
9898         ASSERT(psize != 0);
9899         asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
9900         ASSERT(asize <= sizeof (*lb));
9901 
9902         /*
9903          * Update the start log block pointer in the device header to point
9904          * to the log block we're about to write.
9905          */
9906         l2dhdr->dh_start_lbps[1] = l2dhdr->dh_start_lbps[0];
9907         l2dhdr->dh_start_lbps[0].lbp_daddr = dev->l2ad_hand;
9908         l2dhdr->dh_start_lbps[0].lbp_payload_asize =
9909             dev->l2ad_log_blk_payload_asize;
9910         l2dhdr->dh_start_lbps[0].lbp_payload_start =
9911             dev->l2ad_log_blk_payload_start;
9912         _NOTE(CONSTCOND)
9913         L2BLK_SET_LSIZE(
9914             (&l2dhdr->dh_start_lbps[0])->lbp_prop, sizeof (*lb));
9915         L2BLK_SET_PSIZE(
9916             (&l2dhdr->dh_start_lbps[0])->lbp_prop, asize);
9917         L2BLK_SET_CHECKSUM(
9918             (&l2dhdr->dh_start_lbps[0])->lbp_prop,
9919             ZIO_CHECKSUM_FLETCHER_4);
9920         if (asize < sizeof (*lb)) {
9921                 /* compression succeeded */
9922                 bzero(tmpbuf + psize, asize - psize);
9923                 L2BLK_SET_COMPRESS(
9924                     (&l2dhdr->dh_start_lbps[0])->lbp_prop,
9925                     ZIO_COMPRESS_LZ4);
9926         } else {
9927                 /* compression failed */
9928                 bcopy(lb, tmpbuf, sizeof (*lb));
9929                 L2BLK_SET_COMPRESS(
9930                     (&l2dhdr->dh_start_lbps[0])->lbp_prop,
9931                     ZIO_COMPRESS_OFF);
9932         }
9933 
9934         /* checksum what we're about to write */
9935         fletcher_4_native(tmpbuf, asize, NULL,
9936             &l2dhdr->dh_start_lbps[0].lbp_cksum);
9937 
9938         abd_put(abd_buf->abd);
9939 
9940         /* perform the write itself */
9941         abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb));
9942         abd_take_ownership_of_buf(abd_buf->abd, B_TRUE);
9943         wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
9944             asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL,
9945             ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
9946         DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio);
9947         (void) zio_nowait(wzio);
9948 
9949         dev->l2ad_hand += asize;
9950         /*
9951          * Include the committed log block's pointer  in the list of pointers
9952          * to log blocks present in the L2ARC device.
9953          */
9954         bcopy(&l2dhdr->dh_start_lbps[0], lb_ptr_buf->lb_ptr,
9955             sizeof (l2arc_log_blkptr_t));
9956         mutex_enter(&dev->l2ad_mtx);
9957         list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf);
9958         ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize);
9959         ARCSTAT_BUMP(arcstat_l2_log_blk_count);
9960         zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
9961         zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
9962         mutex_exit(&dev->l2ad_mtx);
9963         vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
9964 
9965         /* bump the kstats */
9966         ARCSTAT_INCR(arcstat_l2_write_bytes, asize);
9967         ARCSTAT_BUMP(arcstat_l2_log_blk_writes);
9968         ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, asize);
9969         ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio,
9970             dev->l2ad_log_blk_payload_asize / asize);
9971 
9972         /* start a new log block */
9973         dev->l2ad_log_ent_idx = 0;
9974         dev->l2ad_log_blk_payload_asize = 0;
9975         dev->l2ad_log_blk_payload_start = 0;
9976 }
9977 
9978 /*
9979  * Validates an L2ARC log block address to make sure that it can be read
9980  * from the provided L2ARC device.
9981  */
9982 boolean_t
9983 l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp)
9984 {
9985         /* L2BLK_GET_PSIZE returns aligned size for log blocks */
9986         uint64_t asize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
9987         uint64_t end = lbp->lbp_daddr + asize - 1;
9988         uint64_t start = lbp->lbp_payload_start;
9989         boolean_t evicted = B_FALSE;
9990 
9991         /* BEGIN CSTYLED */
9992         /*
9993          * A log block is valid if all of the following conditions are true:
9994          * - it fits entirely (including its payload) between l2ad_start and
9995          *   l2ad_end
9996          * - it has a valid size
9997          * - neither the log block itself nor part of its payload was evicted
9998          *   by l2arc_evict():
9999          *
10000          *              l2ad_hand          l2ad_evict
10001          *              |                        |      lbp_daddr
10002          *              |     start              |      |  end
10003          *              |     |                  |      |  |
10004          *              V     V                  V      V  V
10005          *   l2ad_start ============================================ l2ad_end
10006          *                    --------------------------||||
10007          *                              ^                ^
10008          *                              |               log block
10009          *                              payload
10010          */
10011         /* END CSTYLED */
10012         evicted =
10013             l2arc_range_check_overlap(start, end, dev->l2ad_hand) ||
10014             l2arc_range_check_overlap(start, end, dev->l2ad_evict) ||
10015             l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, start) ||
10016             l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end);
10017 
10018         return (start >= dev->l2ad_start && end <= dev->l2ad_end &&
10019             asize > 0 && asize <= sizeof (l2arc_log_blk_phys_t) &&
10020             (!evicted || dev->l2ad_first));
10021 }
10022 
10023 /*
10024  * Inserts ARC buffer header `hdr' into the current L2ARC log block on
10025  * the device. The buffer being inserted must be present in L2ARC.
10026  * Returns B_TRUE if the L2ARC log block is full and needs to be committed
10027  * to L2ARC, or B_FALSE if it still has room for more ARC buffers.
10028  */
10029 static boolean_t
10030 l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
10031 {
10032         l2arc_log_blk_phys_t    *lb = &dev->l2ad_log_blk;
10033         l2arc_log_ent_phys_t    *le;
10034 
10035         if (dev->l2ad_log_entries == 0)
10036                 return (B_FALSE);
10037 
10038         int index = dev->l2ad_log_ent_idx++;
10039 
10040         ASSERT3S(index, <, dev->l2ad_log_entries);
10041         ASSERT(HDR_HAS_L2HDR(hdr));
10042 
10043         le = &lb->lb_entries[index];
10044         bzero(le, sizeof (*le));
10045         le->le_dva = hdr->b_dva;
10046         le->le_birth = hdr->b_birth;
10047         le->le_daddr = hdr->b_l2hdr.b_daddr;
10048         if (index == 0)
10049                 dev->l2ad_log_blk_payload_start = le->le_daddr;
10050         L2BLK_SET_LSIZE((le)->le_prop, HDR_GET_LSIZE(hdr));
10051         L2BLK_SET_PSIZE((le)->le_prop, HDR_GET_PSIZE(hdr));
10052         L2BLK_SET_COMPRESS((le)->le_prop, HDR_GET_COMPRESS(hdr));
10053         L2BLK_SET_TYPE((le)->le_prop, hdr->b_type);
10054         L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr)));
10055         L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr)));
10056         L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state);
10057 
10058         dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
10059             HDR_GET_PSIZE(hdr));
10060 
10061         return (dev->l2ad_log_ent_idx == dev->l2ad_log_entries);
10062 }
10063 
10064 /*
10065  * Checks whether a given L2ARC device address sits in a time-sequential
10066  * range. The trick here is that the L2ARC is a rotary buffer, so we can't
10067  * just do a range comparison, we need to handle the situation in which the
10068  * range wraps around the end of the L2ARC device. Arguments:
10069  *      bottom -- Lower end of the range to check (written to earlier).
10070  *      top    -- Upper end of the range to check (written to later).
10071  *      check  -- The address for which we want to determine if it sits in
10072  *                between the top and bottom.
10073  *
10074  * The 3-way conditional below represents the following cases:
10075  *
10076  *      bottom < top : Sequentially ordered case:
10077  *        <check>--------+-------------------+
10078  *                       |  (overlap here?)  |
10079  *       L2ARC dev       V                   V
10080  *       |---------------<bottom>============<top>--------------|
10081  *
10082  *      bottom > top: Looped-around case:
10083  *                            <check>--------+------------------+
10084  *                                           |  (overlap here?) |
10085  *       L2ARC dev                           V                  V
10086  *       |===============<top>---------------<bottom>===========|
10087  *       ^               ^
10088  *       |  (or here?)   |
10089  *       +---------------+---------<check>
10090  *
10091  *      top == bottom : Just a single address comparison.
10092  */
10093 boolean_t
10094 l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check)
10095 {
10096         if (bottom < top)
10097                 return (bottom <= check && check <= top);
10098         else if (bottom > top)
10099                 return (check <= top || bottom <= check);
10100         else
10101                 return (check == top);
10102 }