base-ij-webrev Old usr/src/uts/common/os/kmem.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2017, Joyent, Inc.
  24  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  25  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  26  * Copyright 2018, Joyent, Inc.
  27  * Copyright 2020 Oxide Computer Company
  28  */
  29 
  30 /*
  31  * Kernel memory allocator, as described in the following two papers and a
  32  * statement about the consolidator:
  33  *
  34  * Jeff Bonwick,
  35  * The Slab Allocator: An Object-Caching Kernel Memory Allocator.
  36  * Proceedings of the Summer 1994 Usenix Conference.
  37  * Available as /shared/sac/PSARC/1994/028/materials/kmem.pdf.
  38  *
  39  * Jeff Bonwick and Jonathan Adams,
  40  * Magazines and vmem: Extending the Slab Allocator to Many CPUs and
  41  * Arbitrary Resources.
  42  * Proceedings of the 2001 Usenix Conference.
  43  * Available as /shared/sac/PSARC/2000/550/materials/vmem.pdf.
  44  *
  45  * kmem Slab Consolidator Big Theory Statement:
  46  *
  47  * 1. Motivation
  48  *
  49  * As stated in Bonwick94, slabs provide the following advantages over other
  50  * allocation structures in terms of memory fragmentation:
  51  *
  52  *  - Internal fragmentation (per-buffer wasted space) is minimal.
  53  *  - Severe external fragmentation (unused buffers on the free list) is
  54  *    unlikely.
  55  *
  56  * Segregating objects by size eliminates one source of external fragmentation,
  57  * and according to Bonwick:
  58  *
  59  *   The other reason that slabs reduce external fragmentation is that all
  60  *   objects in a slab are of the same type, so they have the same lifetime
  61  *   distribution. The resulting segregation of short-lived and long-lived
  62  *   objects at slab granularity reduces the likelihood of an entire page being
  63  *   held hostage due to a single long-lived allocation [Barrett93, Hanson90].
  64  *
  65  * While unlikely, severe external fragmentation remains possible. Clients that
  66  * allocate both short- and long-lived objects from the same cache cannot
  67  * anticipate the distribution of long-lived objects within the allocator's slab
  68  * implementation. Even a small percentage of long-lived objects distributed
  69  * randomly across many slabs can lead to a worst case scenario where the client
  70  * frees the majority of its objects and the system gets back almost none of the
  71  * slabs. Despite the client doing what it reasonably can to help the system
  72  * reclaim memory, the allocator cannot shake free enough slabs because of
  73  * lonely allocations stubbornly hanging on. Although the allocator is in a
  74  * position to diagnose the fragmentation, there is nothing that the allocator
  75  * by itself can do about it. It only takes a single allocated object to prevent
  76  * an entire slab from being reclaimed, and any object handed out by
  77  * kmem_cache_alloc() is by definition in the client's control. Conversely,
  78  * although the client is in a position to move a long-lived object, it has no
  79  * way of knowing if the object is causing fragmentation, and if so, where to
  80  * move it. A solution necessarily requires further cooperation between the
  81  * allocator and the client.
  82  *
  83  * 2. Move Callback
  84  *
  85  * The kmem slab consolidator therefore adds a move callback to the
  86  * allocator/client interface, improving worst-case external fragmentation in
  87  * kmem caches that supply a function to move objects from one memory location
  88  * to another. In a situation of low memory kmem attempts to consolidate all of
  89  * a cache's slabs at once; otherwise it works slowly to bring external
  90  * fragmentation within the 1/8 limit guaranteed for internal fragmentation,
  91  * thereby helping to avoid a low memory situation in the future.
  92  *
  93  * The callback has the following signature:
  94  *
  95  *   kmem_cbrc_t move(void *old, void *new, size_t size, void *user_arg)
  96  *
  97  * It supplies the kmem client with two addresses: the allocated object that
  98  * kmem wants to move and a buffer selected by kmem for the client to use as the
  99  * copy destination. The callback is kmem's way of saying "Please get off of
 100  * this buffer and use this one instead." kmem knows where it wants to move the
 101  * object in order to best reduce fragmentation. All the client needs to know
 102  * about the second argument (void *new) is that it is an allocated, constructed
 103  * object ready to take the contents of the old object. When the move function
 104  * is called, the system is likely to be low on memory, and the new object
 105  * spares the client from having to worry about allocating memory for the
 106  * requested move. The third argument supplies the size of the object, in case a
 107  * single move function handles multiple caches whose objects differ only in
 108  * size (such as zio_buf_512, zio_buf_1024, etc). Finally, the same optional
 109  * user argument passed to the constructor, destructor, and reclaim functions is
 110  * also passed to the move callback.
 111  *
 112  * 2.1 Setting the Move Callback
 113  *
 114  * The client sets the move callback after creating the cache and before
 115  * allocating from it:
 116  *
 117  *      object_cache = kmem_cache_create(...);
 118  *      kmem_cache_set_move(object_cache, object_move);
 119  *
 120  * 2.2 Move Callback Return Values
 121  *
 122  * Only the client knows about its own data and when is a good time to move it.
 123  * The client is cooperating with kmem to return unused memory to the system,
 124  * and kmem respectfully accepts this help at the client's convenience. When
 125  * asked to move an object, the client can respond with any of the following:
 126  *
 127  *   typedef enum kmem_cbrc {
 128  *           KMEM_CBRC_YES,
 129  *           KMEM_CBRC_NO,
 130  *           KMEM_CBRC_LATER,
 131  *           KMEM_CBRC_DONT_NEED,
 132  *           KMEM_CBRC_DONT_KNOW
 133  *   } kmem_cbrc_t;
 134  *
 135  * The client must not explicitly kmem_cache_free() either of the objects passed
 136  * to the callback, since kmem wants to free them directly to the slab layer
 137  * (bypassing the per-CPU magazine layer). The response tells kmem which of the
 138  * objects to free:
 139  *
 140  *       YES: (Did it) The client moved the object, so kmem frees the old one.
 141  *        NO: (Never) The client refused, so kmem frees the new object (the
 142  *            unused copy destination). kmem also marks the slab of the old
 143  *            object so as not to bother the client with further callbacks for
 144  *            that object as long as the slab remains on the partial slab list.
 145  *            (The system won't be getting the slab back as long as the
 146  *            immovable object holds it hostage, so there's no point in moving
 147  *            any of its objects.)
 148  *     LATER: The client is using the object and cannot move it now, so kmem
 149  *            frees the new object (the unused copy destination). kmem still
 150  *            attempts to move other objects off the slab, since it expects to
 151  *            succeed in clearing the slab in a later callback. The client
 152  *            should use LATER instead of NO if the object is likely to become
 153  *            movable very soon.
 154  * DONT_NEED: The client no longer needs the object, so kmem frees the old along
 155  *            with the new object (the unused copy destination). This response
 156  *            is the client's opportunity to be a model citizen and give back as
 157  *            much as it can.
 158  * DONT_KNOW: The client does not know about the object because
 159  *            a) the client has just allocated the object and not yet put it
 160  *               wherever it expects to find known objects
 161  *            b) the client has removed the object from wherever it expects to
 162  *               find known objects and is about to free it, or
 163  *            c) the client has freed the object.
 164  *            In all these cases (a, b, and c) kmem frees the new object (the
 165  *            unused copy destination).  In the first case, the object is in
 166  *            use and the correct action is that for LATER; in the latter two
 167  *            cases, we know that the object is either freed or about to be
 168  *            freed, in which case it is either already in a magazine or about
 169  *            to be in one.  In these cases, we know that the object will either
 170  *            be reallocated and reused, or it will end up in a full magazine
 171  *            that will be reaped (thereby liberating the slab).  Because it
 172  *            is prohibitively expensive to differentiate these cases, and
 173  *            because the defrag code is executed when we're low on memory
 174  *            (thereby biasing the system to reclaim full magazines) we treat
 175  *            all DONT_KNOW cases as LATER and rely on cache reaping to
 176  *            generally clean up full magazines.  While we take the same action
 177  *            for these cases, we maintain their semantic distinction:  if
 178  *            defragmentation is not occurring, it is useful to know if this
 179  *            is due to objects in use (LATER) or objects in an unknown state
 180  *            of transition (DONT_KNOW).
 181  *
 182  * 2.3 Object States
 183  *
 184  * Neither kmem nor the client can be assumed to know the object's whereabouts
 185  * at the time of the callback. An object belonging to a kmem cache may be in
 186  * any of the following states:
 187  *
 188  * 1. Uninitialized on the slab
 189  * 2. Allocated from the slab but not constructed (still uninitialized)
 190  * 3. Allocated from the slab, constructed, but not yet ready for business
 191  *    (not in a valid state for the move callback)
 192  * 4. In use (valid and known to the client)
 193  * 5. About to be freed (no longer in a valid state for the move callback)
 194  * 6. Freed to a magazine (still constructed)
 195  * 7. Allocated from a magazine, not yet ready for business (not in a valid
 196  *    state for the move callback), and about to return to state #4
 197  * 8. Deconstructed on a magazine that is about to be freed
 198  * 9. Freed to the slab
 199  *
 200  * Since the move callback may be called at any time while the object is in any
 201  * of the above states (except state #1), the client needs a safe way to
 202  * determine whether or not it knows about the object. Specifically, the client
 203  * needs to know whether or not the object is in state #4, the only state in
 204  * which a move is valid. If the object is in any other state, the client should
 205  * immediately return KMEM_CBRC_DONT_KNOW, since it is unsafe to access any of
 206  * the object's fields.
 207  *
 208  * Note that although an object may be in state #4 when kmem initiates the move
 209  * request, the object may no longer be in that state by the time kmem actually
 210  * calls the move function. Not only does the client free objects
 211  * asynchronously, kmem itself puts move requests on a queue where thay are
 212  * pending until kmem processes them from another context. Also, objects freed
 213  * to a magazine appear allocated from the point of view of the slab layer, so
 214  * kmem may even initiate requests for objects in a state other than state #4.
 215  *
 216  * 2.3.1 Magazine Layer
 217  *
 218  * An important insight revealed by the states listed above is that the magazine
 219  * layer is populated only by kmem_cache_free(). Magazines of constructed
 220  * objects are never populated directly from the slab layer (which contains raw,
 221  * unconstructed objects). Whenever an allocation request cannot be satisfied
 222  * from the magazine layer, the magazines are bypassed and the request is
 223  * satisfied from the slab layer (creating a new slab if necessary). kmem calls
 224  * the object constructor only when allocating from the slab layer, and only in
 225  * response to kmem_cache_alloc() or to prepare the destination buffer passed in
 226  * the move callback. kmem does not preconstruct objects in anticipation of
 227  * kmem_cache_alloc().
 228  *
 229  * 2.3.2 Object Constructor and Destructor
 230  *
 231  * If the client supplies a destructor, it must be valid to call the destructor
 232  * on a newly created object (immediately after the constructor).
 233  *
 234  * 2.4 Recognizing Known Objects
 235  *
 236  * There is a simple test to determine safely whether or not the client knows
 237  * about a given object in the move callback. It relies on the fact that kmem
 238  * guarantees that the object of the move callback has only been touched by the
 239  * client itself or else by kmem. kmem does this by ensuring that none of the
 240  * cache's slabs are freed to the virtual memory (VM) subsystem while a move
 241  * callback is pending. When the last object on a slab is freed, if there is a
 242  * pending move, kmem puts the slab on a per-cache dead list and defers freeing
 243  * slabs on that list until all pending callbacks are completed. That way,
 244  * clients can be certain that the object of a move callback is in one of the
 245  * states listed above, making it possible to distinguish known objects (in
 246  * state #4) using the two low order bits of any pointer member (with the
 247  * exception of 'char *' or 'short *' which may not be 4-byte aligned on some
 248  * platforms).
 249  *
 250  * The test works as long as the client always transitions objects from state #4
 251  * (known, in use) to state #5 (about to be freed, invalid) by setting the low
 252  * order bit of the client-designated pointer member. Since kmem only writes
 253  * invalid memory patterns, such as 0xbaddcafe to uninitialized memory and
 254  * 0xdeadbeef to freed memory, any scribbling on the object done by kmem is
 255  * guaranteed to set at least one of the two low order bits. Therefore, given an
 256  * object with a back pointer to a 'container_t *o_container', the client can
 257  * test
 258  *
 259  *      container_t *container = object->o_container;
 260  *      if ((uintptr_t)container & 0x3) {
 261  *              return (KMEM_CBRC_DONT_KNOW);
 262  *      }
 263  *
 264  * Typically, an object will have a pointer to some structure with a list or
 265  * hash where objects from the cache are kept while in use. Assuming that the
 266  * client has some way of knowing that the container structure is valid and will
 267  * not go away during the move, and assuming that the structure includes a lock
 268  * to protect whatever collection is used, then the client would continue as
 269  * follows:
 270  *
 271  *      // Ensure that the container structure does not go away.
 272  *      if (container_hold(container) == 0) {
 273  *              return (KMEM_CBRC_DONT_KNOW);
 274  *      }
 275  *      mutex_enter(&container->c_objects_lock);
 276  *      if (container != object->o_container) {
 277  *              mutex_exit(&container->c_objects_lock);
 278  *              container_rele(container);
 279  *              return (KMEM_CBRC_DONT_KNOW);
 280  *      }
 281  *
 282  * At this point the client knows that the object cannot be freed as long as
 283  * c_objects_lock is held. Note that after acquiring the lock, the client must
 284  * recheck the o_container pointer in case the object was removed just before
 285  * acquiring the lock.
 286  *
 287  * When the client is about to free an object, it must first remove that object
 288  * from the list, hash, or other structure where it is kept. At that time, to
 289  * mark the object so it can be distinguished from the remaining, known objects,
 290  * the client sets the designated low order bit:
 291  *
 292  *      mutex_enter(&container->c_objects_lock);
 293  *      object->o_container = (void *)((uintptr_t)object->o_container | 0x1);
 294  *      list_remove(&container->c_objects, object);
 295  *      mutex_exit(&container->c_objects_lock);
 296  *
 297  * In the common case, the object is freed to the magazine layer, where it may
 298  * be reused on a subsequent allocation without the overhead of calling the
 299  * constructor. While in the magazine it appears allocated from the point of
 300  * view of the slab layer, making it a candidate for the move callback. Most
 301  * objects unrecognized by the client in the move callback fall into this
 302  * category and are cheaply distinguished from known objects by the test
 303  * described earlier. Because searching magazines is prohibitively expensive
 304  * for kmem, clients that do not mark freed objects (and therefore return
 305  * KMEM_CBRC_DONT_KNOW for large numbers of objects) may find defragmentation
 306  * efficacy reduced.
 307  *
 308  * Invalidating the designated pointer member before freeing the object marks
 309  * the object to be avoided in the callback, and conversely, assigning a valid
 310  * value to the designated pointer member after allocating the object makes the
 311  * object fair game for the callback:
 312  *
 313  *      ... allocate object ...
 314  *      ... set any initial state not set by the constructor ...
 315  *
 316  *      mutex_enter(&container->c_objects_lock);
 317  *      list_insert_tail(&container->c_objects, object);
 318  *      membar_producer();
 319  *      object->o_container = container;
 320  *      mutex_exit(&container->c_objects_lock);
 321  *
 322  * Note that everything else must be valid before setting o_container makes the
 323  * object fair game for the move callback. The membar_producer() call ensures
 324  * that all the object's state is written to memory before setting the pointer
 325  * that transitions the object from state #3 or #7 (allocated, constructed, not
 326  * yet in use) to state #4 (in use, valid). That's important because the move
 327  * function has to check the validity of the pointer before it can safely
 328  * acquire the lock protecting the collection where it expects to find known
 329  * objects.
 330  *
 331  * This method of distinguishing known objects observes the usual symmetry:
 332  * invalidating the designated pointer is the first thing the client does before
 333  * freeing the object, and setting the designated pointer is the last thing the
 334  * client does after allocating the object. Of course, the client is not
 335  * required to use this method. Fundamentally, how the client recognizes known
 336  * objects is completely up to the client, but this method is recommended as an
 337  * efficient and safe way to take advantage of the guarantees made by kmem. If
 338  * the entire object is arbitrary data without any markable bits from a suitable
 339  * pointer member, then the client must find some other method, such as
 340  * searching a hash table of known objects.
 341  *
 342  * 2.5 Preventing Objects From Moving
 343  *
 344  * Besides a way to distinguish known objects, the other thing that the client
 345  * needs is a strategy to ensure that an object will not move while the client
 346  * is actively using it. The details of satisfying this requirement tend to be
 347  * highly cache-specific. It might seem that the same rules that let a client
 348  * remove an object safely should also decide when an object can be moved
 349  * safely. However, any object state that makes a removal attempt invalid is
 350  * likely to be long-lasting for objects that the client does not expect to
 351  * remove. kmem knows nothing about the object state and is equally likely (from
 352  * the client's point of view) to request a move for any object in the cache,
 353  * whether prepared for removal or not. Even a low percentage of objects stuck
 354  * in place by unremovability will defeat the consolidator if the stuck objects
 355  * are the same long-lived allocations likely to hold slabs hostage.
 356  * Fundamentally, the consolidator is not aimed at common cases. Severe external
 357  * fragmentation is a worst case scenario manifested as sparsely allocated
 358  * slabs, by definition a low percentage of the cache's objects. When deciding
 359  * what makes an object movable, keep in mind the goal of the consolidator: to
 360  * bring worst-case external fragmentation within the limits guaranteed for
 361  * internal fragmentation. Removability is a poor criterion if it is likely to
 362  * exclude more than an insignificant percentage of objects for long periods of
 363  * time.
 364  *
 365  * A tricky general solution exists, and it has the advantage of letting you
 366  * move any object at almost any moment, practically eliminating the likelihood
 367  * that an object can hold a slab hostage. However, if there is a cache-specific
 368  * way to ensure that an object is not actively in use in the vast majority of
 369  * cases, a simpler solution that leverages this cache-specific knowledge is
 370  * preferred.
 371  *
 372  * 2.5.1 Cache-Specific Solution
 373  *
 374  * As an example of a cache-specific solution, the ZFS znode cache takes
 375  * advantage of the fact that the vast majority of znodes are only being
 376  * referenced from the DNLC. (A typical case might be a few hundred in active
 377  * use and a hundred thousand in the DNLC.) In the move callback, after the ZFS
 378  * client has established that it recognizes the znode and can access its fields
 379  * safely (using the method described earlier), it then tests whether the znode
 380  * is referenced by anything other than the DNLC. If so, it assumes that the
 381  * znode may be in active use and is unsafe to move, so it drops its locks and
 382  * returns KMEM_CBRC_LATER. The advantage of this strategy is that everywhere
 383  * else znodes are used, no change is needed to protect against the possibility
 384  * of the znode moving. The disadvantage is that it remains possible for an
 385  * application to hold a znode slab hostage with an open file descriptor.
 386  * However, this case ought to be rare and the consolidator has a way to deal
 387  * with it: If the client responds KMEM_CBRC_LATER repeatedly for the same
 388  * object, kmem eventually stops believing it and treats the slab as if the
 389  * client had responded KMEM_CBRC_NO. Having marked the hostage slab, kmem can
 390  * then focus on getting it off of the partial slab list by allocating rather
 391  * than freeing all of its objects. (Either way of getting a slab off the
 392  * free list reduces fragmentation.)
 393  *
 394  * 2.5.2 General Solution
 395  *
 396  * The general solution, on the other hand, requires an explicit hold everywhere
 397  * the object is used to prevent it from moving. To keep the client locking
 398  * strategy as uncomplicated as possible, kmem guarantees the simplifying
 399  * assumption that move callbacks are sequential, even across multiple caches.
 400  * Internally, a global queue processed by a single thread supports all caches
 401  * implementing the callback function. No matter how many caches supply a move
 402  * function, the consolidator never moves more than one object at a time, so the
 403  * client does not have to worry about tricky lock ordering involving several
 404  * related objects from different kmem caches.
 405  *
 406  * The general solution implements the explicit hold as a read-write lock, which
 407  * allows multiple readers to access an object from the cache simultaneously
 408  * while a single writer is excluded from moving it. A single rwlock for the
 409  * entire cache would lock out all threads from using any of the cache's objects
 410  * even though only a single object is being moved, so to reduce contention,
 411  * the client can fan out the single rwlock into an array of rwlocks hashed by
 412  * the object address, making it probable that moving one object will not
 413  * prevent other threads from using a different object. The rwlock cannot be a
 414  * member of the object itself, because the possibility of the object moving
 415  * makes it unsafe to access any of the object's fields until the lock is
 416  * acquired.
 417  *
 418  * Assuming a small, fixed number of locks, it's possible that multiple objects
 419  * will hash to the same lock. A thread that needs to use multiple objects in
 420  * the same function may acquire the same lock multiple times. Since rwlocks are
 421  * reentrant for readers, and since there is never more than a single writer at
 422  * a time (assuming that the client acquires the lock as a writer only when
 423  * moving an object inside the callback), there would seem to be no problem.
 424  * However, a client locking multiple objects in the same function must handle
 425  * one case of potential deadlock: Assume that thread A needs to prevent both
 426  * object 1 and object 2 from moving, and thread B, the callback, meanwhile
 427  * tries to move object 3. It's possible, if objects 1, 2, and 3 all hash to the
 428  * same lock, that thread A will acquire the lock for object 1 as a reader
 429  * before thread B sets the lock's write-wanted bit, preventing thread A from
 430  * reacquiring the lock for object 2 as a reader. Unable to make forward
 431  * progress, thread A will never release the lock for object 1, resulting in
 432  * deadlock.
 433  *
 434  * There are two ways of avoiding the deadlock just described. The first is to
 435  * use rw_tryenter() rather than rw_enter() in the callback function when
 436  * attempting to acquire the lock as a writer. If tryenter discovers that the
 437  * same object (or another object hashed to the same lock) is already in use, it
 438  * aborts the callback and returns KMEM_CBRC_LATER. The second way is to use
 439  * rprwlock_t (declared in common/fs/zfs/sys/rprwlock.h) instead of rwlock_t,
 440  * since it allows a thread to acquire the lock as a reader in spite of a
 441  * waiting writer. This second approach insists on moving the object now, no
 442  * matter how many readers the move function must wait for in order to do so,
 443  * and could delay the completion of the callback indefinitely (blocking
 444  * callbacks to other clients). In practice, a less insistent callback using
 445  * rw_tryenter() returns KMEM_CBRC_LATER infrequently enough that there seems
 446  * little reason to use anything else.
 447  *
 448  * Avoiding deadlock is not the only problem that an implementation using an
 449  * explicit hold needs to solve. Locking the object in the first place (to
 450  * prevent it from moving) remains a problem, since the object could move
 451  * between the time you obtain a pointer to the object and the time you acquire
 452  * the rwlock hashed to that pointer value. Therefore the client needs to
 453  * recheck the value of the pointer after acquiring the lock, drop the lock if
 454  * the value has changed, and try again. This requires a level of indirection:
 455  * something that points to the object rather than the object itself, that the
 456  * client can access safely while attempting to acquire the lock. (The object
 457  * itself cannot be referenced safely because it can move at any time.)
 458  * The following lock-acquisition function takes whatever is safe to reference
 459  * (arg), follows its pointer to the object (using function f), and tries as
 460  * often as necessary to acquire the hashed lock and verify that the object
 461  * still has not moved:
 462  *
 463  *      object_t *
 464  *      object_hold(object_f f, void *arg)
 465  *      {
 466  *              object_t *op;
 467  *
 468  *              op = f(arg);
 469  *              if (op == NULL) {
 470  *                      return (NULL);
 471  *              }
 472  *
 473  *              rw_enter(OBJECT_RWLOCK(op), RW_READER);
 474  *              while (op != f(arg)) {
 475  *                      rw_exit(OBJECT_RWLOCK(op));
 476  *                      op = f(arg);
 477  *                      if (op == NULL) {
 478  *                              break;
 479  *                      }
 480  *                      rw_enter(OBJECT_RWLOCK(op), RW_READER);
 481  *              }
 482  *
 483  *              return (op);
 484  *      }
 485  *
 486  * The OBJECT_RWLOCK macro hashes the object address to obtain the rwlock. The
 487  * lock reacquisition loop, while necessary, almost never executes. The function
 488  * pointer f (used to obtain the object pointer from arg) has the following type
 489  * definition:
 490  *
 491  *      typedef object_t *(*object_f)(void *arg);
 492  *
 493  * An object_f implementation is likely to be as simple as accessing a structure
 494  * member:
 495  *
 496  *      object_t *
 497  *      s_object(void *arg)
 498  *      {
 499  *              something_t *sp = arg;
 500  *              return (sp->s_object);
 501  *      }
 502  *
 503  * The flexibility of a function pointer allows the path to the object to be
 504  * arbitrarily complex and also supports the notion that depending on where you
 505  * are using the object, you may need to get it from someplace different.
 506  *
 507  * The function that releases the explicit hold is simpler because it does not
 508  * have to worry about the object moving:
 509  *
 510  *      void
 511  *      object_rele(object_t *op)
 512  *      {
 513  *              rw_exit(OBJECT_RWLOCK(op));
 514  *      }
 515  *
 516  * The caller is spared these details so that obtaining and releasing an
 517  * explicit hold feels like a simple mutex_enter()/mutex_exit() pair. The caller
 518  * of object_hold() only needs to know that the returned object pointer is valid
 519  * if not NULL and that the object will not move until released.
 520  *
 521  * Although object_hold() prevents an object from moving, it does not prevent it
 522  * from being freed. The caller must take measures before calling object_hold()
 523  * (afterwards is too late) to ensure that the held object cannot be freed. The
 524  * caller must do so without accessing the unsafe object reference, so any lock
 525  * or reference count used to ensure the continued existence of the object must
 526  * live outside the object itself.
 527  *
 528  * Obtaining a new object is a special case where an explicit hold is impossible
 529  * for the caller. Any function that returns a newly allocated object (either as
 530  * a return value, or as an in-out paramter) must return it already held; after
 531  * the caller gets it is too late, since the object cannot be safely accessed
 532  * without the level of indirection described earlier. The following
 533  * object_alloc() example uses the same code shown earlier to transition a new
 534  * object into the state of being recognized (by the client) as a known object.
 535  * The function must acquire the hold (rw_enter) before that state transition
 536  * makes the object movable:
 537  *
 538  *      static object_t *
 539  *      object_alloc(container_t *container)
 540  *      {
 541  *              object_t *object = kmem_cache_alloc(object_cache, 0);
 542  *              ... set any initial state not set by the constructor ...
 543  *              rw_enter(OBJECT_RWLOCK(object), RW_READER);
 544  *              mutex_enter(&container->c_objects_lock);
 545  *              list_insert_tail(&container->c_objects, object);
 546  *              membar_producer();
 547  *              object->o_container = container;
 548  *              mutex_exit(&container->c_objects_lock);
 549  *              return (object);
 550  *      }
 551  *
 552  * Functions that implicitly acquire an object hold (any function that calls
 553  * object_alloc() to supply an object for the caller) need to be carefully noted
 554  * so that the matching object_rele() is not neglected. Otherwise, leaked holds
 555  * prevent all objects hashed to the affected rwlocks from ever being moved.
 556  *
 557  * The pointer to a held object can be hashed to the holding rwlock even after
 558  * the object has been freed. Although it is possible to release the hold
 559  * after freeing the object, you may decide to release the hold implicitly in
 560  * whatever function frees the object, so as to release the hold as soon as
 561  * possible, and for the sake of symmetry with the function that implicitly
 562  * acquires the hold when it allocates the object. Here, object_free() releases
 563  * the hold acquired by object_alloc(). Its implicit object_rele() forms a
 564  * matching pair with object_hold():
 565  *
 566  *      void
 567  *      object_free(object_t *object)
 568  *      {
 569  *              container_t *container;
 570  *
 571  *              ASSERT(object_held(object));
 572  *              container = object->o_container;
 573  *              mutex_enter(&container->c_objects_lock);
 574  *              object->o_container =
 575  *                  (void *)((uintptr_t)object->o_container | 0x1);
 576  *              list_remove(&container->c_objects, object);
 577  *              mutex_exit(&container->c_objects_lock);
 578  *              object_rele(object);
 579  *              kmem_cache_free(object_cache, object);
 580  *      }
 581  *
 582  * Note that object_free() cannot safely accept an object pointer as an argument
 583  * unless the object is already held. Any function that calls object_free()
 584  * needs to be carefully noted since it similarly forms a matching pair with
 585  * object_hold().
 586  *
 587  * To complete the picture, the following callback function implements the
 588  * general solution by moving objects only if they are currently unheld:
 589  *
 590  *      static kmem_cbrc_t
 591  *      object_move(void *buf, void *newbuf, size_t size, void *arg)
 592  *      {
 593  *              object_t *op = buf, *np = newbuf;
 594  *              container_t *container;
 595  *
 596  *              container = op->o_container;
 597  *              if ((uintptr_t)container & 0x3) {
 598  *                      return (KMEM_CBRC_DONT_KNOW);
 599  *              }
 600  *
 601  *              // Ensure that the container structure does not go away.
 602  *              if (container_hold(container) == 0) {
 603  *                      return (KMEM_CBRC_DONT_KNOW);
 604  *              }
 605  *
 606  *              mutex_enter(&container->c_objects_lock);
 607  *              if (container != op->o_container) {
 608  *                      mutex_exit(&container->c_objects_lock);
 609  *                      container_rele(container);
 610  *                      return (KMEM_CBRC_DONT_KNOW);
 611  *              }
 612  *
 613  *              if (rw_tryenter(OBJECT_RWLOCK(op), RW_WRITER) == 0) {
 614  *                      mutex_exit(&container->c_objects_lock);
 615  *                      container_rele(container);
 616  *                      return (KMEM_CBRC_LATER);
 617  *              }
 618  *
 619  *              object_move_impl(op, np); // critical section
 620  *              rw_exit(OBJECT_RWLOCK(op));
 621  *
 622  *              op->o_container = (void *)((uintptr_t)op->o_container | 0x1);
 623  *              list_link_replace(&op->o_link_node, &np->o_link_node);
 624  *              mutex_exit(&container->c_objects_lock);
 625  *              container_rele(container);
 626  *              return (KMEM_CBRC_YES);
 627  *      }
 628  *
 629  * Note that object_move() must invalidate the designated o_container pointer of
 630  * the old object in the same way that object_free() does, since kmem will free
 631  * the object in response to the KMEM_CBRC_YES return value.
 632  *
 633  * The lock order in object_move() differs from object_alloc(), which locks
 634  * OBJECT_RWLOCK first and &container->c_objects_lock second, but as long as the
 635  * callback uses rw_tryenter() (preventing the deadlock described earlier), it's
 636  * not a problem. Holding the lock on the object list in the example above
 637  * through the entire callback not only prevents the object from going away, it
 638  * also allows you to lock the list elsewhere and know that none of its elements
 639  * will move during iteration.
 640  *
 641  * Adding an explicit hold everywhere an object from the cache is used is tricky
 642  * and involves much more change to client code than a cache-specific solution
 643  * that leverages existing state to decide whether or not an object is
 644  * movable. However, this approach has the advantage that no object remains
 645  * immovable for any significant length of time, making it extremely unlikely
 646  * that long-lived allocations can continue holding slabs hostage; and it works
 647  * for any cache.
 648  *
 649  * 3. Consolidator Implementation
 650  *
 651  * Once the client supplies a move function that a) recognizes known objects and
 652  * b) avoids moving objects that are actively in use, the remaining work is up
 653  * to the consolidator to decide which objects to move and when to issue
 654  * callbacks.
 655  *
 656  * The consolidator relies on the fact that a cache's slabs are ordered by
 657  * usage. Each slab has a fixed number of objects. Depending on the slab's
 658  * "color" (the offset of the first object from the beginning of the slab;
 659  * offsets are staggered to mitigate false sharing of cache lines) it is either
 660  * the maximum number of objects per slab determined at cache creation time or
 661  * else the number closest to the maximum that fits within the space remaining
 662  * after the initial offset. A completely allocated slab may contribute some
 663  * internal fragmentation (per-slab overhead) but no external fragmentation, so
 664  * it is of no interest to the consolidator. At the other extreme, slabs whose
 665  * objects have all been freed to the slab are released to the virtual memory
 666  * (VM) subsystem (objects freed to magazines are still allocated as far as the
 667  * slab is concerned). External fragmentation exists when there are slabs
 668  * somewhere between these extremes. A partial slab has at least one but not all
 669  * of its objects allocated. The more partial slabs, and the fewer allocated
 670  * objects on each of them, the higher the fragmentation. Hence the
 671  * consolidator's overall strategy is to reduce the number of partial slabs by
 672  * moving allocated objects from the least allocated slabs to the most allocated
 673  * slabs.
 674  *
 675  * Partial slabs are kept in an AVL tree ordered by usage. Completely allocated
 676  * slabs are kept separately in an unordered list. Since the majority of slabs
 677  * tend to be completely allocated (a typical unfragmented cache may have
 678  * thousands of complete slabs and only a single partial slab), separating
 679  * complete slabs improves the efficiency of partial slab ordering, since the
 680  * complete slabs do not affect the depth or balance of the AVL tree. This
 681  * ordered sequence of partial slabs acts as a "free list" supplying objects for
 682  * allocation requests.
 683  *
 684  * Objects are always allocated from the first partial slab in the free list,
 685  * where the allocation is most likely to eliminate a partial slab (by
 686  * completely allocating it). Conversely, when a single object from a completely
 687  * allocated slab is freed to the slab, that slab is added to the front of the
 688  * free list. Since most free list activity involves highly allocated slabs
 689  * coming and going at the front of the list, slabs tend naturally toward the
 690  * ideal order: highly allocated at the front, sparsely allocated at the back.
 691  * Slabs with few allocated objects are likely to become completely free if they
 692  * keep a safe distance away from the front of the free list. Slab misorders
 693  * interfere with the natural tendency of slabs to become completely free or
 694  * completely allocated. For example, a slab with a single allocated object
 695  * needs only a single free to escape the cache; its natural desire is
 696  * frustrated when it finds itself at the front of the list where a second
 697  * allocation happens just before the free could have released it. Another slab
 698  * with all but one object allocated might have supplied the buffer instead, so
 699  * that both (as opposed to neither) of the slabs would have been taken off the
 700  * free list.
 701  *
 702  * Although slabs tend naturally toward the ideal order, misorders allowed by a
 703  * simple list implementation defeat the consolidator's strategy of merging
 704  * least- and most-allocated slabs. Without an AVL tree to guarantee order, kmem
 705  * needs another way to fix misorders to optimize its callback strategy. One
 706  * approach is to periodically scan a limited number of slabs, advancing a
 707  * marker to hold the current scan position, and to move extreme misorders to
 708  * the front or back of the free list and to the front or back of the current
 709  * scan range. By making consecutive scan ranges overlap by one slab, the least
 710  * allocated slab in the current range can be carried along from the end of one
 711  * scan to the start of the next.
 712  *
 713  * Maintaining partial slabs in an AVL tree relieves kmem of this additional
 714  * task, however. Since most of the cache's activity is in the magazine layer,
 715  * and allocations from the slab layer represent only a startup cost, the
 716  * overhead of maintaining a balanced tree is not a significant concern compared
 717  * to the opportunity of reducing complexity by eliminating the partial slab
 718  * scanner just described. The overhead of an AVL tree is minimized by
 719  * maintaining only partial slabs in the tree and keeping completely allocated
 720  * slabs separately in a list. To avoid increasing the size of the slab
 721  * structure the AVL linkage pointers are reused for the slab's list linkage,
 722  * since the slab will always be either partial or complete, never stored both
 723  * ways at the same time. To further minimize the overhead of the AVL tree the
 724  * compare function that orders partial slabs by usage divides the range of
 725  * allocated object counts into bins such that counts within the same bin are
 726  * considered equal. Binning partial slabs makes it less likely that allocating
 727  * or freeing a single object will change the slab's order, requiring a tree
 728  * reinsertion (an avl_remove() followed by an avl_add(), both potentially
 729  * requiring some rebalancing of the tree). Allocation counts closest to
 730  * completely free and completely allocated are left unbinned (finely sorted) to
 731  * better support the consolidator's strategy of merging slabs at either
 732  * extreme.
 733  *
 734  * 3.1 Assessing Fragmentation and Selecting Candidate Slabs
 735  *
 736  * The consolidator piggybacks on the kmem maintenance thread and is called on
 737  * the same interval as kmem_cache_update(), once per cache every fifteen
 738  * seconds. kmem maintains a running count of unallocated objects in the slab
 739  * layer (cache_bufslab). The consolidator checks whether that number exceeds
 740  * 12.5% (1/8) of the total objects in the cache (cache_buftotal), and whether
 741  * there is a significant number of slabs in the cache (arbitrarily a minimum
 742  * 101 total slabs). Unused objects that have fallen out of the magazine layer's
 743  * working set are included in the assessment, and magazines in the depot are
 744  * reaped if those objects would lift cache_bufslab above the fragmentation
 745  * threshold. Once the consolidator decides that a cache is fragmented, it looks
 746  * for a candidate slab to reclaim, starting at the end of the partial slab free
 747  * list and scanning backwards. At first the consolidator is choosy: only a slab
 748  * with fewer than 12.5% (1/8) of its objects allocated qualifies (or else a
 749  * single allocated object, regardless of percentage). If there is difficulty
 750  * finding a candidate slab, kmem raises the allocation threshold incrementally,
 751  * up to a maximum 87.5% (7/8), so that eventually the consolidator will reduce
 752  * external fragmentation (unused objects on the free list) below 12.5% (1/8),
 753  * even in the worst case of every slab in the cache being almost 7/8 allocated.
 754  * The threshold can also be lowered incrementally when candidate slabs are easy
 755  * to find, and the threshold is reset to the minimum 1/8 as soon as the cache
 756  * is no longer fragmented.
 757  *
 758  * 3.2 Generating Callbacks
 759  *
 760  * Once an eligible slab is chosen, a callback is generated for every allocated
 761  * object on the slab, in the hope that the client will move everything off the
 762  * slab and make it reclaimable. Objects selected as move destinations are
 763  * chosen from slabs at the front of the free list. Assuming slabs in the ideal
 764  * order (most allocated at the front, least allocated at the back) and a
 765  * cooperative client, the consolidator will succeed in removing slabs from both
 766  * ends of the free list, completely allocating on the one hand and completely
 767  * freeing on the other. Objects selected as move destinations are allocated in
 768  * the kmem maintenance thread where move requests are enqueued. A separate
 769  * callback thread removes pending callbacks from the queue and calls the
 770  * client. The separate thread ensures that client code (the move function) does
 771  * not interfere with internal kmem maintenance tasks. A map of pending
 772  * callbacks keyed by object address (the object to be moved) is checked to
 773  * ensure that duplicate callbacks are not generated for the same object.
 774  * Allocating the move destination (the object to move to) prevents subsequent
 775  * callbacks from selecting the same destination as an earlier pending callback.
 776  *
 777  * Move requests can also be generated by kmem_cache_reap() when the system is
 778  * desperate for memory and by kmem_cache_move_notify(), called by the client to
 779  * notify kmem that a move refused earlier with KMEM_CBRC_LATER is now possible.
 780  * The map of pending callbacks is protected by the same lock that protects the
 781  * slab layer.
 782  *
 783  * When the system is desperate for memory, kmem does not bother to determine
 784  * whether or not the cache exceeds the fragmentation threshold, but tries to
 785  * consolidate as many slabs as possible. Normally, the consolidator chews
 786  * slowly, one sparsely allocated slab at a time during each maintenance
 787  * interval that the cache is fragmented. When desperate, the consolidator
 788  * starts at the last partial slab and enqueues callbacks for every allocated
 789  * object on every partial slab, working backwards until it reaches the first
 790  * partial slab. The first partial slab, meanwhile, advances in pace with the
 791  * consolidator as allocations to supply move destinations for the enqueued
 792  * callbacks use up the highly allocated slabs at the front of the free list.
 793  * Ideally, the overgrown free list collapses like an accordion, starting at
 794  * both ends and ending at the center with a single partial slab.
 795  *
 796  * 3.3 Client Responses
 797  *
 798  * When the client returns KMEM_CBRC_NO in response to the move callback, kmem
 799  * marks the slab that supplied the stuck object non-reclaimable and moves it to
 800  * front of the free list. The slab remains marked as long as it remains on the
 801  * free list, and it appears more allocated to the partial slab compare function
 802  * than any unmarked slab, no matter how many of its objects are allocated.
 803  * Since even one immovable object ties up the entire slab, the goal is to
 804  * completely allocate any slab that cannot be completely freed. kmem does not
 805  * bother generating callbacks to move objects from a marked slab unless the
 806  * system is desperate.
 807  *
 808  * When the client responds KMEM_CBRC_LATER, kmem increments a count for the
 809  * slab. If the client responds LATER too many times, kmem disbelieves and
 810  * treats the response as a NO. The count is cleared when the slab is taken off
 811  * the partial slab list or when the client moves one of the slab's objects.
 812  *
 813  * 4. Observability
 814  *
 815  * A kmem cache's external fragmentation is best observed with 'mdb -k' using
 816  * the ::kmem_slabs dcmd. For a complete description of the command, enter
 817  * '::help kmem_slabs' at the mdb prompt.
 818  */
 819 
 820 #include <sys/kmem_impl.h>
 821 #include <sys/vmem_impl.h>
 822 #include <sys/param.h>
 823 #include <sys/sysmacros.h>
 824 #include <sys/vm.h>
 825 #include <sys/proc.h>
 826 #include <sys/tuneable.h>
 827 #include <sys/systm.h>
 828 #include <sys/cmn_err.h>
 829 #include <sys/debug.h>
 830 #include <sys/sdt.h>
 831 #include <sys/mutex.h>
 832 #include <sys/bitmap.h>
 833 #include <sys/atomic.h>
 834 #include <sys/kobj.h>
 835 #include <sys/disp.h>
 836 #include <vm/seg_kmem.h>
 837 #include <sys/log.h>
 838 #include <sys/callb.h>
 839 #include <sys/taskq.h>
 840 #include <sys/modctl.h>
 841 #include <sys/reboot.h>
 842 #include <sys/id32.h>
 843 #include <sys/zone.h>
 844 #include <sys/netstack.h>
 845 #ifdef  DEBUG
 846 #include <sys/random.h>
 847 #endif
 848 
 849 extern void streams_msg_init(void);
 850 extern int segkp_fromheap;
 851 extern void segkp_cache_free(void);
 852 extern int callout_init_done;
 853 
 854 struct kmem_cache_kstat {
 855         kstat_named_t   kmc_buf_size;
 856         kstat_named_t   kmc_align;
 857         kstat_named_t   kmc_chunk_size;
 858         kstat_named_t   kmc_slab_size;
 859         kstat_named_t   kmc_alloc;
 860         kstat_named_t   kmc_alloc_fail;
 861         kstat_named_t   kmc_free;
 862         kstat_named_t   kmc_depot_alloc;
 863         kstat_named_t   kmc_depot_free;
 864         kstat_named_t   kmc_depot_contention;
 865         kstat_named_t   kmc_slab_alloc;
 866         kstat_named_t   kmc_slab_free;
 867         kstat_named_t   kmc_buf_constructed;
 868         kstat_named_t   kmc_buf_avail;
 869         kstat_named_t   kmc_buf_inuse;
 870         kstat_named_t   kmc_buf_total;
 871         kstat_named_t   kmc_buf_max;
 872         kstat_named_t   kmc_slab_create;
 873         kstat_named_t   kmc_slab_destroy;
 874         kstat_named_t   kmc_vmem_source;
 875         kstat_named_t   kmc_hash_size;
 876         kstat_named_t   kmc_hash_lookup_depth;
 877         kstat_named_t   kmc_hash_rescale;
 878         kstat_named_t   kmc_full_magazines;
 879         kstat_named_t   kmc_empty_magazines;
 880         kstat_named_t   kmc_magazine_size;
 881         kstat_named_t   kmc_reap; /* number of kmem_cache_reap() calls */
 882         kstat_named_t   kmc_defrag; /* attempts to defrag all partial slabs */
 883         kstat_named_t   kmc_scan; /* attempts to defrag one partial slab */
 884         kstat_named_t   kmc_move_callbacks; /* sum of yes, no, later, dn, dk */
 885         kstat_named_t   kmc_move_yes;
 886         kstat_named_t   kmc_move_no;
 887         kstat_named_t   kmc_move_later;
 888         kstat_named_t   kmc_move_dont_need;
 889         kstat_named_t   kmc_move_dont_know; /* obj unrecognized by client ... */
 890         kstat_named_t   kmc_move_hunt_found; /* ... but found in mag layer */
 891         kstat_named_t   kmc_move_slabs_freed; /* slabs freed by consolidator */
 892         kstat_named_t   kmc_move_reclaimable; /* buffers, if consolidator ran */
 893 } kmem_cache_kstat = {
 894         { "buf_size",           KSTAT_DATA_UINT64 },
 895         { "align",              KSTAT_DATA_UINT64 },
 896         { "chunk_size",         KSTAT_DATA_UINT64 },
 897         { "slab_size",          KSTAT_DATA_UINT64 },
 898         { "alloc",              KSTAT_DATA_UINT64 },
 899         { "alloc_fail",         KSTAT_DATA_UINT64 },
 900         { "free",               KSTAT_DATA_UINT64 },
 901         { "depot_alloc",        KSTAT_DATA_UINT64 },
 902         { "depot_free",         KSTAT_DATA_UINT64 },
 903         { "depot_contention",   KSTAT_DATA_UINT64 },
 904         { "slab_alloc",         KSTAT_DATA_UINT64 },
 905         { "slab_free",          KSTAT_DATA_UINT64 },
 906         { "buf_constructed",    KSTAT_DATA_UINT64 },
 907         { "buf_avail",          KSTAT_DATA_UINT64 },
 908         { "buf_inuse",          KSTAT_DATA_UINT64 },
 909         { "buf_total",          KSTAT_DATA_UINT64 },
 910         { "buf_max",            KSTAT_DATA_UINT64 },
 911         { "slab_create",        KSTAT_DATA_UINT64 },
 912         { "slab_destroy",       KSTAT_DATA_UINT64 },
 913         { "vmem_source",        KSTAT_DATA_UINT64 },
 914         { "hash_size",          KSTAT_DATA_UINT64 },
 915         { "hash_lookup_depth",  KSTAT_DATA_UINT64 },
 916         { "hash_rescale",       KSTAT_DATA_UINT64 },
 917         { "full_magazines",     KSTAT_DATA_UINT64 },
 918         { "empty_magazines",    KSTAT_DATA_UINT64 },
 919         { "magazine_size",      KSTAT_DATA_UINT64 },
 920         { "reap",               KSTAT_DATA_UINT64 },
 921         { "defrag",             KSTAT_DATA_UINT64 },
 922         { "scan",               KSTAT_DATA_UINT64 },
 923         { "move_callbacks",     KSTAT_DATA_UINT64 },
 924         { "move_yes",           KSTAT_DATA_UINT64 },
 925         { "move_no",            KSTAT_DATA_UINT64 },
 926         { "move_later",         KSTAT_DATA_UINT64 },
 927         { "move_dont_need",     KSTAT_DATA_UINT64 },
 928         { "move_dont_know",     KSTAT_DATA_UINT64 },
 929         { "move_hunt_found",    KSTAT_DATA_UINT64 },
 930         { "move_slabs_freed",   KSTAT_DATA_UINT64 },
 931         { "move_reclaimable",   KSTAT_DATA_UINT64 },
 932 };
 933 
 934 static kmutex_t kmem_cache_kstat_lock;
 935 
 936 /*
 937  * The default set of caches to back kmem_alloc().
 938  * These sizes should be reevaluated periodically.
 939  *
 940  * We want allocations that are multiples of the coherency granularity
 941  * (64 bytes) to be satisfied from a cache which is a multiple of 64
 942  * bytes, so that it will be 64-byte aligned.  For all multiples of 64,
 943  * the next kmem_cache_size greater than or equal to it must be a
 944  * multiple of 64.
 945  *
 946  * We split the table into two sections:  size <= 4k and size > 4k.  This
 947  * saves a lot of space and cache footprint in our cache tables.
 948  */
 949 static const int kmem_alloc_sizes[] = {
 950         1 * 8,
 951         2 * 8,
 952         3 * 8,
 953         4 * 8,          5 * 8,          6 * 8,          7 * 8,
 954         4 * 16,         5 * 16,         6 * 16,         7 * 16,
 955         4 * 32,         5 * 32,         6 * 32,         7 * 32,
 956         4 * 64,         5 * 64,         6 * 64,         7 * 64,
 957         4 * 128,        5 * 128,        6 * 128,        7 * 128,
 958         P2ALIGN(8192 / 7, 64),
 959         P2ALIGN(8192 / 6, 64),
 960         P2ALIGN(8192 / 5, 64),
 961         P2ALIGN(8192 / 4, 64),
 962         P2ALIGN(8192 / 3, 64),
 963         P2ALIGN(8192 / 2, 64),
 964 };
 965 
 966 static const int kmem_big_alloc_sizes[] = {
 967         2 * 4096,       3 * 4096,
 968         2 * 8192,       3 * 8192,
 969         4 * 8192,       5 * 8192,       6 * 8192,       7 * 8192,
 970         8 * 8192,       9 * 8192,       10 * 8192,      11 * 8192,
 971         12 * 8192,      13 * 8192,      14 * 8192,      15 * 8192,
 972         16 * 8192
 973 };
 974 
 975 #define KMEM_MAXBUF             4096
 976 #define KMEM_BIG_MAXBUF_32BIT   32768
 977 #define KMEM_BIG_MAXBUF         131072
 978 
 979 #define KMEM_BIG_MULTIPLE       4096    /* big_alloc_sizes must be a multiple */
 980 #define KMEM_BIG_SHIFT          12      /* lg(KMEM_BIG_MULTIPLE) */
 981 
 982 static kmem_cache_t *kmem_alloc_table[KMEM_MAXBUF >> KMEM_ALIGN_SHIFT];
 983 static kmem_cache_t *kmem_big_alloc_table[KMEM_BIG_MAXBUF >> KMEM_BIG_SHIFT];
 984 
 985 #define KMEM_ALLOC_TABLE_MAX    (KMEM_MAXBUF >> KMEM_ALIGN_SHIFT)
 986 static size_t kmem_big_alloc_table_max = 0;     /* # of filled elements */
 987 
 988 static kmem_magtype_t kmem_magtype[] = {
 989         { 1,    8,      3200,   65536   },
 990         { 3,    16,     256,    32768   },
 991         { 7,    32,     64,     16384   },
 992         { 15,   64,     0,      8192    },
 993         { 31,   64,     0,      4096    },
 994         { 47,   64,     0,      2048    },
 995         { 63,   64,     0,      1024    },
 996         { 95,   64,     0,      512     },
 997         { 143,  64,     0,      0       },
 998 };
 999 
1000 static uint32_t kmem_reaping;
1001 static uint32_t kmem_reaping_idspace;
1002 
1003 /*
1004  * kmem tunables
1005  */
1006 clock_t kmem_reap_interval;     /* cache reaping rate [15 * HZ ticks] */
1007 int kmem_depot_contention = 3;  /* max failed tryenters per real interval */
1008 pgcnt_t kmem_reapahead = 0;     /* start reaping N pages before pageout */
1009 int kmem_panic = 1;             /* whether to panic on error */
1010 int kmem_logging = 1;           /* kmem_log_enter() override */
1011 uint32_t kmem_mtbf = 0;         /* mean time between failures [default: off] */
1012 size_t kmem_transaction_log_size; /* transaction log size [2% of memory] */
1013 size_t kmem_content_log_size;   /* content log size [2% of memory] */
1014 size_t kmem_failure_log_size;   /* failure log [4 pages per CPU] */
1015 size_t kmem_slab_log_size;      /* slab create log [4 pages per CPU] */
1016 size_t kmem_zerosized_log_size; /* zero-sized log [4 pages per CPU] */
1017 size_t kmem_content_maxsave = 256; /* KMF_CONTENTS max bytes to log */
1018 size_t kmem_lite_minsize = 0;   /* minimum buffer size for KMF_LITE */
1019 size_t kmem_lite_maxalign = 1024; /* maximum buffer alignment for KMF_LITE */
1020 int kmem_lite_pcs = 4;          /* number of PCs to store in KMF_LITE mode */
1021 size_t kmem_maxverify;          /* maximum bytes to inspect in debug routines */
1022 size_t kmem_minfirewall;        /* hardware-enforced redzone threshold */
1023 
1024 #ifdef DEBUG
1025 int kmem_warn_zerosized = 1;    /* whether to warn on zero-sized KM_SLEEP */
1026 #else
1027 int kmem_warn_zerosized = 0;    /* whether to warn on zero-sized KM_SLEEP */
1028 #endif
1029 
1030 int kmem_panic_zerosized = 0;   /* whether to panic on zero-sized KM_SLEEP */
1031 
1032 #ifdef _LP64
1033 size_t  kmem_max_cached = KMEM_BIG_MAXBUF;      /* maximum kmem_alloc cache */
1034 #else
1035 size_t  kmem_max_cached = KMEM_BIG_MAXBUF_32BIT; /* maximum kmem_alloc cache */
1036 #endif
1037 
1038 #ifdef DEBUG
1039 int kmem_flags = KMF_AUDIT | KMF_DEADBEEF | KMF_REDZONE | KMF_CONTENTS;
1040 #else
1041 int kmem_flags = 0;
1042 #endif
1043 int kmem_ready;
1044 
1045 static kmem_cache_t     *kmem_slab_cache;
1046 static kmem_cache_t     *kmem_bufctl_cache;
1047 static kmem_cache_t     *kmem_bufctl_audit_cache;
1048 
1049 static kmutex_t         kmem_cache_lock;        /* inter-cache linkage only */
1050 static list_t           kmem_caches;
1051 
1052 static taskq_t          *kmem_taskq;
1053 static kmutex_t         kmem_flags_lock;
1054 static vmem_t           *kmem_metadata_arena;
1055 static vmem_t           *kmem_msb_arena;        /* arena for metadata caches */
1056 static vmem_t           *kmem_cache_arena;
1057 static vmem_t           *kmem_hash_arena;
1058 static vmem_t           *kmem_log_arena;
1059 static vmem_t           *kmem_oversize_arena;
1060 static vmem_t           *kmem_va_arena;
1061 static vmem_t           *kmem_default_arena;
1062 static vmem_t           *kmem_firewall_va_arena;
1063 static vmem_t           *kmem_firewall_arena;
1064 
1065 static int              kmem_zerosized;         /* # of zero-sized allocs */
1066 
1067 /*
1068  * kmem slab consolidator thresholds (tunables)
1069  */
1070 size_t kmem_frag_minslabs = 101;        /* minimum total slabs */
1071 size_t kmem_frag_numer = 1;             /* free buffers (numerator) */
1072 size_t kmem_frag_denom = KMEM_VOID_FRACTION; /* buffers (denominator) */
1073 /*
1074  * Maximum number of slabs from which to move buffers during a single
1075  * maintenance interval while the system is not low on memory.
1076  */
1077 size_t kmem_reclaim_max_slabs = 1;
1078 /*
1079  * Number of slabs to scan backwards from the end of the partial slab list
1080  * when searching for buffers to relocate.
1081  */
1082 size_t kmem_reclaim_scan_range = 12;
1083 
1084 /* consolidator knobs */
1085 boolean_t kmem_move_noreap;
1086 boolean_t kmem_move_blocked;
1087 boolean_t kmem_move_fulltilt;
1088 boolean_t kmem_move_any_partial;
1089 
1090 #ifdef  DEBUG
1091 /*
1092  * kmem consolidator debug tunables:
1093  * Ensure code coverage by occasionally running the consolidator even when the
1094  * caches are not fragmented (they may never be). These intervals are mean time
1095  * in cache maintenance intervals (kmem_cache_update).
1096  */
1097 uint32_t kmem_mtb_move = 60;    /* defrag 1 slab (~15min) */
1098 uint32_t kmem_mtb_reap = 1800;  /* defrag all slabs (~7.5hrs) */
1099 #endif  /* DEBUG */
1100 
1101 static kmem_cache_t     *kmem_defrag_cache;
1102 static kmem_cache_t     *kmem_move_cache;
1103 static taskq_t          *kmem_move_taskq;
1104 
1105 static void kmem_cache_scan(kmem_cache_t *);
1106 static void kmem_cache_defrag(kmem_cache_t *);
1107 static void kmem_slab_prefill(kmem_cache_t *, kmem_slab_t *);
1108 
1109 
1110 kmem_log_header_t       *kmem_transaction_log;
1111 kmem_log_header_t       *kmem_content_log;
1112 kmem_log_header_t       *kmem_failure_log;
1113 kmem_log_header_t       *kmem_slab_log;
1114 kmem_log_header_t       *kmem_zerosized_log;
1115 
1116 static int              kmem_lite_count; /* # of PCs in kmem_buftag_lite_t */
1117 
1118 #define KMEM_BUFTAG_LITE_ENTER(bt, count, caller)                       \
1119         if ((count) > 0) {                                           \
1120                 pc_t *_s = ((kmem_buftag_lite_t *)(bt))->bt_history; \
1121                 pc_t *_e;                                               \
1122                 /* memmove() the old entries down one notch */          \
1123                 for (_e = &_s[(count) - 1]; _e > _s; _e--)               \
1124                         *_e = *(_e - 1);                                \
1125                 *_s = (uintptr_t)(caller);                              \
1126         }
1127 
1128 #define KMERR_MODIFIED  0       /* buffer modified while on freelist */
1129 #define KMERR_REDZONE   1       /* redzone violation (write past end of buf) */
1130 #define KMERR_DUPFREE   2       /* freed a buffer twice */
1131 #define KMERR_BADADDR   3       /* freed a bad (unallocated) address */
1132 #define KMERR_BADBUFTAG 4       /* buftag corrupted */
1133 #define KMERR_BADBUFCTL 5       /* bufctl corrupted */
1134 #define KMERR_BADCACHE  6       /* freed a buffer to the wrong cache */
1135 #define KMERR_BADSIZE   7       /* alloc size != free size */
1136 #define KMERR_BADBASE   8       /* buffer base address wrong */
1137 
1138 struct {
1139         hrtime_t        kmp_timestamp;  /* timestamp of panic */
1140         int             kmp_error;      /* type of kmem error */
1141         void            *kmp_buffer;    /* buffer that induced panic */
1142         void            *kmp_realbuf;   /* real start address for buffer */
1143         kmem_cache_t    *kmp_cache;     /* buffer's cache according to client */
1144         kmem_cache_t    *kmp_realcache; /* actual cache containing buffer */
1145         kmem_slab_t     *kmp_slab;      /* slab accoring to kmem_findslab() */
1146         kmem_bufctl_t   *kmp_bufctl;    /* bufctl */
1147 } kmem_panic_info;
1148 
1149 
1150 static void
1151 copy_pattern(uint64_t pattern, void *buf_arg, size_t size)
1152 {
1153         uint64_t *bufend = (uint64_t *)((char *)buf_arg + size);
1154         uint64_t *buf = buf_arg;
1155 
1156         while (buf < bufend)
1157                 *buf++ = pattern;
1158 }
1159 
1160 static void *
1161 verify_pattern(uint64_t pattern, void *buf_arg, size_t size)
1162 {
1163         uint64_t *bufend = (uint64_t *)((char *)buf_arg + size);
1164         uint64_t *buf;
1165 
1166         for (buf = buf_arg; buf < bufend; buf++)
1167                 if (*buf != pattern)
1168                         return (buf);
1169         return (NULL);
1170 }
1171 
1172 static void *
1173 verify_and_copy_pattern(uint64_t old, uint64_t new, void *buf_arg, size_t size)
1174 {
1175         uint64_t *bufend = (uint64_t *)((char *)buf_arg + size);
1176         uint64_t *buf;
1177 
1178         for (buf = buf_arg; buf < bufend; buf++) {
1179                 if (*buf != old) {
1180                         copy_pattern(old, buf_arg,
1181                             (char *)buf - (char *)buf_arg);
1182                         return (buf);
1183                 }
1184                 *buf = new;
1185         }
1186 
1187         return (NULL);
1188 }
1189 
1190 static void
1191 kmem_cache_applyall(void (*func)(kmem_cache_t *), taskq_t *tq, int tqflag)
1192 {
1193         kmem_cache_t *cp;
1194 
1195         mutex_enter(&kmem_cache_lock);
1196         for (cp = list_head(&kmem_caches); cp != NULL;
1197             cp = list_next(&kmem_caches, cp))
1198                 if (tq != NULL)
1199                         (void) taskq_dispatch(tq, (task_func_t *)func, cp,
1200                             tqflag);
1201                 else
1202                         func(cp);
1203         mutex_exit(&kmem_cache_lock);
1204 }
1205 
1206 static void
1207 kmem_cache_applyall_id(void (*func)(kmem_cache_t *), taskq_t *tq, int tqflag)
1208 {
1209         kmem_cache_t *cp;
1210 
1211         mutex_enter(&kmem_cache_lock);
1212         for (cp = list_head(&kmem_caches); cp != NULL;
1213             cp = list_next(&kmem_caches, cp)) {
1214                 if (!(cp->cache_cflags & KMC_IDENTIFIER))
1215                         continue;
1216                 if (tq != NULL)
1217                         (void) taskq_dispatch(tq, (task_func_t *)func, cp,
1218                             tqflag);
1219                 else
1220                         func(cp);
1221         }
1222         mutex_exit(&kmem_cache_lock);
1223 }
1224 
1225 /*
1226  * Debugging support.  Given a buffer address, find its slab.
1227  */
1228 static kmem_slab_t *
1229 kmem_findslab(kmem_cache_t *cp, void *buf)
1230 {
1231         kmem_slab_t *sp;
1232 
1233         mutex_enter(&cp->cache_lock);
1234         for (sp = list_head(&cp->cache_complete_slabs); sp != NULL;
1235             sp = list_next(&cp->cache_complete_slabs, sp)) {
1236                 if (KMEM_SLAB_MEMBER(sp, buf)) {
1237                         mutex_exit(&cp->cache_lock);
1238                         return (sp);
1239                 }
1240         }
1241         for (sp = avl_first(&cp->cache_partial_slabs); sp != NULL;
1242             sp = AVL_NEXT(&cp->cache_partial_slabs, sp)) {
1243                 if (KMEM_SLAB_MEMBER(sp, buf)) {
1244                         mutex_exit(&cp->cache_lock);
1245                         return (sp);
1246                 }
1247         }
1248         mutex_exit(&cp->cache_lock);
1249 
1250         return (NULL);
1251 }
1252 
1253 static void
1254 kmem_error(int error, kmem_cache_t *cparg, void *bufarg)
1255 {
1256         kmem_buftag_t *btp = NULL;
1257         kmem_bufctl_t *bcp = NULL;
1258         kmem_cache_t *cp = cparg;
1259         kmem_slab_t *sp;
1260         uint64_t *off;
1261         void *buf = bufarg;
1262 
1263         kmem_logging = 0;       /* stop logging when a bad thing happens */
1264 
1265         kmem_panic_info.kmp_timestamp = gethrtime();
1266 
1267         sp = kmem_findslab(cp, buf);
1268         if (sp == NULL) {
1269                 for (cp = list_tail(&kmem_caches); cp != NULL;
1270                     cp = list_prev(&kmem_caches, cp)) {
1271                         if ((sp = kmem_findslab(cp, buf)) != NULL)
1272                                 break;
1273                 }
1274         }
1275 
1276         if (sp == NULL) {
1277                 cp = NULL;
1278                 error = KMERR_BADADDR;
1279         } else {
1280                 if (cp != cparg)
1281                         error = KMERR_BADCACHE;
1282                 else
1283                         buf = (char *)bufarg - ((uintptr_t)bufarg -
1284                             (uintptr_t)sp->slab_base) % cp->cache_chunksize;
1285                 if (buf != bufarg)
1286                         error = KMERR_BADBASE;
1287                 if (cp->cache_flags & KMF_BUFTAG)
1288                         btp = KMEM_BUFTAG(cp, buf);
1289                 if (cp->cache_flags & KMF_HASH) {
1290                         mutex_enter(&cp->cache_lock);
1291                         for (bcp = *KMEM_HASH(cp, buf); bcp; bcp = bcp->bc_next)
1292                                 if (bcp->bc_addr == buf)
1293                                         break;
1294                         mutex_exit(&cp->cache_lock);
1295                         if (bcp == NULL && btp != NULL)
1296                                 bcp = btp->bt_bufctl;
1297                         if (kmem_findslab(cp->cache_bufctl_cache, bcp) ==
1298                             NULL || P2PHASE((uintptr_t)bcp, KMEM_ALIGN) ||
1299                             bcp->bc_addr != buf) {
1300                                 error = KMERR_BADBUFCTL;
1301                                 bcp = NULL;
1302                         }
1303                 }
1304         }
1305 
1306         kmem_panic_info.kmp_error = error;
1307         kmem_panic_info.kmp_buffer = bufarg;
1308         kmem_panic_info.kmp_realbuf = buf;
1309         kmem_panic_info.kmp_cache = cparg;
1310         kmem_panic_info.kmp_realcache = cp;
1311         kmem_panic_info.kmp_slab = sp;
1312         kmem_panic_info.kmp_bufctl = bcp;
1313 
1314         printf("kernel memory allocator: ");
1315 
1316         switch (error) {
1317 
1318         case KMERR_MODIFIED:
1319                 printf("buffer modified after being freed\n");
1320                 off = verify_pattern(KMEM_FREE_PATTERN, buf, cp->cache_verify);
1321                 if (off == NULL)        /* shouldn't happen */
1322                         off = buf;
1323                 printf("modification occurred at offset 0x%lx "
1324                     "(0x%llx replaced by 0x%llx)\n",
1325                     (uintptr_t)off - (uintptr_t)buf,
1326                     (longlong_t)KMEM_FREE_PATTERN, (longlong_t)*off);
1327                 break;
1328 
1329         case KMERR_REDZONE:
1330                 printf("redzone violation: write past end of buffer\n");
1331                 break;
1332 
1333         case KMERR_BADADDR:
1334                 printf("invalid free: buffer not in cache\n");
1335                 break;
1336 
1337         case KMERR_DUPFREE:
1338                 printf("duplicate free: buffer freed twice\n");
1339                 break;
1340 
1341         case KMERR_BADBUFTAG:
1342                 printf("boundary tag corrupted\n");
1343                 printf("bcp ^ bxstat = %lx, should be %lx\n",
1344                     (intptr_t)btp->bt_bufctl ^ btp->bt_bxstat,
1345                     KMEM_BUFTAG_FREE);
1346                 break;
1347 
1348         case KMERR_BADBUFCTL:
1349                 printf("bufctl corrupted\n");
1350                 break;
1351 
1352         case KMERR_BADCACHE:
1353                 printf("buffer freed to wrong cache\n");
1354                 printf("buffer was allocated from %s,\n", cp->cache_name);
1355                 printf("caller attempting free to %s.\n", cparg->cache_name);
1356                 break;
1357 
1358         case KMERR_BADSIZE:
1359                 printf("bad free: free size (%u) != alloc size (%u)\n",
1360                     KMEM_SIZE_DECODE(((uint32_t *)btp)[0]),
1361                     KMEM_SIZE_DECODE(((uint32_t *)btp)[1]));
1362                 break;
1363 
1364         case KMERR_BADBASE:
1365                 printf("bad free: free address (%p) != alloc address (%p)\n",
1366                     bufarg, buf);
1367                 break;
1368         }
1369 
1370         printf("buffer=%p  bufctl=%p  cache: %s\n",
1371             bufarg, (void *)bcp, cparg->cache_name);
1372 
1373         if (bcp != NULL && (cp->cache_flags & KMF_AUDIT) &&
1374             error != KMERR_BADBUFCTL) {
1375                 int d;
1376                 timestruc_t ts;
1377                 kmem_bufctl_audit_t *bcap = (kmem_bufctl_audit_t *)bcp;
1378 
1379                 hrt2ts(kmem_panic_info.kmp_timestamp - bcap->bc_timestamp, &ts);
1380                 printf("previous transaction on buffer %p:\n", buf);
1381                 printf("thread=%p  time=T-%ld.%09ld  slab=%p  cache: %s\n",
1382                     (void *)bcap->bc_thread, ts.tv_sec, ts.tv_nsec,
1383                     (void *)sp, cp->cache_name);
1384                 for (d = 0; d < MIN(bcap->bc_depth, KMEM_STACK_DEPTH); d++) {
1385                         ulong_t off;
1386                         char *sym = kobj_getsymname(bcap->bc_stack[d], &off);
1387                         printf("%s+%lx\n", sym ? sym : "?", off);
1388                 }
1389         }
1390         if (kmem_panic > 0)
1391                 panic("kernel heap corruption detected");
1392         if (kmem_panic == 0)
1393                 debug_enter(NULL);
1394         kmem_logging = 1;       /* resume logging */
1395 }
1396 
1397 static kmem_log_header_t *
1398 kmem_log_init(size_t logsize)
1399 {
1400         kmem_log_header_t *lhp;
1401         int nchunks = 4 * max_ncpus;
1402         size_t lhsize = (size_t)&((kmem_log_header_t *)0)->lh_cpu[max_ncpus];
1403         int i;
1404 
1405         /*
1406          * Make sure that lhp->lh_cpu[] is nicely aligned
1407          * to prevent false sharing of cache lines.
1408          */
1409         lhsize = P2ROUNDUP(lhsize, KMEM_ALIGN);
1410         lhp = vmem_xalloc(kmem_log_arena, lhsize, 64, P2NPHASE(lhsize, 64), 0,
1411             NULL, NULL, VM_SLEEP);
1412         bzero(lhp, lhsize);
1413 
1414         mutex_init(&lhp->lh_lock, NULL, MUTEX_DEFAULT, NULL);
1415         lhp->lh_nchunks = nchunks;
1416         lhp->lh_chunksize = P2ROUNDUP(logsize / nchunks + 1, PAGESIZE);
1417         lhp->lh_base = vmem_alloc(kmem_log_arena,
1418             lhp->lh_chunksize * nchunks, VM_SLEEP);
1419         lhp->lh_free = vmem_alloc(kmem_log_arena,
1420             nchunks * sizeof (int), VM_SLEEP);
1421         bzero(lhp->lh_base, lhp->lh_chunksize * nchunks);
1422 
1423         for (i = 0; i < max_ncpus; i++) {
1424                 kmem_cpu_log_header_t *clhp = &lhp->lh_cpu[i];
1425                 mutex_init(&clhp->clh_lock, NULL, MUTEX_DEFAULT, NULL);
1426                 clhp->clh_chunk = i;
1427         }
1428 
1429         for (i = max_ncpus; i < nchunks; i++)
1430                 lhp->lh_free[i] = i;
1431 
1432         lhp->lh_head = max_ncpus;
1433         lhp->lh_tail = 0;
1434 
1435         return (lhp);
1436 }
1437 
1438 static void *
1439 kmem_log_enter(kmem_log_header_t *lhp, void *data, size_t size)
1440 {
1441         void *logspace;
1442         kmem_cpu_log_header_t *clhp;
1443 
1444         if (lhp == NULL || kmem_logging == 0 || panicstr)
1445                 return (NULL);
1446 
1447         clhp = &lhp->lh_cpu[CPU->cpu_seqid];
1448 
1449         mutex_enter(&clhp->clh_lock);
1450         clhp->clh_hits++;
1451         if (size > clhp->clh_avail) {
1452                 mutex_enter(&lhp->lh_lock);
1453                 lhp->lh_hits++;
1454                 lhp->lh_free[lhp->lh_tail] = clhp->clh_chunk;
1455                 lhp->lh_tail = (lhp->lh_tail + 1) % lhp->lh_nchunks;
1456                 clhp->clh_chunk = lhp->lh_free[lhp->lh_head];
1457                 lhp->lh_head = (lhp->lh_head + 1) % lhp->lh_nchunks;
1458                 clhp->clh_current = lhp->lh_base +
1459                     clhp->clh_chunk * lhp->lh_chunksize;
1460                 clhp->clh_avail = lhp->lh_chunksize;
1461                 if (size > lhp->lh_chunksize)
1462                         size = lhp->lh_chunksize;
1463                 mutex_exit(&lhp->lh_lock);
1464         }
1465         logspace = clhp->clh_current;
1466         clhp->clh_current += size;
1467         clhp->clh_avail -= size;
1468         bcopy(data, logspace, size);
1469         mutex_exit(&clhp->clh_lock);
1470         return (logspace);
1471 }
1472 
1473 #define KMEM_AUDIT(lp, cp, bcp)                                         \
1474 {                                                                       \
1475         kmem_bufctl_audit_t *_bcp = (kmem_bufctl_audit_t *)(bcp);       \
1476         _bcp->bc_timestamp = gethrtime();                            \
1477         _bcp->bc_thread = curthread;                                 \
1478         _bcp->bc_depth = getpcstack(_bcp->bc_stack, KMEM_STACK_DEPTH);    \
1479         _bcp->bc_lastlog = kmem_log_enter((lp), _bcp, sizeof (*_bcp));       \
1480 }
1481 
1482 static void
1483 kmem_log_event(kmem_log_header_t *lp, kmem_cache_t *cp,
1484     kmem_slab_t *sp, void *addr)
1485 {
1486         kmem_bufctl_audit_t bca;
1487 
1488         bzero(&bca, sizeof (kmem_bufctl_audit_t));
1489         bca.bc_addr = addr;
1490         bca.bc_slab = sp;
1491         bca.bc_cache = cp;
1492         KMEM_AUDIT(lp, cp, &bca);
1493 }
1494 
1495 /*
1496  * Create a new slab for cache cp.
1497  */
1498 static kmem_slab_t *
1499 kmem_slab_create(kmem_cache_t *cp, int kmflag)
1500 {
1501         size_t slabsize = cp->cache_slabsize;
1502         size_t chunksize = cp->cache_chunksize;
1503         int cache_flags = cp->cache_flags;
1504         size_t color, chunks;
1505         char *buf, *slab;
1506         kmem_slab_t *sp;
1507         kmem_bufctl_t *bcp;
1508         vmem_t *vmp = cp->cache_arena;
1509 
1510         ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
1511 
1512         color = cp->cache_color + cp->cache_align;
1513         if (color > cp->cache_maxcolor)
1514                 color = cp->cache_mincolor;
1515         cp->cache_color = color;
1516 
1517         slab = vmem_alloc(vmp, slabsize, kmflag & KM_VMFLAGS);
1518 
1519         if (slab == NULL)
1520                 goto vmem_alloc_failure;
1521 
1522         ASSERT(P2PHASE((uintptr_t)slab, vmp->vm_quantum) == 0);
1523 
1524         /*
1525          * Reverify what was already checked in kmem_cache_set_move(), since the
1526          * consolidator depends (for correctness) on slabs being initialized
1527          * with the 0xbaddcafe memory pattern (setting a low order bit usable by
1528          * clients to distinguish uninitialized memory from known objects).
1529          */
1530         ASSERT((cp->cache_move == NULL) || !(cp->cache_cflags & KMC_NOTOUCH));
1531         if (!(cp->cache_cflags & KMC_NOTOUCH))
1532                 copy_pattern(KMEM_UNINITIALIZED_PATTERN, slab, slabsize);
1533 
1534         if (cache_flags & KMF_HASH) {
1535                 if ((sp = kmem_cache_alloc(kmem_slab_cache, kmflag)) == NULL)
1536                         goto slab_alloc_failure;
1537                 chunks = (slabsize - color) / chunksize;
1538         } else {
1539                 sp = KMEM_SLAB(cp, slab);
1540                 chunks = (slabsize - sizeof (kmem_slab_t) - color) / chunksize;
1541         }
1542 
1543         sp->slab_cache       = cp;
1544         sp->slab_head        = NULL;
1545         sp->slab_refcnt      = 0;
1546         sp->slab_base        = buf = slab + color;
1547         sp->slab_chunks      = chunks;
1548         sp->slab_stuck_offset = (uint32_t)-1;
1549         sp->slab_later_count = 0;
1550         sp->slab_flags = 0;
1551 
1552         ASSERT(chunks > 0);
1553         while (chunks-- != 0) {
1554                 if (cache_flags & KMF_HASH) {
1555                         bcp = kmem_cache_alloc(cp->cache_bufctl_cache, kmflag);
1556                         if (bcp == NULL)
1557                                 goto bufctl_alloc_failure;
1558                         if (cache_flags & KMF_AUDIT) {
1559                                 kmem_bufctl_audit_t *bcap =
1560                                     (kmem_bufctl_audit_t *)bcp;
1561                                 bzero(bcap, sizeof (kmem_bufctl_audit_t));
1562                                 bcap->bc_cache = cp;
1563                         }
1564                         bcp->bc_addr = buf;
1565                         bcp->bc_slab = sp;
1566                 } else {
1567                         bcp = KMEM_BUFCTL(cp, buf);
1568                 }
1569                 if (cache_flags & KMF_BUFTAG) {
1570                         kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
1571                         btp->bt_redzone = KMEM_REDZONE_PATTERN;
1572                         btp->bt_bufctl = bcp;
1573                         btp->bt_bxstat = (intptr_t)bcp ^ KMEM_BUFTAG_FREE;
1574                         if (cache_flags & KMF_DEADBEEF) {
1575                                 copy_pattern(KMEM_FREE_PATTERN, buf,
1576                                     cp->cache_verify);
1577                         }
1578                 }
1579                 bcp->bc_next = sp->slab_head;
1580                 sp->slab_head = bcp;
1581                 buf += chunksize;
1582         }
1583 
1584         kmem_log_event(kmem_slab_log, cp, sp, slab);
1585 
1586         return (sp);
1587 
1588 bufctl_alloc_failure:
1589 
1590         while ((bcp = sp->slab_head) != NULL) {
1591                 sp->slab_head = bcp->bc_next;
1592                 kmem_cache_free(cp->cache_bufctl_cache, bcp);
1593         }
1594         kmem_cache_free(kmem_slab_cache, sp);
1595 
1596 slab_alloc_failure:
1597 
1598         vmem_free(vmp, slab, slabsize);
1599 
1600 vmem_alloc_failure:
1601 
1602         kmem_log_event(kmem_failure_log, cp, NULL, NULL);
1603         atomic_inc_64(&cp->cache_alloc_fail);
1604 
1605         return (NULL);
1606 }
1607 
1608 /*
1609  * Destroy a slab.
1610  */
1611 static void
1612 kmem_slab_destroy(kmem_cache_t *cp, kmem_slab_t *sp)
1613 {
1614         vmem_t *vmp = cp->cache_arena;
1615         void *slab = (void *)P2ALIGN((uintptr_t)sp->slab_base, vmp->vm_quantum);
1616 
1617         ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
1618         ASSERT(sp->slab_refcnt == 0);
1619 
1620         if (cp->cache_flags & KMF_HASH) {
1621                 kmem_bufctl_t *bcp;
1622                 while ((bcp = sp->slab_head) != NULL) {
1623                         sp->slab_head = bcp->bc_next;
1624                         kmem_cache_free(cp->cache_bufctl_cache, bcp);
1625                 }
1626                 kmem_cache_free(kmem_slab_cache, sp);
1627         }
1628         vmem_free(vmp, slab, cp->cache_slabsize);
1629 }
1630 
1631 static void *
1632 kmem_slab_alloc_impl(kmem_cache_t *cp, kmem_slab_t *sp, boolean_t prefill)
1633 {
1634         kmem_bufctl_t *bcp, **hash_bucket;
1635         void *buf;
1636         boolean_t new_slab = (sp->slab_refcnt == 0);
1637 
1638         ASSERT(MUTEX_HELD(&cp->cache_lock));
1639         /*
1640          * kmem_slab_alloc() drops cache_lock when it creates a new slab, so we
1641          * can't ASSERT(avl_is_empty(&cp->cache_partial_slabs)) here when the
1642          * slab is newly created.
1643          */
1644         ASSERT(new_slab || (KMEM_SLAB_IS_PARTIAL(sp) &&
1645             (sp == avl_first(&cp->cache_partial_slabs))));
1646         ASSERT(sp->slab_cache == cp);
1647 
1648         cp->cache_slab_alloc++;
1649         cp->cache_bufslab--;
1650         sp->slab_refcnt++;
1651 
1652         bcp = sp->slab_head;
1653         sp->slab_head = bcp->bc_next;
1654 
1655         if (cp->cache_flags & KMF_HASH) {
1656                 /*
1657                  * Add buffer to allocated-address hash table.
1658                  */
1659                 buf = bcp->bc_addr;
1660                 hash_bucket = KMEM_HASH(cp, buf);
1661                 bcp->bc_next = *hash_bucket;
1662                 *hash_bucket = bcp;
1663                 if ((cp->cache_flags & (KMF_AUDIT | KMF_BUFTAG)) == KMF_AUDIT) {
1664                         KMEM_AUDIT(kmem_transaction_log, cp, bcp);
1665                 }
1666         } else {
1667                 buf = KMEM_BUF(cp, bcp);
1668         }
1669 
1670         ASSERT(KMEM_SLAB_MEMBER(sp, buf));
1671 
1672         if (sp->slab_head == NULL) {
1673                 ASSERT(KMEM_SLAB_IS_ALL_USED(sp));
1674                 if (new_slab) {
1675                         ASSERT(sp->slab_chunks == 1);
1676                 } else {
1677                         ASSERT(sp->slab_chunks > 1); /* the slab was partial */
1678                         avl_remove(&cp->cache_partial_slabs, sp);
1679                         sp->slab_later_count = 0; /* clear history */
1680                         sp->slab_flags &= ~KMEM_SLAB_NOMOVE;
1681                         sp->slab_stuck_offset = (uint32_t)-1;
1682                 }
1683                 list_insert_head(&cp->cache_complete_slabs, sp);
1684                 cp->cache_complete_slab_count++;
1685                 return (buf);
1686         }
1687 
1688         ASSERT(KMEM_SLAB_IS_PARTIAL(sp));
1689         /*
1690          * Peek to see if the magazine layer is enabled before
1691          * we prefill.  We're not holding the cpu cache lock,
1692          * so the peek could be wrong, but there's no harm in it.
1693          */
1694         if (new_slab && prefill && (cp->cache_flags & KMF_PREFILL) &&
1695             (KMEM_CPU_CACHE(cp)->cc_magsize != 0))  {
1696                 kmem_slab_prefill(cp, sp);
1697                 return (buf);
1698         }
1699 
1700         if (new_slab) {
1701                 avl_add(&cp->cache_partial_slabs, sp);
1702                 return (buf);
1703         }
1704 
1705         /*
1706          * The slab is now more allocated than it was, so the
1707          * order remains unchanged.
1708          */
1709         ASSERT(!avl_update(&cp->cache_partial_slabs, sp));
1710         return (buf);
1711 }
1712 
1713 /*
1714  * Allocate a raw (unconstructed) buffer from cp's slab layer.
1715  */
1716 static void *
1717 kmem_slab_alloc(kmem_cache_t *cp, int kmflag)
1718 {
1719         kmem_slab_t *sp;
1720         void *buf;
1721         boolean_t test_destructor;
1722 
1723         mutex_enter(&cp->cache_lock);
1724         test_destructor = (cp->cache_slab_alloc == 0);
1725         sp = avl_first(&cp->cache_partial_slabs);
1726         if (sp == NULL) {
1727                 ASSERT(cp->cache_bufslab == 0);
1728 
1729                 /*
1730                  * The freelist is empty.  Create a new slab.
1731                  */
1732                 mutex_exit(&cp->cache_lock);
1733                 if ((sp = kmem_slab_create(cp, kmflag)) == NULL) {
1734                         return (NULL);
1735                 }
1736                 mutex_enter(&cp->cache_lock);
1737                 cp->cache_slab_create++;
1738                 if ((cp->cache_buftotal += sp->slab_chunks) > cp->cache_bufmax)
1739                         cp->cache_bufmax = cp->cache_buftotal;
1740                 cp->cache_bufslab += sp->slab_chunks;
1741         }
1742 
1743         buf = kmem_slab_alloc_impl(cp, sp, B_TRUE);
1744         ASSERT((cp->cache_slab_create - cp->cache_slab_destroy) ==
1745             (cp->cache_complete_slab_count +
1746             avl_numnodes(&cp->cache_partial_slabs) +
1747             (cp->cache_defrag == NULL ? 0 : cp->cache_defrag->kmd_deadcount)));
1748         mutex_exit(&cp->cache_lock);
1749 
1750         if (test_destructor && cp->cache_destructor != NULL) {
1751                 /*
1752                  * On the first kmem_slab_alloc(), assert that it is valid to
1753                  * call the destructor on a newly constructed object without any
1754                  * client involvement.
1755                  */
1756                 if ((cp->cache_constructor == NULL) ||
1757                     cp->cache_constructor(buf, cp->cache_private,
1758                     kmflag) == 0) {
1759                         cp->cache_destructor(buf, cp->cache_private);
1760                 }
1761                 copy_pattern(KMEM_UNINITIALIZED_PATTERN, buf,
1762                     cp->cache_bufsize);
1763                 if (cp->cache_flags & KMF_DEADBEEF) {
1764                         copy_pattern(KMEM_FREE_PATTERN, buf, cp->cache_verify);
1765                 }
1766         }
1767 
1768         return (buf);
1769 }
1770 
1771 static void kmem_slab_move_yes(kmem_cache_t *, kmem_slab_t *, void *);
1772 
1773 /*
1774  * Free a raw (unconstructed) buffer to cp's slab layer.
1775  */
1776 static void
1777 kmem_slab_free(kmem_cache_t *cp, void *buf)
1778 {
1779         kmem_slab_t *sp;
1780         kmem_bufctl_t *bcp, **prev_bcpp;
1781 
1782         ASSERT(buf != NULL);
1783 
1784         mutex_enter(&cp->cache_lock);
1785         cp->cache_slab_free++;
1786 
1787         if (cp->cache_flags & KMF_HASH) {
1788                 /*
1789                  * Look up buffer in allocated-address hash table.
1790                  */
1791                 prev_bcpp = KMEM_HASH(cp, buf);
1792                 while ((bcp = *prev_bcpp) != NULL) {
1793                         if (bcp->bc_addr == buf) {
1794                                 *prev_bcpp = bcp->bc_next;
1795                                 sp = bcp->bc_slab;
1796                                 break;
1797                         }
1798                         cp->cache_lookup_depth++;
1799                         prev_bcpp = &bcp->bc_next;
1800                 }
1801         } else {
1802                 bcp = KMEM_BUFCTL(cp, buf);
1803                 sp = KMEM_SLAB(cp, buf);
1804         }
1805 
1806         if (bcp == NULL || sp->slab_cache != cp || !KMEM_SLAB_MEMBER(sp, buf)) {
1807                 mutex_exit(&cp->cache_lock);
1808                 kmem_error(KMERR_BADADDR, cp, buf);
1809                 return;
1810         }
1811 
1812         if (KMEM_SLAB_OFFSET(sp, buf) == sp->slab_stuck_offset) {
1813                 /*
1814                  * If this is the buffer that prevented the consolidator from
1815                  * clearing the slab, we can reset the slab flags now that the
1816                  * buffer is freed. (It makes sense to do this in
1817                  * kmem_cache_free(), where the client gives up ownership of the
1818                  * buffer, but on the hot path the test is too expensive.)
1819                  */
1820                 kmem_slab_move_yes(cp, sp, buf);
1821         }
1822 
1823         if ((cp->cache_flags & (KMF_AUDIT | KMF_BUFTAG)) == KMF_AUDIT) {
1824                 if (cp->cache_flags & KMF_CONTENTS)
1825                         ((kmem_bufctl_audit_t *)bcp)->bc_contents =
1826                             kmem_log_enter(kmem_content_log, buf,
1827                             cp->cache_contents);
1828                 KMEM_AUDIT(kmem_transaction_log, cp, bcp);
1829         }
1830 
1831         bcp->bc_next = sp->slab_head;
1832         sp->slab_head = bcp;
1833 
1834         cp->cache_bufslab++;
1835         ASSERT(sp->slab_refcnt >= 1);
1836 
1837         if (--sp->slab_refcnt == 0) {
1838                 /*
1839                  * There are no outstanding allocations from this slab,
1840                  * so we can reclaim the memory.
1841                  */
1842                 if (sp->slab_chunks == 1) {
1843                         list_remove(&cp->cache_complete_slabs, sp);
1844                         cp->cache_complete_slab_count--;
1845                 } else {
1846                         avl_remove(&cp->cache_partial_slabs, sp);
1847                 }
1848 
1849                 cp->cache_buftotal -= sp->slab_chunks;
1850                 cp->cache_bufslab -= sp->slab_chunks;
1851                 /*
1852                  * Defer releasing the slab to the virtual memory subsystem
1853                  * while there is a pending move callback, since we guarantee
1854                  * that buffers passed to the move callback have only been
1855                  * touched by kmem or by the client itself. Since the memory
1856                  * patterns baddcafe (uninitialized) and deadbeef (freed) both
1857                  * set at least one of the two lowest order bits, the client can
1858                  * test those bits in the move callback to determine whether or
1859                  * not it knows about the buffer (assuming that the client also
1860                  * sets one of those low order bits whenever it frees a buffer).
1861                  */
1862                 if (cp->cache_defrag == NULL ||
1863                     (avl_is_empty(&cp->cache_defrag->kmd_moves_pending) &&
1864                     !(sp->slab_flags & KMEM_SLAB_MOVE_PENDING))) {
1865                         cp->cache_slab_destroy++;
1866                         mutex_exit(&cp->cache_lock);
1867                         kmem_slab_destroy(cp, sp);
1868                 } else {
1869                         list_t *deadlist = &cp->cache_defrag->kmd_deadlist;
1870                         /*
1871                          * Slabs are inserted at both ends of the deadlist to
1872                          * distinguish between slabs freed while move callbacks
1873                          * are pending (list head) and a slab freed while the
1874                          * lock is dropped in kmem_move_buffers() (list tail) so
1875                          * that in both cases slab_destroy() is called from the
1876                          * right context.
1877                          */
1878                         if (sp->slab_flags & KMEM_SLAB_MOVE_PENDING) {
1879                                 list_insert_tail(deadlist, sp);
1880                         } else {
1881                                 list_insert_head(deadlist, sp);
1882                         }
1883                         cp->cache_defrag->kmd_deadcount++;
1884                         mutex_exit(&cp->cache_lock);
1885                 }
1886                 return;
1887         }
1888 
1889         if (bcp->bc_next == NULL) {
1890                 /* Transition the slab from completely allocated to partial. */
1891                 ASSERT(sp->slab_refcnt == (sp->slab_chunks - 1));
1892                 ASSERT(sp->slab_chunks > 1);
1893                 list_remove(&cp->cache_complete_slabs, sp);
1894                 cp->cache_complete_slab_count--;
1895                 avl_add(&cp->cache_partial_slabs, sp);
1896         } else {
1897                 (void) avl_update_gt(&cp->cache_partial_slabs, sp);
1898         }
1899 
1900         ASSERT((cp->cache_slab_create - cp->cache_slab_destroy) ==
1901             (cp->cache_complete_slab_count +
1902             avl_numnodes(&cp->cache_partial_slabs) +
1903             (cp->cache_defrag == NULL ? 0 : cp->cache_defrag->kmd_deadcount)));
1904         mutex_exit(&cp->cache_lock);
1905 }
1906 
1907 /*
1908  * Return -1 if kmem_error, 1 if constructor fails, 0 if successful.
1909  */
1910 static int
1911 kmem_cache_alloc_debug(kmem_cache_t *cp, void *buf, int kmflag, int construct,
1912     caddr_t caller)
1913 {
1914         kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
1915         kmem_bufctl_audit_t *bcp = (kmem_bufctl_audit_t *)btp->bt_bufctl;
1916         uint32_t mtbf;
1917 
1918         if (btp->bt_bxstat != ((intptr_t)bcp ^ KMEM_BUFTAG_FREE)) {
1919                 kmem_error(KMERR_BADBUFTAG, cp, buf);
1920                 return (-1);
1921         }
1922 
1923         btp->bt_bxstat = (intptr_t)bcp ^ KMEM_BUFTAG_ALLOC;
1924 
1925         if ((cp->cache_flags & KMF_HASH) && bcp->bc_addr != buf) {
1926                 kmem_error(KMERR_BADBUFCTL, cp, buf);
1927                 return (-1);
1928         }
1929 
1930         if (cp->cache_flags & KMF_DEADBEEF) {
1931                 if (!construct && (cp->cache_flags & KMF_LITE)) {
1932                         if (*(uint64_t *)buf != KMEM_FREE_PATTERN) {
1933                                 kmem_error(KMERR_MODIFIED, cp, buf);
1934                                 return (-1);
1935                         }
1936                         if (cp->cache_constructor != NULL)
1937                                 *(uint64_t *)buf = btp->bt_redzone;
1938                         else
1939                                 *(uint64_t *)buf = KMEM_UNINITIALIZED_PATTERN;
1940                 } else {
1941                         construct = 1;
1942                         if (verify_and_copy_pattern(KMEM_FREE_PATTERN,
1943                             KMEM_UNINITIALIZED_PATTERN, buf,
1944                             cp->cache_verify)) {
1945                                 kmem_error(KMERR_MODIFIED, cp, buf);
1946                                 return (-1);
1947                         }
1948                 }
1949         }
1950         btp->bt_redzone = KMEM_REDZONE_PATTERN;
1951 
1952         if ((mtbf = kmem_mtbf | cp->cache_mtbf) != 0 &&
1953             gethrtime() % mtbf == 0 &&
1954             (kmflag & (KM_NOSLEEP | KM_PANIC)) == KM_NOSLEEP) {
1955                 kmem_log_event(kmem_failure_log, cp, NULL, NULL);
1956                 if (!construct && cp->cache_destructor != NULL)
1957                         cp->cache_destructor(buf, cp->cache_private);
1958         } else {
1959                 mtbf = 0;
1960         }
1961 
1962         if (mtbf || (construct && cp->cache_constructor != NULL &&
1963             cp->cache_constructor(buf, cp->cache_private, kmflag) != 0)) {
1964                 atomic_inc_64(&cp->cache_alloc_fail);
1965                 btp->bt_bxstat = (intptr_t)bcp ^ KMEM_BUFTAG_FREE;
1966                 if (cp->cache_flags & KMF_DEADBEEF)
1967                         copy_pattern(KMEM_FREE_PATTERN, buf, cp->cache_verify);
1968                 kmem_slab_free(cp, buf);
1969                 return (1);
1970         }
1971 
1972         if (cp->cache_flags & KMF_AUDIT) {
1973                 KMEM_AUDIT(kmem_transaction_log, cp, bcp);
1974         }
1975 
1976         if ((cp->cache_flags & KMF_LITE) &&
1977             !(cp->cache_cflags & KMC_KMEM_ALLOC)) {
1978                 KMEM_BUFTAG_LITE_ENTER(btp, kmem_lite_count, caller);
1979         }
1980 
1981         return (0);
1982 }
1983 
1984 static int
1985 kmem_cache_free_debug(kmem_cache_t *cp, void *buf, caddr_t caller)
1986 {
1987         kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
1988         kmem_bufctl_audit_t *bcp = (kmem_bufctl_audit_t *)btp->bt_bufctl;
1989         kmem_slab_t *sp;
1990 
1991         if (btp->bt_bxstat != ((intptr_t)bcp ^ KMEM_BUFTAG_ALLOC)) {
1992                 if (btp->bt_bxstat == ((intptr_t)bcp ^ KMEM_BUFTAG_FREE)) {
1993                         kmem_error(KMERR_DUPFREE, cp, buf);
1994                         return (-1);
1995                 }
1996                 sp = kmem_findslab(cp, buf);
1997                 if (sp == NULL || sp->slab_cache != cp)
1998                         kmem_error(KMERR_BADADDR, cp, buf);
1999                 else
2000                         kmem_error(KMERR_REDZONE, cp, buf);
2001                 return (-1);
2002         }
2003 
2004         btp->bt_bxstat = (intptr_t)bcp ^ KMEM_BUFTAG_FREE;
2005 
2006         if ((cp->cache_flags & KMF_HASH) && bcp->bc_addr != buf) {
2007                 kmem_error(KMERR_BADBUFCTL, cp, buf);
2008                 return (-1);
2009         }
2010 
2011         if (btp->bt_redzone != KMEM_REDZONE_PATTERN) {
2012                 kmem_error(KMERR_REDZONE, cp, buf);
2013                 return (-1);
2014         }
2015 
2016         if (cp->cache_flags & KMF_AUDIT) {
2017                 if (cp->cache_flags & KMF_CONTENTS)
2018                         bcp->bc_contents = kmem_log_enter(kmem_content_log,
2019                             buf, cp->cache_contents);
2020                 KMEM_AUDIT(kmem_transaction_log, cp, bcp);
2021         }
2022 
2023         if ((cp->cache_flags & KMF_LITE) &&
2024             !(cp->cache_cflags & KMC_KMEM_ALLOC)) {
2025                 KMEM_BUFTAG_LITE_ENTER(btp, kmem_lite_count, caller);
2026         }
2027 
2028         if (cp->cache_flags & KMF_DEADBEEF) {
2029                 if (cp->cache_flags & KMF_LITE)
2030                         btp->bt_redzone = *(uint64_t *)buf;
2031                 else if (cp->cache_destructor != NULL)
2032                         cp->cache_destructor(buf, cp->cache_private);
2033 
2034                 copy_pattern(KMEM_FREE_PATTERN, buf, cp->cache_verify);
2035         }
2036 
2037         return (0);
2038 }
2039 
2040 /*
2041  * Free each object in magazine mp to cp's slab layer, and free mp itself.
2042  */
2043 static void
2044 kmem_magazine_destroy(kmem_cache_t *cp, kmem_magazine_t *mp, int nrounds)
2045 {
2046         int round;
2047 
2048         ASSERT(!list_link_active(&cp->cache_link) ||
2049             taskq_member(kmem_taskq, curthread));
2050 
2051         for (round = 0; round < nrounds; round++) {
2052                 void *buf = mp->mag_round[round];
2053 
2054                 if (cp->cache_flags & KMF_DEADBEEF) {
2055                         if (verify_pattern(KMEM_FREE_PATTERN, buf,
2056                             cp->cache_verify) != NULL) {
2057                                 kmem_error(KMERR_MODIFIED, cp, buf);
2058                                 continue;
2059                         }
2060                         if ((cp->cache_flags & KMF_LITE) &&
2061                             cp->cache_destructor != NULL) {
2062                                 kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
2063                                 *(uint64_t *)buf = btp->bt_redzone;
2064                                 cp->cache_destructor(buf, cp->cache_private);
2065                                 *(uint64_t *)buf = KMEM_FREE_PATTERN;
2066                         }
2067                 } else if (cp->cache_destructor != NULL) {
2068                         cp->cache_destructor(buf, cp->cache_private);
2069                 }
2070 
2071                 kmem_slab_free(cp, buf);
2072         }
2073         ASSERT(KMEM_MAGAZINE_VALID(cp, mp));
2074         kmem_cache_free(cp->cache_magtype->mt_cache, mp);
2075 }
2076 
2077 /*
2078  * Allocate a magazine from the depot.
2079  */
2080 static kmem_magazine_t *
2081 kmem_depot_alloc(kmem_cache_t *cp, kmem_maglist_t *mlp)
2082 {
2083         kmem_magazine_t *mp;
2084 
2085         /*
2086          * If we can't get the depot lock without contention,
2087          * update our contention count.  We use the depot
2088          * contention rate to determine whether we need to
2089          * increase the magazine size for better scalability.
2090          */
2091         if (!mutex_tryenter(&cp->cache_depot_lock)) {
2092                 mutex_enter(&cp->cache_depot_lock);
2093                 cp->cache_depot_contention++;
2094         }
2095 
2096         if ((mp = mlp->ml_list) != NULL) {
2097                 ASSERT(KMEM_MAGAZINE_VALID(cp, mp));
2098                 mlp->ml_list = mp->mag_next;
2099                 if (--mlp->ml_total < mlp->ml_min)
2100                         mlp->ml_min = mlp->ml_total;
2101                 mlp->ml_alloc++;
2102         }
2103 
2104         mutex_exit(&cp->cache_depot_lock);
2105 
2106         return (mp);
2107 }
2108 
2109 /*
2110  * Free a magazine to the depot.
2111  */
2112 static void
2113 kmem_depot_free(kmem_cache_t *cp, kmem_maglist_t *mlp, kmem_magazine_t *mp)
2114 {
2115         mutex_enter(&cp->cache_depot_lock);
2116         ASSERT(KMEM_MAGAZINE_VALID(cp, mp));
2117         mp->mag_next = mlp->ml_list;
2118         mlp->ml_list = mp;
2119         mlp->ml_total++;
2120         mutex_exit(&cp->cache_depot_lock);
2121 }
2122 
2123 /*
2124  * Update the working set statistics for cp's depot.
2125  */
2126 static void
2127 kmem_depot_ws_update(kmem_cache_t *cp)
2128 {
2129         mutex_enter(&cp->cache_depot_lock);
2130         cp->cache_full.ml_reaplimit = cp->cache_full.ml_min;
2131         cp->cache_full.ml_min = cp->cache_full.ml_total;
2132         cp->cache_empty.ml_reaplimit = cp->cache_empty.ml_min;
2133         cp->cache_empty.ml_min = cp->cache_empty.ml_total;
2134         mutex_exit(&cp->cache_depot_lock);
2135 }
2136 
2137 /*
2138  * Set the working set statistics for cp's depot to zero.  (Everything is
2139  * eligible for reaping.)
2140  */
2141 static void
2142 kmem_depot_ws_zero(kmem_cache_t *cp)
2143 {
2144         mutex_enter(&cp->cache_depot_lock);
2145         cp->cache_full.ml_reaplimit = cp->cache_full.ml_total;
2146         cp->cache_full.ml_min = cp->cache_full.ml_total;
2147         cp->cache_empty.ml_reaplimit = cp->cache_empty.ml_total;
2148         cp->cache_empty.ml_min = cp->cache_empty.ml_total;
2149         mutex_exit(&cp->cache_depot_lock);
2150 }
2151 
2152 /*
2153  * The number of bytes to reap before we call kpreempt(). The default (1MB)
2154  * causes us to preempt reaping up to hundreds of times per second. Using a
2155  * larger value (1GB) causes this to have virtually no effect.
2156  */
2157 size_t kmem_reap_preempt_bytes = 1024 * 1024;
2158 
2159 /*
2160  * Reap all magazines that have fallen out of the depot's working set.
2161  */
2162 static void
2163 kmem_depot_ws_reap(kmem_cache_t *cp)
2164 {
2165         size_t bytes = 0;
2166         long reap;
2167         kmem_magazine_t *mp;
2168 
2169         ASSERT(!list_link_active(&cp->cache_link) ||
2170             taskq_member(kmem_taskq, curthread));
2171 
2172         reap = MIN(cp->cache_full.ml_reaplimit, cp->cache_full.ml_min);
2173         while (reap-- &&
2174             (mp = kmem_depot_alloc(cp, &cp->cache_full)) != NULL) {
2175                 kmem_magazine_destroy(cp, mp, cp->cache_magtype->mt_magsize);
2176                 bytes += cp->cache_magtype->mt_magsize * cp->cache_bufsize;
2177                 if (bytes > kmem_reap_preempt_bytes) {
2178                         kpreempt(KPREEMPT_SYNC);
2179                         bytes = 0;
2180                 }
2181         }
2182 
2183         reap = MIN(cp->cache_empty.ml_reaplimit, cp->cache_empty.ml_min);
2184         while (reap-- &&
2185             (mp = kmem_depot_alloc(cp, &cp->cache_empty)) != NULL) {
2186                 kmem_magazine_destroy(cp, mp, 0);
2187                 bytes += cp->cache_magtype->mt_magsize * cp->cache_bufsize;
2188                 if (bytes > kmem_reap_preempt_bytes) {
2189                         kpreempt(KPREEMPT_SYNC);
2190                         bytes = 0;
2191                 }
2192         }
2193 }
2194 
2195 static void
2196 kmem_cpu_reload(kmem_cpu_cache_t *ccp, kmem_magazine_t *mp, int rounds)
2197 {
2198         ASSERT((ccp->cc_loaded == NULL && ccp->cc_rounds == -1) ||
2199             (ccp->cc_loaded && ccp->cc_rounds + rounds == ccp->cc_magsize));
2200         ASSERT(ccp->cc_magsize > 0);
2201 
2202         ccp->cc_ploaded = ccp->cc_loaded;
2203         ccp->cc_prounds = ccp->cc_rounds;
2204         ccp->cc_loaded = mp;
2205         ccp->cc_rounds = rounds;
2206 }
2207 
2208 /*
2209  * Intercept kmem alloc/free calls during crash dump in order to avoid
2210  * changing kmem state while memory is being saved to the dump device.
2211  * Otherwise, ::kmem_verify will report "corrupt buffers".  Note that
2212  * there are no locks because only one CPU calls kmem during a crash
2213  * dump. To enable this feature, first create the associated vmem
2214  * arena with VMC_DUMPSAFE.
2215  */
2216 static void *kmem_dump_start;   /* start of pre-reserved heap */
2217 static void *kmem_dump_end;     /* end of heap area */
2218 static void *kmem_dump_curr;    /* current free heap pointer */
2219 static size_t kmem_dump_size;   /* size of heap area */
2220 
2221 /* append to each buf created in the pre-reserved heap */
2222 typedef struct kmem_dumpctl {
2223         void    *kdc_next;      /* cache dump free list linkage */
2224 } kmem_dumpctl_t;
2225 
2226 #define KMEM_DUMPCTL(cp, buf)   \
2227         ((kmem_dumpctl_t *)P2ROUNDUP((uintptr_t)(buf) + (cp)->cache_bufsize, \
2228             sizeof (void *)))
2229 
2230 /* set non zero for full report */
2231 uint_t kmem_dump_verbose = 0;
2232 
2233 /* stats for overize heap */
2234 uint_t kmem_dump_oversize_allocs = 0;
2235 uint_t kmem_dump_oversize_max = 0;
2236 
2237 static void
2238 kmem_dumppr(char **pp, char *e, const char *format, ...)
2239 {
2240         char *p = *pp;
2241 
2242         if (p < e) {
2243                 int n;
2244                 va_list ap;
2245 
2246                 va_start(ap, format);
2247                 n = vsnprintf(p, e - p, format, ap);
2248                 va_end(ap);
2249                 *pp = p + n;
2250         }
2251 }
2252 
2253 /*
2254  * Called when dumpadm(8) configures dump parameters.
2255  */
2256 void
2257 kmem_dump_init(size_t size)
2258 {
2259         /* Our caller ensures size is always set. */
2260         ASSERT3U(size, >, 0);
2261 
2262         if (kmem_dump_start != NULL)
2263                 kmem_free(kmem_dump_start, kmem_dump_size);
2264 
2265         kmem_dump_start = kmem_alloc(size, KM_SLEEP);
2266         kmem_dump_size = size;
2267         kmem_dump_curr = kmem_dump_start;
2268         kmem_dump_end = (void *)((char *)kmem_dump_start + size);
2269         copy_pattern(KMEM_UNINITIALIZED_PATTERN, kmem_dump_start, size);
2270 }
2271 
2272 /*
2273  * Set flag for each kmem_cache_t if is safe to use alternate dump
2274  * memory. Called just before panic crash dump starts. Set the flag
2275  * for the calling CPU.
2276  */
2277 void
2278 kmem_dump_begin(void)
2279 {
2280         kmem_cache_t *cp;
2281 
2282         ASSERT(panicstr != NULL);
2283 
2284         for (cp = list_head(&kmem_caches); cp != NULL;
2285             cp = list_next(&kmem_caches, cp)) {
2286                 kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp);
2287 
2288                 if (cp->cache_arena->vm_cflags & VMC_DUMPSAFE) {
2289                         cp->cache_flags |= KMF_DUMPDIVERT;
2290                         ccp->cc_flags |= KMF_DUMPDIVERT;
2291                         ccp->cc_dump_rounds = ccp->cc_rounds;
2292                         ccp->cc_dump_prounds = ccp->cc_prounds;
2293                         ccp->cc_rounds = ccp->cc_prounds = -1;
2294                 } else {
2295                         cp->cache_flags |= KMF_DUMPUNSAFE;
2296                         ccp->cc_flags |= KMF_DUMPUNSAFE;
2297                 }
2298         }
2299 }
2300 
2301 /*
2302  * finished dump intercept
2303  * print any warnings on the console
2304  * return verbose information to dumpsys() in the given buffer
2305  */
2306 size_t
2307 kmem_dump_finish(char *buf, size_t size)
2308 {
2309         int percent = 0;
2310         size_t used;
2311         char *e = buf + size;
2312         char *p = buf;
2313 
2314         if (kmem_dump_curr == kmem_dump_end) {
2315                 cmn_err(CE_WARN, "exceeded kmem_dump space of %lu "
2316                     "bytes: kmem state in dump may be inconsistent",
2317                     kmem_dump_size);
2318         }
2319 
2320         if (kmem_dump_verbose == 0)
2321                 return (0);
2322 
2323         used = (char *)kmem_dump_curr - (char *)kmem_dump_start;
2324         percent = (used * 100) / kmem_dump_size;
2325 
2326         kmem_dumppr(&p, e, "%% heap used,%d\n", percent);
2327         kmem_dumppr(&p, e, "used bytes,%ld\n", used);
2328         kmem_dumppr(&p, e, "heap size,%ld\n", kmem_dump_size);
2329         kmem_dumppr(&p, e, "Oversize allocs,%d\n",
2330             kmem_dump_oversize_allocs);
2331         kmem_dumppr(&p, e, "Oversize max size,%ld\n",
2332             kmem_dump_oversize_max);
2333 
2334         /* return buffer size used */
2335         if (p < e)
2336                 bzero(p, e - p);
2337         return (p - buf);
2338 }
2339 
2340 /*
2341  * Allocate a constructed object from alternate dump memory.
2342  */
2343 void *
2344 kmem_cache_alloc_dump(kmem_cache_t *cp, int kmflag)
2345 {
2346         void *buf;
2347         void *curr;
2348         char *bufend;
2349 
2350         /* return a constructed object */
2351         if ((buf = cp->cache_dump.kd_freelist) != NULL) {
2352                 cp->cache_dump.kd_freelist = KMEM_DUMPCTL(cp, buf)->kdc_next;
2353                 return (buf);
2354         }
2355 
2356         /* create a new constructed object */
2357         curr = kmem_dump_curr;
2358         buf = (void *)P2ROUNDUP((uintptr_t)curr, cp->cache_align);
2359         bufend = (char *)KMEM_DUMPCTL(cp, buf) + sizeof (kmem_dumpctl_t);
2360 
2361         /* hat layer objects cannot cross a page boundary */
2362         if (cp->cache_align < PAGESIZE) {
2363                 char *page = (char *)P2ROUNDUP((uintptr_t)buf, PAGESIZE);
2364                 if (bufend > page) {
2365                         bufend += page - (char *)buf;
2366                         buf = (void *)page;
2367                 }
2368         }
2369 
2370         /* fall back to normal alloc if reserved area is used up */
2371         if (bufend > (char *)kmem_dump_end) {
2372                 kmem_dump_curr = kmem_dump_end;
2373                 cp->cache_dump.kd_alloc_fails++;
2374                 return (NULL);
2375         }
2376 
2377         /*
2378          * Must advance curr pointer before calling a constructor that
2379          * may also allocate memory.
2380          */
2381         kmem_dump_curr = bufend;
2382 
2383         /* run constructor */
2384         if (cp->cache_constructor != NULL &&
2385             cp->cache_constructor(buf, cp->cache_private, kmflag)
2386             != 0) {
2387 #ifdef DEBUG
2388                 printf("name='%s' cache=0x%p: kmem cache constructor failed\n",
2389                     cp->cache_name, (void *)cp);
2390 #endif
2391                 /* reset curr pointer iff no allocs were done */
2392                 if (kmem_dump_curr == bufend)
2393                         kmem_dump_curr = curr;
2394 
2395                 cp->cache_dump.kd_alloc_fails++;
2396                 /* fall back to normal alloc if the constructor fails */
2397                 return (NULL);
2398         }
2399 
2400         return (buf);
2401 }
2402 
2403 /*
2404  * Free a constructed object in alternate dump memory.
2405  */
2406 int
2407 kmem_cache_free_dump(kmem_cache_t *cp, void *buf)
2408 {
2409         /* save constructed buffers for next time */
2410         if ((char *)buf >= (char *)kmem_dump_start &&
2411             (char *)buf < (char *)kmem_dump_end) {
2412                 KMEM_DUMPCTL(cp, buf)->kdc_next = cp->cache_dump.kd_freelist;
2413                 cp->cache_dump.kd_freelist = buf;
2414                 return (0);
2415         }
2416 
2417         /* just drop buffers that were allocated before dump started */
2418         if (kmem_dump_curr < kmem_dump_end)
2419                 return (0);
2420 
2421         /* fall back to normal free if reserved area is used up */
2422         return (1);
2423 }
2424 
2425 /*
2426  * Allocate a constructed object from cache cp.
2427  */
2428 void *
2429 kmem_cache_alloc(kmem_cache_t *cp, int kmflag)
2430 {
2431         kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp);
2432         kmem_magazine_t *fmp;
2433         void *buf;
2434 
2435         mutex_enter(&ccp->cc_lock);
2436         for (;;) {
2437                 /*
2438                  * If there's an object available in the current CPU's
2439                  * loaded magazine, just take it and return.
2440                  */
2441                 if (ccp->cc_rounds > 0) {
2442                         buf = ccp->cc_loaded->mag_round[--ccp->cc_rounds];
2443                         ccp->cc_alloc++;
2444                         mutex_exit(&ccp->cc_lock);
2445                         if (ccp->cc_flags & (KMF_BUFTAG | KMF_DUMPUNSAFE)) {
2446                                 if (ccp->cc_flags & KMF_DUMPUNSAFE) {
2447                                         ASSERT(!(ccp->cc_flags &
2448                                             KMF_DUMPDIVERT));
2449                                         cp->cache_dump.kd_unsafe++;
2450                                 }
2451                                 if ((ccp->cc_flags & KMF_BUFTAG) &&
2452                                     kmem_cache_alloc_debug(cp, buf, kmflag, 0,
2453                                     caller()) != 0) {
2454                                         if (kmflag & KM_NOSLEEP)
2455                                                 return (NULL);
2456                                         mutex_enter(&ccp->cc_lock);
2457                                         continue;
2458                                 }
2459                         }
2460                         return (buf);
2461                 }
2462 
2463                 /*
2464                  * The loaded magazine is empty.  If the previously loaded
2465                  * magazine was full, exchange them and try again.
2466                  */
2467                 if (ccp->cc_prounds > 0) {
2468                         kmem_cpu_reload(ccp, ccp->cc_ploaded, ccp->cc_prounds);
2469                         continue;
2470                 }
2471 
2472                 /*
2473                  * Return an alternate buffer at dump time to preserve
2474                  * the heap.
2475                  */
2476                 if (ccp->cc_flags & (KMF_DUMPDIVERT | KMF_DUMPUNSAFE)) {
2477                         if (ccp->cc_flags & KMF_DUMPUNSAFE) {
2478                                 ASSERT(!(ccp->cc_flags & KMF_DUMPDIVERT));
2479                                 /* log it so that we can warn about it */
2480                                 cp->cache_dump.kd_unsafe++;
2481                         } else {
2482                                 if ((buf = kmem_cache_alloc_dump(cp, kmflag)) !=
2483                                     NULL) {
2484                                         mutex_exit(&ccp->cc_lock);
2485                                         return (buf);
2486                                 }
2487                                 break;          /* fall back to slab layer */
2488                         }
2489                 }
2490 
2491                 /*
2492                  * If the magazine layer is disabled, break out now.
2493                  */
2494                 if (ccp->cc_magsize == 0)
2495                         break;
2496 
2497                 /*
2498                  * Try to get a full magazine from the depot.
2499                  */
2500                 fmp = kmem_depot_alloc(cp, &cp->cache_full);
2501                 if (fmp != NULL) {
2502                         if (ccp->cc_ploaded != NULL)
2503                                 kmem_depot_free(cp, &cp->cache_empty,
2504                                     ccp->cc_ploaded);
2505                         kmem_cpu_reload(ccp, fmp, ccp->cc_magsize);
2506                         continue;
2507                 }
2508 
2509                 /*
2510                  * There are no full magazines in the depot,
2511                  * so fall through to the slab layer.
2512                  */
2513                 break;
2514         }
2515         mutex_exit(&ccp->cc_lock);
2516 
2517         /*
2518          * We couldn't allocate a constructed object from the magazine layer,
2519          * so get a raw buffer from the slab layer and apply its constructor.
2520          */
2521         buf = kmem_slab_alloc(cp, kmflag);
2522 
2523         if (buf == NULL)
2524                 return (NULL);
2525 
2526         if (cp->cache_flags & KMF_BUFTAG) {
2527                 /*
2528                  * Make kmem_cache_alloc_debug() apply the constructor for us.
2529                  */
2530                 int rc = kmem_cache_alloc_debug(cp, buf, kmflag, 1, caller());
2531                 if (rc != 0) {
2532                         if (kmflag & KM_NOSLEEP)
2533                                 return (NULL);
2534                         /*
2535                          * kmem_cache_alloc_debug() detected corruption
2536                          * but didn't panic (kmem_panic <= 0). We should not be
2537                          * here because the constructor failed (indicated by a
2538                          * return code of 1). Try again.
2539                          */
2540                         ASSERT(rc == -1);
2541                         return (kmem_cache_alloc(cp, kmflag));
2542                 }
2543                 return (buf);
2544         }
2545 
2546         if (cp->cache_constructor != NULL &&
2547             cp->cache_constructor(buf, cp->cache_private, kmflag) != 0) {
2548                 atomic_inc_64(&cp->cache_alloc_fail);
2549                 kmem_slab_free(cp, buf);
2550                 return (NULL);
2551         }
2552 
2553         return (buf);
2554 }
2555 
2556 /*
2557  * The freed argument tells whether or not kmem_cache_free_debug() has already
2558  * been called so that we can avoid the duplicate free error. For example, a
2559  * buffer on a magazine has already been freed by the client but is still
2560  * constructed.
2561  */
2562 static void
2563 kmem_slab_free_constructed(kmem_cache_t *cp, void *buf, boolean_t freed)
2564 {
2565         if (!freed && (cp->cache_flags & KMF_BUFTAG))
2566                 if (kmem_cache_free_debug(cp, buf, caller()) == -1)
2567                         return;
2568 
2569         /*
2570          * Note that if KMF_DEADBEEF is in effect and KMF_LITE is not,
2571          * kmem_cache_free_debug() will have already applied the destructor.
2572          */
2573         if ((cp->cache_flags & (KMF_DEADBEEF | KMF_LITE)) != KMF_DEADBEEF &&
2574             cp->cache_destructor != NULL) {
2575                 if (cp->cache_flags & KMF_DEADBEEF) {    /* KMF_LITE implied */
2576                         kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
2577                         *(uint64_t *)buf = btp->bt_redzone;
2578                         cp->cache_destructor(buf, cp->cache_private);
2579                         *(uint64_t *)buf = KMEM_FREE_PATTERN;
2580                 } else {
2581                         cp->cache_destructor(buf, cp->cache_private);
2582                 }
2583         }
2584 
2585         kmem_slab_free(cp, buf);
2586 }
2587 
2588 /*
2589  * Used when there's no room to free a buffer to the per-CPU cache.
2590  * Drops and re-acquires &ccp->cc_lock, and returns non-zero if the
2591  * caller should try freeing to the per-CPU cache again.
2592  * Note that we don't directly install the magazine in the cpu cache,
2593  * since its state may have changed wildly while the lock was dropped.
2594  */
2595 static int
2596 kmem_cpucache_magazine_alloc(kmem_cpu_cache_t *ccp, kmem_cache_t *cp)
2597 {
2598         kmem_magazine_t *emp;
2599         kmem_magtype_t *mtp;
2600 
2601         ASSERT(MUTEX_HELD(&ccp->cc_lock));
2602         ASSERT(((uint_t)ccp->cc_rounds == ccp->cc_magsize ||
2603             ((uint_t)ccp->cc_rounds == -1)) &&
2604             ((uint_t)ccp->cc_prounds == ccp->cc_magsize ||
2605             ((uint_t)ccp->cc_prounds == -1)));
2606 
2607         emp = kmem_depot_alloc(cp, &cp->cache_empty);
2608         if (emp != NULL) {
2609                 if (ccp->cc_ploaded != NULL)
2610                         kmem_depot_free(cp, &cp->cache_full,
2611                             ccp->cc_ploaded);
2612                 kmem_cpu_reload(ccp, emp, 0);
2613                 return (1);
2614         }
2615         /*
2616          * There are no empty magazines in the depot,
2617          * so try to allocate a new one.  We must drop all locks
2618          * across kmem_cache_alloc() because lower layers may
2619          * attempt to allocate from this cache.
2620          */
2621         mtp = cp->cache_magtype;
2622         mutex_exit(&ccp->cc_lock);
2623         emp = kmem_cache_alloc(mtp->mt_cache, KM_NOSLEEP);
2624         mutex_enter(&ccp->cc_lock);
2625 
2626         if (emp != NULL) {
2627                 /*
2628                  * We successfully allocated an empty magazine.
2629                  * However, we had to drop ccp->cc_lock to do it,
2630                  * so the cache's magazine size may have changed.
2631                  * If so, free the magazine and try again.
2632                  */
2633                 if (ccp->cc_magsize != mtp->mt_magsize) {
2634                         mutex_exit(&ccp->cc_lock);
2635                         kmem_cache_free(mtp->mt_cache, emp);
2636                         mutex_enter(&ccp->cc_lock);
2637                         return (1);
2638                 }
2639 
2640                 /*
2641                  * We got a magazine of the right size.  Add it to
2642                  * the depot and try the whole dance again.
2643                  */
2644                 kmem_depot_free(cp, &cp->cache_empty, emp);
2645                 return (1);
2646         }
2647 
2648         /*
2649          * We couldn't allocate an empty magazine,
2650          * so fall through to the slab layer.
2651          */
2652         return (0);
2653 }
2654 
2655 /*
2656  * Free a constructed object to cache cp.
2657  */
2658 void
2659 kmem_cache_free(kmem_cache_t *cp, void *buf)
2660 {
2661         kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp);
2662 
2663         /*
2664          * The client must not free either of the buffers passed to the move
2665          * callback function.
2666          */
2667         ASSERT(cp->cache_defrag == NULL ||
2668             cp->cache_defrag->kmd_thread != curthread ||
2669             (buf != cp->cache_defrag->kmd_from_buf &&
2670             buf != cp->cache_defrag->kmd_to_buf));
2671 
2672         if (ccp->cc_flags & (KMF_BUFTAG | KMF_DUMPDIVERT | KMF_DUMPUNSAFE)) {
2673                 if (ccp->cc_flags & KMF_DUMPUNSAFE) {
2674                         ASSERT(!(ccp->cc_flags & KMF_DUMPDIVERT));
2675                         /* log it so that we can warn about it */
2676                         cp->cache_dump.kd_unsafe++;
2677                 } else if (KMEM_DUMPCC(ccp) && !kmem_cache_free_dump(cp, buf)) {
2678                         return;
2679                 }
2680                 if (ccp->cc_flags & KMF_BUFTAG) {
2681                         if (kmem_cache_free_debug(cp, buf, caller()) == -1)
2682                                 return;
2683                 }
2684         }
2685 
2686         mutex_enter(&ccp->cc_lock);
2687         /*
2688          * Any changes to this logic should be reflected in kmem_slab_prefill()
2689          */
2690         for (;;) {
2691                 /*
2692                  * If there's a slot available in the current CPU's
2693                  * loaded magazine, just put the object there and return.
2694                  */
2695                 if ((uint_t)ccp->cc_rounds < ccp->cc_magsize) {
2696                         ccp->cc_loaded->mag_round[ccp->cc_rounds++] = buf;
2697                         ccp->cc_free++;
2698                         mutex_exit(&ccp->cc_lock);
2699                         return;
2700                 }
2701 
2702                 /*
2703                  * The loaded magazine is full.  If the previously loaded
2704                  * magazine was empty, exchange them and try again.
2705                  */
2706                 if (ccp->cc_prounds == 0) {
2707                         kmem_cpu_reload(ccp, ccp->cc_ploaded, ccp->cc_prounds);
2708                         continue;
2709                 }
2710 
2711                 /*
2712                  * If the magazine layer is disabled, break out now.
2713                  */
2714                 if (ccp->cc_magsize == 0)
2715                         break;
2716 
2717                 if (!kmem_cpucache_magazine_alloc(ccp, cp)) {
2718                         /*
2719                          * We couldn't free our constructed object to the
2720                          * magazine layer, so apply its destructor and free it
2721                          * to the slab layer.
2722                          */
2723                         break;
2724                 }
2725         }
2726         mutex_exit(&ccp->cc_lock);
2727         kmem_slab_free_constructed(cp, buf, B_TRUE);
2728 }
2729 
2730 static void
2731 kmem_slab_prefill(kmem_cache_t *cp, kmem_slab_t *sp)
2732 {
2733         kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp);
2734         int cache_flags = cp->cache_flags;
2735 
2736         kmem_bufctl_t *next, *head;
2737         size_t nbufs;
2738 
2739         /*
2740          * Completely allocate the newly created slab and put the pre-allocated
2741          * buffers in magazines. Any of the buffers that cannot be put in
2742          * magazines must be returned to the slab.
2743          */
2744         ASSERT(MUTEX_HELD(&cp->cache_lock));
2745         ASSERT((cache_flags & (KMF_PREFILL|KMF_BUFTAG)) == KMF_PREFILL);
2746         ASSERT(cp->cache_constructor == NULL);
2747         ASSERT(sp->slab_cache == cp);
2748         ASSERT(sp->slab_refcnt == 1);
2749         ASSERT(sp->slab_head != NULL && sp->slab_chunks > sp->slab_refcnt);
2750         ASSERT(avl_find(&cp->cache_partial_slabs, sp, NULL) == NULL);
2751 
2752         head = sp->slab_head;
2753         nbufs = (sp->slab_chunks - sp->slab_refcnt);
2754         sp->slab_head = NULL;
2755         sp->slab_refcnt += nbufs;
2756         cp->cache_bufslab -= nbufs;
2757         cp->cache_slab_alloc += nbufs;
2758         list_insert_head(&cp->cache_complete_slabs, sp);
2759         cp->cache_complete_slab_count++;
2760         mutex_exit(&cp->cache_lock);
2761         mutex_enter(&ccp->cc_lock);
2762 
2763         while (head != NULL) {
2764                 void *buf = KMEM_BUF(cp, head);
2765                 /*
2766                  * If there's a slot available in the current CPU's
2767                  * loaded magazine, just put the object there and
2768                  * continue.
2769                  */
2770                 if ((uint_t)ccp->cc_rounds < ccp->cc_magsize) {
2771                         ccp->cc_loaded->mag_round[ccp->cc_rounds++] =
2772                             buf;
2773                         ccp->cc_free++;
2774                         nbufs--;
2775                         head = head->bc_next;
2776                         continue;
2777                 }
2778 
2779                 /*
2780                  * The loaded magazine is full.  If the previously
2781                  * loaded magazine was empty, exchange them and try
2782                  * again.
2783                  */
2784                 if (ccp->cc_prounds == 0) {
2785                         kmem_cpu_reload(ccp, ccp->cc_ploaded,
2786                             ccp->cc_prounds);
2787                         continue;
2788                 }
2789 
2790                 /*
2791                  * If the magazine layer is disabled, break out now.
2792                  */
2793 
2794                 if (ccp->cc_magsize == 0) {
2795                         break;
2796                 }
2797 
2798                 if (!kmem_cpucache_magazine_alloc(ccp, cp))
2799                         break;
2800         }
2801         mutex_exit(&ccp->cc_lock);
2802         if (nbufs != 0) {
2803                 ASSERT(head != NULL);
2804 
2805                 /*
2806                  * If there was a failure, return remaining objects to
2807                  * the slab
2808                  */
2809                 while (head != NULL) {
2810                         ASSERT(nbufs != 0);
2811                         next = head->bc_next;
2812                         head->bc_next = NULL;
2813                         kmem_slab_free(cp, KMEM_BUF(cp, head));
2814                         head = next;
2815                         nbufs--;
2816                 }
2817         }
2818         ASSERT(head == NULL);
2819         ASSERT(nbufs == 0);
2820         mutex_enter(&cp->cache_lock);
2821 }
2822 
2823 void *
2824 kmem_zalloc(size_t size, int kmflag)
2825 {
2826         size_t index;
2827         void *buf;
2828 
2829         if ((index = ((size - 1) >> KMEM_ALIGN_SHIFT)) < KMEM_ALLOC_TABLE_MAX) {
2830                 kmem_cache_t *cp = kmem_alloc_table[index];
2831                 buf = kmem_cache_alloc(cp, kmflag);
2832                 if (buf != NULL) {
2833                         if ((cp->cache_flags & KMF_BUFTAG) && !KMEM_DUMP(cp)) {
2834                                 kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
2835                                 ((uint8_t *)buf)[size] = KMEM_REDZONE_BYTE;
2836                                 ((uint32_t *)btp)[1] = KMEM_SIZE_ENCODE(size);
2837 
2838                                 if (cp->cache_flags & KMF_LITE) {
2839                                         KMEM_BUFTAG_LITE_ENTER(btp,
2840                                             kmem_lite_count, caller());
2841                                 }
2842                         }
2843                         bzero(buf, size);
2844                 }
2845         } else {
2846                 buf = kmem_alloc(size, kmflag);
2847                 if (buf != NULL)
2848                         bzero(buf, size);
2849         }
2850         return (buf);
2851 }
2852 
2853 void *
2854 kmem_alloc(size_t size, int kmflag)
2855 {
2856         size_t index;
2857         kmem_cache_t *cp;
2858         void *buf;
2859 
2860         if ((index = ((size - 1) >> KMEM_ALIGN_SHIFT)) < KMEM_ALLOC_TABLE_MAX) {
2861                 cp = kmem_alloc_table[index];
2862                 /* fall through to kmem_cache_alloc() */
2863 
2864         } else if ((index = ((size - 1) >> KMEM_BIG_SHIFT)) <
2865             kmem_big_alloc_table_max) {
2866                 cp = kmem_big_alloc_table[index];
2867                 /* fall through to kmem_cache_alloc() */
2868 
2869         } else {
2870                 if (size == 0) {
2871                         if (kmflag != KM_SLEEP && !(kmflag & KM_PANIC))
2872                                 return (NULL);
2873 
2874                         /*
2875                          * If this is a sleeping allocation or one that has
2876                          * been specified to panic on allocation failure, we
2877                          * consider it to be deprecated behavior to allocate
2878                          * 0 bytes.  If we have been configured to panic under
2879                          * this condition, we panic; if to warn, we warn -- and
2880                          * regardless, we log to the kmem_zerosized_log that
2881                          * that this condition has occurred (which gives us
2882                          * enough information to be able to debug it).
2883                          */
2884                         if (kmem_panic && kmem_panic_zerosized)
2885                                 panic("attempted to kmem_alloc() size of 0");
2886 
2887                         if (kmem_warn_zerosized) {
2888                                 cmn_err(CE_WARN, "kmem_alloc(): sleeping "
2889                                     "allocation with size of 0; "
2890                                     "see kmem_zerosized_log for details");
2891                         }
2892 
2893                         kmem_log_event(kmem_zerosized_log, NULL, NULL, NULL);
2894 
2895                         return (NULL);
2896                 }
2897 
2898                 buf = vmem_alloc(kmem_oversize_arena, size,
2899                     kmflag & KM_VMFLAGS);
2900                 if (buf == NULL)
2901                         kmem_log_event(kmem_failure_log, NULL, NULL,
2902                             (void *)size);
2903                 else if (KMEM_DUMP(kmem_slab_cache)) {
2904                         /* stats for dump intercept */
2905                         kmem_dump_oversize_allocs++;
2906                         if (size > kmem_dump_oversize_max)
2907                                 kmem_dump_oversize_max = size;
2908                 }
2909                 return (buf);
2910         }
2911 
2912         buf = kmem_cache_alloc(cp, kmflag);
2913         if ((cp->cache_flags & KMF_BUFTAG) && !KMEM_DUMP(cp) && buf != NULL) {
2914                 kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
2915                 ((uint8_t *)buf)[size] = KMEM_REDZONE_BYTE;
2916                 ((uint32_t *)btp)[1] = KMEM_SIZE_ENCODE(size);
2917 
2918                 if (cp->cache_flags & KMF_LITE) {
2919                         KMEM_BUFTAG_LITE_ENTER(btp, kmem_lite_count, caller());
2920                 }
2921         }
2922         return (buf);
2923 }
2924 
2925 void
2926 kmem_free(void *buf, size_t size)
2927 {
2928         size_t index;
2929         kmem_cache_t *cp;
2930 
2931         if ((index = (size - 1) >> KMEM_ALIGN_SHIFT) < KMEM_ALLOC_TABLE_MAX) {
2932                 cp = kmem_alloc_table[index];
2933                 /* fall through to kmem_cache_free() */
2934 
2935         } else if ((index = ((size - 1) >> KMEM_BIG_SHIFT)) <
2936             kmem_big_alloc_table_max) {
2937                 cp = kmem_big_alloc_table[index];
2938                 /* fall through to kmem_cache_free() */
2939 
2940         } else {
2941                 EQUIV(buf == NULL, size == 0);
2942                 if (buf == NULL && size == 0)
2943                         return;
2944                 vmem_free(kmem_oversize_arena, buf, size);
2945                 return;
2946         }
2947 
2948         if ((cp->cache_flags & KMF_BUFTAG) && !KMEM_DUMP(cp)) {
2949                 kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
2950                 uint32_t *ip = (uint32_t *)btp;
2951                 if (ip[1] != KMEM_SIZE_ENCODE(size)) {
2952                         if (*(uint64_t *)buf == KMEM_FREE_PATTERN) {
2953                                 kmem_error(KMERR_DUPFREE, cp, buf);
2954                                 return;
2955                         }
2956                         if (KMEM_SIZE_VALID(ip[1])) {
2957                                 ip[0] = KMEM_SIZE_ENCODE(size);
2958                                 kmem_error(KMERR_BADSIZE, cp, buf);
2959                         } else {
2960                                 kmem_error(KMERR_REDZONE, cp, buf);
2961                         }
2962                         return;
2963                 }
2964                 if (((uint8_t *)buf)[size] != KMEM_REDZONE_BYTE) {
2965                         kmem_error(KMERR_REDZONE, cp, buf);
2966                         return;
2967                 }
2968                 btp->bt_redzone = KMEM_REDZONE_PATTERN;
2969                 if (cp->cache_flags & KMF_LITE) {
2970                         KMEM_BUFTAG_LITE_ENTER(btp, kmem_lite_count,
2971                             caller());
2972                 }
2973         }
2974         kmem_cache_free(cp, buf);
2975 }
2976 
2977 void *
2978 kmem_firewall_va_alloc(vmem_t *vmp, size_t size, int vmflag)
2979 {
2980         size_t realsize = size + vmp->vm_quantum;
2981         void *addr;
2982 
2983         /*
2984          * Annoying edge case: if 'size' is just shy of ULONG_MAX, adding
2985          * vm_quantum will cause integer wraparound.  Check for this, and
2986          * blow off the firewall page in this case.  Note that such a
2987          * giant allocation (the entire kernel address space) can never
2988          * be satisfied, so it will either fail immediately (VM_NOSLEEP)
2989          * or sleep forever (VM_SLEEP).  Thus, there is no need for a
2990          * corresponding check in kmem_firewall_va_free().
2991          */
2992         if (realsize < size)
2993                 realsize = size;
2994 
2995         /*
2996          * While boot still owns resource management, make sure that this
2997          * redzone virtual address allocation is properly accounted for in
2998          * OBPs "virtual-memory" "available" lists because we're
2999          * effectively claiming them for a red zone.  If we don't do this,
3000          * the available lists become too fragmented and too large for the
3001          * current boot/kernel memory list interface.
3002          */
3003         addr = vmem_alloc(vmp, realsize, vmflag | VM_NEXTFIT);
3004 
3005         if (addr != NULL && kvseg.s_base == NULL && realsize != size)
3006                 (void) boot_virt_alloc((char *)addr + size, vmp->vm_quantum);
3007 
3008         return (addr);
3009 }
3010 
3011 void
3012 kmem_firewall_va_free(vmem_t *vmp, void *addr, size_t size)
3013 {
3014         ASSERT((kvseg.s_base == NULL ?
3015             va_to_pfn((char *)addr + size) :
3016             hat_getpfnum(kas.a_hat, (caddr_t)addr + size)) == PFN_INVALID);
3017 
3018         vmem_free(vmp, addr, size + vmp->vm_quantum);
3019 }
3020 
3021 /*
3022  * Try to allocate at least `size' bytes of memory without sleeping or
3023  * panicking. Return actual allocated size in `asize'. If allocation failed,
3024  * try final allocation with sleep or panic allowed.
3025  */
3026 void *
3027 kmem_alloc_tryhard(size_t size, size_t *asize, int kmflag)
3028 {
3029         void *p;
3030 
3031         *asize = P2ROUNDUP(size, KMEM_ALIGN);
3032         do {
3033                 p = kmem_alloc(*asize, (kmflag | KM_NOSLEEP) & ~KM_PANIC);
3034                 if (p != NULL)
3035                         return (p);
3036                 *asize += KMEM_ALIGN;
3037         } while (*asize <= PAGESIZE);
3038 
3039         *asize = P2ROUNDUP(size, KMEM_ALIGN);
3040         return (kmem_alloc(*asize, kmflag));
3041 }
3042 
3043 /*
3044  * Reclaim all unused memory from a cache.
3045  */
3046 static void
3047 kmem_cache_reap(kmem_cache_t *cp)
3048 {
3049         ASSERT(taskq_member(kmem_taskq, curthread));
3050         cp->cache_reap++;
3051 
3052         /*
3053          * Ask the cache's owner to free some memory if possible.
3054          * The idea is to handle things like the inode cache, which
3055          * typically sits on a bunch of memory that it doesn't truly
3056          * *need*.  Reclaim policy is entirely up to the owner; this
3057          * callback is just an advisory plea for help.
3058          */
3059         if (cp->cache_reclaim != NULL) {
3060                 long delta;
3061 
3062                 /*
3063                  * Reclaimed memory should be reapable (not included in the
3064                  * depot's working set).
3065                  */
3066                 delta = cp->cache_full.ml_total;
3067                 cp->cache_reclaim(cp->cache_private);
3068                 delta = cp->cache_full.ml_total - delta;
3069                 if (delta > 0) {
3070                         mutex_enter(&cp->cache_depot_lock);
3071                         cp->cache_full.ml_reaplimit += delta;
3072                         cp->cache_full.ml_min += delta;
3073                         mutex_exit(&cp->cache_depot_lock);
3074                 }
3075         }
3076 
3077         kmem_depot_ws_reap(cp);
3078 
3079         if (cp->cache_defrag != NULL && !kmem_move_noreap) {
3080                 kmem_cache_defrag(cp);
3081         }
3082 }
3083 
3084 static void
3085 kmem_reap_timeout(void *flag_arg)
3086 {
3087         uint32_t *flag = (uint32_t *)flag_arg;
3088 
3089         ASSERT(flag == &kmem_reaping || flag == &kmem_reaping_idspace);
3090         *flag = 0;
3091 }
3092 
3093 static void
3094 kmem_reap_done(void *flag)
3095 {
3096         if (!callout_init_done) {
3097                 /* can't schedule a timeout at this point */
3098                 kmem_reap_timeout(flag);
3099         } else {
3100                 (void) timeout(kmem_reap_timeout, flag, kmem_reap_interval);
3101         }
3102 }
3103 
3104 static void
3105 kmem_reap_start(void *flag)
3106 {
3107         ASSERT(flag == &kmem_reaping || flag == &kmem_reaping_idspace);
3108 
3109         if (flag == &kmem_reaping) {
3110                 kmem_cache_applyall(kmem_cache_reap, kmem_taskq, TQ_NOSLEEP);
3111                 /*
3112                  * if we have segkp under heap, reap segkp cache.
3113                  */
3114                 if (segkp_fromheap)
3115                         segkp_cache_free();
3116         }
3117         else
3118                 kmem_cache_applyall_id(kmem_cache_reap, kmem_taskq, TQ_NOSLEEP);
3119 
3120         /*
3121          * We use taskq_dispatch() to schedule a timeout to clear
3122          * the flag so that kmem_reap() becomes self-throttling:
3123          * we won't reap again until the current reap completes *and*
3124          * at least kmem_reap_interval ticks have elapsed.
3125          */
3126         if (taskq_dispatch(kmem_taskq, kmem_reap_done, flag, TQ_NOSLEEP) ==
3127             TASKQID_INVALID)
3128                 kmem_reap_done(flag);
3129 }
3130 
3131 static void
3132 kmem_reap_common(void *flag_arg)
3133 {
3134         uint32_t *flag = (uint32_t *)flag_arg;
3135 
3136         if (MUTEX_HELD(&kmem_cache_lock) || kmem_taskq == NULL ||
3137             atomic_cas_32(flag, 0, 1) != 0)
3138                 return;
3139 
3140         /*
3141          * It may not be kosher to do memory allocation when a reap is called
3142          * (for example, if vmem_populate() is in the call chain).  So we
3143          * start the reap going with a TQ_NOALLOC dispatch.  If the dispatch
3144          * fails, we reset the flag, and the next reap will try again.
3145          */
3146         if (taskq_dispatch(kmem_taskq, kmem_reap_start, flag, TQ_NOALLOC) ==
3147             TASKQID_INVALID)
3148                 *flag = 0;
3149 }
3150 
3151 /*
3152  * Reclaim all unused memory from all caches.  Called from the VM system
3153  * when memory gets tight.
3154  */
3155 void
3156 kmem_reap(void)
3157 {
3158         kmem_reap_common(&kmem_reaping);
3159 }
3160 
3161 /*
3162  * Reclaim all unused memory from identifier arenas, called when a vmem
3163  * arena not back by memory is exhausted.  Since reaping memory-backed caches
3164  * cannot help with identifier exhaustion, we avoid both a large amount of
3165  * work and unwanted side-effects from reclaim callbacks.
3166  */
3167 void
3168 kmem_reap_idspace(void)
3169 {
3170         kmem_reap_common(&kmem_reaping_idspace);
3171 }
3172 
3173 /*
3174  * Purge all magazines from a cache and set its magazine limit to zero.
3175  * All calls are serialized by the kmem_taskq lock, except for the final
3176  * call from kmem_cache_destroy().
3177  */
3178 static void
3179 kmem_cache_magazine_purge(kmem_cache_t *cp)
3180 {
3181         kmem_cpu_cache_t *ccp;
3182         kmem_magazine_t *mp, *pmp;
3183         int rounds, prounds, cpu_seqid;
3184 
3185         ASSERT(!list_link_active(&cp->cache_link) ||
3186             taskq_member(kmem_taskq, curthread));
3187         ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
3188 
3189         for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) {
3190                 ccp = &cp->cache_cpu[cpu_seqid];
3191 
3192                 mutex_enter(&ccp->cc_lock);
3193                 mp = ccp->cc_loaded;
3194                 pmp = ccp->cc_ploaded;
3195                 rounds = ccp->cc_rounds;
3196                 prounds = ccp->cc_prounds;
3197                 ccp->cc_loaded = NULL;
3198                 ccp->cc_ploaded = NULL;
3199                 ccp->cc_rounds = -1;
3200                 ccp->cc_prounds = -1;
3201                 ccp->cc_magsize = 0;
3202                 mutex_exit(&ccp->cc_lock);
3203 
3204                 if (mp)
3205                         kmem_magazine_destroy(cp, mp, rounds);
3206                 if (pmp)
3207                         kmem_magazine_destroy(cp, pmp, prounds);
3208         }
3209 
3210         kmem_depot_ws_zero(cp);
3211         kmem_depot_ws_reap(cp);
3212 }
3213 
3214 /*
3215  * Enable per-cpu magazines on a cache.
3216  */
3217 static void
3218 kmem_cache_magazine_enable(kmem_cache_t *cp)
3219 {
3220         int cpu_seqid;
3221 
3222         if (cp->cache_flags & KMF_NOMAGAZINE)
3223                 return;
3224 
3225         for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) {
3226                 kmem_cpu_cache_t *ccp = &cp->cache_cpu[cpu_seqid];
3227                 mutex_enter(&ccp->cc_lock);
3228                 ccp->cc_magsize = cp->cache_magtype->mt_magsize;
3229                 mutex_exit(&ccp->cc_lock);
3230         }
3231 
3232 }
3233 
3234 /*
3235  * Allow our caller to determine if there are running reaps.
3236  *
3237  * This call is very conservative and may return B_TRUE even when
3238  * reaping activity isn't active. If it returns B_FALSE, then reaping
3239  * activity is definitely inactive.
3240  */
3241 boolean_t
3242 kmem_cache_reap_active(void)
3243 {
3244         return (!taskq_empty(kmem_taskq));
3245 }
3246 
3247 /*
3248  * Reap (almost) everything soon.
3249  *
3250  * Note: this does not wait for the reap-tasks to complete. Caller
3251  * should use kmem_cache_reap_active() (above) and/or moderation to
3252  * avoid scheduling too many reap-tasks.
3253  */
3254 void
3255 kmem_cache_reap_soon(kmem_cache_t *cp)
3256 {
3257         ASSERT(list_link_active(&cp->cache_link));
3258 
3259         kmem_depot_ws_zero(cp);
3260 
3261         (void) taskq_dispatch(kmem_taskq,
3262             (task_func_t *)kmem_depot_ws_reap, cp, TQ_SLEEP);
3263 }
3264 
3265 /*
3266  * Recompute a cache's magazine size.  The trade-off is that larger magazines
3267  * provide a higher transfer rate with the depot, while smaller magazines
3268  * reduce memory consumption.  Magazine resizing is an expensive operation;
3269  * it should not be done frequently.
3270  *
3271  * Changes to the magazine size are serialized by the kmem_taskq lock.
3272  *
3273  * Note: at present this only grows the magazine size.  It might be useful
3274  * to allow shrinkage too.
3275  */
3276 static void
3277 kmem_cache_magazine_resize(kmem_cache_t *cp)
3278 {
3279         kmem_magtype_t *mtp = cp->cache_magtype;
3280 
3281         ASSERT(taskq_member(kmem_taskq, curthread));
3282 
3283         if (cp->cache_chunksize < mtp->mt_maxbuf) {
3284                 kmem_cache_magazine_purge(cp);
3285                 mutex_enter(&cp->cache_depot_lock);
3286                 cp->cache_magtype = ++mtp;
3287                 cp->cache_depot_contention_prev =
3288                     cp->cache_depot_contention + INT_MAX;
3289                 mutex_exit(&cp->cache_depot_lock);
3290                 kmem_cache_magazine_enable(cp);
3291         }
3292 }
3293 
3294 /*
3295  * Rescale a cache's hash table, so that the table size is roughly the
3296  * cache size.  We want the average lookup time to be extremely small.
3297  */
3298 static void
3299 kmem_hash_rescale(kmem_cache_t *cp)
3300 {
3301         kmem_bufctl_t **old_table, **new_table, *bcp;
3302         size_t old_size, new_size, h;
3303 
3304         ASSERT(taskq_member(kmem_taskq, curthread));
3305 
3306         new_size = MAX(KMEM_HASH_INITIAL,
3307             1 << (highbit(3 * cp->cache_buftotal + 4) - 2));
3308         old_size = cp->cache_hash_mask + 1;
3309 
3310         if ((old_size >> 1) <= new_size && new_size <= (old_size << 1))
3311                 return;
3312 
3313         new_table = vmem_alloc(kmem_hash_arena, new_size * sizeof (void *),
3314             VM_NOSLEEP);
3315         if (new_table == NULL)
3316                 return;
3317         bzero(new_table, new_size * sizeof (void *));
3318 
3319         mutex_enter(&cp->cache_lock);
3320 
3321         old_size = cp->cache_hash_mask + 1;
3322         old_table = cp->cache_hash_table;
3323 
3324         cp->cache_hash_mask = new_size - 1;
3325         cp->cache_hash_table = new_table;
3326         cp->cache_rescale++;
3327 
3328         for (h = 0; h < old_size; h++) {
3329                 bcp = old_table[h];
3330                 while (bcp != NULL) {
3331                         void *addr = bcp->bc_addr;
3332                         kmem_bufctl_t *next_bcp = bcp->bc_next;
3333                         kmem_bufctl_t **hash_bucket = KMEM_HASH(cp, addr);
3334                         bcp->bc_next = *hash_bucket;
3335                         *hash_bucket = bcp;
3336                         bcp = next_bcp;
3337                 }
3338         }
3339 
3340         mutex_exit(&cp->cache_lock);
3341 
3342         vmem_free(kmem_hash_arena, old_table, old_size * sizeof (void *));
3343 }
3344 
3345 /*
3346  * Perform periodic maintenance on a cache: hash rescaling, depot working-set
3347  * update, magazine resizing, and slab consolidation.
3348  */
3349 static void
3350 kmem_cache_update(kmem_cache_t *cp)
3351 {
3352         int need_hash_rescale = 0;
3353         int need_magazine_resize = 0;
3354 
3355         ASSERT(MUTEX_HELD(&kmem_cache_lock));
3356 
3357         /*
3358          * If the cache has become much larger or smaller than its hash table,
3359          * fire off a request to rescale the hash table.
3360          */
3361         mutex_enter(&cp->cache_lock);
3362 
3363         if ((cp->cache_flags & KMF_HASH) &&
3364             (cp->cache_buftotal > (cp->cache_hash_mask << 1) ||
3365             (cp->cache_buftotal < (cp->cache_hash_mask >> 1) &&
3366             cp->cache_hash_mask > KMEM_HASH_INITIAL)))
3367                 need_hash_rescale = 1;
3368 
3369         mutex_exit(&cp->cache_lock);
3370 
3371         /*
3372          * Update the depot working set statistics.
3373          */
3374         kmem_depot_ws_update(cp);
3375 
3376         /*
3377          * If there's a lot of contention in the depot,
3378          * increase the magazine size.
3379          */
3380         mutex_enter(&cp->cache_depot_lock);
3381 
3382         if (cp->cache_chunksize < cp->cache_magtype->mt_maxbuf &&
3383             (int)(cp->cache_depot_contention -
3384             cp->cache_depot_contention_prev) > kmem_depot_contention)
3385                 need_magazine_resize = 1;
3386 
3387         cp->cache_depot_contention_prev = cp->cache_depot_contention;
3388 
3389         mutex_exit(&cp->cache_depot_lock);
3390 
3391         if (need_hash_rescale)
3392                 (void) taskq_dispatch(kmem_taskq,
3393                     (task_func_t *)kmem_hash_rescale, cp, TQ_NOSLEEP);
3394 
3395         if (need_magazine_resize)
3396                 (void) taskq_dispatch(kmem_taskq,
3397                     (task_func_t *)kmem_cache_magazine_resize, cp, TQ_NOSLEEP);
3398 
3399         if (cp->cache_defrag != NULL)
3400                 (void) taskq_dispatch(kmem_taskq,
3401                     (task_func_t *)kmem_cache_scan, cp, TQ_NOSLEEP);
3402 }
3403 
3404 static void kmem_update(void *);
3405 
3406 static void
3407 kmem_update_timeout(void *dummy)
3408 {
3409         (void) timeout(kmem_update, dummy, kmem_reap_interval);
3410 }
3411 
3412 static void
3413 kmem_update(void *dummy)
3414 {
3415         kmem_cache_applyall(kmem_cache_update, NULL, TQ_NOSLEEP);
3416 
3417         /*
3418          * We use taskq_dispatch() to reschedule the timeout so that
3419          * kmem_update() becomes self-throttling: it won't schedule
3420          * new tasks until all previous tasks have completed.
3421          */
3422         if (taskq_dispatch(kmem_taskq, kmem_update_timeout, dummy, TQ_NOSLEEP)
3423             == TASKQID_INVALID)
3424                 kmem_update_timeout(NULL);
3425 }
3426 
3427 static int
3428 kmem_cache_kstat_update(kstat_t *ksp, int rw)
3429 {
3430         struct kmem_cache_kstat *kmcp = &kmem_cache_kstat;
3431         kmem_cache_t *cp = ksp->ks_private;
3432         uint64_t cpu_buf_avail;
3433         uint64_t buf_avail = 0;
3434         int cpu_seqid;
3435         long reap;
3436 
3437         ASSERT(MUTEX_HELD(&kmem_cache_kstat_lock));
3438 
3439         if (rw == KSTAT_WRITE)
3440                 return (EACCES);
3441 
3442         mutex_enter(&cp->cache_lock);
3443 
3444         kmcp->kmc_alloc_fail.value.ui64              = cp->cache_alloc_fail;
3445         kmcp->kmc_alloc.value.ui64           = cp->cache_slab_alloc;
3446         kmcp->kmc_free.value.ui64            = cp->cache_slab_free;
3447         kmcp->kmc_slab_alloc.value.ui64              = cp->cache_slab_alloc;
3448         kmcp->kmc_slab_free.value.ui64               = cp->cache_slab_free;
3449 
3450         for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) {
3451                 kmem_cpu_cache_t *ccp = &cp->cache_cpu[cpu_seqid];
3452 
3453                 mutex_enter(&ccp->cc_lock);
3454 
3455                 cpu_buf_avail = 0;
3456                 if (ccp->cc_rounds > 0)
3457                         cpu_buf_avail += ccp->cc_rounds;
3458                 if (ccp->cc_prounds > 0)
3459                         cpu_buf_avail += ccp->cc_prounds;
3460 
3461                 kmcp->kmc_alloc.value.ui64   += ccp->cc_alloc;
3462                 kmcp->kmc_free.value.ui64    += ccp->cc_free;
3463                 buf_avail                       += cpu_buf_avail;
3464 
3465                 mutex_exit(&ccp->cc_lock);
3466         }
3467 
3468         mutex_enter(&cp->cache_depot_lock);
3469 
3470         kmcp->kmc_depot_alloc.value.ui64     = cp->cache_full.ml_alloc;
3471         kmcp->kmc_depot_free.value.ui64              = cp->cache_empty.ml_alloc;
3472         kmcp->kmc_depot_contention.value.ui64        = cp->cache_depot_contention;
3473         kmcp->kmc_full_magazines.value.ui64  = cp->cache_full.ml_total;
3474         kmcp->kmc_empty_magazines.value.ui64 = cp->cache_empty.ml_total;
3475         kmcp->kmc_magazine_size.value.ui64   =
3476             (cp->cache_flags & KMF_NOMAGAZINE) ?
3477             0 : cp->cache_magtype->mt_magsize;
3478 
3479         kmcp->kmc_alloc.value.ui64           += cp->cache_full.ml_alloc;
3480         kmcp->kmc_free.value.ui64            += cp->cache_empty.ml_alloc;
3481         buf_avail += cp->cache_full.ml_total * cp->cache_magtype->mt_magsize;
3482 
3483         reap = MIN(cp->cache_full.ml_reaplimit, cp->cache_full.ml_min);
3484         reap = MIN(reap, cp->cache_full.ml_total);
3485 
3486         mutex_exit(&cp->cache_depot_lock);
3487 
3488         kmcp->kmc_buf_size.value.ui64        = cp->cache_bufsize;
3489         kmcp->kmc_align.value.ui64   = cp->cache_align;
3490         kmcp->kmc_chunk_size.value.ui64      = cp->cache_chunksize;
3491         kmcp->kmc_slab_size.value.ui64       = cp->cache_slabsize;
3492         kmcp->kmc_buf_constructed.value.ui64 = buf_avail;
3493         buf_avail += cp->cache_bufslab;
3494         kmcp->kmc_buf_avail.value.ui64       = buf_avail;
3495         kmcp->kmc_buf_inuse.value.ui64       = cp->cache_buftotal - buf_avail;
3496         kmcp->kmc_buf_total.value.ui64       = cp->cache_buftotal;
3497         kmcp->kmc_buf_max.value.ui64 = cp->cache_bufmax;
3498         kmcp->kmc_slab_create.value.ui64     = cp->cache_slab_create;
3499         kmcp->kmc_slab_destroy.value.ui64    = cp->cache_slab_destroy;
3500         kmcp->kmc_hash_size.value.ui64       = (cp->cache_flags & KMF_HASH) ?
3501             cp->cache_hash_mask + 1 : 0;
3502         kmcp->kmc_hash_lookup_depth.value.ui64       = cp->cache_lookup_depth;
3503         kmcp->kmc_hash_rescale.value.ui64    = cp->cache_rescale;
3504         kmcp->kmc_vmem_source.value.ui64     = cp->cache_arena->vm_id;
3505         kmcp->kmc_reap.value.ui64    = cp->cache_reap;
3506 
3507         if (cp->cache_defrag == NULL) {
3508                 kmcp->kmc_move_callbacks.value.ui64  = 0;
3509                 kmcp->kmc_move_yes.value.ui64                = 0;
3510                 kmcp->kmc_move_no.value.ui64         = 0;
3511                 kmcp->kmc_move_later.value.ui64              = 0;
3512                 kmcp->kmc_move_dont_need.value.ui64  = 0;
3513                 kmcp->kmc_move_dont_know.value.ui64  = 0;
3514                 kmcp->kmc_move_hunt_found.value.ui64 = 0;
3515                 kmcp->kmc_move_slabs_freed.value.ui64        = 0;
3516                 kmcp->kmc_defrag.value.ui64          = 0;
3517                 kmcp->kmc_scan.value.ui64            = 0;
3518                 kmcp->kmc_move_reclaimable.value.ui64        = 0;
3519         } else {
3520                 int64_t reclaimable;
3521 
3522                 kmem_defrag_t *kd = cp->cache_defrag;
3523                 kmcp->kmc_move_callbacks.value.ui64  = kd->kmd_callbacks;
3524                 kmcp->kmc_move_yes.value.ui64                = kd->kmd_yes;
3525                 kmcp->kmc_move_no.value.ui64         = kd->kmd_no;
3526                 kmcp->kmc_move_later.value.ui64              = kd->kmd_later;
3527                 kmcp->kmc_move_dont_need.value.ui64  = kd->kmd_dont_need;
3528                 kmcp->kmc_move_dont_know.value.ui64  = kd->kmd_dont_know;
3529                 kmcp->kmc_move_hunt_found.value.ui64 = 0;
3530                 kmcp->kmc_move_slabs_freed.value.ui64        = kd->kmd_slabs_freed;
3531                 kmcp->kmc_defrag.value.ui64          = kd->kmd_defrags;
3532                 kmcp->kmc_scan.value.ui64            = kd->kmd_scans;
3533 
3534                 reclaimable = cp->cache_bufslab - (cp->cache_maxchunks - 1);
3535                 reclaimable = MAX(reclaimable, 0);
3536                 reclaimable += ((uint64_t)reap * cp->cache_magtype->mt_magsize);
3537                 kmcp->kmc_move_reclaimable.value.ui64        = reclaimable;
3538         }
3539 
3540         mutex_exit(&cp->cache_lock);
3541         return (0);
3542 }
3543 
3544 /*
3545  * Return a named statistic about a particular cache.
3546  * This shouldn't be called very often, so it's currently designed for
3547  * simplicity (leverages existing kstat support) rather than efficiency.
3548  */
3549 uint64_t
3550 kmem_cache_stat(kmem_cache_t *cp, char *name)
3551 {
3552         int i;
3553         kstat_t *ksp = cp->cache_kstat;
3554         kstat_named_t *knp = (kstat_named_t *)&kmem_cache_kstat;
3555         uint64_t value = 0;
3556 
3557         if (ksp != NULL) {
3558                 mutex_enter(&kmem_cache_kstat_lock);
3559                 (void) kmem_cache_kstat_update(ksp, KSTAT_READ);
3560                 for (i = 0; i < ksp->ks_ndata; i++) {
3561                         if (strcmp(knp[i].name, name) == 0) {
3562                                 value = knp[i].value.ui64;
3563                                 break;
3564                         }
3565                 }
3566                 mutex_exit(&kmem_cache_kstat_lock);
3567         }
3568         return (value);
3569 }
3570 
3571 /*
3572  * Return an estimate of currently available kernel heap memory.
3573  * On 32-bit systems, physical memory may exceed virtual memory,
3574  * we just truncate the result at 1GB.
3575  */
3576 size_t
3577 kmem_avail(void)
3578 {
3579         spgcnt_t rmem = availrmem - tune.t_minarmem;
3580         spgcnt_t fmem = freemem - minfree;
3581 
3582         return ((size_t)ptob(MIN(MAX(MIN(rmem, fmem), 0),
3583             1 << (30 - PAGESHIFT))));
3584 }
3585 
3586 /*
3587  * Return the maximum amount of memory that is (in theory) allocatable
3588  * from the heap. This may be used as an estimate only since there
3589  * is no guarentee this space will still be available when an allocation
3590  * request is made, nor that the space may be allocated in one big request
3591  * due to kernel heap fragmentation.
3592  */
3593 size_t
3594 kmem_maxavail(void)
3595 {
3596         spgcnt_t pmem = availrmem - tune.t_minarmem;
3597         spgcnt_t vmem = btop(vmem_size(heap_arena, VMEM_FREE));
3598 
3599         return ((size_t)ptob(MAX(MIN(pmem, vmem), 0)));
3600 }
3601 
3602 /*
3603  * Indicate whether memory-intensive kmem debugging is enabled.
3604  */
3605 int
3606 kmem_debugging(void)
3607 {
3608         return (kmem_flags & (KMF_AUDIT | KMF_REDZONE));
3609 }
3610 
3611 /* binning function, sorts finely at the two extremes */
3612 #define KMEM_PARTIAL_SLAB_WEIGHT(sp, binshift)                          \
3613         ((((sp)->slab_refcnt <= (binshift)) ||                            \
3614             (((sp)->slab_chunks - (sp)->slab_refcnt) <= (binshift)))   \
3615             ? -(sp)->slab_refcnt                                     \
3616             : -((binshift) + ((sp)->slab_refcnt >> (binshift))))
3617 
3618 /*
3619  * Minimizing the number of partial slabs on the freelist minimizes
3620  * fragmentation (the ratio of unused buffers held by the slab layer). There are
3621  * two ways to get a slab off of the freelist: 1) free all the buffers on the
3622  * slab, and 2) allocate all the buffers on the slab. It follows that we want
3623  * the most-used slabs at the front of the list where they have the best chance
3624  * of being completely allocated, and the least-used slabs at a safe distance
3625  * from the front to improve the odds that the few remaining buffers will all be
3626  * freed before another allocation can tie up the slab. For that reason a slab
3627  * with a higher slab_refcnt sorts less than than a slab with a lower
3628  * slab_refcnt.
3629  *
3630  * However, if a slab has at least one buffer that is deemed unfreeable, we
3631  * would rather have that slab at the front of the list regardless of
3632  * slab_refcnt, since even one unfreeable buffer makes the entire slab
3633  * unfreeable. If the client returns KMEM_CBRC_NO in response to a cache_move()
3634  * callback, the slab is marked unfreeable for as long as it remains on the
3635  * freelist.
3636  */
3637 static int
3638 kmem_partial_slab_cmp(const void *p0, const void *p1)
3639 {
3640         const kmem_cache_t *cp;
3641         const kmem_slab_t *s0 = p0;
3642         const kmem_slab_t *s1 = p1;
3643         int w0, w1;
3644         size_t binshift;
3645 
3646         ASSERT(KMEM_SLAB_IS_PARTIAL(s0));
3647         ASSERT(KMEM_SLAB_IS_PARTIAL(s1));
3648         ASSERT(s0->slab_cache == s1->slab_cache);
3649         cp = s1->slab_cache;
3650         ASSERT(MUTEX_HELD(&cp->cache_lock));
3651         binshift = cp->cache_partial_binshift;
3652 
3653         /* weight of first slab */
3654         w0 = KMEM_PARTIAL_SLAB_WEIGHT(s0, binshift);
3655         if (s0->slab_flags & KMEM_SLAB_NOMOVE) {
3656                 w0 -= cp->cache_maxchunks;
3657         }
3658 
3659         /* weight of second slab */
3660         w1 = KMEM_PARTIAL_SLAB_WEIGHT(s1, binshift);
3661         if (s1->slab_flags & KMEM_SLAB_NOMOVE) {
3662                 w1 -= cp->cache_maxchunks;
3663         }
3664 
3665         if (w0 < w1)
3666                 return (-1);
3667         if (w0 > w1)
3668                 return (1);
3669 
3670         /* compare pointer values */
3671         if ((uintptr_t)s0 < (uintptr_t)s1)
3672                 return (-1);
3673         if ((uintptr_t)s0 > (uintptr_t)s1)
3674                 return (1);
3675 
3676         return (0);
3677 }
3678 
3679 /*
3680  * It must be valid to call the destructor (if any) on a newly created object.
3681  * That is, the constructor (if any) must leave the object in a valid state for
3682  * the destructor.
3683  */
3684 kmem_cache_t *
3685 kmem_cache_create(
3686         char *name,             /* descriptive name for this cache */
3687         size_t bufsize,         /* size of the objects it manages */
3688         size_t align,           /* required object alignment */
3689         int (*constructor)(void *, void *, int), /* object constructor */
3690         void (*destructor)(void *, void *),     /* object destructor */
3691         void (*reclaim)(void *), /* memory reclaim callback */
3692         void *private,          /* pass-thru arg for constr/destr/reclaim */
3693         vmem_t *vmp,            /* vmem source for slab allocation */
3694         int cflags)             /* cache creation flags */
3695 {
3696         int cpu_seqid;
3697         size_t chunksize;
3698         kmem_cache_t *cp;
3699         kmem_magtype_t *mtp;
3700         size_t csize = KMEM_CACHE_SIZE(max_ncpus);
3701 
3702 #ifdef  DEBUG
3703         /*
3704          * Cache names should conform to the rules for valid C identifiers
3705          */
3706         if (!strident_valid(name)) {
3707                 cmn_err(CE_CONT,
3708                     "kmem_cache_create: '%s' is an invalid cache name\n"
3709                     "cache names must conform to the rules for "
3710                     "C identifiers\n", name);
3711         }
3712 #endif  /* DEBUG */
3713 
3714         if (vmp == NULL)
3715                 vmp = kmem_default_arena;
3716 
3717         /*
3718          * If this kmem cache has an identifier vmem arena as its source, mark
3719          * it such to allow kmem_reap_idspace().
3720          */
3721         ASSERT(!(cflags & KMC_IDENTIFIER));   /* consumer should not set this */
3722         if (vmp->vm_cflags & VMC_IDENTIFIER)
3723                 cflags |= KMC_IDENTIFIER;
3724 
3725         /*
3726          * Get a kmem_cache structure.  We arrange that cp->cache_cpu[]
3727          * is aligned on a KMEM_CPU_CACHE_SIZE boundary to prevent
3728          * false sharing of per-CPU data.
3729          */
3730         cp = vmem_xalloc(kmem_cache_arena, csize, KMEM_CPU_CACHE_SIZE,
3731             P2NPHASE(csize, KMEM_CPU_CACHE_SIZE), 0, NULL, NULL, VM_SLEEP);
3732         bzero(cp, csize);
3733         list_link_init(&cp->cache_link);
3734 
3735         if (align == 0)
3736                 align = KMEM_ALIGN;
3737 
3738         /*
3739          * If we're not at least KMEM_ALIGN aligned, we can't use free
3740          * memory to hold bufctl information (because we can't safely
3741          * perform word loads and stores on it).
3742          */
3743         if (align < KMEM_ALIGN)
3744                 cflags |= KMC_NOTOUCH;
3745 
3746         if (!ISP2(align) || align > vmp->vm_quantum)
3747                 panic("kmem_cache_create: bad alignment %lu", align);
3748 
3749         mutex_enter(&kmem_flags_lock);
3750         if (kmem_flags & KMF_RANDOMIZE)
3751                 kmem_flags = (((kmem_flags | ~KMF_RANDOM) + 1) & KMF_RANDOM) |
3752                     KMF_RANDOMIZE;
3753         cp->cache_flags = (kmem_flags | cflags) & KMF_DEBUG;
3754         mutex_exit(&kmem_flags_lock);
3755 
3756         /*
3757          * Make sure all the various flags are reasonable.
3758          */
3759         ASSERT(!(cflags & KMC_NOHASH) || !(cflags & KMC_NOTOUCH));
3760 
3761         if (cp->cache_flags & KMF_LITE) {
3762                 if (bufsize >= kmem_lite_minsize &&
3763                     align <= kmem_lite_maxalign &&
3764                     P2PHASE(bufsize, kmem_lite_maxalign) != 0) {
3765                         cp->cache_flags |= KMF_BUFTAG;
3766                         cp->cache_flags &= ~(KMF_AUDIT | KMF_FIREWALL);
3767                 } else {
3768                         cp->cache_flags &= ~KMF_DEBUG;
3769                 }
3770         }
3771 
3772         if (cp->cache_flags & KMF_DEADBEEF)
3773                 cp->cache_flags |= KMF_REDZONE;
3774 
3775         if ((cflags & KMC_QCACHE) && (cp->cache_flags & KMF_AUDIT))
3776                 cp->cache_flags |= KMF_NOMAGAZINE;
3777 
3778         if (cflags & KMC_NODEBUG)
3779                 cp->cache_flags &= ~KMF_DEBUG;
3780 
3781         if (cflags & KMC_NOTOUCH)
3782                 cp->cache_flags &= ~KMF_TOUCH;
3783 
3784         if (cflags & KMC_PREFILL)
3785                 cp->cache_flags |= KMF_PREFILL;
3786 
3787         if (cflags & KMC_NOHASH)
3788                 cp->cache_flags &= ~(KMF_AUDIT | KMF_FIREWALL);
3789 
3790         if (cflags & KMC_NOMAGAZINE)
3791                 cp->cache_flags |= KMF_NOMAGAZINE;
3792 
3793         if ((cp->cache_flags & KMF_AUDIT) && !(cflags & KMC_NOTOUCH))
3794                 cp->cache_flags |= KMF_REDZONE;
3795 
3796         if (!(cp->cache_flags & KMF_AUDIT))
3797                 cp->cache_flags &= ~KMF_CONTENTS;
3798 
3799         if ((cp->cache_flags & KMF_BUFTAG) && bufsize >= kmem_minfirewall &&
3800             !(cp->cache_flags & KMF_LITE) && !(cflags & KMC_NOHASH))
3801                 cp->cache_flags |= KMF_FIREWALL;
3802 
3803         if (vmp != kmem_default_arena || kmem_firewall_arena == NULL)
3804                 cp->cache_flags &= ~KMF_FIREWALL;
3805 
3806         if (cp->cache_flags & KMF_FIREWALL) {
3807                 cp->cache_flags &= ~KMF_BUFTAG;
3808                 cp->cache_flags |= KMF_NOMAGAZINE;
3809                 ASSERT(vmp == kmem_default_arena);
3810                 vmp = kmem_firewall_arena;
3811         }
3812 
3813         /*
3814          * Set cache properties.
3815          */
3816         (void) strncpy(cp->cache_name, name, KMEM_CACHE_NAMELEN);
3817         strident_canon(cp->cache_name, KMEM_CACHE_NAMELEN + 1);
3818         cp->cache_bufsize = bufsize;
3819         cp->cache_align = align;
3820         cp->cache_constructor = constructor;
3821         cp->cache_destructor = destructor;
3822         cp->cache_reclaim = reclaim;
3823         cp->cache_private = private;
3824         cp->cache_arena = vmp;
3825         cp->cache_cflags = cflags;
3826 
3827         /*
3828          * Determine the chunk size.
3829          */
3830         chunksize = bufsize;
3831 
3832         if (align >= KMEM_ALIGN) {
3833                 chunksize = P2ROUNDUP(chunksize, KMEM_ALIGN);
3834                 cp->cache_bufctl = chunksize - KMEM_ALIGN;
3835         }
3836 
3837         if (cp->cache_flags & KMF_BUFTAG) {
3838                 cp->cache_bufctl = chunksize;
3839                 cp->cache_buftag = chunksize;
3840                 if (cp->cache_flags & KMF_LITE)
3841                         chunksize += KMEM_BUFTAG_LITE_SIZE(kmem_lite_count);
3842                 else
3843                         chunksize += sizeof (kmem_buftag_t);
3844         }
3845 
3846         if (cp->cache_flags & KMF_DEADBEEF) {
3847                 cp->cache_verify = MIN(cp->cache_buftag, kmem_maxverify);
3848                 if (cp->cache_flags & KMF_LITE)
3849                         cp->cache_verify = sizeof (uint64_t);
3850         }
3851 
3852         cp->cache_contents = MIN(cp->cache_bufctl, kmem_content_maxsave);
3853 
3854         cp->cache_chunksize = chunksize = P2ROUNDUP(chunksize, align);
3855 
3856         /*
3857          * Now that we know the chunk size, determine the optimal slab size.
3858          */
3859         if (vmp == kmem_firewall_arena) {
3860                 cp->cache_slabsize = P2ROUNDUP(chunksize, vmp->vm_quantum);
3861                 cp->cache_mincolor = cp->cache_slabsize - chunksize;
3862                 cp->cache_maxcolor = cp->cache_mincolor;
3863                 cp->cache_flags |= KMF_HASH;
3864                 ASSERT(!(cp->cache_flags & KMF_BUFTAG));
3865         } else if ((cflags & KMC_NOHASH) || (!(cflags & KMC_NOTOUCH) &&
3866             !(cp->cache_flags & KMF_AUDIT) &&
3867             chunksize < vmp->vm_quantum / KMEM_VOID_FRACTION)) {
3868                 cp->cache_slabsize = vmp->vm_quantum;
3869                 cp->cache_mincolor = 0;
3870                 cp->cache_maxcolor =
3871                     (cp->cache_slabsize - sizeof (kmem_slab_t)) % chunksize;
3872                 ASSERT(chunksize + sizeof (kmem_slab_t) <= cp->cache_slabsize);
3873                 ASSERT(!(cp->cache_flags & KMF_AUDIT));
3874         } else {
3875                 size_t chunks, bestfit, waste, slabsize;
3876                 size_t minwaste = LONG_MAX;
3877 
3878                 bestfit = 0;
3879                 for (chunks = 1; chunks <= KMEM_VOID_FRACTION; chunks++) {
3880                         slabsize = P2ROUNDUP(chunksize * chunks,
3881                             vmp->vm_quantum);
3882                         chunks = slabsize / chunksize;
3883                         waste = (slabsize % chunksize) / chunks;
3884                         if (waste < minwaste) {
3885                                 minwaste = waste;
3886                                 bestfit = slabsize;
3887                         }
3888                 }
3889                 if (cflags & KMC_QCACHE)
3890                         bestfit = VMEM_QCACHE_SLABSIZE(vmp->vm_qcache_max);
3891                 cp->cache_slabsize = bestfit;
3892                 cp->cache_mincolor = 0;
3893                 cp->cache_maxcolor = bestfit % chunksize;
3894                 cp->cache_flags |= KMF_HASH;
3895         }
3896 
3897         cp->cache_maxchunks = (cp->cache_slabsize / cp->cache_chunksize);
3898         cp->cache_partial_binshift = highbit(cp->cache_maxchunks / 16) + 1;
3899 
3900         /*
3901          * Disallowing prefill when either the DEBUG or HASH flag is set or when
3902          * there is a constructor avoids some tricky issues with debug setup
3903          * that may be revisited later. We cannot allow prefill in a
3904          * metadata cache because of potential recursion.
3905          */
3906         if (vmp == kmem_msb_arena ||
3907             cp->cache_flags & (KMF_HASH | KMF_BUFTAG) ||
3908             cp->cache_constructor != NULL)
3909                 cp->cache_flags &= ~KMF_PREFILL;
3910 
3911         if (cp->cache_flags & KMF_HASH) {
3912                 ASSERT(!(cflags & KMC_NOHASH));
3913                 cp->cache_bufctl_cache = (cp->cache_flags & KMF_AUDIT) ?
3914                     kmem_bufctl_audit_cache : kmem_bufctl_cache;
3915         }
3916 
3917         if (cp->cache_maxcolor >= vmp->vm_quantum)
3918                 cp->cache_maxcolor = vmp->vm_quantum - 1;
3919 
3920         cp->cache_color = cp->cache_mincolor;
3921 
3922         /*
3923          * Initialize the rest of the slab layer.
3924          */
3925         mutex_init(&cp->cache_lock, NULL, MUTEX_DEFAULT, NULL);
3926 
3927         avl_create(&cp->cache_partial_slabs, kmem_partial_slab_cmp,
3928             sizeof (kmem_slab_t), offsetof(kmem_slab_t, slab_link));
3929         /* LINTED: E_TRUE_LOGICAL_EXPR */
3930         ASSERT(sizeof (list_node_t) <= sizeof (avl_node_t));
3931         /* reuse partial slab AVL linkage for complete slab list linkage */
3932         list_create(&cp->cache_complete_slabs,
3933             sizeof (kmem_slab_t), offsetof(kmem_slab_t, slab_link));
3934 
3935         if (cp->cache_flags & KMF_HASH) {
3936                 cp->cache_hash_table = vmem_alloc(kmem_hash_arena,
3937                     KMEM_HASH_INITIAL * sizeof (void *), VM_SLEEP);
3938                 bzero(cp->cache_hash_table,
3939                     KMEM_HASH_INITIAL * sizeof (void *));
3940                 cp->cache_hash_mask = KMEM_HASH_INITIAL - 1;
3941                 cp->cache_hash_shift = highbit((ulong_t)chunksize) - 1;
3942         }
3943 
3944         /*
3945          * Initialize the depot.
3946          */
3947         mutex_init(&cp->cache_depot_lock, NULL, MUTEX_DEFAULT, NULL);
3948 
3949         for (mtp = kmem_magtype; chunksize <= mtp->mt_minbuf; mtp++)
3950                 continue;
3951 
3952         cp->cache_magtype = mtp;
3953 
3954         /*
3955          * Initialize the CPU layer.
3956          */
3957         for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) {
3958                 kmem_cpu_cache_t *ccp = &cp->cache_cpu[cpu_seqid];
3959                 mutex_init(&ccp->cc_lock, NULL, MUTEX_DEFAULT, NULL);
3960                 ccp->cc_flags = cp->cache_flags;
3961                 ccp->cc_rounds = -1;
3962                 ccp->cc_prounds = -1;
3963         }
3964 
3965         /*
3966          * Create the cache's kstats.
3967          */
3968         if ((cp->cache_kstat = kstat_create("unix", 0, cp->cache_name,
3969             "kmem_cache", KSTAT_TYPE_NAMED,
3970             sizeof (kmem_cache_kstat) / sizeof (kstat_named_t),
3971             KSTAT_FLAG_VIRTUAL)) != NULL) {
3972                 cp->cache_kstat->ks_data = &kmem_cache_kstat;
3973                 cp->cache_kstat->ks_update = kmem_cache_kstat_update;
3974                 cp->cache_kstat->ks_private = cp;
3975                 cp->cache_kstat->ks_lock = &kmem_cache_kstat_lock;
3976                 kstat_install(cp->cache_kstat);
3977         }
3978 
3979         /*
3980          * Add the cache to the global list.  This makes it visible
3981          * to kmem_update(), so the cache must be ready for business.
3982          */
3983         mutex_enter(&kmem_cache_lock);
3984         list_insert_tail(&kmem_caches, cp);
3985         mutex_exit(&kmem_cache_lock);
3986 
3987         if (kmem_ready)
3988                 kmem_cache_magazine_enable(cp);
3989 
3990         return (cp);
3991 }
3992 
3993 static int
3994 kmem_move_cmp(const void *buf, const void *p)
3995 {
3996         const kmem_move_t *kmm = p;
3997         uintptr_t v1 = (uintptr_t)buf;
3998         uintptr_t v2 = (uintptr_t)kmm->kmm_from_buf;
3999         return (v1 < v2 ? -1 : (v1 > v2 ? 1 : 0));
4000 }
4001 
4002 static void
4003 kmem_reset_reclaim_threshold(kmem_defrag_t *kmd)
4004 {
4005         kmd->kmd_reclaim_numer = 1;
4006 }
4007 
4008 /*
4009  * Initially, when choosing candidate slabs for buffers to move, we want to be
4010  * very selective and take only slabs that are less than
4011  * (1 / KMEM_VOID_FRACTION) allocated. If we have difficulty finding candidate
4012  * slabs, then we raise the allocation ceiling incrementally. The reclaim
4013  * threshold is reset to (1 / KMEM_VOID_FRACTION) as soon as the cache is no
4014  * longer fragmented.
4015  */
4016 static void
4017 kmem_adjust_reclaim_threshold(kmem_defrag_t *kmd, int direction)
4018 {
4019         if (direction > 0) {
4020                 /* make it easier to find a candidate slab */
4021                 if (kmd->kmd_reclaim_numer < (KMEM_VOID_FRACTION - 1)) {
4022                         kmd->kmd_reclaim_numer++;
4023                 }
4024         } else {
4025                 /* be more selective */
4026                 if (kmd->kmd_reclaim_numer > 1) {
4027                         kmd->kmd_reclaim_numer--;
4028                 }
4029         }
4030 }
4031 
4032 void
4033 kmem_cache_set_move(kmem_cache_t *cp,
4034     kmem_cbrc_t (*move)(void *, void *, size_t, void *))
4035 {
4036         kmem_defrag_t *defrag;
4037 
4038         ASSERT(move != NULL);
4039         /*
4040          * The consolidator does not support NOTOUCH caches because kmem cannot
4041          * initialize their slabs with the 0xbaddcafe memory pattern, which sets
4042          * a low order bit usable by clients to distinguish uninitialized memory
4043          * from known objects (see kmem_slab_create).
4044          */
4045         ASSERT(!(cp->cache_cflags & KMC_NOTOUCH));
4046         ASSERT(!(cp->cache_cflags & KMC_IDENTIFIER));
4047 
4048         /*
4049          * We should not be holding anyone's cache lock when calling
4050          * kmem_cache_alloc(), so allocate in all cases before acquiring the
4051          * lock.
4052          */
4053         defrag = kmem_cache_alloc(kmem_defrag_cache, KM_SLEEP);
4054 
4055         mutex_enter(&cp->cache_lock);
4056 
4057         if (KMEM_IS_MOVABLE(cp)) {
4058                 if (cp->cache_move == NULL) {
4059                         ASSERT(cp->cache_slab_alloc == 0);
4060 
4061                         cp->cache_defrag = defrag;
4062                         defrag = NULL; /* nothing to free */
4063                         bzero(cp->cache_defrag, sizeof (kmem_defrag_t));
4064                         avl_create(&cp->cache_defrag->kmd_moves_pending,
4065                             kmem_move_cmp, sizeof (kmem_move_t),
4066                             offsetof(kmem_move_t, kmm_entry));
4067                         /* LINTED: E_TRUE_LOGICAL_EXPR */
4068                         ASSERT(sizeof (list_node_t) <= sizeof (avl_node_t));
4069                         /* reuse the slab's AVL linkage for deadlist linkage */
4070                         list_create(&cp->cache_defrag->kmd_deadlist,
4071                             sizeof (kmem_slab_t),
4072                             offsetof(kmem_slab_t, slab_link));
4073                         kmem_reset_reclaim_threshold(cp->cache_defrag);
4074                 }
4075                 cp->cache_move = move;
4076         }
4077 
4078         mutex_exit(&cp->cache_lock);
4079 
4080         if (defrag != NULL) {
4081                 kmem_cache_free(kmem_defrag_cache, defrag); /* unused */
4082         }
4083 }
4084 
4085 void
4086 kmem_cache_destroy(kmem_cache_t *cp)
4087 {
4088         int cpu_seqid;
4089 
4090         /*
4091          * Remove the cache from the global cache list so that no one else
4092          * can schedule tasks on its behalf, wait for any pending tasks to
4093          * complete, purge the cache, and then destroy it.
4094          */
4095         mutex_enter(&kmem_cache_lock);
4096         list_remove(&kmem_caches, cp);
4097         mutex_exit(&kmem_cache_lock);
4098 
4099         if (kmem_taskq != NULL)
4100                 taskq_wait(kmem_taskq);
4101 
4102         if (kmem_move_taskq != NULL && cp->cache_defrag != NULL)
4103                 taskq_wait(kmem_move_taskq);
4104 
4105         kmem_cache_magazine_purge(cp);
4106 
4107         mutex_enter(&cp->cache_lock);
4108         if (cp->cache_buftotal != 0)
4109                 cmn_err(CE_WARN, "kmem_cache_destroy: '%s' (%p) not empty",
4110                     cp->cache_name, (void *)cp);
4111         if (cp->cache_defrag != NULL) {
4112                 avl_destroy(&cp->cache_defrag->kmd_moves_pending);
4113                 list_destroy(&cp->cache_defrag->kmd_deadlist);
4114                 kmem_cache_free(kmem_defrag_cache, cp->cache_defrag);
4115                 cp->cache_defrag = NULL;
4116         }
4117         /*
4118          * The cache is now dead.  There should be no further activity.  We
4119          * enforce this by setting land mines in the constructor, destructor,
4120          * reclaim, and move routines that induce a kernel text fault if
4121          * invoked.
4122          */
4123         cp->cache_constructor = (int (*)(void *, void *, int))1;
4124         cp->cache_destructor = (void (*)(void *, void *))2;
4125         cp->cache_reclaim = (void (*)(void *))3;
4126         cp->cache_move = (kmem_cbrc_t (*)(void *, void *, size_t, void *))4;
4127         mutex_exit(&cp->cache_lock);
4128 
4129         kstat_delete(cp->cache_kstat);
4130 
4131         if (cp->cache_hash_table != NULL)
4132                 vmem_free(kmem_hash_arena, cp->cache_hash_table,
4133                     (cp->cache_hash_mask + 1) * sizeof (void *));
4134 
4135         for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++)
4136                 mutex_destroy(&cp->cache_cpu[cpu_seqid].cc_lock);
4137 
4138         mutex_destroy(&cp->cache_depot_lock);
4139         mutex_destroy(&cp->cache_lock);
4140 
4141         vmem_free(kmem_cache_arena, cp, KMEM_CACHE_SIZE(max_ncpus));
4142 }
4143 
4144 /*ARGSUSED*/
4145 static int
4146 kmem_cpu_setup(cpu_setup_t what, int id, void *arg)
4147 {
4148         ASSERT(MUTEX_HELD(&cpu_lock));
4149         if (what == CPU_UNCONFIG) {
4150                 kmem_cache_applyall(kmem_cache_magazine_purge,
4151                     kmem_taskq, TQ_SLEEP);
4152                 kmem_cache_applyall(kmem_cache_magazine_enable,
4153                     kmem_taskq, TQ_SLEEP);
4154         }
4155         return (0);
4156 }
4157 
4158 static void
4159 kmem_alloc_caches_create(const int *array, size_t count,
4160     kmem_cache_t **alloc_table, size_t maxbuf, uint_t shift)
4161 {
4162         char name[KMEM_CACHE_NAMELEN + 1];
4163         size_t table_unit = (1 << shift); /* range of one alloc_table entry */
4164         size_t size = table_unit;
4165         int i;
4166 
4167         for (i = 0; i < count; i++) {
4168                 size_t cache_size = array[i];
4169                 size_t align = KMEM_ALIGN;
4170                 kmem_cache_t *cp;
4171 
4172                 /* if the table has an entry for maxbuf, we're done */
4173                 if (size > maxbuf)
4174                         break;
4175 
4176                 /* cache size must be a multiple of the table unit */
4177                 ASSERT(P2PHASE(cache_size, table_unit) == 0);
4178 
4179                 /*
4180                  * If they allocate a multiple of the coherency granularity,
4181                  * they get a coherency-granularity-aligned address.
4182                  */
4183                 if (IS_P2ALIGNED(cache_size, 64))
4184                         align = 64;
4185                 if (IS_P2ALIGNED(cache_size, PAGESIZE))
4186                         align = PAGESIZE;
4187                 (void) snprintf(name, sizeof (name),
4188                     "kmem_alloc_%lu", cache_size);
4189                 cp = kmem_cache_create(name, cache_size, align,
4190                     NULL, NULL, NULL, NULL, NULL, KMC_KMEM_ALLOC);
4191 
4192                 while (size <= cache_size) {
4193                         alloc_table[(size - 1) >> shift] = cp;
4194                         size += table_unit;
4195                 }
4196         }
4197 
4198         ASSERT(size > maxbuf);               /* i.e. maxbuf <= max(cache_size) */
4199 }
4200 
4201 static void
4202 kmem_cache_init(int pass, int use_large_pages)
4203 {
4204         int i;
4205         size_t maxbuf;
4206         kmem_magtype_t *mtp;
4207 
4208         for (i = 0; i < sizeof (kmem_magtype) / sizeof (*mtp); i++) {
4209                 char name[KMEM_CACHE_NAMELEN + 1];
4210 
4211                 mtp = &kmem_magtype[i];
4212                 (void) sprintf(name, "kmem_magazine_%d", mtp->mt_magsize);
4213                 mtp->mt_cache = kmem_cache_create(name,
4214                     (mtp->mt_magsize + 1) * sizeof (void *),
4215                     mtp->mt_align, NULL, NULL, NULL, NULL,
4216                     kmem_msb_arena, KMC_NOHASH);
4217         }
4218 
4219         kmem_slab_cache = kmem_cache_create("kmem_slab_cache",
4220             sizeof (kmem_slab_t), 0, NULL, NULL, NULL, NULL,
4221             kmem_msb_arena, KMC_NOHASH);
4222 
4223         kmem_bufctl_cache = kmem_cache_create("kmem_bufctl_cache",
4224             sizeof (kmem_bufctl_t), 0, NULL, NULL, NULL, NULL,
4225             kmem_msb_arena, KMC_NOHASH);
4226 
4227         kmem_bufctl_audit_cache = kmem_cache_create("kmem_bufctl_audit_cache",
4228             sizeof (kmem_bufctl_audit_t), 0, NULL, NULL, NULL, NULL,
4229             kmem_msb_arena, KMC_NOHASH);
4230 
4231         if (pass == 2) {
4232                 kmem_va_arena = vmem_create("kmem_va",
4233                     NULL, 0, PAGESIZE,
4234                     vmem_alloc, vmem_free, heap_arena,
4235                     8 * PAGESIZE, VM_SLEEP);
4236 
4237                 if (use_large_pages) {
4238                         kmem_default_arena = vmem_xcreate("kmem_default",
4239                             NULL, 0, PAGESIZE,
4240                             segkmem_alloc_lp, segkmem_free_lp, kmem_va_arena,
4241                             0, VMC_DUMPSAFE | VM_SLEEP);
4242                 } else {
4243                         kmem_default_arena = vmem_create("kmem_default",
4244                             NULL, 0, PAGESIZE,
4245                             segkmem_alloc, segkmem_free, kmem_va_arena,
4246                             0, VMC_DUMPSAFE | VM_SLEEP);
4247                 }
4248 
4249                 /* Figure out what our maximum cache size is */
4250                 maxbuf = kmem_max_cached;
4251                 if (maxbuf <= KMEM_MAXBUF) {
4252                         maxbuf = 0;
4253                         kmem_max_cached = KMEM_MAXBUF;
4254                 } else {
4255                         size_t size = 0;
4256                         size_t max =
4257                             sizeof (kmem_big_alloc_sizes) / sizeof (int);
4258                         /*
4259                          * Round maxbuf up to an existing cache size.  If maxbuf
4260                          * is larger than the largest cache, we truncate it to
4261                          * the largest cache's size.
4262                          */
4263                         for (i = 0; i < max; i++) {
4264                                 size = kmem_big_alloc_sizes[i];
4265                                 if (maxbuf <= size)
4266                                         break;
4267                         }
4268                         kmem_max_cached = maxbuf = size;
4269                 }
4270 
4271                 /*
4272                  * The big alloc table may not be completely overwritten, so
4273                  * we clear out any stale cache pointers from the first pass.
4274                  */
4275                 bzero(kmem_big_alloc_table, sizeof (kmem_big_alloc_table));
4276         } else {
4277                 /*
4278                  * During the first pass, the kmem_alloc_* caches
4279                  * are treated as metadata.
4280                  */
4281                 kmem_default_arena = kmem_msb_arena;
4282                 maxbuf = KMEM_BIG_MAXBUF_32BIT;
4283         }
4284 
4285         /*
4286          * Set up the default caches to back kmem_alloc()
4287          */
4288         kmem_alloc_caches_create(
4289             kmem_alloc_sizes, sizeof (kmem_alloc_sizes) / sizeof (int),
4290             kmem_alloc_table, KMEM_MAXBUF, KMEM_ALIGN_SHIFT);
4291 
4292         kmem_alloc_caches_create(
4293             kmem_big_alloc_sizes, sizeof (kmem_big_alloc_sizes) / sizeof (int),
4294             kmem_big_alloc_table, maxbuf, KMEM_BIG_SHIFT);
4295 
4296         kmem_big_alloc_table_max = maxbuf >> KMEM_BIG_SHIFT;
4297 }
4298 
4299 void
4300 kmem_init(void)
4301 {
4302         kmem_cache_t *cp;
4303         int old_kmem_flags = kmem_flags;
4304         int use_large_pages = 0;
4305         size_t maxverify, minfirewall;
4306 
4307         kstat_init();
4308 
4309         /*
4310          * Don't do firewalled allocations if the heap is less than 1TB
4311          * (i.e. on a 32-bit kernel)
4312          * The resulting VM_NEXTFIT allocations would create too much
4313          * fragmentation in a small heap.
4314          */
4315 #if defined(_LP64)
4316         maxverify = minfirewall = PAGESIZE / 2;
4317 #else
4318         maxverify = minfirewall = ULONG_MAX;
4319 #endif
4320 
4321         /* LINTED */
4322         ASSERT(sizeof (kmem_cpu_cache_t) == KMEM_CPU_CACHE_SIZE);
4323 
4324         list_create(&kmem_caches, sizeof (kmem_cache_t),
4325             offsetof(kmem_cache_t, cache_link));
4326 
4327         kmem_metadata_arena = vmem_create("kmem_metadata", NULL, 0, PAGESIZE,
4328             vmem_alloc, vmem_free, heap_arena, 8 * PAGESIZE,
4329             VM_SLEEP | VMC_NO_QCACHE);
4330 
4331         kmem_msb_arena = vmem_create("kmem_msb", NULL, 0,
4332             PAGESIZE, segkmem_alloc, segkmem_free, kmem_metadata_arena, 0,
4333             VMC_DUMPSAFE | VM_SLEEP);
4334 
4335         kmem_cache_arena = vmem_create("kmem_cache", NULL, 0, KMEM_ALIGN,
4336             segkmem_alloc, segkmem_free, kmem_metadata_arena, 0, VM_SLEEP);
4337 
4338         kmem_hash_arena = vmem_create("kmem_hash", NULL, 0, KMEM_ALIGN,
4339             segkmem_alloc, segkmem_free, kmem_metadata_arena, 0, VM_SLEEP);
4340 
4341         kmem_log_arena = vmem_create("kmem_log", NULL, 0, KMEM_ALIGN,
4342             segkmem_alloc, segkmem_free, heap_arena, 0, VM_SLEEP);
4343 
4344         kmem_firewall_va_arena = vmem_create("kmem_firewall_va",
4345             NULL, 0, PAGESIZE,
4346             kmem_firewall_va_alloc, kmem_firewall_va_free, heap_arena,
4347             0, VM_SLEEP);
4348 
4349         kmem_firewall_arena = vmem_create("kmem_firewall", NULL, 0, PAGESIZE,
4350             segkmem_alloc, segkmem_free, kmem_firewall_va_arena, 0,
4351             VMC_DUMPSAFE | VM_SLEEP);
4352 
4353         /* temporary oversize arena for mod_read_system_file */
4354         kmem_oversize_arena = vmem_create("kmem_oversize", NULL, 0, PAGESIZE,
4355             segkmem_alloc, segkmem_free, heap_arena, 0, VM_SLEEP);
4356 
4357         kmem_reap_interval = 15 * hz;
4358 
4359         /*
4360          * Read /etc/system.  This is a chicken-and-egg problem because
4361          * kmem_flags may be set in /etc/system, but mod_read_system_file()
4362          * needs to use the allocator.  The simplest solution is to create
4363          * all the standard kmem caches, read /etc/system, destroy all the
4364          * caches we just created, and then create them all again in light
4365          * of the (possibly) new kmem_flags and other kmem tunables.
4366          */
4367         kmem_cache_init(1, 0);
4368 
4369         mod_read_system_file(boothowto & RB_ASKNAME);
4370 
4371         while ((cp = list_tail(&kmem_caches)) != NULL)
4372                 kmem_cache_destroy(cp);
4373 
4374         vmem_destroy(kmem_oversize_arena);
4375 
4376         if (old_kmem_flags & KMF_STICKY)
4377                 kmem_flags = old_kmem_flags;
4378 
4379         if (!(kmem_flags & KMF_AUDIT))
4380                 vmem_seg_size = offsetof(vmem_seg_t, vs_thread);
4381 
4382         if (kmem_maxverify == 0)
4383                 kmem_maxverify = maxverify;
4384 
4385         if (kmem_minfirewall == 0)
4386                 kmem_minfirewall = minfirewall;
4387 
4388         /*
4389          * give segkmem a chance to figure out if we are using large pages
4390          * for the kernel heap
4391          */
4392         use_large_pages = segkmem_lpsetup();
4393 
4394         /*
4395          * To protect against corruption, we keep the actual number of callers
4396          * KMF_LITE records seperate from the tunable.  We arbitrarily clamp
4397          * to 16, since the overhead for small buffers quickly gets out of
4398          * hand.
4399          *
4400          * The real limit would depend on the needs of the largest KMC_NOHASH
4401          * cache.
4402          */
4403         kmem_lite_count = MIN(MAX(0, kmem_lite_pcs), 16);
4404         kmem_lite_pcs = kmem_lite_count;
4405 
4406         /*
4407          * Normally, we firewall oversized allocations when possible, but
4408          * if we are using large pages for kernel memory, and we don't have
4409          * any non-LITE debugging flags set, we want to allocate oversized
4410          * buffers from large pages, and so skip the firewalling.
4411          */
4412         if (use_large_pages &&
4413             ((kmem_flags & KMF_LITE) || !(kmem_flags & KMF_DEBUG))) {
4414                 kmem_oversize_arena = vmem_xcreate("kmem_oversize", NULL, 0,
4415                     PAGESIZE, segkmem_alloc_lp, segkmem_free_lp, heap_arena,
4416                     0, VMC_DUMPSAFE | VM_SLEEP);
4417         } else {
4418                 kmem_oversize_arena = vmem_create("kmem_oversize",
4419                     NULL, 0, PAGESIZE,
4420                     segkmem_alloc, segkmem_free, kmem_minfirewall < ULONG_MAX?
4421                     kmem_firewall_va_arena : heap_arena, 0, VMC_DUMPSAFE |
4422                     VM_SLEEP);
4423         }
4424 
4425         kmem_cache_init(2, use_large_pages);
4426 
4427         if (kmem_flags & (KMF_AUDIT | KMF_RANDOMIZE)) {
4428                 if (kmem_transaction_log_size == 0)
4429                         kmem_transaction_log_size = kmem_maxavail() / 50;
4430                 kmem_transaction_log = kmem_log_init(kmem_transaction_log_size);
4431         }
4432 
4433         if (kmem_flags & (KMF_CONTENTS | KMF_RANDOMIZE)) {
4434                 if (kmem_content_log_size == 0)
4435                         kmem_content_log_size = kmem_maxavail() / 50;
4436                 kmem_content_log = kmem_log_init(kmem_content_log_size);
4437         }
4438 
4439         kmem_failure_log = kmem_log_init(kmem_failure_log_size);
4440         kmem_slab_log = kmem_log_init(kmem_slab_log_size);
4441         kmem_zerosized_log = kmem_log_init(kmem_zerosized_log_size);
4442 
4443         /*
4444          * Initialize STREAMS message caches so allocb() is available.
4445          * This allows us to initialize the logging framework (cmn_err(9F),
4446          * strlog(9F), etc) so we can start recording messages.
4447          */
4448         streams_msg_init();
4449 
4450         /*
4451          * Initialize the ZSD framework in Zones so modules loaded henceforth
4452          * can register their callbacks.
4453          */
4454         zone_zsd_init();
4455 
4456         log_init();
4457         taskq_init();
4458 
4459         /*
4460          * Warn about invalid or dangerous values of kmem_flags.
4461          * Always warn about unsupported values.
4462          */
4463         if (((kmem_flags & ~(KMF_AUDIT | KMF_DEADBEEF | KMF_REDZONE |
4464             KMF_CONTENTS | KMF_LITE)) != 0) ||
4465             ((kmem_flags & KMF_LITE) && kmem_flags != KMF_LITE))
4466                 cmn_err(CE_WARN, "kmem_flags set to unsupported value 0x%x.",
4467                     kmem_flags);
4468 
4469 #ifdef DEBUG
4470         if ((kmem_flags & KMF_DEBUG) == 0)
4471                 cmn_err(CE_NOTE, "kmem debugging disabled.");
4472 #else
4473         /*
4474          * For non-debug kernels, the only "normal" flags are 0, KMF_LITE,
4475          * KMF_REDZONE, and KMF_CONTENTS (the last because it is only enabled
4476          * if KMF_AUDIT is set). We should warn the user about the performance
4477          * penalty of KMF_AUDIT or KMF_DEADBEEF if they are set and KMF_LITE
4478          * isn't set (since that disables AUDIT).
4479          */
4480         if (!(kmem_flags & KMF_LITE) &&
4481             (kmem_flags & (KMF_AUDIT | KMF_DEADBEEF)) != 0)
4482                 cmn_err(CE_WARN, "High-overhead kmem debugging features "
4483                     "enabled (kmem_flags = 0x%x).  Performance degradation "
4484                     "and large memory overhead possible.", kmem_flags);
4485 #endif /* not DEBUG */
4486 
4487         kmem_cache_applyall(kmem_cache_magazine_enable, NULL, TQ_SLEEP);
4488 
4489         kmem_ready = 1;
4490 
4491         /*
4492          * Initialize the platform-specific aligned/DMA memory allocator.
4493          */
4494         ka_init();
4495 
4496         /*
4497          * Initialize 32-bit ID cache.
4498          */
4499         id32_init();
4500 
4501         /*
4502          * Initialize the networking stack so modules loaded can
4503          * register their callbacks.
4504          */
4505         netstack_init();
4506 }
4507 
4508 static void
4509 kmem_move_init(void)
4510 {
4511         kmem_defrag_cache = kmem_cache_create("kmem_defrag_cache",
4512             sizeof (kmem_defrag_t), 0, NULL, NULL, NULL, NULL,
4513             kmem_msb_arena, KMC_NOHASH);
4514         kmem_move_cache = kmem_cache_create("kmem_move_cache",
4515             sizeof (kmem_move_t), 0, NULL, NULL, NULL, NULL,
4516             kmem_msb_arena, KMC_NOHASH);
4517 
4518         /*
4519          * kmem guarantees that move callbacks are sequential and that even
4520          * across multiple caches no two moves ever execute simultaneously.
4521          * Move callbacks are processed on a separate taskq so that client code
4522          * does not interfere with internal maintenance tasks.
4523          */
4524         kmem_move_taskq = taskq_create_instance("kmem_move_taskq", 0, 1,
4525             minclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE);
4526 }
4527 
4528 void
4529 kmem_thread_init(void)
4530 {
4531         kmem_move_init();
4532 
4533         /*
4534          * This taskq is used for various kmem maintenance functions, including
4535          * kmem_reap().   When maintenance is required on every cache,
4536          * kmem_cache_applyall() dispatches one task per cache onto this queue.
4537          *
4538          * In the case of kmem_reap(), the system may be under increasingly
4539          * dire memory pressure and may not be able to allocate a new task
4540          * entry.  The count of entries to prepopulate (below) should cover at
4541          * least as many caches as we generally expect to exist on the system
4542          * so that they may all be scheduled for reaping under those
4543          * conditions.
4544          */
4545         kmem_taskq = taskq_create_instance("kmem_taskq", 0, 1, minclsyspri,
4546             600, INT_MAX, TASKQ_PREPOPULATE);
4547 }
4548 
4549 void
4550 kmem_mp_init(void)
4551 {
4552         mutex_enter(&cpu_lock);
4553         register_cpu_setup_func(kmem_cpu_setup, NULL);
4554         mutex_exit(&cpu_lock);
4555 
4556         kmem_update_timeout(NULL);
4557 
4558         taskq_mp_init();
4559 }
4560 
4561 /*
4562  * Return the slab of the allocated buffer, or NULL if the buffer is not
4563  * allocated. This function may be called with a known slab address to determine
4564  * whether or not the buffer is allocated, or with a NULL slab address to obtain
4565  * an allocated buffer's slab.
4566  */
4567 static kmem_slab_t *
4568 kmem_slab_allocated(kmem_cache_t *cp, kmem_slab_t *sp, void *buf)
4569 {
4570         kmem_bufctl_t *bcp, *bufbcp;
4571 
4572         ASSERT(MUTEX_HELD(&cp->cache_lock));
4573         ASSERT(sp == NULL || KMEM_SLAB_MEMBER(sp, buf));
4574 
4575         if (cp->cache_flags & KMF_HASH) {
4576                 for (bcp = *KMEM_HASH(cp, buf);
4577                     (bcp != NULL) && (bcp->bc_addr != buf);
4578                     bcp = bcp->bc_next) {
4579                         continue;
4580                 }
4581                 ASSERT(sp != NULL && bcp != NULL ? sp == bcp->bc_slab : 1);
4582                 return (bcp == NULL ? NULL : bcp->bc_slab);
4583         }
4584 
4585         if (sp == NULL) {
4586                 sp = KMEM_SLAB(cp, buf);
4587         }
4588         bufbcp = KMEM_BUFCTL(cp, buf);
4589         for (bcp = sp->slab_head;
4590             (bcp != NULL) && (bcp != bufbcp);
4591             bcp = bcp->bc_next) {
4592                 continue;
4593         }
4594         return (bcp == NULL ? sp : NULL);
4595 }
4596 
4597 static boolean_t
4598 kmem_slab_is_reclaimable(kmem_cache_t *cp, kmem_slab_t *sp, int flags)
4599 {
4600         long refcnt = sp->slab_refcnt;
4601 
4602         ASSERT(cp->cache_defrag != NULL);
4603 
4604         /*
4605          * For code coverage we want to be able to move an object within the
4606          * same slab (the only partial slab) even if allocating the destination
4607          * buffer resulted in a completely allocated slab.
4608          */
4609         if (flags & KMM_DEBUG) {
4610                 return ((flags & KMM_DESPERATE) ||
4611                     ((sp->slab_flags & KMEM_SLAB_NOMOVE) == 0));
4612         }
4613 
4614         /* If we're desperate, we don't care if the client said NO. */
4615         if (flags & KMM_DESPERATE) {
4616                 return (refcnt < sp->slab_chunks); /* any partial */
4617         }
4618 
4619         if (sp->slab_flags & KMEM_SLAB_NOMOVE) {
4620                 return (B_FALSE);
4621         }
4622 
4623         if ((refcnt == 1) || kmem_move_any_partial) {
4624                 return (refcnt < sp->slab_chunks);
4625         }
4626 
4627         /*
4628          * The reclaim threshold is adjusted at each kmem_cache_scan() so that
4629          * slabs with a progressively higher percentage of used buffers can be
4630          * reclaimed until the cache as a whole is no longer fragmented.
4631          *
4632          *      sp->slab_refcnt   kmd_reclaim_numer
4633          *      --------------- < ------------------
4634          *      sp->slab_chunks   KMEM_VOID_FRACTION
4635          */
4636         return ((refcnt * KMEM_VOID_FRACTION) <
4637             (sp->slab_chunks * cp->cache_defrag->kmd_reclaim_numer));
4638 }
4639 
4640 /*
4641  * May be called from the kmem_move_taskq, from kmem_cache_move_notify_task(),
4642  * or when the buffer is freed.
4643  */
4644 static void
4645 kmem_slab_move_yes(kmem_cache_t *cp, kmem_slab_t *sp, void *from_buf)
4646 {
4647         ASSERT(MUTEX_HELD(&cp->cache_lock));
4648         ASSERT(KMEM_SLAB_MEMBER(sp, from_buf));
4649 
4650         if (!KMEM_SLAB_IS_PARTIAL(sp)) {
4651                 return;
4652         }
4653 
4654         if (sp->slab_flags & KMEM_SLAB_NOMOVE) {
4655                 if (KMEM_SLAB_OFFSET(sp, from_buf) == sp->slab_stuck_offset) {
4656                         avl_remove(&cp->cache_partial_slabs, sp);
4657                         sp->slab_flags &= ~KMEM_SLAB_NOMOVE;
4658                         sp->slab_stuck_offset = (uint32_t)-1;
4659                         avl_add(&cp->cache_partial_slabs, sp);
4660                 }
4661         } else {
4662                 sp->slab_later_count = 0;
4663                 sp->slab_stuck_offset = (uint32_t)-1;
4664         }
4665 }
4666 
4667 static void
4668 kmem_slab_move_no(kmem_cache_t *cp, kmem_slab_t *sp, void *from_buf)
4669 {
4670         ASSERT(taskq_member(kmem_move_taskq, curthread));
4671         ASSERT(MUTEX_HELD(&cp->cache_lock));
4672         ASSERT(KMEM_SLAB_MEMBER(sp, from_buf));
4673 
4674         if (!KMEM_SLAB_IS_PARTIAL(sp)) {
4675                 return;
4676         }
4677 
4678         avl_remove(&cp->cache_partial_slabs, sp);
4679         sp->slab_later_count = 0;
4680         sp->slab_flags |= KMEM_SLAB_NOMOVE;
4681         sp->slab_stuck_offset = KMEM_SLAB_OFFSET(sp, from_buf);
4682         avl_add(&cp->cache_partial_slabs, sp);
4683 }
4684 
4685 static void kmem_move_end(kmem_cache_t *, kmem_move_t *);
4686 
4687 /*
4688  * The move callback takes two buffer addresses, the buffer to be moved, and a
4689  * newly allocated and constructed buffer selected by kmem as the destination.
4690  * It also takes the size of the buffer and an optional user argument specified
4691  * at cache creation time. kmem guarantees that the buffer to be moved has not
4692  * been unmapped by the virtual memory subsystem. Beyond that, it cannot
4693  * guarantee the present whereabouts of the buffer to be moved, so it is up to
4694  * the client to safely determine whether or not it is still using the buffer.
4695  * The client must not free either of the buffers passed to the move callback,
4696  * since kmem wants to free them directly to the slab layer. The client response
4697  * tells kmem which of the two buffers to free:
4698  *
4699  * YES          kmem frees the old buffer (the move was successful)
4700  * NO           kmem frees the new buffer, marks the slab of the old buffer
4701  *              non-reclaimable to avoid bothering the client again
4702  * LATER        kmem frees the new buffer, increments slab_later_count
4703  * DONT_KNOW    kmem frees the new buffer
4704  * DONT_NEED    kmem frees both the old buffer and the new buffer
4705  *
4706  * The pending callback argument now being processed contains both of the
4707  * buffers (old and new) passed to the move callback function, the slab of the
4708  * old buffer, and flags related to the move request, such as whether or not the
4709  * system was desperate for memory.
4710  *
4711  * Slabs are not freed while there is a pending callback, but instead are kept
4712  * on a deadlist, which is drained after the last callback completes. This means
4713  * that slabs are safe to access until kmem_move_end(), no matter how many of
4714  * their buffers have been freed. Once slab_refcnt reaches zero, it stays at
4715  * zero for as long as the slab remains on the deadlist and until the slab is
4716  * freed.
4717  */
4718 static void
4719 kmem_move_buffer(kmem_move_t *callback)
4720 {
4721         kmem_cbrc_t response;
4722         kmem_slab_t *sp = callback->kmm_from_slab;
4723         kmem_cache_t *cp = sp->slab_cache;
4724         boolean_t free_on_slab;
4725 
4726         ASSERT(taskq_member(kmem_move_taskq, curthread));
4727         ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
4728         ASSERT(KMEM_SLAB_MEMBER(sp, callback->kmm_from_buf));
4729 
4730         /*
4731          * The number of allocated buffers on the slab may have changed since we
4732          * last checked the slab's reclaimability (when the pending move was
4733          * enqueued), or the client may have responded NO when asked to move
4734          * another buffer on the same slab.
4735          */
4736         if (!kmem_slab_is_reclaimable(cp, sp, callback->kmm_flags)) {
4737                 kmem_slab_free(cp, callback->kmm_to_buf);
4738                 kmem_move_end(cp, callback);
4739                 return;
4740         }
4741 
4742         /*
4743          * Checking the slab layer is easy, so we might as well do that here
4744          * in case we can avoid bothering the client.
4745          */
4746         mutex_enter(&cp->cache_lock);
4747         free_on_slab = (kmem_slab_allocated(cp, sp,
4748             callback->kmm_from_buf) == NULL);
4749         mutex_exit(&cp->cache_lock);
4750 
4751         if (free_on_slab) {
4752                 kmem_slab_free(cp, callback->kmm_to_buf);
4753                 kmem_move_end(cp, callback);
4754                 return;
4755         }
4756 
4757         if (cp->cache_flags & KMF_BUFTAG) {
4758                 /*
4759                  * Make kmem_cache_alloc_debug() apply the constructor for us.
4760                  */
4761                 if (kmem_cache_alloc_debug(cp, callback->kmm_to_buf,
4762                     KM_NOSLEEP, 1, caller()) != 0) {
4763                         kmem_move_end(cp, callback);
4764                         return;
4765                 }
4766         } else if (cp->cache_constructor != NULL &&
4767             cp->cache_constructor(callback->kmm_to_buf, cp->cache_private,
4768             KM_NOSLEEP) != 0) {
4769                 atomic_inc_64(&cp->cache_alloc_fail);
4770                 kmem_slab_free(cp, callback->kmm_to_buf);
4771                 kmem_move_end(cp, callback);
4772                 return;
4773         }
4774 
4775         cp->cache_defrag->kmd_callbacks++;
4776         cp->cache_defrag->kmd_thread = curthread;
4777         cp->cache_defrag->kmd_from_buf = callback->kmm_from_buf;
4778         cp->cache_defrag->kmd_to_buf = callback->kmm_to_buf;
4779         DTRACE_PROBE2(kmem__move__start, kmem_cache_t *, cp, kmem_move_t *,
4780             callback);
4781 
4782         response = cp->cache_move(callback->kmm_from_buf,
4783             callback->kmm_to_buf, cp->cache_bufsize, cp->cache_private);
4784 
4785         DTRACE_PROBE3(kmem__move__end, kmem_cache_t *, cp, kmem_move_t *,
4786             callback, kmem_cbrc_t, response);
4787         cp->cache_defrag->kmd_thread = NULL;
4788         cp->cache_defrag->kmd_from_buf = NULL;
4789         cp->cache_defrag->kmd_to_buf = NULL;
4790 
4791         if (response == KMEM_CBRC_YES) {
4792                 cp->cache_defrag->kmd_yes++;
4793                 kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE);
4794                 /* slab safe to access until kmem_move_end() */
4795                 if (sp->slab_refcnt == 0)
4796                         cp->cache_defrag->kmd_slabs_freed++;
4797                 mutex_enter(&cp->cache_lock);
4798                 kmem_slab_move_yes(cp, sp, callback->kmm_from_buf);
4799                 mutex_exit(&cp->cache_lock);
4800                 kmem_move_end(cp, callback);
4801                 return;
4802         }
4803 
4804         switch (response) {
4805         case KMEM_CBRC_NO:
4806                 cp->cache_defrag->kmd_no++;
4807                 mutex_enter(&cp->cache_lock);
4808                 kmem_slab_move_no(cp, sp, callback->kmm_from_buf);
4809                 mutex_exit(&cp->cache_lock);
4810                 break;
4811         case KMEM_CBRC_LATER:
4812                 cp->cache_defrag->kmd_later++;
4813                 mutex_enter(&cp->cache_lock);
4814                 if (!KMEM_SLAB_IS_PARTIAL(sp)) {
4815                         mutex_exit(&cp->cache_lock);
4816                         break;
4817                 }
4818 
4819                 if (++sp->slab_later_count >= KMEM_DISBELIEF) {
4820                         kmem_slab_move_no(cp, sp, callback->kmm_from_buf);
4821                 } else if (!(sp->slab_flags & KMEM_SLAB_NOMOVE)) {
4822                         sp->slab_stuck_offset = KMEM_SLAB_OFFSET(sp,
4823                             callback->kmm_from_buf);
4824                 }
4825                 mutex_exit(&cp->cache_lock);
4826                 break;
4827         case KMEM_CBRC_DONT_NEED:
4828                 cp->cache_defrag->kmd_dont_need++;
4829                 kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE);
4830                 if (sp->slab_refcnt == 0)
4831                         cp->cache_defrag->kmd_slabs_freed++;
4832                 mutex_enter(&cp->cache_lock);
4833                 kmem_slab_move_yes(cp, sp, callback->kmm_from_buf);
4834                 mutex_exit(&cp->cache_lock);
4835                 break;
4836         case KMEM_CBRC_DONT_KNOW:
4837                 /*
4838                  * If we don't know if we can move this buffer or not, we'll
4839                  * just assume that we can't:  if the buffer is in fact free,
4840                  * then it is sitting in one of the per-CPU magazines or in
4841                  * a full magazine in the depot layer.  Either way, because
4842                  * defrag is induced in the same logic that reaps a cache,
4843                  * it's likely that full magazines will be returned to the
4844                  * system soon (thereby accomplishing what we're trying to
4845                  * accomplish here: return those magazines to their slabs).
4846                  * Given this, any work that we might do now to locate a buffer
4847                  * in a magazine is wasted (and expensive!) work; we bump
4848                  * a counter in this case and otherwise assume that we can't
4849                  * move it.
4850                  */
4851                 cp->cache_defrag->kmd_dont_know++;
4852                 break;
4853         default:
4854                 panic("'%s' (%p) unexpected move callback response %d\n",
4855                     cp->cache_name, (void *)cp, response);
4856         }
4857 
4858         kmem_slab_free_constructed(cp, callback->kmm_to_buf, B_FALSE);
4859         kmem_move_end(cp, callback);
4860 }
4861 
4862 /* Return B_FALSE if there is insufficient memory for the move request. */
4863 static boolean_t
4864 kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags)
4865 {
4866         void *to_buf;
4867         avl_index_t index;
4868         kmem_move_t *callback, *pending;
4869         ulong_t n;
4870 
4871         ASSERT(taskq_member(kmem_taskq, curthread));
4872         ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
4873         ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING);
4874 
4875         callback = kmem_cache_alloc(kmem_move_cache, KM_NOSLEEP);
4876 
4877         if (callback == NULL)
4878                 return (B_FALSE);
4879 
4880         callback->kmm_from_slab = sp;
4881         callback->kmm_from_buf = buf;
4882         callback->kmm_flags = flags;
4883 
4884         mutex_enter(&cp->cache_lock);
4885 
4886         n = avl_numnodes(&cp->cache_partial_slabs);
4887         if ((n == 0) || ((n == 1) && !(flags & KMM_DEBUG))) {
4888                 mutex_exit(&cp->cache_lock);
4889                 kmem_cache_free(kmem_move_cache, callback);
4890                 return (B_TRUE); /* there is no need for the move request */
4891         }
4892 
4893         pending = avl_find(&cp->cache_defrag->kmd_moves_pending, buf, &index);
4894         if (pending != NULL) {
4895                 /*
4896                  * If the move is already pending and we're desperate now,
4897                  * update the move flags.
4898                  */
4899                 if (flags & KMM_DESPERATE) {
4900                         pending->kmm_flags |= KMM_DESPERATE;
4901                 }
4902                 mutex_exit(&cp->cache_lock);
4903                 kmem_cache_free(kmem_move_cache, callback);
4904                 return (B_TRUE);
4905         }
4906 
4907         to_buf = kmem_slab_alloc_impl(cp, avl_first(&cp->cache_partial_slabs),
4908             B_FALSE);
4909         callback->kmm_to_buf = to_buf;
4910         avl_insert(&cp->cache_defrag->kmd_moves_pending, callback, index);
4911 
4912         mutex_exit(&cp->cache_lock);
4913 
4914         if (taskq_dispatch(kmem_move_taskq, (task_func_t *)kmem_move_buffer,
4915             callback, TQ_NOSLEEP) == TASKQID_INVALID) {
4916                 mutex_enter(&cp->cache_lock);
4917                 avl_remove(&cp->cache_defrag->kmd_moves_pending, callback);
4918                 mutex_exit(&cp->cache_lock);
4919                 kmem_slab_free(cp, to_buf);
4920                 kmem_cache_free(kmem_move_cache, callback);
4921                 return (B_FALSE);
4922         }
4923 
4924         return (B_TRUE);
4925 }
4926 
4927 static void
4928 kmem_move_end(kmem_cache_t *cp, kmem_move_t *callback)
4929 {
4930         avl_index_t index;
4931 
4932         ASSERT(cp->cache_defrag != NULL);
4933         ASSERT(taskq_member(kmem_move_taskq, curthread));
4934         ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
4935 
4936         mutex_enter(&cp->cache_lock);
4937         VERIFY(avl_find(&cp->cache_defrag->kmd_moves_pending,
4938             callback->kmm_from_buf, &index) != NULL);
4939         avl_remove(&cp->cache_defrag->kmd_moves_pending, callback);
4940         if (avl_is_empty(&cp->cache_defrag->kmd_moves_pending)) {
4941                 list_t *deadlist = &cp->cache_defrag->kmd_deadlist;
4942                 kmem_slab_t *sp;
4943 
4944                 /*
4945                  * The last pending move completed. Release all slabs from the
4946                  * front of the dead list except for any slab at the tail that
4947                  * needs to be released from the context of kmem_move_buffers().
4948                  * kmem deferred unmapping the buffers on these slabs in order
4949                  * to guarantee that buffers passed to the move callback have
4950                  * been touched only by kmem or by the client itself.
4951                  */
4952                 while ((sp = list_remove_head(deadlist)) != NULL) {
4953                         if (sp->slab_flags & KMEM_SLAB_MOVE_PENDING) {
4954                                 list_insert_tail(deadlist, sp);
4955                                 break;
4956                         }
4957                         cp->cache_defrag->kmd_deadcount--;
4958                         cp->cache_slab_destroy++;
4959                         mutex_exit(&cp->cache_lock);
4960                         kmem_slab_destroy(cp, sp);
4961                         mutex_enter(&cp->cache_lock);
4962                 }
4963         }
4964         mutex_exit(&cp->cache_lock);
4965         kmem_cache_free(kmem_move_cache, callback);
4966 }
4967 
4968 /*
4969  * Move buffers from least used slabs first by scanning backwards from the end
4970  * of the partial slab list. Scan at most max_scan candidate slabs and move
4971  * buffers from at most max_slabs slabs (0 for all partial slabs in both cases).
4972  * If desperate to reclaim memory, move buffers from any partial slab, otherwise
4973  * skip slabs with a ratio of allocated buffers at or above the current
4974  * threshold. Return the number of unskipped slabs (at most max_slabs, -1 if the
4975  * scan is aborted) so that the caller can adjust the reclaimability threshold
4976  * depending on how many reclaimable slabs it finds.
4977  *
4978  * kmem_move_buffers() drops and reacquires cache_lock every time it issues a
4979  * move request, since it is not valid for kmem_move_begin() to call
4980  * kmem_cache_alloc() or taskq_dispatch() with cache_lock held.
4981  */
4982 static int
4983 kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs,
4984     int flags)
4985 {
4986         kmem_slab_t *sp;
4987         void *buf;
4988         int i, j; /* slab index, buffer index */
4989         int s; /* reclaimable slabs */
4990         int b; /* allocated (movable) buffers on reclaimable slab */
4991         boolean_t success;
4992         int refcnt;
4993         int nomove;
4994 
4995         ASSERT(taskq_member(kmem_taskq, curthread));
4996         ASSERT(MUTEX_HELD(&cp->cache_lock));
4997         ASSERT(kmem_move_cache != NULL);
4998         ASSERT(cp->cache_move != NULL && cp->cache_defrag != NULL);
4999         ASSERT((flags & KMM_DEBUG) ? !avl_is_empty(&cp->cache_partial_slabs) :
5000             avl_numnodes(&cp->cache_partial_slabs) > 1);
5001 
5002         if (kmem_move_blocked) {
5003                 return (0);
5004         }
5005 
5006         if (kmem_move_fulltilt) {
5007                 flags |= KMM_DESPERATE;
5008         }
5009 
5010         if (max_scan == 0 || (flags & KMM_DESPERATE)) {
5011                 /*
5012                  * Scan as many slabs as needed to find the desired number of
5013                  * candidate slabs.
5014                  */
5015                 max_scan = (size_t)-1;
5016         }
5017 
5018         if (max_slabs == 0 || (flags & KMM_DESPERATE)) {
5019                 /* Find as many candidate slabs as possible. */
5020                 max_slabs = (size_t)-1;
5021         }
5022 
5023         sp = avl_last(&cp->cache_partial_slabs);
5024         ASSERT(KMEM_SLAB_IS_PARTIAL(sp));
5025         for (i = 0, s = 0; (i < max_scan) && (s < max_slabs) && (sp != NULL) &&
5026             ((sp != avl_first(&cp->cache_partial_slabs)) ||
5027             (flags & KMM_DEBUG));
5028             sp = AVL_PREV(&cp->cache_partial_slabs, sp), i++) {
5029 
5030                 if (!kmem_slab_is_reclaimable(cp, sp, flags)) {
5031                         continue;
5032                 }
5033                 s++;
5034 
5035                 /* Look for allocated buffers to move. */
5036                 for (j = 0, b = 0, buf = sp->slab_base;
5037                     (j < sp->slab_chunks) && (b < sp->slab_refcnt);
5038                     buf = (((char *)buf) + cp->cache_chunksize), j++) {
5039 
5040                         if (kmem_slab_allocated(cp, sp, buf) == NULL) {
5041                                 continue;
5042                         }
5043 
5044                         b++;
5045 
5046                         /*
5047                          * Prevent the slab from being destroyed while we drop
5048                          * cache_lock and while the pending move is not yet
5049                          * registered. Flag the pending move while
5050                          * kmd_moves_pending may still be empty, since we can't
5051                          * yet rely on a non-zero pending move count to prevent
5052                          * the slab from being destroyed.
5053                          */
5054                         ASSERT(!(sp->slab_flags & KMEM_SLAB_MOVE_PENDING));
5055                         sp->slab_flags |= KMEM_SLAB_MOVE_PENDING;
5056                         /*
5057                          * Recheck refcnt and nomove after reacquiring the lock,
5058                          * since these control the order of partial slabs, and
5059                          * we want to know if we can pick up the scan where we
5060                          * left off.
5061                          */
5062                         refcnt = sp->slab_refcnt;
5063                         nomove = (sp->slab_flags & KMEM_SLAB_NOMOVE);
5064                         mutex_exit(&cp->cache_lock);
5065 
5066                         success = kmem_move_begin(cp, sp, buf, flags);
5067 
5068                         /*
5069                          * Now, before the lock is reacquired, kmem could
5070                          * process all pending move requests and purge the
5071                          * deadlist, so that upon reacquiring the lock, sp has
5072                          * been remapped. Or, the client may free all the
5073                          * objects on the slab while the pending moves are still
5074                          * on the taskq. Therefore, the KMEM_SLAB_MOVE_PENDING
5075                          * flag causes the slab to be put at the end of the
5076                          * deadlist and prevents it from being destroyed, since
5077                          * we plan to destroy it here after reacquiring the
5078                          * lock.
5079                          */
5080                         mutex_enter(&cp->cache_lock);
5081                         ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING);
5082                         sp->slab_flags &= ~KMEM_SLAB_MOVE_PENDING;
5083 
5084                         if (sp->slab_refcnt == 0) {
5085                                 list_t *deadlist =
5086                                     &cp->cache_defrag->kmd_deadlist;
5087                                 list_remove(deadlist, sp);
5088 
5089                                 if (!avl_is_empty(
5090                                     &cp->cache_defrag->kmd_moves_pending)) {
5091                                         /*
5092                                          * A pending move makes it unsafe to
5093                                          * destroy the slab, because even though
5094                                          * the move is no longer needed, the
5095                                          * context where that is determined
5096                                          * requires the slab to exist.
5097                                          * Fortunately, a pending move also
5098                                          * means we don't need to destroy the
5099                                          * slab here, since it will get
5100                                          * destroyed along with any other slabs
5101                                          * on the deadlist after the last
5102                                          * pending move completes.
5103                                          */
5104                                         list_insert_head(deadlist, sp);
5105                                         return (-1);
5106                                 }
5107 
5108                                 /*
5109                                  * Destroy the slab now if it was completely
5110                                  * freed while we dropped cache_lock and there
5111                                  * are no pending moves. Since slab_refcnt
5112                                  * cannot change once it reaches zero, no new
5113                                  * pending moves from that slab are possible.
5114                                  */
5115                                 cp->cache_defrag->kmd_deadcount--;
5116                                 cp->cache_slab_destroy++;
5117                                 mutex_exit(&cp->cache_lock);
5118                                 kmem_slab_destroy(cp, sp);
5119                                 mutex_enter(&cp->cache_lock);
5120                                 /*
5121                                  * Since we can't pick up the scan where we left
5122                                  * off, abort the scan and say nothing about the
5123                                  * number of reclaimable slabs.
5124                                  */
5125                                 return (-1);
5126                         }
5127 
5128                         if (!success) {
5129                                 /*
5130                                  * Abort the scan if there is not enough memory
5131                                  * for the request and say nothing about the
5132                                  * number of reclaimable slabs.
5133                                  */
5134                                 return (-1);
5135                         }
5136 
5137                         /*
5138                          * The slab's position changed while the lock was
5139                          * dropped, so we don't know where we are in the
5140                          * sequence any more.
5141                          */
5142                         if (sp->slab_refcnt != refcnt) {
5143                                 /*
5144                                  * If this is a KMM_DEBUG move, the slab_refcnt
5145                                  * may have changed because we allocated a
5146                                  * destination buffer on the same slab. In that
5147                                  * case, we're not interested in counting it.
5148                                  */
5149                                 return (-1);
5150                         }
5151                         if ((sp->slab_flags & KMEM_SLAB_NOMOVE) != nomove)
5152                                 return (-1);
5153 
5154                         /*
5155                          * Generating a move request allocates a destination
5156                          * buffer from the slab layer, bumping the first partial
5157                          * slab if it is completely allocated. If the current
5158                          * slab becomes the first partial slab as a result, we
5159                          * can't continue to scan backwards.
5160                          *
5161                          * If this is a KMM_DEBUG move and we allocated the
5162                          * destination buffer from the last partial slab, then
5163                          * the buffer we're moving is on the same slab and our
5164                          * slab_refcnt has changed, causing us to return before
5165                          * reaching here if there are no partial slabs left.
5166                          */
5167                         ASSERT(!avl_is_empty(&cp->cache_partial_slabs));
5168                         if (sp == avl_first(&cp->cache_partial_slabs)) {
5169                                 /*
5170                                  * We're not interested in a second KMM_DEBUG
5171                                  * move.
5172                                  */
5173                                 goto end_scan;
5174                         }
5175                 }
5176         }
5177 end_scan:
5178 
5179         return (s);
5180 }
5181 
5182 typedef struct kmem_move_notify_args {
5183         kmem_cache_t *kmna_cache;
5184         void *kmna_buf;
5185 } kmem_move_notify_args_t;
5186 
5187 static void
5188 kmem_cache_move_notify_task(void *arg)
5189 {
5190         kmem_move_notify_args_t *args = arg;
5191         kmem_cache_t *cp = args->kmna_cache;
5192         void *buf = args->kmna_buf;
5193         kmem_slab_t *sp;
5194 
5195         ASSERT(taskq_member(kmem_taskq, curthread));
5196         ASSERT(list_link_active(&cp->cache_link));
5197 
5198         kmem_free(args, sizeof (kmem_move_notify_args_t));
5199         mutex_enter(&cp->cache_lock);
5200         sp = kmem_slab_allocated(cp, NULL, buf);
5201 
5202         /* Ignore the notification if the buffer is no longer allocated. */
5203         if (sp == NULL) {
5204                 mutex_exit(&cp->cache_lock);
5205                 return;
5206         }
5207 
5208         /* Ignore the notification if there's no reason to move the buffer. */
5209         if (avl_numnodes(&cp->cache_partial_slabs) > 1) {
5210                 /*
5211                  * So far the notification is not ignored. Ignore the
5212                  * notification if the slab is not marked by an earlier refusal
5213                  * to move a buffer.
5214                  */
5215                 if (!(sp->slab_flags & KMEM_SLAB_NOMOVE) &&
5216                     (sp->slab_later_count == 0)) {
5217                         mutex_exit(&cp->cache_lock);
5218                         return;
5219                 }
5220 
5221                 kmem_slab_move_yes(cp, sp, buf);
5222                 ASSERT(!(sp->slab_flags & KMEM_SLAB_MOVE_PENDING));
5223                 sp->slab_flags |= KMEM_SLAB_MOVE_PENDING;
5224                 mutex_exit(&cp->cache_lock);
5225                 /* see kmem_move_buffers() about dropping the lock */
5226                 (void) kmem_move_begin(cp, sp, buf, KMM_NOTIFY);
5227                 mutex_enter(&cp->cache_lock);
5228                 ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING);
5229                 sp->slab_flags &= ~KMEM_SLAB_MOVE_PENDING;
5230                 if (sp->slab_refcnt == 0) {
5231                         list_t *deadlist = &cp->cache_defrag->kmd_deadlist;
5232                         list_remove(deadlist, sp);
5233 
5234                         if (!avl_is_empty(
5235                             &cp->cache_defrag->kmd_moves_pending)) {
5236                                 list_insert_head(deadlist, sp);
5237                                 mutex_exit(&cp->cache_lock);
5238                                 return;
5239                         }
5240 
5241                         cp->cache_defrag->kmd_deadcount--;
5242                         cp->cache_slab_destroy++;
5243                         mutex_exit(&cp->cache_lock);
5244                         kmem_slab_destroy(cp, sp);
5245                         return;
5246                 }
5247         } else {
5248                 kmem_slab_move_yes(cp, sp, buf);
5249         }
5250         mutex_exit(&cp->cache_lock);
5251 }
5252 
5253 void
5254 kmem_cache_move_notify(kmem_cache_t *cp, void *buf)
5255 {
5256         kmem_move_notify_args_t *args;
5257 
5258         args = kmem_alloc(sizeof (kmem_move_notify_args_t), KM_NOSLEEP);
5259         if (args != NULL) {
5260                 args->kmna_cache = cp;
5261                 args->kmna_buf = buf;
5262                 if (taskq_dispatch(kmem_taskq,
5263                     (task_func_t *)kmem_cache_move_notify_task, args,
5264                     TQ_NOSLEEP) == TASKQID_INVALID)
5265                         kmem_free(args, sizeof (kmem_move_notify_args_t));
5266         }
5267 }
5268 
5269 static void
5270 kmem_cache_defrag(kmem_cache_t *cp)
5271 {
5272         size_t n;
5273 
5274         ASSERT(cp->cache_defrag != NULL);
5275 
5276         mutex_enter(&cp->cache_lock);
5277         n = avl_numnodes(&cp->cache_partial_slabs);
5278         if (n > 1) {
5279                 /* kmem_move_buffers() drops and reacquires cache_lock */
5280                 cp->cache_defrag->kmd_defrags++;
5281                 (void) kmem_move_buffers(cp, n, 0, KMM_DESPERATE);
5282         }
5283         mutex_exit(&cp->cache_lock);
5284 }
5285 
5286 /* Is this cache above the fragmentation threshold? */
5287 static boolean_t
5288 kmem_cache_frag_threshold(kmem_cache_t *cp, uint64_t nfree)
5289 {
5290         /*
5291          *      nfree           kmem_frag_numer
5292          * ------------------ > ---------------
5293          * cp->cache_buftotal        kmem_frag_denom
5294          */
5295         return ((nfree * kmem_frag_denom) >
5296             (cp->cache_buftotal * kmem_frag_numer));
5297 }
5298 
5299 static boolean_t
5300 kmem_cache_is_fragmented(kmem_cache_t *cp, boolean_t *doreap)
5301 {
5302         boolean_t fragmented;
5303         uint64_t nfree;
5304 
5305         ASSERT(MUTEX_HELD(&cp->cache_lock));
5306         *doreap = B_FALSE;
5307 
5308         if (kmem_move_fulltilt) {
5309                 if (avl_numnodes(&cp->cache_partial_slabs) > 1) {
5310                         return (B_TRUE);
5311                 }
5312         } else {
5313                 if ((cp->cache_complete_slab_count + avl_numnodes(
5314                     &cp->cache_partial_slabs)) < kmem_frag_minslabs) {
5315                         return (B_FALSE);
5316                 }
5317         }
5318 
5319         nfree = cp->cache_bufslab;
5320         fragmented = ((avl_numnodes(&cp->cache_partial_slabs) > 1) &&
5321             kmem_cache_frag_threshold(cp, nfree));
5322 
5323         /*
5324          * Free buffers in the magazine layer appear allocated from the point of
5325          * view of the slab layer. We want to know if the slab layer would
5326          * appear fragmented if we included free buffers from magazines that
5327          * have fallen out of the working set.
5328          */
5329         if (!fragmented) {
5330                 long reap;
5331 
5332                 mutex_enter(&cp->cache_depot_lock);
5333                 reap = MIN(cp->cache_full.ml_reaplimit, cp->cache_full.ml_min);
5334                 reap = MIN(reap, cp->cache_full.ml_total);
5335                 mutex_exit(&cp->cache_depot_lock);
5336 
5337                 nfree += ((uint64_t)reap * cp->cache_magtype->mt_magsize);
5338                 if (kmem_cache_frag_threshold(cp, nfree)) {
5339                         *doreap = B_TRUE;
5340                 }
5341         }
5342 
5343         return (fragmented);
5344 }
5345 
5346 /* Called periodically from kmem_taskq */
5347 static void
5348 kmem_cache_scan(kmem_cache_t *cp)
5349 {
5350         boolean_t reap = B_FALSE;
5351         kmem_defrag_t *kmd;
5352 
5353         ASSERT(taskq_member(kmem_taskq, curthread));
5354 
5355         mutex_enter(&cp->cache_lock);
5356 
5357         kmd = cp->cache_defrag;
5358         if (kmd->kmd_consolidate > 0) {
5359                 kmd->kmd_consolidate--;
5360                 mutex_exit(&cp->cache_lock);
5361                 kmem_cache_reap(cp);
5362                 return;
5363         }
5364 
5365         if (kmem_cache_is_fragmented(cp, &reap)) {
5366                 int slabs_found;
5367 
5368                 /*
5369                  * Consolidate reclaimable slabs from the end of the partial
5370                  * slab list (scan at most kmem_reclaim_scan_range slabs to find
5371                  * reclaimable slabs). Keep track of how many candidate slabs we
5372                  * looked for and how many we actually found so we can adjust
5373                  * the definition of a candidate slab if we're having trouble
5374                  * finding them.
5375                  *
5376                  * kmem_move_buffers() drops and reacquires cache_lock.
5377                  */
5378                 kmd->kmd_scans++;
5379                 slabs_found = kmem_move_buffers(cp, kmem_reclaim_scan_range,
5380                     kmem_reclaim_max_slabs, 0);
5381                 if (slabs_found >= 0) {
5382                         kmd->kmd_slabs_sought += kmem_reclaim_max_slabs;
5383                         kmd->kmd_slabs_found += slabs_found;
5384                 }
5385 
5386                 if (++kmd->kmd_tries >= kmem_reclaim_scan_range) {
5387                         kmd->kmd_tries = 0;
5388 
5389                         /*
5390                          * If we had difficulty finding candidate slabs in
5391                          * previous scans, adjust the threshold so that
5392                          * candidates are easier to find.
5393                          */
5394                         if (kmd->kmd_slabs_found == kmd->kmd_slabs_sought) {
5395                                 kmem_adjust_reclaim_threshold(kmd, -1);
5396                         } else if ((kmd->kmd_slabs_found * 2) <
5397                             kmd->kmd_slabs_sought) {
5398                                 kmem_adjust_reclaim_threshold(kmd, 1);
5399                         }
5400                         kmd->kmd_slabs_sought = 0;
5401                         kmd->kmd_slabs_found = 0;
5402                 }
5403         } else {
5404                 kmem_reset_reclaim_threshold(cp->cache_defrag);
5405 #ifdef  DEBUG
5406                 if (!avl_is_empty(&cp->cache_partial_slabs)) {
5407                         /*
5408                          * In a debug kernel we want the consolidator to
5409                          * run occasionally even when there is plenty of
5410                          * memory.
5411                          */
5412                         uint16_t debug_rand;
5413 
5414                         (void) random_get_bytes((uint8_t *)&debug_rand, 2);
5415                         if (!kmem_move_noreap &&
5416                             ((debug_rand % kmem_mtb_reap) == 0)) {
5417                                 mutex_exit(&cp->cache_lock);
5418                                 kmem_cache_reap(cp);
5419                                 return;
5420                         } else if ((debug_rand % kmem_mtb_move) == 0) {
5421                                 kmd->kmd_scans++;
5422                                 (void) kmem_move_buffers(cp,
5423                                     kmem_reclaim_scan_range, 1, KMM_DEBUG);
5424                         }
5425                 }
5426 #endif  /* DEBUG */
5427         }
5428 
5429         mutex_exit(&cp->cache_lock);
5430 
5431         if (reap)
5432                 kmem_depot_ws_reap(cp);
5433 }