3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2015 Joyent, Inc.  All rights reserved.
  24  * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  25  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  26  */
  27 
  28 /*
  29  * Kernel memory allocator, as described in the following two papers and a
  30  * statement about the consolidator:
  31  *
  32  * Jeff Bonwick,
  33  * The Slab Allocator: An Object-Caching Kernel Memory Allocator.
  34  * Proceedings of the Summer 1994 Usenix Conference.
  35  * Available as /shared/sac/PSARC/1994/028/materials/kmem.pdf.
  36  *
  37  * Jeff Bonwick and Jonathan Adams,
  38  * Magazines and vmem: Extending the Slab Allocator to Many CPUs and
  39  * Arbitrary Resources.
  40  * Proceedings of the 2001 Usenix Conference.
  41  * Available as /shared/sac/PSARC/2000/550/materials/vmem.pdf.
  42  *
  43  * kmem Slab Consolidator Big Theory Statement:
 
 
 143  *            (The system won't be getting the slab back as long as the
 144  *            immovable object holds it hostage, so there's no point in moving
 145  *            any of its objects.)
 146  *     LATER: The client is using the object and cannot move it now, so kmem
 147  *            frees the new object (the unused copy destination). kmem still
 148  *            attempts to move other objects off the slab, since it expects to
 149  *            succeed in clearing the slab in a later callback. The client
 150  *            should use LATER instead of NO if the object is likely to become
 151  *            movable very soon.
 152  * DONT_NEED: The client no longer needs the object, so kmem frees the old along
 153  *            with the new object (the unused copy destination). This response
 154  *            is the client's opportunity to be a model citizen and give back as
 155  *            much as it can.
 156  * DONT_KNOW: The client does not know about the object because
 157  *            a) the client has just allocated the object and not yet put it
 158  *               wherever it expects to find known objects
 159  *            b) the client has removed the object from wherever it expects to
 160  *               find known objects and is about to free it, or
 161  *            c) the client has freed the object.
 162  *            In all these cases (a, b, and c) kmem frees the new object (the
 163  *            unused copy destination).  In the first case, the object is in
 164  *            use and the correct action is that for LATER; in the latter two
 165  *            cases, we know that the object is either freed or about to be
 166  *            freed, in which case it is either already in a magazine or about
 167  *            to be in one.  In these cases, we know that the object will either
 168  *            be reallocated and reused, or it will end up in a full magazine
 169  *            that will be reaped (thereby liberating the slab).  Because it
 170  *            is prohibitively expensive to differentiate these cases, and
 171  *            because the defrag code is executed when we're low on memory
 172  *            (thereby biasing the system to reclaim full magazines) we treat
 173  *            all DONT_KNOW cases as LATER and rely on cache reaping to
 174  *            generally clean up full magazines.  While we take the same action
 175  *            for these cases, we maintain their semantic distinction:  if
 176  *            defragmentation is not occurring, it is useful to know if this
 177  *            is due to objects in use (LATER) or objects in an unknown state
 178  *            of transition (DONT_KNOW).
 179  *
 180  * 2.3 Object States
 181  *
 182  * Neither kmem nor the client can be assumed to know the object's whereabouts
 183  * at the time of the callback. An object belonging to a kmem cache may be in
 184  * any of the following states:
 185  *
 186  * 1. Uninitialized on the slab
 187  * 2. Allocated from the slab but not constructed (still uninitialized)
 188  * 3. Allocated from the slab, constructed, but not yet ready for business
 189  *    (not in a valid state for the move callback)
 190  * 4. In use (valid and known to the client)
 191  * 5. About to be freed (no longer in a valid state for the move callback)
 192  * 6. Freed to a magazine (still constructed)
 193  * 7. Allocated from a magazine, not yet ready for business (not in a valid
 194  *    state for the move callback), and about to return to state #4
 195  * 8. Deconstructed on a magazine that is about to be freed
 196  * 9. Freed to the slab
 197  *
 198  * Since the move callback may be called at any time while the object is in any
 
 
 281  * c_objects_lock is held. Note that after acquiring the lock, the client must
 282  * recheck the o_container pointer in case the object was removed just before
 283  * acquiring the lock.
 284  *
 285  * When the client is about to free an object, it must first remove that object
 286  * from the list, hash, or other structure where it is kept. At that time, to
 287  * mark the object so it can be distinguished from the remaining, known objects,
 288  * the client sets the designated low order bit:
 289  *
 290  *      mutex_enter(&container->c_objects_lock);
 291  *      object->o_container = (void *)((uintptr_t)object->o_container | 0x1);
 292  *      list_remove(&container->c_objects, object);
 293  *      mutex_exit(&container->c_objects_lock);
 294  *
 295  * In the common case, the object is freed to the magazine layer, where it may
 296  * be reused on a subsequent allocation without the overhead of calling the
 297  * constructor. While in the magazine it appears allocated from the point of
 298  * view of the slab layer, making it a candidate for the move callback. Most
 299  * objects unrecognized by the client in the move callback fall into this
 300  * category and are cheaply distinguished from known objects by the test
 301  * described earlier. Because searching magazines is prohibitively expensive
 302  * for kmem, clients that do not mark freed objects (and therefore return
 303  * KMEM_CBRC_DONT_KNOW for large numbers of objects) may find defragmentation
 304  * efficacy reduced.
 305  *
 306  * Invalidating the designated pointer member before freeing the object marks
 307  * the object to be avoided in the callback, and conversely, assigning a valid
 308  * value to the designated pointer member after allocating the object makes the
 309  * object fair game for the callback:
 310  *
 311  *      ... allocate object ...
 312  *      ... set any initial state not set by the constructor ...
 313  *
 314  *      mutex_enter(&container->c_objects_lock);
 315  *      list_insert_tail(&container->c_objects, object);
 316  *      membar_producer();
 317  *      object->o_container = container;
 318  *      mutex_exit(&container->c_objects_lock);
 319  *
 320  * Note that everything else must be valid before setting o_container makes the
 321  * object fair game for the move callback. The membar_producer() call ensures
 322  * that all the object's state is written to memory before setting the pointer
 323  * that transitions the object from state #3 or #7 (allocated, constructed, not
 324  * yet in use) to state #4 (in use, valid). That's important because the move
 
 
 994         { 95,   64,     0,      512     },
 995         { 143,  64,     0,      0       },
 996 };
 997 
 998 static uint32_t kmem_reaping;
 999 static uint32_t kmem_reaping_idspace;
1000 
1001 /*
1002  * kmem tunables
1003  */
1004 clock_t kmem_reap_interval;     /* cache reaping rate [15 * HZ ticks] */
1005 int kmem_depot_contention = 3;  /* max failed tryenters per real interval */
1006 pgcnt_t kmem_reapahead = 0;     /* start reaping N pages before pageout */
1007 int kmem_panic = 1;             /* whether to panic on error */
1008 int kmem_logging = 1;           /* kmem_log_enter() override */
1009 uint32_t kmem_mtbf = 0;         /* mean time between failures [default: off] */
1010 size_t kmem_transaction_log_size; /* transaction log size [2% of memory] */
1011 size_t kmem_content_log_size;   /* content log size [2% of memory] */
1012 size_t kmem_failure_log_size;   /* failure log [4 pages per CPU] */
1013 size_t kmem_slab_log_size;      /* slab create log [4 pages per CPU] */
1014 size_t kmem_zerosized_log_size; /* zero-sized log [4 pages per CPU] */
1015 size_t kmem_content_maxsave = 256; /* KMF_CONTENTS max bytes to log */
1016 size_t kmem_lite_minsize = 0;   /* minimum buffer size for KMF_LITE */
1017 size_t kmem_lite_maxalign = 1024; /* maximum buffer alignment for KMF_LITE */
1018 int kmem_lite_pcs = 4;          /* number of PCs to store in KMF_LITE mode */
1019 size_t kmem_maxverify;          /* maximum bytes to inspect in debug routines */
1020 size_t kmem_minfirewall;        /* hardware-enforced redzone threshold */
1021 
1022 #ifdef DEBUG
1023 int kmem_warn_zerosized = 1;    /* whether to warn on zero-sized KM_SLEEP */
1024 #else
1025 int kmem_warn_zerosized = 0;    /* whether to warn on zero-sized KM_SLEEP */
1026 #endif
1027 
1028 int kmem_panic_zerosized = 0;   /* whether to panic on zero-sized KM_SLEEP */
1029 
1030 #ifdef _LP64
1031 size_t  kmem_max_cached = KMEM_BIG_MAXBUF;      /* maximum kmem_alloc cache */
1032 #else
1033 size_t  kmem_max_cached = KMEM_BIG_MAXBUF_32BIT; /* maximum kmem_alloc cache */
1034 #endif
1035 
1036 #ifdef DEBUG
1037 int kmem_flags = KMF_AUDIT | KMF_DEADBEEF | KMF_REDZONE | KMF_CONTENTS;
1038 #else
1039 int kmem_flags = 0;
1040 #endif
1041 int kmem_ready;
1042 
1043 static kmem_cache_t     *kmem_slab_cache;
1044 static kmem_cache_t     *kmem_bufctl_cache;
1045 static kmem_cache_t     *kmem_bufctl_audit_cache;
1046 
1047 static kmutex_t         kmem_cache_lock;        /* inter-cache linkage only */
1048 static list_t           kmem_caches;
1049 
1050 static taskq_t          *kmem_taskq;
1051 static kmutex_t         kmem_flags_lock;
1052 static vmem_t           *kmem_metadata_arena;
1053 static vmem_t           *kmem_msb_arena;        /* arena for metadata caches */
1054 static vmem_t           *kmem_cache_arena;
1055 static vmem_t           *kmem_hash_arena;
1056 static vmem_t           *kmem_log_arena;
1057 static vmem_t           *kmem_oversize_arena;
1058 static vmem_t           *kmem_va_arena;
1059 static vmem_t           *kmem_default_arena;
1060 static vmem_t           *kmem_firewall_va_arena;
1061 static vmem_t           *kmem_firewall_arena;
1062 
1063 static int              kmem_zerosized;         /* # of zero-sized allocs */
1064 
1065 /*
1066  * kmem slab consolidator thresholds (tunables)
1067  */
1068 size_t kmem_frag_minslabs = 101;        /* minimum total slabs */
1069 size_t kmem_frag_numer = 1;             /* free buffers (numerator) */
1070 size_t kmem_frag_denom = KMEM_VOID_FRACTION; /* buffers (denominator) */
1071 /*
1072  * Maximum number of slabs from which to move buffers during a single
1073  * maintenance interval while the system is not low on memory.
1074  */
1075 size_t kmem_reclaim_max_slabs = 1;
1076 /*
1077  * Number of slabs to scan backwards from the end of the partial slab list
1078  * when searching for buffers to relocate.
1079  */
1080 size_t kmem_reclaim_scan_range = 12;
1081 
1082 /* consolidator knobs */
1083 static boolean_t kmem_move_noreap;
1084 static boolean_t kmem_move_blocked;
1085 static boolean_t kmem_move_fulltilt;
1086 static boolean_t kmem_move_any_partial;
1087 
1088 #ifdef  DEBUG
1089 /*
1090  * kmem consolidator debug tunables:
1091  * Ensure code coverage by occasionally running the consolidator even when the
1092  * caches are not fragmented (they may never be). These intervals are mean time
1093  * in cache maintenance intervals (kmem_cache_update).
1094  */
1095 uint32_t kmem_mtb_move = 60;    /* defrag 1 slab (~15min) */
1096 uint32_t kmem_mtb_reap = 1800;  /* defrag all slabs (~7.5hrs) */
1097 #endif  /* DEBUG */
1098 
1099 static kmem_cache_t     *kmem_defrag_cache;
1100 static kmem_cache_t     *kmem_move_cache;
1101 static taskq_t          *kmem_move_taskq;
1102 
1103 static void kmem_cache_scan(kmem_cache_t *);
1104 static void kmem_cache_defrag(kmem_cache_t *);
1105 static void kmem_slab_prefill(kmem_cache_t *, kmem_slab_t *);
1106 
1107 
1108 kmem_log_header_t       *kmem_transaction_log;
1109 kmem_log_header_t       *kmem_content_log;
1110 kmem_log_header_t       *kmem_failure_log;
1111 kmem_log_header_t       *kmem_slab_log;
1112 kmem_log_header_t       *kmem_zerosized_log;
1113 
1114 static int              kmem_lite_count; /* # of PCs in kmem_buftag_lite_t */
1115 
1116 #define KMEM_BUFTAG_LITE_ENTER(bt, count, caller)                       \
1117         if ((count) > 0) {                                           \
1118                 pc_t *_s = ((kmem_buftag_lite_t *)(bt))->bt_history; \
1119                 pc_t *_e;                                               \
1120                 /* memmove() the old entries down one notch */          \
1121                 for (_e = &_s[(count) - 1]; _e > _s; _e--)               \
1122                         *_e = *(_e - 1);                                \
1123                 *_s = (uintptr_t)(caller);                              \
1124         }
1125 
1126 #define KMERR_MODIFIED  0       /* buffer modified while on freelist */
1127 #define KMERR_REDZONE   1       /* redzone violation (write past end of buf) */
1128 #define KMERR_DUPFREE   2       /* freed a buffer twice */
1129 #define KMERR_BADADDR   3       /* freed a bad (unallocated) address */
1130 #define KMERR_BADBUFTAG 4       /* buftag corrupted */
1131 #define KMERR_BADBUFCTL 5       /* bufctl corrupted */
1132 #define KMERR_BADCACHE  6       /* freed a buffer to the wrong cache */
 
1873                          */
1874                         if (sp->slab_flags & KMEM_SLAB_MOVE_PENDING) {
1875                                 list_insert_tail(deadlist, sp);
1876                         } else {
1877                                 list_insert_head(deadlist, sp);
1878                         }
1879                         cp->cache_defrag->kmd_deadcount++;
1880                         mutex_exit(&cp->cache_lock);
1881                 }
1882                 return;
1883         }
1884 
1885         if (bcp->bc_next == NULL) {
1886                 /* Transition the slab from completely allocated to partial. */
1887                 ASSERT(sp->slab_refcnt == (sp->slab_chunks - 1));
1888                 ASSERT(sp->slab_chunks > 1);
1889                 list_remove(&cp->cache_complete_slabs, sp);
1890                 cp->cache_complete_slab_count--;
1891                 avl_add(&cp->cache_partial_slabs, sp);
1892         } else {
1893                 (void) avl_update_gt(&cp->cache_partial_slabs, sp);
1894         }
1895 
1896         ASSERT((cp->cache_slab_create - cp->cache_slab_destroy) ==
1897             (cp->cache_complete_slab_count +
1898             avl_numnodes(&cp->cache_partial_slabs) +
1899             (cp->cache_defrag == NULL ? 0 : cp->cache_defrag->kmd_deadcount)));
1900         mutex_exit(&cp->cache_lock);
1901 }
1902 
1903 /*
1904  * Return -1 if kmem_error, 1 if constructor fails, 0 if successful.
1905  */
1906 static int
1907 kmem_cache_alloc_debug(kmem_cache_t *cp, void *buf, int kmflag, int construct,
1908     caddr_t caller)
1909 {
1910         kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
1911         kmem_bufctl_audit_t *bcp = (kmem_bufctl_audit_t *)btp->bt_bufctl;
1912         uint32_t mtbf;
1913 
 
2907         return (buf);
2908 }
2909 
2910 void *
2911 kmem_alloc(size_t size, int kmflag)
2912 {
2913         size_t index;
2914         kmem_cache_t *cp;
2915         void *buf;
2916 
2917         if ((index = ((size - 1) >> KMEM_ALIGN_SHIFT)) < KMEM_ALLOC_TABLE_MAX) {
2918                 cp = kmem_alloc_table[index];
2919                 /* fall through to kmem_cache_alloc() */
2920 
2921         } else if ((index = ((size - 1) >> KMEM_BIG_SHIFT)) <
2922             kmem_big_alloc_table_max) {
2923                 cp = kmem_big_alloc_table[index];
2924                 /* fall through to kmem_cache_alloc() */
2925 
2926         } else {
2927                 if (size == 0) {
2928                         if (kmflag != KM_SLEEP && !(kmflag & KM_PANIC))
2929                                 return (NULL);
2930 
2931                         /*
2932                          * If this is a sleeping allocation or one that has
2933                          * been specified to panic on allocation failure, we
2934                          * consider it to be deprecated behavior to allocate
2935                          * 0 bytes.  If we have been configured to panic under
2936                          * this condition, we panic; if to warn, we warn -- and
2937                          * regardless, we log to the kmem_zerosized_log that
2938                          * that this condition has occurred (which gives us
2939                          * enough information to be able to debug it).
2940                          */
2941                         if (kmem_panic && kmem_panic_zerosized)
2942                                 panic("attempted to kmem_alloc() size of 0");
2943 
2944                         if (kmem_warn_zerosized) {
2945                                 cmn_err(CE_WARN, "kmem_alloc(): sleeping "
2946                                     "allocation with size of 0; "
2947                                     "see kmem_zerosized_log for details");
2948                         }
2949 
2950                         kmem_log_event(kmem_zerosized_log, NULL, NULL, NULL);
2951 
2952                         return (NULL);
2953                 }
2954 
2955                 buf = vmem_alloc(kmem_oversize_arena, size,
2956                     kmflag & KM_VMFLAGS);
2957                 if (buf == NULL)
2958                         kmem_log_event(kmem_failure_log, NULL, NULL,
2959                             (void *)size);
2960                 else if (KMEM_DUMP(kmem_slab_cache)) {
2961                         /* stats for dump intercept */
2962                         kmem_dump_oversize_allocs++;
2963                         if (size > kmem_dump_oversize_max)
2964                                 kmem_dump_oversize_max = size;
2965                 }
2966                 return (buf);
2967         }
2968 
2969         buf = kmem_cache_alloc(cp, kmflag);
2970         if ((cp->cache_flags & KMF_BUFTAG) && !KMEM_DUMP(cp) && buf != NULL) {
2971                 kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
2972                 ((uint8_t *)buf)[size] = KMEM_REDZONE_BYTE;
2973                 ((uint32_t *)btp)[1] = KMEM_SIZE_ENCODE(size);
2974 
 
3547                 kmcp->kmc_move_yes.value.ui64                = 0;
3548                 kmcp->kmc_move_no.value.ui64         = 0;
3549                 kmcp->kmc_move_later.value.ui64              = 0;
3550                 kmcp->kmc_move_dont_need.value.ui64  = 0;
3551                 kmcp->kmc_move_dont_know.value.ui64  = 0;
3552                 kmcp->kmc_move_hunt_found.value.ui64 = 0;
3553                 kmcp->kmc_move_slabs_freed.value.ui64        = 0;
3554                 kmcp->kmc_defrag.value.ui64          = 0;
3555                 kmcp->kmc_scan.value.ui64            = 0;
3556                 kmcp->kmc_move_reclaimable.value.ui64        = 0;
3557         } else {
3558                 int64_t reclaimable;
3559 
3560                 kmem_defrag_t *kd = cp->cache_defrag;
3561                 kmcp->kmc_move_callbacks.value.ui64  = kd->kmd_callbacks;
3562                 kmcp->kmc_move_yes.value.ui64                = kd->kmd_yes;
3563                 kmcp->kmc_move_no.value.ui64         = kd->kmd_no;
3564                 kmcp->kmc_move_later.value.ui64              = kd->kmd_later;
3565                 kmcp->kmc_move_dont_need.value.ui64  = kd->kmd_dont_need;
3566                 kmcp->kmc_move_dont_know.value.ui64  = kd->kmd_dont_know;
3567                 kmcp->kmc_move_hunt_found.value.ui64 = 0;
3568                 kmcp->kmc_move_slabs_freed.value.ui64        = kd->kmd_slabs_freed;
3569                 kmcp->kmc_defrag.value.ui64          = kd->kmd_defrags;
3570                 kmcp->kmc_scan.value.ui64            = kd->kmd_scans;
3571 
3572                 reclaimable = cp->cache_bufslab - (cp->cache_maxchunks - 1);
3573                 reclaimable = MAX(reclaimable, 0);
3574                 reclaimable += ((uint64_t)reap * cp->cache_magtype->mt_magsize);
3575                 kmcp->kmc_move_reclaimable.value.ui64        = reclaimable;
3576         }
3577 
3578         mutex_exit(&cp->cache_lock);
3579         return (0);
3580 }
3581 
3582 /*
3583  * Return a named statistic about a particular cache.
3584  * This shouldn't be called very often, so it's currently designed for
3585  * simplicity (leverages existing kstat support) rather than efficiency.
3586  */
3587 uint64_t
 
4457                     segkmem_alloc, segkmem_free, kmem_minfirewall < ULONG_MAX?
4458                     kmem_firewall_va_arena : heap_arena, 0, VMC_DUMPSAFE |
4459                     VM_SLEEP);
4460         }
4461 
4462         kmem_cache_init(2, use_large_pages);
4463 
4464         if (kmem_flags & (KMF_AUDIT | KMF_RANDOMIZE)) {
4465                 if (kmem_transaction_log_size == 0)
4466                         kmem_transaction_log_size = kmem_maxavail() / 50;
4467                 kmem_transaction_log = kmem_log_init(kmem_transaction_log_size);
4468         }
4469 
4470         if (kmem_flags & (KMF_CONTENTS | KMF_RANDOMIZE)) {
4471                 if (kmem_content_log_size == 0)
4472                         kmem_content_log_size = kmem_maxavail() / 50;
4473                 kmem_content_log = kmem_log_init(kmem_content_log_size);
4474         }
4475 
4476         kmem_failure_log = kmem_log_init(kmem_failure_log_size);
4477         kmem_slab_log = kmem_log_init(kmem_slab_log_size);
4478         kmem_zerosized_log = kmem_log_init(kmem_zerosized_log_size);
4479 
4480         /*
4481          * Initialize STREAMS message caches so allocb() is available.
4482          * This allows us to initialize the logging framework (cmn_err(9F),
4483          * strlog(9F), etc) so we can start recording messages.
4484          */
4485         streams_msg_init();
4486 
4487         /*
4488          * Initialize the ZSD framework in Zones so modules loaded henceforth
4489          * can register their callbacks.
4490          */
4491         zone_zsd_init();
4492 
4493         log_init();
4494         taskq_init();
4495 
4496         /*
4497          * Warn about invalid or dangerous values of kmem_flags.
4498          * Always warn about unsupported values.
 
4646                 return (B_FALSE);
4647         }
4648 
4649         if ((refcnt == 1) || kmem_move_any_partial) {
4650                 return (refcnt < sp->slab_chunks);
4651         }
4652 
4653         /*
4654          * The reclaim threshold is adjusted at each kmem_cache_scan() so that
4655          * slabs with a progressively higher percentage of used buffers can be
4656          * reclaimed until the cache as a whole is no longer fragmented.
4657          *
4658          *      sp->slab_refcnt   kmd_reclaim_numer
4659          *      --------------- < ------------------
4660          *      sp->slab_chunks   KMEM_VOID_FRACTION
4661          */
4662         return ((refcnt * KMEM_VOID_FRACTION) <
4663             (sp->slab_chunks * cp->cache_defrag->kmd_reclaim_numer));
4664 }
4665 
4666 /*
4667  * May be called from the kmem_move_taskq, from kmem_cache_move_notify_task(),
4668  * or when the buffer is freed.
4669  */
4670 static void
4671 kmem_slab_move_yes(kmem_cache_t *cp, kmem_slab_t *sp, void *from_buf)
4672 {
4673         ASSERT(MUTEX_HELD(&cp->cache_lock));
4674         ASSERT(KMEM_SLAB_MEMBER(sp, from_buf));
4675 
4676         if (!KMEM_SLAB_IS_PARTIAL(sp)) {
4677                 return;
4678         }
4679 
4680         if (sp->slab_flags & KMEM_SLAB_NOMOVE) {
4681                 if (KMEM_SLAB_OFFSET(sp, from_buf) == sp->slab_stuck_offset) {
4682                         avl_remove(&cp->cache_partial_slabs, sp);
4683                         sp->slab_flags &= ~KMEM_SLAB_NOMOVE;
4684                         sp->slab_stuck_offset = (uint32_t)-1;
4685                         avl_add(&cp->cache_partial_slabs, sp);
4686                 }
 
4709 }
4710 
4711 static void kmem_move_end(kmem_cache_t *, kmem_move_t *);
4712 
4713 /*
4714  * The move callback takes two buffer addresses, the buffer to be moved, and a
4715  * newly allocated and constructed buffer selected by kmem as the destination.
4716  * It also takes the size of the buffer and an optional user argument specified
4717  * at cache creation time. kmem guarantees that the buffer to be moved has not
4718  * been unmapped by the virtual memory subsystem. Beyond that, it cannot
4719  * guarantee the present whereabouts of the buffer to be moved, so it is up to
4720  * the client to safely determine whether or not it is still using the buffer.
4721  * The client must not free either of the buffers passed to the move callback,
4722  * since kmem wants to free them directly to the slab layer. The client response
4723  * tells kmem which of the two buffers to free:
4724  *
4725  * YES          kmem frees the old buffer (the move was successful)
4726  * NO           kmem frees the new buffer, marks the slab of the old buffer
4727  *              non-reclaimable to avoid bothering the client again
4728  * LATER        kmem frees the new buffer, increments slab_later_count
4729  * DONT_KNOW    kmem frees the new buffer
4730  * DONT_NEED    kmem frees both the old buffer and the new buffer
4731  *
4732  * The pending callback argument now being processed contains both of the
4733  * buffers (old and new) passed to the move callback function, the slab of the
4734  * old buffer, and flags related to the move request, such as whether or not the
4735  * system was desperate for memory.
4736  *
4737  * Slabs are not freed while there is a pending callback, but instead are kept
4738  * on a deadlist, which is drained after the last callback completes. This means
4739  * that slabs are safe to access until kmem_move_end(), no matter how many of
4740  * their buffers have been freed. Once slab_refcnt reaches zero, it stays at
4741  * zero for as long as the slab remains on the deadlist and until the slab is
4742  * freed.
4743  */
4744 static void
4745 kmem_move_buffer(kmem_move_t *callback)
4746 {
4747         kmem_cbrc_t response;
4748         kmem_slab_t *sp = callback->kmm_from_slab;
4749         kmem_cache_t *cp = sp->slab_cache;
4750         boolean_t free_on_slab;
4751 
4752         ASSERT(taskq_member(kmem_move_taskq, curthread));
4753         ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
4754         ASSERT(KMEM_SLAB_MEMBER(sp, callback->kmm_from_buf));
4755 
4756         /*
4757          * The number of allocated buffers on the slab may have changed since we
4758          * last checked the slab's reclaimability (when the pending move was
4759          * enqueued), or the client may have responded NO when asked to move
4760          * another buffer on the same slab.
4761          */
4762         if (!kmem_slab_is_reclaimable(cp, sp, callback->kmm_flags)) {
4763                 kmem_slab_free(cp, callback->kmm_to_buf);
4764                 kmem_move_end(cp, callback);
4765                 return;
4766         }
4767 
4768         /*
4769          * Checking the slab layer is easy, so we might as well do that here
4770          * in case we can avoid bothering the client.
4771          */
4772         mutex_enter(&cp->cache_lock);
4773         free_on_slab = (kmem_slab_allocated(cp, sp,
4774             callback->kmm_from_buf) == NULL);
4775         mutex_exit(&cp->cache_lock);
4776 
4777         if (free_on_slab) {
4778                 kmem_slab_free(cp, callback->kmm_to_buf);
4779                 kmem_move_end(cp, callback);
4780                 return;
4781         }
4782 
4783         if (cp->cache_flags & KMF_BUFTAG) {
4784                 /*
4785                  * Make kmem_cache_alloc_debug() apply the constructor for us.
4786                  */
4787                 if (kmem_cache_alloc_debug(cp, callback->kmm_to_buf,
4788                     KM_NOSLEEP, 1, caller()) != 0) {
4789                         kmem_move_end(cp, callback);
4790                         return;
4791                 }
4792         } else if (cp->cache_constructor != NULL &&
4793             cp->cache_constructor(callback->kmm_to_buf, cp->cache_private,
4794             KM_NOSLEEP) != 0) {
4795                 atomic_inc_64(&cp->cache_alloc_fail);
4796                 kmem_slab_free(cp, callback->kmm_to_buf);
4797                 kmem_move_end(cp, callback);
4798                 return;
4799         }
4800 
4801         cp->cache_defrag->kmd_callbacks++;
4802         cp->cache_defrag->kmd_thread = curthread;
4803         cp->cache_defrag->kmd_from_buf = callback->kmm_from_buf;
4804         cp->cache_defrag->kmd_to_buf = callback->kmm_to_buf;
4805         DTRACE_PROBE2(kmem__move__start, kmem_cache_t *, cp, kmem_move_t *,
4806             callback);
4807 
4808         response = cp->cache_move(callback->kmm_from_buf,
4809             callback->kmm_to_buf, cp->cache_bufsize, cp->cache_private);
4810 
4811         DTRACE_PROBE3(kmem__move__end, kmem_cache_t *, cp, kmem_move_t *,
4812             callback, kmem_cbrc_t, response);
4813         cp->cache_defrag->kmd_thread = NULL;
4814         cp->cache_defrag->kmd_from_buf = NULL;
4815         cp->cache_defrag->kmd_to_buf = NULL;
4816 
4817         if (response == KMEM_CBRC_YES) {
4818                 cp->cache_defrag->kmd_yes++;
4819                 kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE);
4820                 /* slab safe to access until kmem_move_end() */
4821                 if (sp->slab_refcnt == 0)
4822                         cp->cache_defrag->kmd_slabs_freed++;
4823                 mutex_enter(&cp->cache_lock);
4824                 kmem_slab_move_yes(cp, sp, callback->kmm_from_buf);
4825                 mutex_exit(&cp->cache_lock);
4826                 kmem_move_end(cp, callback);
4827                 return;
4828         }
4829 
4830         switch (response) {
4831         case KMEM_CBRC_NO:
4832                 cp->cache_defrag->kmd_no++;
4833                 mutex_enter(&cp->cache_lock);
4834                 kmem_slab_move_no(cp, sp, callback->kmm_from_buf);
4835                 mutex_exit(&cp->cache_lock);
4836                 break;
4837         case KMEM_CBRC_LATER:
4838                 cp->cache_defrag->kmd_later++;
4839                 mutex_enter(&cp->cache_lock);
4840                 if (!KMEM_SLAB_IS_PARTIAL(sp)) {
4841                         mutex_exit(&cp->cache_lock);
4842                         break;
4843                 }
4844 
4845                 if (++sp->slab_later_count >= KMEM_DISBELIEF) {
4846                         kmem_slab_move_no(cp, sp, callback->kmm_from_buf);
4847                 } else if (!(sp->slab_flags & KMEM_SLAB_NOMOVE)) {
4848                         sp->slab_stuck_offset = KMEM_SLAB_OFFSET(sp,
4849                             callback->kmm_from_buf);
4850                 }
4851                 mutex_exit(&cp->cache_lock);
4852                 break;
4853         case KMEM_CBRC_DONT_NEED:
4854                 cp->cache_defrag->kmd_dont_need++;
4855                 kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE);
4856                 if (sp->slab_refcnt == 0)
4857                         cp->cache_defrag->kmd_slabs_freed++;
4858                 mutex_enter(&cp->cache_lock);
4859                 kmem_slab_move_yes(cp, sp, callback->kmm_from_buf);
4860                 mutex_exit(&cp->cache_lock);
4861                 break;
4862         case KMEM_CBRC_DONT_KNOW:
4863                 /*
4864                  * If we don't know if we can move this buffer or not, we'll
4865                  * just assume that we can't:  if the buffer is in fact free,
4866                  * then it is sitting in one of the per-CPU magazines or in
4867                  * a full magazine in the depot layer.  Either way, because
4868                  * defrag is induced in the same logic that reaps a cache,
4869                  * it's likely that full magazines will be returned to the
4870                  * system soon (thereby accomplishing what we're trying to
4871                  * accomplish here: return those magazines to their slabs).
4872                  * Given this, any work that we might do now to locate a buffer
4873                  * in a magazine is wasted (and expensive!) work; we bump
4874                  * a counter in this case and otherwise assume that we can't
4875                  * move it.
4876                  */
4877                 cp->cache_defrag->kmd_dont_know++;
4878                 break;
4879         default:
4880                 panic("'%s' (%p) unexpected move callback response %d\n",
4881                     cp->cache_name, (void *)cp, response);
4882         }
4883 
4884         kmem_slab_free_constructed(cp, callback->kmm_to_buf, B_FALSE);
4885         kmem_move_end(cp, callback);
4886 }
4887 
4888 /* Return B_FALSE if there is insufficient memory for the move request. */
4889 static boolean_t
4890 kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags)
4891 {
4892         void *to_buf;
4893         avl_index_t index;
4894         kmem_move_t *callback, *pending;
4895         ulong_t n;
4896 
4897         ASSERT(taskq_member(kmem_taskq, curthread));
4898         ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
4899         ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING);
4900 
4901         callback = kmem_cache_alloc(kmem_move_cache, KM_NOSLEEP);
4902 
4903         if (callback == NULL)
4904                 return (B_FALSE);
4905 
4906         callback->kmm_from_slab = sp;
4907         callback->kmm_from_buf = buf;
4908         callback->kmm_flags = flags;
4909 
4910         mutex_enter(&cp->cache_lock);
4911 
4912         n = avl_numnodes(&cp->cache_partial_slabs);
4913         if ((n == 0) || ((n == 1) && !(flags & KMM_DEBUG))) {
4914                 mutex_exit(&cp->cache_lock);
4915                 kmem_cache_free(kmem_move_cache, callback);
4916                 return (B_TRUE); /* there is no need for the move request */
4917         }
4918 
4919         pending = avl_find(&cp->cache_defrag->kmd_moves_pending, buf, &index);
4920         if (pending != NULL) {
4921                 /*
4922                  * If the move is already pending and we're desperate now,
4923                  * update the move flags.
4924                  */
4925                 if (flags & KMM_DESPERATE) {
4926                         pending->kmm_flags |= KMM_DESPERATE;
4927                 }
4928                 mutex_exit(&cp->cache_lock);
4929                 kmem_cache_free(kmem_move_cache, callback);
4930                 return (B_TRUE);
4931         }
4932 
4933         to_buf = kmem_slab_alloc_impl(cp, avl_first(&cp->cache_partial_slabs),
4934             B_FALSE);
4935         callback->kmm_to_buf = to_buf;
4936         avl_insert(&cp->cache_defrag->kmd_moves_pending, callback, index);
4937 
4938         mutex_exit(&cp->cache_lock);
4939 
4940         if (!taskq_dispatch(kmem_move_taskq, (task_func_t *)kmem_move_buffer,
4941             callback, TQ_NOSLEEP)) {
4942                 mutex_enter(&cp->cache_lock);
4943                 avl_remove(&cp->cache_defrag->kmd_moves_pending, callback);
4944                 mutex_exit(&cp->cache_lock);
4945                 kmem_slab_free(cp, to_buf);
4946                 kmem_cache_free(kmem_move_cache, callback);
4947                 return (B_FALSE);
4948         }
4949 
4950         return (B_TRUE);
4951 }
4952 
4953 static void
4954 kmem_move_end(kmem_cache_t *cp, kmem_move_t *callback)
4955 {
4956         avl_index_t index;
4957 
4958         ASSERT(cp->cache_defrag != NULL);
4959         ASSERT(taskq_member(kmem_move_taskq, curthread));
4960         ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
4961 
 
4967                 list_t *deadlist = &cp->cache_defrag->kmd_deadlist;
4968                 kmem_slab_t *sp;
4969 
4970                 /*
4971                  * The last pending move completed. Release all slabs from the
4972                  * front of the dead list except for any slab at the tail that
4973                  * needs to be released from the context of kmem_move_buffers().
4974                  * kmem deferred unmapping the buffers on these slabs in order
4975                  * to guarantee that buffers passed to the move callback have
4976                  * been touched only by kmem or by the client itself.
4977                  */
4978                 while ((sp = list_remove_head(deadlist)) != NULL) {
4979                         if (sp->slab_flags & KMEM_SLAB_MOVE_PENDING) {
4980                                 list_insert_tail(deadlist, sp);
4981                                 break;
4982                         }
4983                         cp->cache_defrag->kmd_deadcount--;
4984                         cp->cache_slab_destroy++;
4985                         mutex_exit(&cp->cache_lock);
4986                         kmem_slab_destroy(cp, sp);
4987                         mutex_enter(&cp->cache_lock);
4988                 }
4989         }
4990         mutex_exit(&cp->cache_lock);
4991         kmem_cache_free(kmem_move_cache, callback);
4992 }
4993 
4994 /*
4995  * Move buffers from least used slabs first by scanning backwards from the end
4996  * of the partial slab list. Scan at most max_scan candidate slabs and move
4997  * buffers from at most max_slabs slabs (0 for all partial slabs in both cases).
4998  * If desperate to reclaim memory, move buffers from any partial slab, otherwise
4999  * skip slabs with a ratio of allocated buffers at or above the current
5000  * threshold. Return the number of unskipped slabs (at most max_slabs, -1 if the
5001  * scan is aborted) so that the caller can adjust the reclaimability threshold
5002  * depending on how many reclaimable slabs it finds.
5003  *
5004  * kmem_move_buffers() drops and reacquires cache_lock every time it issues a
5005  * move request, since it is not valid for kmem_move_begin() to call
5006  * kmem_cache_alloc() or taskq_dispatch() with cache_lock held.
 
5111                                 list_t *deadlist =
5112                                     &cp->cache_defrag->kmd_deadlist;
5113                                 list_remove(deadlist, sp);
5114 
5115                                 if (!avl_is_empty(
5116                                     &cp->cache_defrag->kmd_moves_pending)) {
5117                                         /*
5118                                          * A pending move makes it unsafe to
5119                                          * destroy the slab, because even though
5120                                          * the move is no longer needed, the
5121                                          * context where that is determined
5122                                          * requires the slab to exist.
5123                                          * Fortunately, a pending move also
5124                                          * means we don't need to destroy the
5125                                          * slab here, since it will get
5126                                          * destroyed along with any other slabs
5127                                          * on the deadlist after the last
5128                                          * pending move completes.
5129                                          */
5130                                         list_insert_head(deadlist, sp);
5131                                         return (-1);
5132                                 }
5133 
5134                                 /*
5135                                  * Destroy the slab now if it was completely
5136                                  * freed while we dropped cache_lock and there
5137                                  * are no pending moves. Since slab_refcnt
5138                                  * cannot change once it reaches zero, no new
5139                                  * pending moves from that slab are possible.
5140                                  */
5141                                 cp->cache_defrag->kmd_deadcount--;
5142                                 cp->cache_slab_destroy++;
5143                                 mutex_exit(&cp->cache_lock);
5144                                 kmem_slab_destroy(cp, sp);
5145                                 mutex_enter(&cp->cache_lock);
5146                                 /*
5147                                  * Since we can't pick up the scan where we left
5148                                  * off, abort the scan and say nothing about the
5149                                  * number of reclaimable slabs.
5150                                  */
5151                                 return (-1);
5152                         }
5153 
5154                         if (!success) {
5155                                 /*
5156                                  * Abort the scan if there is not enough memory
5157                                  * for the request and say nothing about the
5158                                  * number of reclaimable slabs.
5159                                  */
5160                                 return (-1);
5161                         }
5162 
5163                         /*
5164                          * The slab's position changed while the lock was
5165                          * dropped, so we don't know where we are in the
5166                          * sequence any more.
5167                          */
5168                         if (sp->slab_refcnt != refcnt) {
5169                                 /*
5170                                  * If this is a KMM_DEBUG move, the slab_refcnt
5171                                  * may have changed because we allocated a
5172                                  * destination buffer on the same slab. In that
5173                                  * case, we're not interested in counting it.
5174                                  */
5175                                 return (-1);
5176                         }
5177                         if ((sp->slab_flags & KMEM_SLAB_NOMOVE) != nomove)
5178                                 return (-1);
5179 
5180                         /*
5181                          * Generating a move request allocates a destination
5182                          * buffer from the slab layer, bumping the first partial
5183                          * slab if it is completely allocated. If the current
5184                          * slab becomes the first partial slab as a result, we
5185                          * can't continue to scan backwards.
5186                          *
5187                          * If this is a KMM_DEBUG move and we allocated the
5188                          * destination buffer from the last partial slab, then
5189                          * the buffer we're moving is on the same slab and our
5190                          * slab_refcnt has changed, causing us to return before
5191                          * reaching here if there are no partial slabs left.
5192                          */
5193                         ASSERT(!avl_is_empty(&cp->cache_partial_slabs));
5194                         if (sp == avl_first(&cp->cache_partial_slabs)) {
5195                                 /*
5196                                  * We're not interested in a second KMM_DEBUG
5197                                  * move.
5198                                  */
5199                                 goto end_scan;
5200                         }
5201                 }
5202         }
5203 end_scan:
5204 
5205         return (s);
5206 }
5207 
5208 typedef struct kmem_move_notify_args {
5209         kmem_cache_t *kmna_cache;
5210         void *kmna_buf;
5211 } kmem_move_notify_args_t;
5212 
5213 static void
5214 kmem_cache_move_notify_task(void *arg)
5215 {
5216         kmem_move_notify_args_t *args = arg;
5217         kmem_cache_t *cp = args->kmna_cache;
5218         void *buf = args->kmna_buf;
5219         kmem_slab_t *sp;
5220 
5221         ASSERT(taskq_member(kmem_taskq, curthread));
5222         ASSERT(list_link_active(&cp->cache_link));
5223 
5224         kmem_free(args, sizeof (kmem_move_notify_args_t));
 
5244                         return;
5245                 }
5246 
5247                 kmem_slab_move_yes(cp, sp, buf);
5248                 ASSERT(!(sp->slab_flags & KMEM_SLAB_MOVE_PENDING));
5249                 sp->slab_flags |= KMEM_SLAB_MOVE_PENDING;
5250                 mutex_exit(&cp->cache_lock);
5251                 /* see kmem_move_buffers() about dropping the lock */
5252                 (void) kmem_move_begin(cp, sp, buf, KMM_NOTIFY);
5253                 mutex_enter(&cp->cache_lock);
5254                 ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING);
5255                 sp->slab_flags &= ~KMEM_SLAB_MOVE_PENDING;
5256                 if (sp->slab_refcnt == 0) {
5257                         list_t *deadlist = &cp->cache_defrag->kmd_deadlist;
5258                         list_remove(deadlist, sp);
5259 
5260                         if (!avl_is_empty(
5261                             &cp->cache_defrag->kmd_moves_pending)) {
5262                                 list_insert_head(deadlist, sp);
5263                                 mutex_exit(&cp->cache_lock);
5264                                 return;
5265                         }
5266 
5267                         cp->cache_defrag->kmd_deadcount--;
5268                         cp->cache_slab_destroy++;
5269                         mutex_exit(&cp->cache_lock);
5270                         kmem_slab_destroy(cp, sp);
5271                         return;
5272                 }
5273         } else {
5274                 kmem_slab_move_yes(cp, sp, buf);
5275         }
5276         mutex_exit(&cp->cache_lock);
5277 }
5278 
5279 void
5280 kmem_cache_move_notify(kmem_cache_t *cp, void *buf)
5281 {
5282         kmem_move_notify_args_t *args;
5283 
5284         args = kmem_alloc(sizeof (kmem_move_notify_args_t), KM_NOSLEEP);
5285         if (args != NULL) {
5286                 args->kmna_cache = cp;
5287                 args->kmna_buf = buf;
5288                 if (!taskq_dispatch(kmem_taskq,
5289                     (task_func_t *)kmem_cache_move_notify_task, args,
5290                     TQ_NOSLEEP))
5291                         kmem_free(args, sizeof (kmem_move_notify_args_t));
5292         }
5293 }
5294 
5295 static void
5296 kmem_cache_defrag(kmem_cache_t *cp)
5297 {
5298         size_t n;
5299 
5300         ASSERT(cp->cache_defrag != NULL);
5301 
5302         mutex_enter(&cp->cache_lock);
5303         n = avl_numnodes(&cp->cache_partial_slabs);
5304         if (n > 1) {
5305                 /* kmem_move_buffers() drops and reacquires cache_lock */
5306                 cp->cache_defrag->kmd_defrags++;
5307                 (void) kmem_move_buffers(cp, n, 0, KMM_DESPERATE);
5308         }
5309         mutex_exit(&cp->cache_lock);
5310 }
5311 
5312 /* Is this cache above the fragmentation threshold? */
5313 static boolean_t
5314 kmem_cache_frag_threshold(kmem_cache_t *cp, uint64_t nfree)
5315 {
5316         /*
5317          *      nfree           kmem_frag_numer
5318          * ------------------ > ---------------
5319          * cp->cache_buftotal        kmem_frag_denom
5320          */
5321         return ((nfree * kmem_frag_denom) >
5322             (cp->cache_buftotal * kmem_frag_numer));
5323 }
5324 
5325 static boolean_t
 
5384         if (kmd->kmd_consolidate > 0) {
5385                 kmd->kmd_consolidate--;
5386                 mutex_exit(&cp->cache_lock);
5387                 kmem_cache_reap(cp);
5388                 return;
5389         }
5390 
5391         if (kmem_cache_is_fragmented(cp, &reap)) {
5392                 size_t slabs_found;
5393 
5394                 /*
5395                  * Consolidate reclaimable slabs from the end of the partial
5396                  * slab list (scan at most kmem_reclaim_scan_range slabs to find
5397                  * reclaimable slabs). Keep track of how many candidate slabs we
5398                  * looked for and how many we actually found so we can adjust
5399                  * the definition of a candidate slab if we're having trouble
5400                  * finding them.
5401                  *
5402                  * kmem_move_buffers() drops and reacquires cache_lock.
5403                  */
5404                 kmd->kmd_scans++;
5405                 slabs_found = kmem_move_buffers(cp, kmem_reclaim_scan_range,
5406                     kmem_reclaim_max_slabs, 0);
5407                 if (slabs_found >= 0) {
5408                         kmd->kmd_slabs_sought += kmem_reclaim_max_slabs;
5409                         kmd->kmd_slabs_found += slabs_found;
5410                 }
5411 
5412                 if (++kmd->kmd_tries >= kmem_reclaim_scan_range) {
5413                         kmd->kmd_tries = 0;
5414 
5415                         /*
5416                          * If we had difficulty finding candidate slabs in
5417                          * previous scans, adjust the threshold so that
5418                          * candidates are easier to find.
5419                          */
5420                         if (kmd->kmd_slabs_found == kmd->kmd_slabs_sought) {
5421                                 kmem_adjust_reclaim_threshold(kmd, -1);
5422                         } else if ((kmd->kmd_slabs_found * 2) <
5423                             kmd->kmd_slabs_sought) {
5424                                 kmem_adjust_reclaim_threshold(kmd, 1);
5425                         }
5426                         kmd->kmd_slabs_sought = 0;
5427                         kmd->kmd_slabs_found = 0;
5428                 }
5429         } else {
5430                 kmem_reset_reclaim_threshold(cp->cache_defrag);
5431 #ifdef  DEBUG
5432                 if (!avl_is_empty(&cp->cache_partial_slabs)) {
5433                         /*
5434                          * In a debug kernel we want the consolidator to
5435                          * run occasionally even when there is plenty of
5436                          * memory.
5437                          */
5438                         uint16_t debug_rand;
5439 
5440                         (void) random_get_bytes((uint8_t *)&debug_rand, 2);
5441                         if (!kmem_move_noreap &&
5442                             ((debug_rand % kmem_mtb_reap) == 0)) {
5443                                 mutex_exit(&cp->cache_lock);
5444                                 kmem_cache_reap(cp);
5445                                 return;
5446                         } else if ((debug_rand % kmem_mtb_move) == 0) {
5447                                 kmd->kmd_scans++;
5448                                 (void) kmem_move_buffers(cp,
5449                                     kmem_reclaim_scan_range, 1, KMM_DEBUG);
5450                         }
5451                 }
5452 #endif  /* DEBUG */
5453         }
5454 
5455         mutex_exit(&cp->cache_lock);
5456 
5457         if (reap)
5458                 kmem_depot_ws_reap(cp);
5459 }
 | 
 
 
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  24  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  25  */
  26 
  27 /*
  28  * Kernel memory allocator, as described in the following two papers and a
  29  * statement about the consolidator:
  30  *
  31  * Jeff Bonwick,
  32  * The Slab Allocator: An Object-Caching Kernel Memory Allocator.
  33  * Proceedings of the Summer 1994 Usenix Conference.
  34  * Available as /shared/sac/PSARC/1994/028/materials/kmem.pdf.
  35  *
  36  * Jeff Bonwick and Jonathan Adams,
  37  * Magazines and vmem: Extending the Slab Allocator to Many CPUs and
  38  * Arbitrary Resources.
  39  * Proceedings of the 2001 Usenix Conference.
  40  * Available as /shared/sac/PSARC/2000/550/materials/vmem.pdf.
  41  *
  42  * kmem Slab Consolidator Big Theory Statement:
 
 
 142  *            (The system won't be getting the slab back as long as the
 143  *            immovable object holds it hostage, so there's no point in moving
 144  *            any of its objects.)
 145  *     LATER: The client is using the object and cannot move it now, so kmem
 146  *            frees the new object (the unused copy destination). kmem still
 147  *            attempts to move other objects off the slab, since it expects to
 148  *            succeed in clearing the slab in a later callback. The client
 149  *            should use LATER instead of NO if the object is likely to become
 150  *            movable very soon.
 151  * DONT_NEED: The client no longer needs the object, so kmem frees the old along
 152  *            with the new object (the unused copy destination). This response
 153  *            is the client's opportunity to be a model citizen and give back as
 154  *            much as it can.
 155  * DONT_KNOW: The client does not know about the object because
 156  *            a) the client has just allocated the object and not yet put it
 157  *               wherever it expects to find known objects
 158  *            b) the client has removed the object from wherever it expects to
 159  *               find known objects and is about to free it, or
 160  *            c) the client has freed the object.
 161  *            In all these cases (a, b, and c) kmem frees the new object (the
 162  *            unused copy destination) and searches for the old object in the
 163  *            magazine layer. If found, the object is removed from the magazine
 164  *            layer and freed to the slab layer so it will no longer hold the
 165  *            slab hostage.
 166  *
 167  * 2.3 Object States
 168  *
 169  * Neither kmem nor the client can be assumed to know the object's whereabouts
 170  * at the time of the callback. An object belonging to a kmem cache may be in
 171  * any of the following states:
 172  *
 173  * 1. Uninitialized on the slab
 174  * 2. Allocated from the slab but not constructed (still uninitialized)
 175  * 3. Allocated from the slab, constructed, but not yet ready for business
 176  *    (not in a valid state for the move callback)
 177  * 4. In use (valid and known to the client)
 178  * 5. About to be freed (no longer in a valid state for the move callback)
 179  * 6. Freed to a magazine (still constructed)
 180  * 7. Allocated from a magazine, not yet ready for business (not in a valid
 181  *    state for the move callback), and about to return to state #4
 182  * 8. Deconstructed on a magazine that is about to be freed
 183  * 9. Freed to the slab
 184  *
 185  * Since the move callback may be called at any time while the object is in any
 
 
 268  * c_objects_lock is held. Note that after acquiring the lock, the client must
 269  * recheck the o_container pointer in case the object was removed just before
 270  * acquiring the lock.
 271  *
 272  * When the client is about to free an object, it must first remove that object
 273  * from the list, hash, or other structure where it is kept. At that time, to
 274  * mark the object so it can be distinguished from the remaining, known objects,
 275  * the client sets the designated low order bit:
 276  *
 277  *      mutex_enter(&container->c_objects_lock);
 278  *      object->o_container = (void *)((uintptr_t)object->o_container | 0x1);
 279  *      list_remove(&container->c_objects, object);
 280  *      mutex_exit(&container->c_objects_lock);
 281  *
 282  * In the common case, the object is freed to the magazine layer, where it may
 283  * be reused on a subsequent allocation without the overhead of calling the
 284  * constructor. While in the magazine it appears allocated from the point of
 285  * view of the slab layer, making it a candidate for the move callback. Most
 286  * objects unrecognized by the client in the move callback fall into this
 287  * category and are cheaply distinguished from known objects by the test
 288  * described earlier. Since recognition is cheap for the client, and searching
 289  * magazines is expensive for kmem, kmem defers searching until the client first
 290  * returns KMEM_CBRC_DONT_KNOW. As long as the needed effort is reasonable, kmem
 291  * elsewhere does what it can to avoid bothering the client unnecessarily.
 292  *
 293  * Invalidating the designated pointer member before freeing the object marks
 294  * the object to be avoided in the callback, and conversely, assigning a valid
 295  * value to the designated pointer member after allocating the object makes the
 296  * object fair game for the callback:
 297  *
 298  *      ... allocate object ...
 299  *      ... set any initial state not set by the constructor ...
 300  *
 301  *      mutex_enter(&container->c_objects_lock);
 302  *      list_insert_tail(&container->c_objects, object);
 303  *      membar_producer();
 304  *      object->o_container = container;
 305  *      mutex_exit(&container->c_objects_lock);
 306  *
 307  * Note that everything else must be valid before setting o_container makes the
 308  * object fair game for the move callback. The membar_producer() call ensures
 309  * that all the object's state is written to memory before setting the pointer
 310  * that transitions the object from state #3 or #7 (allocated, constructed, not
 311  * yet in use) to state #4 (in use, valid). That's important because the move
 
 
 981         { 95,   64,     0,      512     },
 982         { 143,  64,     0,      0       },
 983 };
 984 
 985 static uint32_t kmem_reaping;
 986 static uint32_t kmem_reaping_idspace;
 987 
 988 /*
 989  * kmem tunables
 990  */
 991 clock_t kmem_reap_interval;     /* cache reaping rate [15 * HZ ticks] */
 992 int kmem_depot_contention = 3;  /* max failed tryenters per real interval */
 993 pgcnt_t kmem_reapahead = 0;     /* start reaping N pages before pageout */
 994 int kmem_panic = 1;             /* whether to panic on error */
 995 int kmem_logging = 1;           /* kmem_log_enter() override */
 996 uint32_t kmem_mtbf = 0;         /* mean time between failures [default: off] */
 997 size_t kmem_transaction_log_size; /* transaction log size [2% of memory] */
 998 size_t kmem_content_log_size;   /* content log size [2% of memory] */
 999 size_t kmem_failure_log_size;   /* failure log [4 pages per CPU] */
1000 size_t kmem_slab_log_size;      /* slab create log [4 pages per CPU] */
1001 size_t kmem_content_maxsave = 256; /* KMF_CONTENTS max bytes to log */
1002 size_t kmem_lite_minsize = 0;   /* minimum buffer size for KMF_LITE */
1003 size_t kmem_lite_maxalign = 1024; /* maximum buffer alignment for KMF_LITE */
1004 int kmem_lite_pcs = 4;          /* number of PCs to store in KMF_LITE mode */
1005 size_t kmem_maxverify;          /* maximum bytes to inspect in debug routines */
1006 size_t kmem_minfirewall;        /* hardware-enforced redzone threshold */
1007 
1008 #ifdef _LP64
1009 size_t  kmem_max_cached = KMEM_BIG_MAXBUF;      /* maximum kmem_alloc cache */
1010 #else
1011 size_t  kmem_max_cached = KMEM_BIG_MAXBUF_32BIT; /* maximum kmem_alloc cache */
1012 #endif
1013 
1014 #ifdef DEBUG
1015 int kmem_flags = KMF_AUDIT | KMF_DEADBEEF | KMF_REDZONE | KMF_CONTENTS;
1016 #else
1017 int kmem_flags = 0;
1018 #endif
1019 int kmem_ready;
1020 
1021 static kmem_cache_t     *kmem_slab_cache;
1022 static kmem_cache_t     *kmem_bufctl_cache;
1023 static kmem_cache_t     *kmem_bufctl_audit_cache;
1024 
1025 static kmutex_t         kmem_cache_lock;        /* inter-cache linkage only */
1026 static list_t           kmem_caches;
1027 
1028 static taskq_t          *kmem_taskq;
1029 static kmutex_t         kmem_flags_lock;
1030 static vmem_t           *kmem_metadata_arena;
1031 static vmem_t           *kmem_msb_arena;        /* arena for metadata caches */
1032 static vmem_t           *kmem_cache_arena;
1033 static vmem_t           *kmem_hash_arena;
1034 static vmem_t           *kmem_log_arena;
1035 static vmem_t           *kmem_oversize_arena;
1036 static vmem_t           *kmem_va_arena;
1037 static vmem_t           *kmem_default_arena;
1038 static vmem_t           *kmem_firewall_va_arena;
1039 static vmem_t           *kmem_firewall_arena;
1040 
1041 /*
1042  * Define KMEM_STATS to turn on statistic gathering. By default, it is only
1043  * turned on when DEBUG is also defined.
1044  */
1045 #ifdef  DEBUG
1046 #define KMEM_STATS
1047 #endif  /* DEBUG */
1048 
1049 #ifdef  KMEM_STATS
1050 #define KMEM_STAT_ADD(stat)                     ((stat)++)
1051 #define KMEM_STAT_COND_ADD(cond, stat)          ((void) (!(cond) || (stat)++))
1052 #else
1053 #define KMEM_STAT_ADD(stat)                     /* nothing */
1054 #define KMEM_STAT_COND_ADD(cond, stat)          /* nothing */
1055 #endif  /* KMEM_STATS */
1056 
1057 /*
1058  * kmem slab consolidator thresholds (tunables)
1059  */
1060 size_t kmem_frag_minslabs = 101;        /* minimum total slabs */
1061 size_t kmem_frag_numer = 1;             /* free buffers (numerator) */
1062 size_t kmem_frag_denom = KMEM_VOID_FRACTION; /* buffers (denominator) */
1063 /*
1064  * Maximum number of slabs from which to move buffers during a single
1065  * maintenance interval while the system is not low on memory.
1066  */
1067 size_t kmem_reclaim_max_slabs = 1;
1068 /*
1069  * Number of slabs to scan backwards from the end of the partial slab list
1070  * when searching for buffers to relocate.
1071  */
1072 size_t kmem_reclaim_scan_range = 12;
1073 
1074 #ifdef  KMEM_STATS
1075 static struct {
1076         uint64_t kms_callbacks;
1077         uint64_t kms_yes;
1078         uint64_t kms_no;
1079         uint64_t kms_later;
1080         uint64_t kms_dont_need;
1081         uint64_t kms_dont_know;
1082         uint64_t kms_hunt_found_mag;
1083         uint64_t kms_hunt_found_slab;
1084         uint64_t kms_hunt_alloc_fail;
1085         uint64_t kms_hunt_lucky;
1086         uint64_t kms_notify;
1087         uint64_t kms_notify_callbacks;
1088         uint64_t kms_disbelief;
1089         uint64_t kms_already_pending;
1090         uint64_t kms_callback_alloc_fail;
1091         uint64_t kms_callback_taskq_fail;
1092         uint64_t kms_endscan_slab_dead;
1093         uint64_t kms_endscan_slab_destroyed;
1094         uint64_t kms_endscan_nomem;
1095         uint64_t kms_endscan_refcnt_changed;
1096         uint64_t kms_endscan_nomove_changed;
1097         uint64_t kms_endscan_freelist;
1098         uint64_t kms_avl_update;
1099         uint64_t kms_avl_noupdate;
1100         uint64_t kms_no_longer_reclaimable;
1101         uint64_t kms_notify_no_longer_reclaimable;
1102         uint64_t kms_notify_slab_dead;
1103         uint64_t kms_notify_slab_destroyed;
1104         uint64_t kms_alloc_fail;
1105         uint64_t kms_constructor_fail;
1106         uint64_t kms_dead_slabs_freed;
1107         uint64_t kms_defrags;
1108         uint64_t kms_scans;
1109         uint64_t kms_scan_depot_ws_reaps;
1110         uint64_t kms_debug_reaps;
1111         uint64_t kms_debug_scans;
1112 } kmem_move_stats;
1113 #endif  /* KMEM_STATS */
1114 
1115 /* consolidator knobs */
1116 static boolean_t kmem_move_noreap;
1117 static boolean_t kmem_move_blocked;
1118 static boolean_t kmem_move_fulltilt;
1119 static boolean_t kmem_move_any_partial;
1120 
1121 #ifdef  DEBUG
1122 /*
1123  * kmem consolidator debug tunables:
1124  * Ensure code coverage by occasionally running the consolidator even when the
1125  * caches are not fragmented (they may never be). These intervals are mean time
1126  * in cache maintenance intervals (kmem_cache_update).
1127  */
1128 uint32_t kmem_mtb_move = 60;    /* defrag 1 slab (~15min) */
1129 uint32_t kmem_mtb_reap = 1800;  /* defrag all slabs (~7.5hrs) */
1130 #endif  /* DEBUG */
1131 
1132 static kmem_cache_t     *kmem_defrag_cache;
1133 static kmem_cache_t     *kmem_move_cache;
1134 static taskq_t          *kmem_move_taskq;
1135 
1136 static void kmem_cache_scan(kmem_cache_t *);
1137 static void kmem_cache_defrag(kmem_cache_t *);
1138 static void kmem_slab_prefill(kmem_cache_t *, kmem_slab_t *);
1139 
1140 
1141 kmem_log_header_t       *kmem_transaction_log;
1142 kmem_log_header_t       *kmem_content_log;
1143 kmem_log_header_t       *kmem_failure_log;
1144 kmem_log_header_t       *kmem_slab_log;
1145 
1146 static int              kmem_lite_count; /* # of PCs in kmem_buftag_lite_t */
1147 
1148 #define KMEM_BUFTAG_LITE_ENTER(bt, count, caller)                       \
1149         if ((count) > 0) {                                           \
1150                 pc_t *_s = ((kmem_buftag_lite_t *)(bt))->bt_history; \
1151                 pc_t *_e;                                               \
1152                 /* memmove() the old entries down one notch */          \
1153                 for (_e = &_s[(count) - 1]; _e > _s; _e--)               \
1154                         *_e = *(_e - 1);                                \
1155                 *_s = (uintptr_t)(caller);                              \
1156         }
1157 
1158 #define KMERR_MODIFIED  0       /* buffer modified while on freelist */
1159 #define KMERR_REDZONE   1       /* redzone violation (write past end of buf) */
1160 #define KMERR_DUPFREE   2       /* freed a buffer twice */
1161 #define KMERR_BADADDR   3       /* freed a bad (unallocated) address */
1162 #define KMERR_BADBUFTAG 4       /* buftag corrupted */
1163 #define KMERR_BADBUFCTL 5       /* bufctl corrupted */
1164 #define KMERR_BADCACHE  6       /* freed a buffer to the wrong cache */
 
1905                          */
1906                         if (sp->slab_flags & KMEM_SLAB_MOVE_PENDING) {
1907                                 list_insert_tail(deadlist, sp);
1908                         } else {
1909                                 list_insert_head(deadlist, sp);
1910                         }
1911                         cp->cache_defrag->kmd_deadcount++;
1912                         mutex_exit(&cp->cache_lock);
1913                 }
1914                 return;
1915         }
1916 
1917         if (bcp->bc_next == NULL) {
1918                 /* Transition the slab from completely allocated to partial. */
1919                 ASSERT(sp->slab_refcnt == (sp->slab_chunks - 1));
1920                 ASSERT(sp->slab_chunks > 1);
1921                 list_remove(&cp->cache_complete_slabs, sp);
1922                 cp->cache_complete_slab_count--;
1923                 avl_add(&cp->cache_partial_slabs, sp);
1924         } else {
1925 #ifdef  DEBUG
1926                 if (avl_update_gt(&cp->cache_partial_slabs, sp)) {
1927                         KMEM_STAT_ADD(kmem_move_stats.kms_avl_update);
1928                 } else {
1929                         KMEM_STAT_ADD(kmem_move_stats.kms_avl_noupdate);
1930                 }
1931 #else
1932                 (void) avl_update_gt(&cp->cache_partial_slabs, sp);
1933 #endif
1934         }
1935 
1936         ASSERT((cp->cache_slab_create - cp->cache_slab_destroy) ==
1937             (cp->cache_complete_slab_count +
1938             avl_numnodes(&cp->cache_partial_slabs) +
1939             (cp->cache_defrag == NULL ? 0 : cp->cache_defrag->kmd_deadcount)));
1940         mutex_exit(&cp->cache_lock);
1941 }
1942 
1943 /*
1944  * Return -1 if kmem_error, 1 if constructor fails, 0 if successful.
1945  */
1946 static int
1947 kmem_cache_alloc_debug(kmem_cache_t *cp, void *buf, int kmflag, int construct,
1948     caddr_t caller)
1949 {
1950         kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
1951         kmem_bufctl_audit_t *bcp = (kmem_bufctl_audit_t *)btp->bt_bufctl;
1952         uint32_t mtbf;
1953 
 
2947         return (buf);
2948 }
2949 
2950 void *
2951 kmem_alloc(size_t size, int kmflag)
2952 {
2953         size_t index;
2954         kmem_cache_t *cp;
2955         void *buf;
2956 
2957         if ((index = ((size - 1) >> KMEM_ALIGN_SHIFT)) < KMEM_ALLOC_TABLE_MAX) {
2958                 cp = kmem_alloc_table[index];
2959                 /* fall through to kmem_cache_alloc() */
2960 
2961         } else if ((index = ((size - 1) >> KMEM_BIG_SHIFT)) <
2962             kmem_big_alloc_table_max) {
2963                 cp = kmem_big_alloc_table[index];
2964                 /* fall through to kmem_cache_alloc() */
2965 
2966         } else {
2967                 if (size == 0)
2968                         return (NULL);
2969 
2970                 buf = vmem_alloc(kmem_oversize_arena, size,
2971                     kmflag & KM_VMFLAGS);
2972                 if (buf == NULL)
2973                         kmem_log_event(kmem_failure_log, NULL, NULL,
2974                             (void *)size);
2975                 else if (KMEM_DUMP(kmem_slab_cache)) {
2976                         /* stats for dump intercept */
2977                         kmem_dump_oversize_allocs++;
2978                         if (size > kmem_dump_oversize_max)
2979                                 kmem_dump_oversize_max = size;
2980                 }
2981                 return (buf);
2982         }
2983 
2984         buf = kmem_cache_alloc(cp, kmflag);
2985         if ((cp->cache_flags & KMF_BUFTAG) && !KMEM_DUMP(cp) && buf != NULL) {
2986                 kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
2987                 ((uint8_t *)buf)[size] = KMEM_REDZONE_BYTE;
2988                 ((uint32_t *)btp)[1] = KMEM_SIZE_ENCODE(size);
2989 
 
3562                 kmcp->kmc_move_yes.value.ui64                = 0;
3563                 kmcp->kmc_move_no.value.ui64         = 0;
3564                 kmcp->kmc_move_later.value.ui64              = 0;
3565                 kmcp->kmc_move_dont_need.value.ui64  = 0;
3566                 kmcp->kmc_move_dont_know.value.ui64  = 0;
3567                 kmcp->kmc_move_hunt_found.value.ui64 = 0;
3568                 kmcp->kmc_move_slabs_freed.value.ui64        = 0;
3569                 kmcp->kmc_defrag.value.ui64          = 0;
3570                 kmcp->kmc_scan.value.ui64            = 0;
3571                 kmcp->kmc_move_reclaimable.value.ui64        = 0;
3572         } else {
3573                 int64_t reclaimable;
3574 
3575                 kmem_defrag_t *kd = cp->cache_defrag;
3576                 kmcp->kmc_move_callbacks.value.ui64  = kd->kmd_callbacks;
3577                 kmcp->kmc_move_yes.value.ui64                = kd->kmd_yes;
3578                 kmcp->kmc_move_no.value.ui64         = kd->kmd_no;
3579                 kmcp->kmc_move_later.value.ui64              = kd->kmd_later;
3580                 kmcp->kmc_move_dont_need.value.ui64  = kd->kmd_dont_need;
3581                 kmcp->kmc_move_dont_know.value.ui64  = kd->kmd_dont_know;
3582                 kmcp->kmc_move_hunt_found.value.ui64 = kd->kmd_hunt_found;
3583                 kmcp->kmc_move_slabs_freed.value.ui64        = kd->kmd_slabs_freed;
3584                 kmcp->kmc_defrag.value.ui64          = kd->kmd_defrags;
3585                 kmcp->kmc_scan.value.ui64            = kd->kmd_scans;
3586 
3587                 reclaimable = cp->cache_bufslab - (cp->cache_maxchunks - 1);
3588                 reclaimable = MAX(reclaimable, 0);
3589                 reclaimable += ((uint64_t)reap * cp->cache_magtype->mt_magsize);
3590                 kmcp->kmc_move_reclaimable.value.ui64        = reclaimable;
3591         }
3592 
3593         mutex_exit(&cp->cache_lock);
3594         return (0);
3595 }
3596 
3597 /*
3598  * Return a named statistic about a particular cache.
3599  * This shouldn't be called very often, so it's currently designed for
3600  * simplicity (leverages existing kstat support) rather than efficiency.
3601  */
3602 uint64_t
 
4472                     segkmem_alloc, segkmem_free, kmem_minfirewall < ULONG_MAX?
4473                     kmem_firewall_va_arena : heap_arena, 0, VMC_DUMPSAFE |
4474                     VM_SLEEP);
4475         }
4476 
4477         kmem_cache_init(2, use_large_pages);
4478 
4479         if (kmem_flags & (KMF_AUDIT | KMF_RANDOMIZE)) {
4480                 if (kmem_transaction_log_size == 0)
4481                         kmem_transaction_log_size = kmem_maxavail() / 50;
4482                 kmem_transaction_log = kmem_log_init(kmem_transaction_log_size);
4483         }
4484 
4485         if (kmem_flags & (KMF_CONTENTS | KMF_RANDOMIZE)) {
4486                 if (kmem_content_log_size == 0)
4487                         kmem_content_log_size = kmem_maxavail() / 50;
4488                 kmem_content_log = kmem_log_init(kmem_content_log_size);
4489         }
4490 
4491         kmem_failure_log = kmem_log_init(kmem_failure_log_size);
4492 
4493         kmem_slab_log = kmem_log_init(kmem_slab_log_size);
4494 
4495         /*
4496          * Initialize STREAMS message caches so allocb() is available.
4497          * This allows us to initialize the logging framework (cmn_err(9F),
4498          * strlog(9F), etc) so we can start recording messages.
4499          */
4500         streams_msg_init();
4501 
4502         /*
4503          * Initialize the ZSD framework in Zones so modules loaded henceforth
4504          * can register their callbacks.
4505          */
4506         zone_zsd_init();
4507 
4508         log_init();
4509         taskq_init();
4510 
4511         /*
4512          * Warn about invalid or dangerous values of kmem_flags.
4513          * Always warn about unsupported values.
 
4661                 return (B_FALSE);
4662         }
4663 
4664         if ((refcnt == 1) || kmem_move_any_partial) {
4665                 return (refcnt < sp->slab_chunks);
4666         }
4667 
4668         /*
4669          * The reclaim threshold is adjusted at each kmem_cache_scan() so that
4670          * slabs with a progressively higher percentage of used buffers can be
4671          * reclaimed until the cache as a whole is no longer fragmented.
4672          *
4673          *      sp->slab_refcnt   kmd_reclaim_numer
4674          *      --------------- < ------------------
4675          *      sp->slab_chunks   KMEM_VOID_FRACTION
4676          */
4677         return ((refcnt * KMEM_VOID_FRACTION) <
4678             (sp->slab_chunks * cp->cache_defrag->kmd_reclaim_numer));
4679 }
4680 
4681 static void *
4682 kmem_hunt_mag(kmem_cache_t *cp, kmem_magazine_t *m, int n, void *buf,
4683     void *tbuf)
4684 {
4685         int i;          /* magazine round index */
4686 
4687         for (i = 0; i < n; i++) {
4688                 if (buf == m->mag_round[i]) {
4689                         if (cp->cache_flags & KMF_BUFTAG) {
4690                                 (void) kmem_cache_free_debug(cp, tbuf,
4691                                     caller());
4692                         }
4693                         m->mag_round[i] = tbuf;
4694                         return (buf);
4695                 }
4696         }
4697 
4698         return (NULL);
4699 }
4700 
4701 /*
4702  * Hunt the magazine layer for the given buffer. If found, the buffer is
4703  * removed from the magazine layer and returned, otherwise NULL is returned.
4704  * The state of the returned buffer is freed and constructed.
4705  */
4706 static void *
4707 kmem_hunt_mags(kmem_cache_t *cp, void *buf)
4708 {
4709         kmem_cpu_cache_t *ccp;
4710         kmem_magazine_t *m;
4711         int cpu_seqid;
4712         int n;          /* magazine rounds */
4713         void *tbuf;     /* temporary swap buffer */
4714 
4715         ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
4716 
4717         /*
4718          * Allocated a buffer to swap with the one we hope to pull out of a
4719          * magazine when found.
4720          */
4721         tbuf = kmem_cache_alloc(cp, KM_NOSLEEP);
4722         if (tbuf == NULL) {
4723                 KMEM_STAT_ADD(kmem_move_stats.kms_hunt_alloc_fail);
4724                 return (NULL);
4725         }
4726         if (tbuf == buf) {
4727                 KMEM_STAT_ADD(kmem_move_stats.kms_hunt_lucky);
4728                 if (cp->cache_flags & KMF_BUFTAG) {
4729                         (void) kmem_cache_free_debug(cp, buf, caller());
4730                 }
4731                 return (buf);
4732         }
4733 
4734         /* Hunt the depot. */
4735         mutex_enter(&cp->cache_depot_lock);
4736         n = cp->cache_magtype->mt_magsize;
4737         for (m = cp->cache_full.ml_list; m != NULL; m = m->mag_next) {
4738                 if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) {
4739                         mutex_exit(&cp->cache_depot_lock);
4740                         return (buf);
4741                 }
4742         }
4743         mutex_exit(&cp->cache_depot_lock);
4744 
4745         /* Hunt the per-CPU magazines. */
4746         for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) {
4747                 ccp = &cp->cache_cpu[cpu_seqid];
4748 
4749                 mutex_enter(&ccp->cc_lock);
4750                 m = ccp->cc_loaded;
4751                 n = ccp->cc_rounds;
4752                 if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) {
4753                         mutex_exit(&ccp->cc_lock);
4754                         return (buf);
4755                 }
4756                 m = ccp->cc_ploaded;
4757                 n = ccp->cc_prounds;
4758                 if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) {
4759                         mutex_exit(&ccp->cc_lock);
4760                         return (buf);
4761                 }
4762                 mutex_exit(&ccp->cc_lock);
4763         }
4764 
4765         kmem_cache_free(cp, tbuf);
4766         return (NULL);
4767 }
4768 
4769 /*
4770  * May be called from the kmem_move_taskq, from kmem_cache_move_notify_task(),
4771  * or when the buffer is freed.
4772  */
4773 static void
4774 kmem_slab_move_yes(kmem_cache_t *cp, kmem_slab_t *sp, void *from_buf)
4775 {
4776         ASSERT(MUTEX_HELD(&cp->cache_lock));
4777         ASSERT(KMEM_SLAB_MEMBER(sp, from_buf));
4778 
4779         if (!KMEM_SLAB_IS_PARTIAL(sp)) {
4780                 return;
4781         }
4782 
4783         if (sp->slab_flags & KMEM_SLAB_NOMOVE) {
4784                 if (KMEM_SLAB_OFFSET(sp, from_buf) == sp->slab_stuck_offset) {
4785                         avl_remove(&cp->cache_partial_slabs, sp);
4786                         sp->slab_flags &= ~KMEM_SLAB_NOMOVE;
4787                         sp->slab_stuck_offset = (uint32_t)-1;
4788                         avl_add(&cp->cache_partial_slabs, sp);
4789                 }
 
4812 }
4813 
4814 static void kmem_move_end(kmem_cache_t *, kmem_move_t *);
4815 
4816 /*
4817  * The move callback takes two buffer addresses, the buffer to be moved, and a
4818  * newly allocated and constructed buffer selected by kmem as the destination.
4819  * It also takes the size of the buffer and an optional user argument specified
4820  * at cache creation time. kmem guarantees that the buffer to be moved has not
4821  * been unmapped by the virtual memory subsystem. Beyond that, it cannot
4822  * guarantee the present whereabouts of the buffer to be moved, so it is up to
4823  * the client to safely determine whether or not it is still using the buffer.
4824  * The client must not free either of the buffers passed to the move callback,
4825  * since kmem wants to free them directly to the slab layer. The client response
4826  * tells kmem which of the two buffers to free:
4827  *
4828  * YES          kmem frees the old buffer (the move was successful)
4829  * NO           kmem frees the new buffer, marks the slab of the old buffer
4830  *              non-reclaimable to avoid bothering the client again
4831  * LATER        kmem frees the new buffer, increments slab_later_count
4832  * DONT_KNOW    kmem frees the new buffer, searches mags for the old buffer
4833  * DONT_NEED    kmem frees both the old buffer and the new buffer
4834  *
4835  * The pending callback argument now being processed contains both of the
4836  * buffers (old and new) passed to the move callback function, the slab of the
4837  * old buffer, and flags related to the move request, such as whether or not the
4838  * system was desperate for memory.
4839  *
4840  * Slabs are not freed while there is a pending callback, but instead are kept
4841  * on a deadlist, which is drained after the last callback completes. This means
4842  * that slabs are safe to access until kmem_move_end(), no matter how many of
4843  * their buffers have been freed. Once slab_refcnt reaches zero, it stays at
4844  * zero for as long as the slab remains on the deadlist and until the slab is
4845  * freed.
4846  */
4847 static void
4848 kmem_move_buffer(kmem_move_t *callback)
4849 {
4850         kmem_cbrc_t response;
4851         kmem_slab_t *sp = callback->kmm_from_slab;
4852         kmem_cache_t *cp = sp->slab_cache;
4853         boolean_t free_on_slab;
4854 
4855         ASSERT(taskq_member(kmem_move_taskq, curthread));
4856         ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
4857         ASSERT(KMEM_SLAB_MEMBER(sp, callback->kmm_from_buf));
4858 
4859         /*
4860          * The number of allocated buffers on the slab may have changed since we
4861          * last checked the slab's reclaimability (when the pending move was
4862          * enqueued), or the client may have responded NO when asked to move
4863          * another buffer on the same slab.
4864          */
4865         if (!kmem_slab_is_reclaimable(cp, sp, callback->kmm_flags)) {
4866                 KMEM_STAT_ADD(kmem_move_stats.kms_no_longer_reclaimable);
4867                 KMEM_STAT_COND_ADD((callback->kmm_flags & KMM_NOTIFY),
4868                     kmem_move_stats.kms_notify_no_longer_reclaimable);
4869                 kmem_slab_free(cp, callback->kmm_to_buf);
4870                 kmem_move_end(cp, callback);
4871                 return;
4872         }
4873 
4874         /*
4875          * Hunting magazines is expensive, so we'll wait to do that until the
4876          * client responds KMEM_CBRC_DONT_KNOW. However, checking the slab layer
4877          * is cheap, so we might as well do that here in case we can avoid
4878          * bothering the client.
4879          */
4880         mutex_enter(&cp->cache_lock);
4881         free_on_slab = (kmem_slab_allocated(cp, sp,
4882             callback->kmm_from_buf) == NULL);
4883         mutex_exit(&cp->cache_lock);
4884 
4885         if (free_on_slab) {
4886                 KMEM_STAT_ADD(kmem_move_stats.kms_hunt_found_slab);
4887                 kmem_slab_free(cp, callback->kmm_to_buf);
4888                 kmem_move_end(cp, callback);
4889                 return;
4890         }
4891 
4892         if (cp->cache_flags & KMF_BUFTAG) {
4893                 /*
4894                  * Make kmem_cache_alloc_debug() apply the constructor for us.
4895                  */
4896                 if (kmem_cache_alloc_debug(cp, callback->kmm_to_buf,
4897                     KM_NOSLEEP, 1, caller()) != 0) {
4898                         KMEM_STAT_ADD(kmem_move_stats.kms_alloc_fail);
4899                         kmem_move_end(cp, callback);
4900                         return;
4901                 }
4902         } else if (cp->cache_constructor != NULL &&
4903             cp->cache_constructor(callback->kmm_to_buf, cp->cache_private,
4904             KM_NOSLEEP) != 0) {
4905                 atomic_inc_64(&cp->cache_alloc_fail);
4906                 KMEM_STAT_ADD(kmem_move_stats.kms_constructor_fail);
4907                 kmem_slab_free(cp, callback->kmm_to_buf);
4908                 kmem_move_end(cp, callback);
4909                 return;
4910         }
4911 
4912         KMEM_STAT_ADD(kmem_move_stats.kms_callbacks);
4913         KMEM_STAT_COND_ADD((callback->kmm_flags & KMM_NOTIFY),
4914             kmem_move_stats.kms_notify_callbacks);
4915         cp->cache_defrag->kmd_callbacks++;
4916         cp->cache_defrag->kmd_thread = curthread;
4917         cp->cache_defrag->kmd_from_buf = callback->kmm_from_buf;
4918         cp->cache_defrag->kmd_to_buf = callback->kmm_to_buf;
4919         DTRACE_PROBE2(kmem__move__start, kmem_cache_t *, cp, kmem_move_t *,
4920             callback);
4921 
4922         response = cp->cache_move(callback->kmm_from_buf,
4923             callback->kmm_to_buf, cp->cache_bufsize, cp->cache_private);
4924 
4925         DTRACE_PROBE3(kmem__move__end, kmem_cache_t *, cp, kmem_move_t *,
4926             callback, kmem_cbrc_t, response);
4927         cp->cache_defrag->kmd_thread = NULL;
4928         cp->cache_defrag->kmd_from_buf = NULL;
4929         cp->cache_defrag->kmd_to_buf = NULL;
4930 
4931         if (response == KMEM_CBRC_YES) {
4932                 KMEM_STAT_ADD(kmem_move_stats.kms_yes);
4933                 cp->cache_defrag->kmd_yes++;
4934                 kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE);
4935                 /* slab safe to access until kmem_move_end() */
4936                 if (sp->slab_refcnt == 0)
4937                         cp->cache_defrag->kmd_slabs_freed++;
4938                 mutex_enter(&cp->cache_lock);
4939                 kmem_slab_move_yes(cp, sp, callback->kmm_from_buf);
4940                 mutex_exit(&cp->cache_lock);
4941                 kmem_move_end(cp, callback);
4942                 return;
4943         }
4944 
4945         switch (response) {
4946         case KMEM_CBRC_NO:
4947                 KMEM_STAT_ADD(kmem_move_stats.kms_no);
4948                 cp->cache_defrag->kmd_no++;
4949                 mutex_enter(&cp->cache_lock);
4950                 kmem_slab_move_no(cp, sp, callback->kmm_from_buf);
4951                 mutex_exit(&cp->cache_lock);
4952                 break;
4953         case KMEM_CBRC_LATER:
4954                 KMEM_STAT_ADD(kmem_move_stats.kms_later);
4955                 cp->cache_defrag->kmd_later++;
4956                 mutex_enter(&cp->cache_lock);
4957                 if (!KMEM_SLAB_IS_PARTIAL(sp)) {
4958                         mutex_exit(&cp->cache_lock);
4959                         break;
4960                 }
4961 
4962                 if (++sp->slab_later_count >= KMEM_DISBELIEF) {
4963                         KMEM_STAT_ADD(kmem_move_stats.kms_disbelief);
4964                         kmem_slab_move_no(cp, sp, callback->kmm_from_buf);
4965                 } else if (!(sp->slab_flags & KMEM_SLAB_NOMOVE)) {
4966                         sp->slab_stuck_offset = KMEM_SLAB_OFFSET(sp,
4967                             callback->kmm_from_buf);
4968                 }
4969                 mutex_exit(&cp->cache_lock);
4970                 break;
4971         case KMEM_CBRC_DONT_NEED:
4972                 KMEM_STAT_ADD(kmem_move_stats.kms_dont_need);
4973                 cp->cache_defrag->kmd_dont_need++;
4974                 kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE);
4975                 if (sp->slab_refcnt == 0)
4976                         cp->cache_defrag->kmd_slabs_freed++;
4977                 mutex_enter(&cp->cache_lock);
4978                 kmem_slab_move_yes(cp, sp, callback->kmm_from_buf);
4979                 mutex_exit(&cp->cache_lock);
4980                 break;
4981         case KMEM_CBRC_DONT_KNOW:
4982                 KMEM_STAT_ADD(kmem_move_stats.kms_dont_know);
4983                 cp->cache_defrag->kmd_dont_know++;
4984                 if (kmem_hunt_mags(cp, callback->kmm_from_buf) != NULL) {
4985                         KMEM_STAT_ADD(kmem_move_stats.kms_hunt_found_mag);
4986                         cp->cache_defrag->kmd_hunt_found++;
4987                         kmem_slab_free_constructed(cp, callback->kmm_from_buf,
4988                             B_TRUE);
4989                         if (sp->slab_refcnt == 0)
4990                                 cp->cache_defrag->kmd_slabs_freed++;
4991                         mutex_enter(&cp->cache_lock);
4992                         kmem_slab_move_yes(cp, sp, callback->kmm_from_buf);
4993                         mutex_exit(&cp->cache_lock);
4994                 }
4995                 break;
4996         default:
4997                 panic("'%s' (%p) unexpected move callback response %d\n",
4998                     cp->cache_name, (void *)cp, response);
4999         }
5000 
5001         kmem_slab_free_constructed(cp, callback->kmm_to_buf, B_FALSE);
5002         kmem_move_end(cp, callback);
5003 }
5004 
5005 /* Return B_FALSE if there is insufficient memory for the move request. */
5006 static boolean_t
5007 kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags)
5008 {
5009         void *to_buf;
5010         avl_index_t index;
5011         kmem_move_t *callback, *pending;
5012         ulong_t n;
5013 
5014         ASSERT(taskq_member(kmem_taskq, curthread));
5015         ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
5016         ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING);
5017 
5018         callback = kmem_cache_alloc(kmem_move_cache, KM_NOSLEEP);
5019         if (callback == NULL) {
5020                 KMEM_STAT_ADD(kmem_move_stats.kms_callback_alloc_fail);
5021                 return (B_FALSE);
5022         }
5023 
5024         callback->kmm_from_slab = sp;
5025         callback->kmm_from_buf = buf;
5026         callback->kmm_flags = flags;
5027 
5028         mutex_enter(&cp->cache_lock);
5029 
5030         n = avl_numnodes(&cp->cache_partial_slabs);
5031         if ((n == 0) || ((n == 1) && !(flags & KMM_DEBUG))) {
5032                 mutex_exit(&cp->cache_lock);
5033                 kmem_cache_free(kmem_move_cache, callback);
5034                 return (B_TRUE); /* there is no need for the move request */
5035         }
5036 
5037         pending = avl_find(&cp->cache_defrag->kmd_moves_pending, buf, &index);
5038         if (pending != NULL) {
5039                 /*
5040                  * If the move is already pending and we're desperate now,
5041                  * update the move flags.
5042                  */
5043                 if (flags & KMM_DESPERATE) {
5044                         pending->kmm_flags |= KMM_DESPERATE;
5045                 }
5046                 mutex_exit(&cp->cache_lock);
5047                 KMEM_STAT_ADD(kmem_move_stats.kms_already_pending);
5048                 kmem_cache_free(kmem_move_cache, callback);
5049                 return (B_TRUE);
5050         }
5051 
5052         to_buf = kmem_slab_alloc_impl(cp, avl_first(&cp->cache_partial_slabs),
5053             B_FALSE);
5054         callback->kmm_to_buf = to_buf;
5055         avl_insert(&cp->cache_defrag->kmd_moves_pending, callback, index);
5056 
5057         mutex_exit(&cp->cache_lock);
5058 
5059         if (!taskq_dispatch(kmem_move_taskq, (task_func_t *)kmem_move_buffer,
5060             callback, TQ_NOSLEEP)) {
5061                 KMEM_STAT_ADD(kmem_move_stats.kms_callback_taskq_fail);
5062                 mutex_enter(&cp->cache_lock);
5063                 avl_remove(&cp->cache_defrag->kmd_moves_pending, callback);
5064                 mutex_exit(&cp->cache_lock);
5065                 kmem_slab_free(cp, to_buf);
5066                 kmem_cache_free(kmem_move_cache, callback);
5067                 return (B_FALSE);
5068         }
5069 
5070         return (B_TRUE);
5071 }
5072 
5073 static void
5074 kmem_move_end(kmem_cache_t *cp, kmem_move_t *callback)
5075 {
5076         avl_index_t index;
5077 
5078         ASSERT(cp->cache_defrag != NULL);
5079         ASSERT(taskq_member(kmem_move_taskq, curthread));
5080         ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
5081 
 
5087                 list_t *deadlist = &cp->cache_defrag->kmd_deadlist;
5088                 kmem_slab_t *sp;
5089 
5090                 /*
5091                  * The last pending move completed. Release all slabs from the
5092                  * front of the dead list except for any slab at the tail that
5093                  * needs to be released from the context of kmem_move_buffers().
5094                  * kmem deferred unmapping the buffers on these slabs in order
5095                  * to guarantee that buffers passed to the move callback have
5096                  * been touched only by kmem or by the client itself.
5097                  */
5098                 while ((sp = list_remove_head(deadlist)) != NULL) {
5099                         if (sp->slab_flags & KMEM_SLAB_MOVE_PENDING) {
5100                                 list_insert_tail(deadlist, sp);
5101                                 break;
5102                         }
5103                         cp->cache_defrag->kmd_deadcount--;
5104                         cp->cache_slab_destroy++;
5105                         mutex_exit(&cp->cache_lock);
5106                         kmem_slab_destroy(cp, sp);
5107                         KMEM_STAT_ADD(kmem_move_stats.kms_dead_slabs_freed);
5108                         mutex_enter(&cp->cache_lock);
5109                 }
5110         }
5111         mutex_exit(&cp->cache_lock);
5112         kmem_cache_free(kmem_move_cache, callback);
5113 }
5114 
5115 /*
5116  * Move buffers from least used slabs first by scanning backwards from the end
5117  * of the partial slab list. Scan at most max_scan candidate slabs and move
5118  * buffers from at most max_slabs slabs (0 for all partial slabs in both cases).
5119  * If desperate to reclaim memory, move buffers from any partial slab, otherwise
5120  * skip slabs with a ratio of allocated buffers at or above the current
5121  * threshold. Return the number of unskipped slabs (at most max_slabs, -1 if the
5122  * scan is aborted) so that the caller can adjust the reclaimability threshold
5123  * depending on how many reclaimable slabs it finds.
5124  *
5125  * kmem_move_buffers() drops and reacquires cache_lock every time it issues a
5126  * move request, since it is not valid for kmem_move_begin() to call
5127  * kmem_cache_alloc() or taskq_dispatch() with cache_lock held.
 
5232                                 list_t *deadlist =
5233                                     &cp->cache_defrag->kmd_deadlist;
5234                                 list_remove(deadlist, sp);
5235 
5236                                 if (!avl_is_empty(
5237                                     &cp->cache_defrag->kmd_moves_pending)) {
5238                                         /*
5239                                          * A pending move makes it unsafe to
5240                                          * destroy the slab, because even though
5241                                          * the move is no longer needed, the
5242                                          * context where that is determined
5243                                          * requires the slab to exist.
5244                                          * Fortunately, a pending move also
5245                                          * means we don't need to destroy the
5246                                          * slab here, since it will get
5247                                          * destroyed along with any other slabs
5248                                          * on the deadlist after the last
5249                                          * pending move completes.
5250                                          */
5251                                         list_insert_head(deadlist, sp);
5252                                         KMEM_STAT_ADD(kmem_move_stats.
5253                                             kms_endscan_slab_dead);
5254                                         return (-1);
5255                                 }
5256 
5257                                 /*
5258                                  * Destroy the slab now if it was completely
5259                                  * freed while we dropped cache_lock and there
5260                                  * are no pending moves. Since slab_refcnt
5261                                  * cannot change once it reaches zero, no new
5262                                  * pending moves from that slab are possible.
5263                                  */
5264                                 cp->cache_defrag->kmd_deadcount--;
5265                                 cp->cache_slab_destroy++;
5266                                 mutex_exit(&cp->cache_lock);
5267                                 kmem_slab_destroy(cp, sp);
5268                                 KMEM_STAT_ADD(kmem_move_stats.
5269                                     kms_dead_slabs_freed);
5270                                 KMEM_STAT_ADD(kmem_move_stats.
5271                                     kms_endscan_slab_destroyed);
5272                                 mutex_enter(&cp->cache_lock);
5273                                 /*
5274                                  * Since we can't pick up the scan where we left
5275                                  * off, abort the scan and say nothing about the
5276                                  * number of reclaimable slabs.
5277                                  */
5278                                 return (-1);
5279                         }
5280 
5281                         if (!success) {
5282                                 /*
5283                                  * Abort the scan if there is not enough memory
5284                                  * for the request and say nothing about the
5285                                  * number of reclaimable slabs.
5286                                  */
5287                                 KMEM_STAT_COND_ADD(s < max_slabs,
5288                                     kmem_move_stats.kms_endscan_nomem);
5289                                 return (-1);
5290                         }
5291 
5292                         /*
5293                          * The slab's position changed while the lock was
5294                          * dropped, so we don't know where we are in the
5295                          * sequence any more.
5296                          */
5297                         if (sp->slab_refcnt != refcnt) {
5298                                 /*
5299                                  * If this is a KMM_DEBUG move, the slab_refcnt
5300                                  * may have changed because we allocated a
5301                                  * destination buffer on the same slab. In that
5302                                  * case, we're not interested in counting it.
5303                                  */
5304                                 KMEM_STAT_COND_ADD(!(flags & KMM_DEBUG) &&
5305                                     (s < max_slabs),
5306                                     kmem_move_stats.kms_endscan_refcnt_changed);
5307                                 return (-1);
5308                         }
5309                         if ((sp->slab_flags & KMEM_SLAB_NOMOVE) != nomove) {
5310                                 KMEM_STAT_COND_ADD(s < max_slabs,
5311                                     kmem_move_stats.kms_endscan_nomove_changed);
5312                                 return (-1);
5313                         }
5314 
5315                         /*
5316                          * Generating a move request allocates a destination
5317                          * buffer from the slab layer, bumping the first partial
5318                          * slab if it is completely allocated. If the current
5319                          * slab becomes the first partial slab as a result, we
5320                          * can't continue to scan backwards.
5321                          *
5322                          * If this is a KMM_DEBUG move and we allocated the
5323                          * destination buffer from the last partial slab, then
5324                          * the buffer we're moving is on the same slab and our
5325                          * slab_refcnt has changed, causing us to return before
5326                          * reaching here if there are no partial slabs left.
5327                          */
5328                         ASSERT(!avl_is_empty(&cp->cache_partial_slabs));
5329                         if (sp == avl_first(&cp->cache_partial_slabs)) {
5330                                 /*
5331                                  * We're not interested in a second KMM_DEBUG
5332                                  * move.
5333                                  */
5334                                 goto end_scan;
5335                         }
5336                 }
5337         }
5338 end_scan:
5339 
5340         KMEM_STAT_COND_ADD(!(flags & KMM_DEBUG) &&
5341             (s < max_slabs) &&
5342             (sp == avl_first(&cp->cache_partial_slabs)),
5343             kmem_move_stats.kms_endscan_freelist);
5344 
5345         return (s);
5346 }
5347 
5348 typedef struct kmem_move_notify_args {
5349         kmem_cache_t *kmna_cache;
5350         void *kmna_buf;
5351 } kmem_move_notify_args_t;
5352 
5353 static void
5354 kmem_cache_move_notify_task(void *arg)
5355 {
5356         kmem_move_notify_args_t *args = arg;
5357         kmem_cache_t *cp = args->kmna_cache;
5358         void *buf = args->kmna_buf;
5359         kmem_slab_t *sp;
5360 
5361         ASSERT(taskq_member(kmem_taskq, curthread));
5362         ASSERT(list_link_active(&cp->cache_link));
5363 
5364         kmem_free(args, sizeof (kmem_move_notify_args_t));
 
5384                         return;
5385                 }
5386 
5387                 kmem_slab_move_yes(cp, sp, buf);
5388                 ASSERT(!(sp->slab_flags & KMEM_SLAB_MOVE_PENDING));
5389                 sp->slab_flags |= KMEM_SLAB_MOVE_PENDING;
5390                 mutex_exit(&cp->cache_lock);
5391                 /* see kmem_move_buffers() about dropping the lock */
5392                 (void) kmem_move_begin(cp, sp, buf, KMM_NOTIFY);
5393                 mutex_enter(&cp->cache_lock);
5394                 ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING);
5395                 sp->slab_flags &= ~KMEM_SLAB_MOVE_PENDING;
5396                 if (sp->slab_refcnt == 0) {
5397                         list_t *deadlist = &cp->cache_defrag->kmd_deadlist;
5398                         list_remove(deadlist, sp);
5399 
5400                         if (!avl_is_empty(
5401                             &cp->cache_defrag->kmd_moves_pending)) {
5402                                 list_insert_head(deadlist, sp);
5403                                 mutex_exit(&cp->cache_lock);
5404                                 KMEM_STAT_ADD(kmem_move_stats.
5405                                     kms_notify_slab_dead);
5406                                 return;
5407                         }
5408 
5409                         cp->cache_defrag->kmd_deadcount--;
5410                         cp->cache_slab_destroy++;
5411                         mutex_exit(&cp->cache_lock);
5412                         kmem_slab_destroy(cp, sp);
5413                         KMEM_STAT_ADD(kmem_move_stats.kms_dead_slabs_freed);
5414                         KMEM_STAT_ADD(kmem_move_stats.
5415                             kms_notify_slab_destroyed);
5416                         return;
5417                 }
5418         } else {
5419                 kmem_slab_move_yes(cp, sp, buf);
5420         }
5421         mutex_exit(&cp->cache_lock);
5422 }
5423 
5424 void
5425 kmem_cache_move_notify(kmem_cache_t *cp, void *buf)
5426 {
5427         kmem_move_notify_args_t *args;
5428 
5429         KMEM_STAT_ADD(kmem_move_stats.kms_notify);
5430         args = kmem_alloc(sizeof (kmem_move_notify_args_t), KM_NOSLEEP);
5431         if (args != NULL) {
5432                 args->kmna_cache = cp;
5433                 args->kmna_buf = buf;
5434                 if (!taskq_dispatch(kmem_taskq,
5435                     (task_func_t *)kmem_cache_move_notify_task, args,
5436                     TQ_NOSLEEP))
5437                         kmem_free(args, sizeof (kmem_move_notify_args_t));
5438         }
5439 }
5440 
5441 static void
5442 kmem_cache_defrag(kmem_cache_t *cp)
5443 {
5444         size_t n;
5445 
5446         ASSERT(cp->cache_defrag != NULL);
5447 
5448         mutex_enter(&cp->cache_lock);
5449         n = avl_numnodes(&cp->cache_partial_slabs);
5450         if (n > 1) {
5451                 /* kmem_move_buffers() drops and reacquires cache_lock */
5452                 KMEM_STAT_ADD(kmem_move_stats.kms_defrags);
5453                 cp->cache_defrag->kmd_defrags++;
5454                 (void) kmem_move_buffers(cp, n, 0, KMM_DESPERATE);
5455         }
5456         mutex_exit(&cp->cache_lock);
5457 }
5458 
5459 /* Is this cache above the fragmentation threshold? */
5460 static boolean_t
5461 kmem_cache_frag_threshold(kmem_cache_t *cp, uint64_t nfree)
5462 {
5463         /*
5464          *      nfree           kmem_frag_numer
5465          * ------------------ > ---------------
5466          * cp->cache_buftotal        kmem_frag_denom
5467          */
5468         return ((nfree * kmem_frag_denom) >
5469             (cp->cache_buftotal * kmem_frag_numer));
5470 }
5471 
5472 static boolean_t
 
5531         if (kmd->kmd_consolidate > 0) {
5532                 kmd->kmd_consolidate--;
5533                 mutex_exit(&cp->cache_lock);
5534                 kmem_cache_reap(cp);
5535                 return;
5536         }
5537 
5538         if (kmem_cache_is_fragmented(cp, &reap)) {
5539                 size_t slabs_found;
5540 
5541                 /*
5542                  * Consolidate reclaimable slabs from the end of the partial
5543                  * slab list (scan at most kmem_reclaim_scan_range slabs to find
5544                  * reclaimable slabs). Keep track of how many candidate slabs we
5545                  * looked for and how many we actually found so we can adjust
5546                  * the definition of a candidate slab if we're having trouble
5547                  * finding them.
5548                  *
5549                  * kmem_move_buffers() drops and reacquires cache_lock.
5550                  */
5551                 KMEM_STAT_ADD(kmem_move_stats.kms_scans);
5552                 kmd->kmd_scans++;
5553                 slabs_found = kmem_move_buffers(cp, kmem_reclaim_scan_range,
5554                     kmem_reclaim_max_slabs, 0);
5555                 if (slabs_found >= 0) {
5556                         kmd->kmd_slabs_sought += kmem_reclaim_max_slabs;
5557                         kmd->kmd_slabs_found += slabs_found;
5558                 }
5559 
5560                 if (++kmd->kmd_tries >= kmem_reclaim_scan_range) {
5561                         kmd->kmd_tries = 0;
5562 
5563                         /*
5564                          * If we had difficulty finding candidate slabs in
5565                          * previous scans, adjust the threshold so that
5566                          * candidates are easier to find.
5567                          */
5568                         if (kmd->kmd_slabs_found == kmd->kmd_slabs_sought) {
5569                                 kmem_adjust_reclaim_threshold(kmd, -1);
5570                         } else if ((kmd->kmd_slabs_found * 2) <
5571                             kmd->kmd_slabs_sought) {
5572                                 kmem_adjust_reclaim_threshold(kmd, 1);
5573                         }
5574                         kmd->kmd_slabs_sought = 0;
5575                         kmd->kmd_slabs_found = 0;
5576                 }
5577         } else {
5578                 kmem_reset_reclaim_threshold(cp->cache_defrag);
5579 #ifdef  DEBUG
5580                 if (!avl_is_empty(&cp->cache_partial_slabs)) {
5581                         /*
5582                          * In a debug kernel we want the consolidator to
5583                          * run occasionally even when there is plenty of
5584                          * memory.
5585                          */
5586                         uint16_t debug_rand;
5587 
5588                         (void) random_get_bytes((uint8_t *)&debug_rand, 2);
5589                         if (!kmem_move_noreap &&
5590                             ((debug_rand % kmem_mtb_reap) == 0)) {
5591                                 mutex_exit(&cp->cache_lock);
5592                                 KMEM_STAT_ADD(kmem_move_stats.kms_debug_reaps);
5593                                 kmem_cache_reap(cp);
5594                                 return;
5595                         } else if ((debug_rand % kmem_mtb_move) == 0) {
5596                                 KMEM_STAT_ADD(kmem_move_stats.kms_scans);
5597                                 KMEM_STAT_ADD(kmem_move_stats.kms_debug_scans);
5598                                 kmd->kmd_scans++;
5599                                 (void) kmem_move_buffers(cp,
5600                                     kmem_reclaim_scan_range, 1, KMM_DEBUG);
5601                         }
5602                 }
5603 #endif  /* DEBUG */
5604         }
5605 
5606         mutex_exit(&cp->cache_lock);
5607 
5608         if (reap) {
5609                 KMEM_STAT_ADD(kmem_move_stats.kms_scan_depot_ws_reaps);
5610                 kmem_depot_ws_reap(cp);
5611         }
5612 }
 |