Print this page
8493 kmem_move taskq appears to be inducing significant system latency
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>


 142  *            (The system won't be getting the slab back as long as the
 143  *            immovable object holds it hostage, so there's no point in moving
 144  *            any of its objects.)
 145  *     LATER: The client is using the object and cannot move it now, so kmem
 146  *            frees the new object (the unused copy destination). kmem still
 147  *            attempts to move other objects off the slab, since it expects to
 148  *            succeed in clearing the slab in a later callback. The client
 149  *            should use LATER instead of NO if the object is likely to become
 150  *            movable very soon.
 151  * DONT_NEED: The client no longer needs the object, so kmem frees the old along
 152  *            with the new object (the unused copy destination). This response
 153  *            is the client's opportunity to be a model citizen and give back as
 154  *            much as it can.
 155  * DONT_KNOW: The client does not know about the object because
 156  *            a) the client has just allocated the object and not yet put it
 157  *               wherever it expects to find known objects
 158  *            b) the client has removed the object from wherever it expects to
 159  *               find known objects and is about to free it, or
 160  *            c) the client has freed the object.
 161  *            In all these cases (a, b, and c) kmem frees the new object (the
 162  *            unused copy destination) and searches for the old object in the
 163  *            magazine layer. If found, the object is removed from the magazine
 164  *            layer and freed to the slab layer so it will no longer hold the
 165  *            slab hostage.












 166  *
 167  * 2.3 Object States
 168  *
 169  * Neither kmem nor the client can be assumed to know the object's whereabouts
 170  * at the time of the callback. An object belonging to a kmem cache may be in
 171  * any of the following states:
 172  *
 173  * 1. Uninitialized on the slab
 174  * 2. Allocated from the slab but not constructed (still uninitialized)
 175  * 3. Allocated from the slab, constructed, but not yet ready for business
 176  *    (not in a valid state for the move callback)
 177  * 4. In use (valid and known to the client)
 178  * 5. About to be freed (no longer in a valid state for the move callback)
 179  * 6. Freed to a magazine (still constructed)
 180  * 7. Allocated from a magazine, not yet ready for business (not in a valid
 181  *    state for the move callback), and about to return to state #4
 182  * 8. Deconstructed on a magazine that is about to be freed
 183  * 9. Freed to the slab
 184  *
 185  * Since the move callback may be called at any time while the object is in any


 268  * c_objects_lock is held. Note that after acquiring the lock, the client must
 269  * recheck the o_container pointer in case the object was removed just before
 270  * acquiring the lock.
 271  *
 272  * When the client is about to free an object, it must first remove that object
 273  * from the list, hash, or other structure where it is kept. At that time, to
 274  * mark the object so it can be distinguished from the remaining, known objects,
 275  * the client sets the designated low order bit:
 276  *
 277  *      mutex_enter(&container->c_objects_lock);
 278  *      object->o_container = (void *)((uintptr_t)object->o_container | 0x1);
 279  *      list_remove(&container->c_objects, object);
 280  *      mutex_exit(&container->c_objects_lock);
 281  *
 282  * In the common case, the object is freed to the magazine layer, where it may
 283  * be reused on a subsequent allocation without the overhead of calling the
 284  * constructor. While in the magazine it appears allocated from the point of
 285  * view of the slab layer, making it a candidate for the move callback. Most
 286  * objects unrecognized by the client in the move callback fall into this
 287  * category and are cheaply distinguished from known objects by the test
 288  * described earlier. Since recognition is cheap for the client, and searching
 289  * magazines is expensive for kmem, kmem defers searching until the client first
 290  * returns KMEM_CBRC_DONT_KNOW. As long as the needed effort is reasonable, kmem
 291  * elsewhere does what it can to avoid bothering the client unnecessarily.
 292  *
 293  * Invalidating the designated pointer member before freeing the object marks
 294  * the object to be avoided in the callback, and conversely, assigning a valid
 295  * value to the designated pointer member after allocating the object makes the
 296  * object fair game for the callback:
 297  *
 298  *      ... allocate object ...
 299  *      ... set any initial state not set by the constructor ...
 300  *
 301  *      mutex_enter(&container->c_objects_lock);
 302  *      list_insert_tail(&container->c_objects, object);
 303  *      membar_producer();
 304  *      object->o_container = container;
 305  *      mutex_exit(&container->c_objects_lock);
 306  *
 307  * Note that everything else must be valid before setting o_container makes the
 308  * object fair game for the move callback. The membar_producer() call ensures
 309  * that all the object's state is written to memory before setting the pointer
 310  * that transitions the object from state #3 or #7 (allocated, constructed, not
 311  * yet in use) to state #4 (in use, valid). That's important because the move


1022 static kmem_cache_t     *kmem_bufctl_cache;
1023 static kmem_cache_t     *kmem_bufctl_audit_cache;
1024 
1025 static kmutex_t         kmem_cache_lock;        /* inter-cache linkage only */
1026 static list_t           kmem_caches;
1027 
1028 static taskq_t          *kmem_taskq;
1029 static kmutex_t         kmem_flags_lock;
1030 static vmem_t           *kmem_metadata_arena;
1031 static vmem_t           *kmem_msb_arena;        /* arena for metadata caches */
1032 static vmem_t           *kmem_cache_arena;
1033 static vmem_t           *kmem_hash_arena;
1034 static vmem_t           *kmem_log_arena;
1035 static vmem_t           *kmem_oversize_arena;
1036 static vmem_t           *kmem_va_arena;
1037 static vmem_t           *kmem_default_arena;
1038 static vmem_t           *kmem_firewall_va_arena;
1039 static vmem_t           *kmem_firewall_arena;
1040 
1041 /*
1042  * Define KMEM_STATS to turn on statistic gathering. By default, it is only
1043  * turned on when DEBUG is also defined.
1044  */
1045 #ifdef  DEBUG
1046 #define KMEM_STATS
1047 #endif  /* DEBUG */
1048 
1049 #ifdef  KMEM_STATS
1050 #define KMEM_STAT_ADD(stat)                     ((stat)++)
1051 #define KMEM_STAT_COND_ADD(cond, stat)          ((void) (!(cond) || (stat)++))
1052 #else
1053 #define KMEM_STAT_ADD(stat)                     /* nothing */
1054 #define KMEM_STAT_COND_ADD(cond, stat)          /* nothing */
1055 #endif  /* KMEM_STATS */
1056 
1057 /*
1058  * kmem slab consolidator thresholds (tunables)
1059  */
1060 size_t kmem_frag_minslabs = 101;        /* minimum total slabs */
1061 size_t kmem_frag_numer = 1;             /* free buffers (numerator) */
1062 size_t kmem_frag_denom = KMEM_VOID_FRACTION; /* buffers (denominator) */
1063 /*
1064  * Maximum number of slabs from which to move buffers during a single
1065  * maintenance interval while the system is not low on memory.
1066  */
1067 size_t kmem_reclaim_max_slabs = 1;
1068 /*
1069  * Number of slabs to scan backwards from the end of the partial slab list
1070  * when searching for buffers to relocate.
1071  */
1072 size_t kmem_reclaim_scan_range = 12;
1073 
1074 #ifdef  KMEM_STATS
1075 static struct {
1076         uint64_t kms_callbacks;
1077         uint64_t kms_yes;
1078         uint64_t kms_no;
1079         uint64_t kms_later;
1080         uint64_t kms_dont_need;
1081         uint64_t kms_dont_know;
1082         uint64_t kms_hunt_found_mag;
1083         uint64_t kms_hunt_found_slab;
1084         uint64_t kms_hunt_alloc_fail;
1085         uint64_t kms_hunt_lucky;
1086         uint64_t kms_notify;
1087         uint64_t kms_notify_callbacks;
1088         uint64_t kms_disbelief;
1089         uint64_t kms_already_pending;
1090         uint64_t kms_callback_alloc_fail;
1091         uint64_t kms_callback_taskq_fail;
1092         uint64_t kms_endscan_slab_dead;
1093         uint64_t kms_endscan_slab_destroyed;
1094         uint64_t kms_endscan_nomem;
1095         uint64_t kms_endscan_refcnt_changed;
1096         uint64_t kms_endscan_nomove_changed;
1097         uint64_t kms_endscan_freelist;
1098         uint64_t kms_avl_update;
1099         uint64_t kms_avl_noupdate;
1100         uint64_t kms_no_longer_reclaimable;
1101         uint64_t kms_notify_no_longer_reclaimable;
1102         uint64_t kms_notify_slab_dead;
1103         uint64_t kms_notify_slab_destroyed;
1104         uint64_t kms_alloc_fail;
1105         uint64_t kms_constructor_fail;
1106         uint64_t kms_dead_slabs_freed;
1107         uint64_t kms_defrags;
1108         uint64_t kms_scans;
1109         uint64_t kms_scan_depot_ws_reaps;
1110         uint64_t kms_debug_reaps;
1111         uint64_t kms_debug_scans;
1112 } kmem_move_stats;
1113 #endif  /* KMEM_STATS */
1114 
1115 /* consolidator knobs */
1116 boolean_t kmem_move_noreap;
1117 boolean_t kmem_move_blocked;
1118 boolean_t kmem_move_fulltilt;
1119 boolean_t kmem_move_any_partial;
1120 
1121 #ifdef  DEBUG
1122 /*
1123  * kmem consolidator debug tunables:
1124  * Ensure code coverage by occasionally running the consolidator even when the
1125  * caches are not fragmented (they may never be). These intervals are mean time
1126  * in cache maintenance intervals (kmem_cache_update).
1127  */
1128 uint32_t kmem_mtb_move = 60;    /* defrag 1 slab (~15min) */
1129 uint32_t kmem_mtb_reap = 1800;  /* defrag all slabs (~7.5hrs) */
1130 #endif  /* DEBUG */
1131 
1132 static kmem_cache_t     *kmem_defrag_cache;
1133 static kmem_cache_t     *kmem_move_cache;
1134 static taskq_t          *kmem_move_taskq;


1905                          */
1906                         if (sp->slab_flags & KMEM_SLAB_MOVE_PENDING) {
1907                                 list_insert_tail(deadlist, sp);
1908                         } else {
1909                                 list_insert_head(deadlist, sp);
1910                         }
1911                         cp->cache_defrag->kmd_deadcount++;
1912                         mutex_exit(&cp->cache_lock);
1913                 }
1914                 return;
1915         }
1916 
1917         if (bcp->bc_next == NULL) {
1918                 /* Transition the slab from completely allocated to partial. */
1919                 ASSERT(sp->slab_refcnt == (sp->slab_chunks - 1));
1920                 ASSERT(sp->slab_chunks > 1);
1921                 list_remove(&cp->cache_complete_slabs, sp);
1922                 cp->cache_complete_slab_count--;
1923                 avl_add(&cp->cache_partial_slabs, sp);
1924         } else {
1925 #ifdef  DEBUG
1926                 if (avl_update_gt(&cp->cache_partial_slabs, sp)) {
1927                         KMEM_STAT_ADD(kmem_move_stats.kms_avl_update);
1928                 } else {
1929                         KMEM_STAT_ADD(kmem_move_stats.kms_avl_noupdate);
1930                 }
1931 #else
1932                 (void) avl_update_gt(&cp->cache_partial_slabs, sp);
1933 #endif
1934         }
1935 
1936         ASSERT((cp->cache_slab_create - cp->cache_slab_destroy) ==
1937             (cp->cache_complete_slab_count +
1938             avl_numnodes(&cp->cache_partial_slabs) +
1939             (cp->cache_defrag == NULL ? 0 : cp->cache_defrag->kmd_deadcount)));
1940         mutex_exit(&cp->cache_lock);
1941 }
1942 
1943 /*
1944  * Return -1 if kmem_error, 1 if constructor fails, 0 if successful.
1945  */
1946 static int
1947 kmem_cache_alloc_debug(kmem_cache_t *cp, void *buf, int kmflag, int construct,
1948     caddr_t caller)
1949 {
1950         kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
1951         kmem_bufctl_audit_t *bcp = (kmem_bufctl_audit_t *)btp->bt_bufctl;
1952         uint32_t mtbf;
1953 


3562                 kmcp->kmc_move_yes.value.ui64                = 0;
3563                 kmcp->kmc_move_no.value.ui64         = 0;
3564                 kmcp->kmc_move_later.value.ui64              = 0;
3565                 kmcp->kmc_move_dont_need.value.ui64  = 0;
3566                 kmcp->kmc_move_dont_know.value.ui64  = 0;
3567                 kmcp->kmc_move_hunt_found.value.ui64 = 0;
3568                 kmcp->kmc_move_slabs_freed.value.ui64        = 0;
3569                 kmcp->kmc_defrag.value.ui64          = 0;
3570                 kmcp->kmc_scan.value.ui64            = 0;
3571                 kmcp->kmc_move_reclaimable.value.ui64        = 0;
3572         } else {
3573                 int64_t reclaimable;
3574 
3575                 kmem_defrag_t *kd = cp->cache_defrag;
3576                 kmcp->kmc_move_callbacks.value.ui64  = kd->kmd_callbacks;
3577                 kmcp->kmc_move_yes.value.ui64                = kd->kmd_yes;
3578                 kmcp->kmc_move_no.value.ui64         = kd->kmd_no;
3579                 kmcp->kmc_move_later.value.ui64              = kd->kmd_later;
3580                 kmcp->kmc_move_dont_need.value.ui64  = kd->kmd_dont_need;
3581                 kmcp->kmc_move_dont_know.value.ui64  = kd->kmd_dont_know;
3582                 kmcp->kmc_move_hunt_found.value.ui64 = kd->kmd_hunt_found;
3583                 kmcp->kmc_move_slabs_freed.value.ui64        = kd->kmd_slabs_freed;
3584                 kmcp->kmc_defrag.value.ui64          = kd->kmd_defrags;
3585                 kmcp->kmc_scan.value.ui64            = kd->kmd_scans;
3586 
3587                 reclaimable = cp->cache_bufslab - (cp->cache_maxchunks - 1);
3588                 reclaimable = MAX(reclaimable, 0);
3589                 reclaimable += ((uint64_t)reap * cp->cache_magtype->mt_magsize);
3590                 kmcp->kmc_move_reclaimable.value.ui64        = reclaimable;
3591         }
3592 
3593         mutex_exit(&cp->cache_lock);
3594         return (0);
3595 }
3596 
3597 /*
3598  * Return a named statistic about a particular cache.
3599  * This shouldn't be called very often, so it's currently designed for
3600  * simplicity (leverages existing kstat support) rather than efficiency.
3601  */
3602 uint64_t


4133                 kmem_cache_free(kmem_defrag_cache, defrag); /* unused */
4134         }
4135 }
4136 
4137 void
4138 kmem_cache_destroy(kmem_cache_t *cp)
4139 {
4140         int cpu_seqid;
4141 
4142         /*
4143          * Remove the cache from the global cache list so that no one else
4144          * can schedule tasks on its behalf, wait for any pending tasks to
4145          * complete, purge the cache, and then destroy it.
4146          */
4147         mutex_enter(&kmem_cache_lock);
4148         list_remove(&kmem_caches, cp);
4149         mutex_exit(&kmem_cache_lock);
4150 
4151         if (kmem_taskq != NULL)
4152                 taskq_wait(kmem_taskq);
4153         if (kmem_move_taskq != NULL)

4154                 taskq_wait(kmem_move_taskq);
4155 
4156         kmem_cache_magazine_purge(cp);
4157 
4158         mutex_enter(&cp->cache_lock);
4159         if (cp->cache_buftotal != 0)
4160                 cmn_err(CE_WARN, "kmem_cache_destroy: '%s' (%p) not empty",
4161                     cp->cache_name, (void *)cp);
4162         if (cp->cache_defrag != NULL) {
4163                 avl_destroy(&cp->cache_defrag->kmd_moves_pending);
4164                 list_destroy(&cp->cache_defrag->kmd_deadlist);
4165                 kmem_cache_free(kmem_defrag_cache, cp->cache_defrag);
4166                 cp->cache_defrag = NULL;
4167         }
4168         /*
4169          * The cache is now dead.  There should be no further activity.  We
4170          * enforce this by setting land mines in the constructor, destructor,
4171          * reclaim, and move routines that induce a kernel text fault if
4172          * invoked.
4173          */


4660                 return (B_FALSE);
4661         }
4662 
4663         if ((refcnt == 1) || kmem_move_any_partial) {
4664                 return (refcnt < sp->slab_chunks);
4665         }
4666 
4667         /*
4668          * The reclaim threshold is adjusted at each kmem_cache_scan() so that
4669          * slabs with a progressively higher percentage of used buffers can be
4670          * reclaimed until the cache as a whole is no longer fragmented.
4671          *
4672          *      sp->slab_refcnt   kmd_reclaim_numer
4673          *      --------------- < ------------------
4674          *      sp->slab_chunks   KMEM_VOID_FRACTION
4675          */
4676         return ((refcnt * KMEM_VOID_FRACTION) <
4677             (sp->slab_chunks * cp->cache_defrag->kmd_reclaim_numer));
4678 }
4679 
4680 static void *
4681 kmem_hunt_mag(kmem_cache_t *cp, kmem_magazine_t *m, int n, void *buf,
4682     void *tbuf)
4683 {
4684         int i;          /* magazine round index */
4685 
4686         for (i = 0; i < n; i++) {
4687                 if (buf == m->mag_round[i]) {
4688                         if (cp->cache_flags & KMF_BUFTAG) {
4689                                 (void) kmem_cache_free_debug(cp, tbuf,
4690                                     caller());
4691                         }
4692                         m->mag_round[i] = tbuf;
4693                         return (buf);
4694                 }
4695         }
4696 
4697         return (NULL);
4698 }
4699 
4700 /*
4701  * Hunt the magazine layer for the given buffer. If found, the buffer is
4702  * removed from the magazine layer and returned, otherwise NULL is returned.
4703  * The state of the returned buffer is freed and constructed.
4704  */
4705 static void *
4706 kmem_hunt_mags(kmem_cache_t *cp, void *buf)
4707 {
4708         kmem_cpu_cache_t *ccp;
4709         kmem_magazine_t *m;
4710         int cpu_seqid;
4711         int n;          /* magazine rounds */
4712         void *tbuf;     /* temporary swap buffer */
4713 
4714         ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
4715 
4716         /*
4717          * Allocated a buffer to swap with the one we hope to pull out of a
4718          * magazine when found.
4719          */
4720         tbuf = kmem_cache_alloc(cp, KM_NOSLEEP);
4721         if (tbuf == NULL) {
4722                 KMEM_STAT_ADD(kmem_move_stats.kms_hunt_alloc_fail);
4723                 return (NULL);
4724         }
4725         if (tbuf == buf) {
4726                 KMEM_STAT_ADD(kmem_move_stats.kms_hunt_lucky);
4727                 if (cp->cache_flags & KMF_BUFTAG) {
4728                         (void) kmem_cache_free_debug(cp, buf, caller());
4729                 }
4730                 return (buf);
4731         }
4732 
4733         /* Hunt the depot. */
4734         mutex_enter(&cp->cache_depot_lock);
4735         n = cp->cache_magtype->mt_magsize;
4736         for (m = cp->cache_full.ml_list; m != NULL; m = m->mag_next) {
4737                 if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) {
4738                         mutex_exit(&cp->cache_depot_lock);
4739                         return (buf);
4740                 }
4741         }
4742         mutex_exit(&cp->cache_depot_lock);
4743 
4744         /* Hunt the per-CPU magazines. */
4745         for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) {
4746                 ccp = &cp->cache_cpu[cpu_seqid];
4747 
4748                 mutex_enter(&ccp->cc_lock);
4749                 m = ccp->cc_loaded;
4750                 n = ccp->cc_rounds;
4751                 if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) {
4752                         mutex_exit(&ccp->cc_lock);
4753                         return (buf);
4754                 }
4755                 m = ccp->cc_ploaded;
4756                 n = ccp->cc_prounds;
4757                 if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) {
4758                         mutex_exit(&ccp->cc_lock);
4759                         return (buf);
4760                 }
4761                 mutex_exit(&ccp->cc_lock);
4762         }
4763 
4764         kmem_cache_free(cp, tbuf);
4765         return (NULL);
4766 }
4767 
4768 /*
4769  * May be called from the kmem_move_taskq, from kmem_cache_move_notify_task(),
4770  * or when the buffer is freed.
4771  */
4772 static void
4773 kmem_slab_move_yes(kmem_cache_t *cp, kmem_slab_t *sp, void *from_buf)
4774 {
4775         ASSERT(MUTEX_HELD(&cp->cache_lock));
4776         ASSERT(KMEM_SLAB_MEMBER(sp, from_buf));
4777 
4778         if (!KMEM_SLAB_IS_PARTIAL(sp)) {
4779                 return;
4780         }
4781 
4782         if (sp->slab_flags & KMEM_SLAB_NOMOVE) {
4783                 if (KMEM_SLAB_OFFSET(sp, from_buf) == sp->slab_stuck_offset) {
4784                         avl_remove(&cp->cache_partial_slabs, sp);
4785                         sp->slab_flags &= ~KMEM_SLAB_NOMOVE;
4786                         sp->slab_stuck_offset = (uint32_t)-1;
4787                         avl_add(&cp->cache_partial_slabs, sp);
4788                 }


4811 }
4812 
4813 static void kmem_move_end(kmem_cache_t *, kmem_move_t *);
4814 
4815 /*
4816  * The move callback takes two buffer addresses, the buffer to be moved, and a
4817  * newly allocated and constructed buffer selected by kmem as the destination.
4818  * It also takes the size of the buffer and an optional user argument specified
4819  * at cache creation time. kmem guarantees that the buffer to be moved has not
4820  * been unmapped by the virtual memory subsystem. Beyond that, it cannot
4821  * guarantee the present whereabouts of the buffer to be moved, so it is up to
4822  * the client to safely determine whether or not it is still using the buffer.
4823  * The client must not free either of the buffers passed to the move callback,
4824  * since kmem wants to free them directly to the slab layer. The client response
4825  * tells kmem which of the two buffers to free:
4826  *
4827  * YES          kmem frees the old buffer (the move was successful)
4828  * NO           kmem frees the new buffer, marks the slab of the old buffer
4829  *              non-reclaimable to avoid bothering the client again
4830  * LATER        kmem frees the new buffer, increments slab_later_count
4831  * DONT_KNOW    kmem frees the new buffer, searches mags for the old buffer
4832  * DONT_NEED    kmem frees both the old buffer and the new buffer
4833  *
4834  * The pending callback argument now being processed contains both of the
4835  * buffers (old and new) passed to the move callback function, the slab of the
4836  * old buffer, and flags related to the move request, such as whether or not the
4837  * system was desperate for memory.
4838  *
4839  * Slabs are not freed while there is a pending callback, but instead are kept
4840  * on a deadlist, which is drained after the last callback completes. This means
4841  * that slabs are safe to access until kmem_move_end(), no matter how many of
4842  * their buffers have been freed. Once slab_refcnt reaches zero, it stays at
4843  * zero for as long as the slab remains on the deadlist and until the slab is
4844  * freed.
4845  */
4846 static void
4847 kmem_move_buffer(kmem_move_t *callback)
4848 {
4849         kmem_cbrc_t response;
4850         kmem_slab_t *sp = callback->kmm_from_slab;
4851         kmem_cache_t *cp = sp->slab_cache;
4852         boolean_t free_on_slab;
4853 
4854         ASSERT(taskq_member(kmem_move_taskq, curthread));
4855         ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
4856         ASSERT(KMEM_SLAB_MEMBER(sp, callback->kmm_from_buf));
4857 
4858         /*
4859          * The number of allocated buffers on the slab may have changed since we
4860          * last checked the slab's reclaimability (when the pending move was
4861          * enqueued), or the client may have responded NO when asked to move
4862          * another buffer on the same slab.
4863          */
4864         if (!kmem_slab_is_reclaimable(cp, sp, callback->kmm_flags)) {
4865                 KMEM_STAT_ADD(kmem_move_stats.kms_no_longer_reclaimable);
4866                 KMEM_STAT_COND_ADD((callback->kmm_flags & KMM_NOTIFY),
4867                     kmem_move_stats.kms_notify_no_longer_reclaimable);
4868                 kmem_slab_free(cp, callback->kmm_to_buf);
4869                 kmem_move_end(cp, callback);
4870                 return;
4871         }
4872 
4873         /*
4874          * Hunting magazines is expensive, so we'll wait to do that until the
4875          * client responds KMEM_CBRC_DONT_KNOW. However, checking the slab layer
4876          * is cheap, so we might as well do that here in case we can avoid
4877          * bothering the client.
4878          */
4879         mutex_enter(&cp->cache_lock);
4880         free_on_slab = (kmem_slab_allocated(cp, sp,
4881             callback->kmm_from_buf) == NULL);
4882         mutex_exit(&cp->cache_lock);
4883 
4884         if (free_on_slab) {
4885                 KMEM_STAT_ADD(kmem_move_stats.kms_hunt_found_slab);
4886                 kmem_slab_free(cp, callback->kmm_to_buf);
4887                 kmem_move_end(cp, callback);
4888                 return;
4889         }
4890 
4891         if (cp->cache_flags & KMF_BUFTAG) {
4892                 /*
4893                  * Make kmem_cache_alloc_debug() apply the constructor for us.
4894                  */
4895                 if (kmem_cache_alloc_debug(cp, callback->kmm_to_buf,
4896                     KM_NOSLEEP, 1, caller()) != 0) {
4897                         KMEM_STAT_ADD(kmem_move_stats.kms_alloc_fail);
4898                         kmem_move_end(cp, callback);
4899                         return;
4900                 }
4901         } else if (cp->cache_constructor != NULL &&
4902             cp->cache_constructor(callback->kmm_to_buf, cp->cache_private,
4903             KM_NOSLEEP) != 0) {
4904                 atomic_inc_64(&cp->cache_alloc_fail);
4905                 KMEM_STAT_ADD(kmem_move_stats.kms_constructor_fail);
4906                 kmem_slab_free(cp, callback->kmm_to_buf);
4907                 kmem_move_end(cp, callback);
4908                 return;
4909         }
4910 
4911         KMEM_STAT_ADD(kmem_move_stats.kms_callbacks);
4912         KMEM_STAT_COND_ADD((callback->kmm_flags & KMM_NOTIFY),
4913             kmem_move_stats.kms_notify_callbacks);
4914         cp->cache_defrag->kmd_callbacks++;
4915         cp->cache_defrag->kmd_thread = curthread;
4916         cp->cache_defrag->kmd_from_buf = callback->kmm_from_buf;
4917         cp->cache_defrag->kmd_to_buf = callback->kmm_to_buf;
4918         DTRACE_PROBE2(kmem__move__start, kmem_cache_t *, cp, kmem_move_t *,
4919             callback);
4920 
4921         response = cp->cache_move(callback->kmm_from_buf,
4922             callback->kmm_to_buf, cp->cache_bufsize, cp->cache_private);
4923 
4924         DTRACE_PROBE3(kmem__move__end, kmem_cache_t *, cp, kmem_move_t *,
4925             callback, kmem_cbrc_t, response);
4926         cp->cache_defrag->kmd_thread = NULL;
4927         cp->cache_defrag->kmd_from_buf = NULL;
4928         cp->cache_defrag->kmd_to_buf = NULL;
4929 
4930         if (response == KMEM_CBRC_YES) {
4931                 KMEM_STAT_ADD(kmem_move_stats.kms_yes);
4932                 cp->cache_defrag->kmd_yes++;
4933                 kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE);
4934                 /* slab safe to access until kmem_move_end() */
4935                 if (sp->slab_refcnt == 0)
4936                         cp->cache_defrag->kmd_slabs_freed++;
4937                 mutex_enter(&cp->cache_lock);
4938                 kmem_slab_move_yes(cp, sp, callback->kmm_from_buf);
4939                 mutex_exit(&cp->cache_lock);
4940                 kmem_move_end(cp, callback);
4941                 return;
4942         }
4943 
4944         switch (response) {
4945         case KMEM_CBRC_NO:
4946                 KMEM_STAT_ADD(kmem_move_stats.kms_no);
4947                 cp->cache_defrag->kmd_no++;
4948                 mutex_enter(&cp->cache_lock);
4949                 kmem_slab_move_no(cp, sp, callback->kmm_from_buf);
4950                 mutex_exit(&cp->cache_lock);
4951                 break;
4952         case KMEM_CBRC_LATER:
4953                 KMEM_STAT_ADD(kmem_move_stats.kms_later);
4954                 cp->cache_defrag->kmd_later++;
4955                 mutex_enter(&cp->cache_lock);
4956                 if (!KMEM_SLAB_IS_PARTIAL(sp)) {
4957                         mutex_exit(&cp->cache_lock);
4958                         break;
4959                 }
4960 
4961                 if (++sp->slab_later_count >= KMEM_DISBELIEF) {
4962                         KMEM_STAT_ADD(kmem_move_stats.kms_disbelief);
4963                         kmem_slab_move_no(cp, sp, callback->kmm_from_buf);
4964                 } else if (!(sp->slab_flags & KMEM_SLAB_NOMOVE)) {
4965                         sp->slab_stuck_offset = KMEM_SLAB_OFFSET(sp,
4966                             callback->kmm_from_buf);
4967                 }
4968                 mutex_exit(&cp->cache_lock);
4969                 break;
4970         case KMEM_CBRC_DONT_NEED:
4971                 KMEM_STAT_ADD(kmem_move_stats.kms_dont_need);
4972                 cp->cache_defrag->kmd_dont_need++;
4973                 kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE);
4974                 if (sp->slab_refcnt == 0)
4975                         cp->cache_defrag->kmd_slabs_freed++;
4976                 mutex_enter(&cp->cache_lock);
4977                 kmem_slab_move_yes(cp, sp, callback->kmm_from_buf);
4978                 mutex_exit(&cp->cache_lock);
4979                 break;
4980         case KMEM_CBRC_DONT_KNOW:
4981                 KMEM_STAT_ADD(kmem_move_stats.kms_dont_know);













4982                 cp->cache_defrag->kmd_dont_know++;
4983                 if (kmem_hunt_mags(cp, callback->kmm_from_buf) != NULL) {
4984                         KMEM_STAT_ADD(kmem_move_stats.kms_hunt_found_mag);
4985                         cp->cache_defrag->kmd_hunt_found++;
4986                         kmem_slab_free_constructed(cp, callback->kmm_from_buf,
4987                             B_TRUE);
4988                         if (sp->slab_refcnt == 0)
4989                                 cp->cache_defrag->kmd_slabs_freed++;
4990                         mutex_enter(&cp->cache_lock);
4991                         kmem_slab_move_yes(cp, sp, callback->kmm_from_buf);
4992                         mutex_exit(&cp->cache_lock);
4993                 }
4994                 break;
4995         default:
4996                 panic("'%s' (%p) unexpected move callback response %d\n",
4997                     cp->cache_name, (void *)cp, response);
4998         }
4999 
5000         kmem_slab_free_constructed(cp, callback->kmm_to_buf, B_FALSE);
5001         kmem_move_end(cp, callback);
5002 }
5003 
5004 /* Return B_FALSE if there is insufficient memory for the move request. */
5005 static boolean_t
5006 kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags)
5007 {
5008         void *to_buf;
5009         avl_index_t index;
5010         kmem_move_t *callback, *pending;
5011         ulong_t n;
5012 
5013         ASSERT(taskq_member(kmem_taskq, curthread));
5014         ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
5015         ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING);
5016 
5017         callback = kmem_cache_alloc(kmem_move_cache, KM_NOSLEEP);
5018         if (callback == NULL) {
5019                 KMEM_STAT_ADD(kmem_move_stats.kms_callback_alloc_fail);
5020                 return (B_FALSE);
5021         }
5022 
5023         callback->kmm_from_slab = sp;
5024         callback->kmm_from_buf = buf;
5025         callback->kmm_flags = flags;
5026 
5027         mutex_enter(&cp->cache_lock);
5028 
5029         n = avl_numnodes(&cp->cache_partial_slabs);
5030         if ((n == 0) || ((n == 1) && !(flags & KMM_DEBUG))) {
5031                 mutex_exit(&cp->cache_lock);
5032                 kmem_cache_free(kmem_move_cache, callback);
5033                 return (B_TRUE); /* there is no need for the move request */
5034         }
5035 
5036         pending = avl_find(&cp->cache_defrag->kmd_moves_pending, buf, &index);
5037         if (pending != NULL) {
5038                 /*
5039                  * If the move is already pending and we're desperate now,
5040                  * update the move flags.
5041                  */
5042                 if (flags & KMM_DESPERATE) {
5043                         pending->kmm_flags |= KMM_DESPERATE;
5044                 }
5045                 mutex_exit(&cp->cache_lock);
5046                 KMEM_STAT_ADD(kmem_move_stats.kms_already_pending);
5047                 kmem_cache_free(kmem_move_cache, callback);
5048                 return (B_TRUE);
5049         }
5050 
5051         to_buf = kmem_slab_alloc_impl(cp, avl_first(&cp->cache_partial_slabs),
5052             B_FALSE);
5053         callback->kmm_to_buf = to_buf;
5054         avl_insert(&cp->cache_defrag->kmd_moves_pending, callback, index);
5055 
5056         mutex_exit(&cp->cache_lock);
5057 
5058         if (!taskq_dispatch(kmem_move_taskq, (task_func_t *)kmem_move_buffer,
5059             callback, TQ_NOSLEEP)) {
5060                 KMEM_STAT_ADD(kmem_move_stats.kms_callback_taskq_fail);
5061                 mutex_enter(&cp->cache_lock);
5062                 avl_remove(&cp->cache_defrag->kmd_moves_pending, callback);
5063                 mutex_exit(&cp->cache_lock);
5064                 kmem_slab_free(cp, to_buf);
5065                 kmem_cache_free(kmem_move_cache, callback);
5066                 return (B_FALSE);
5067         }
5068 
5069         return (B_TRUE);
5070 }
5071 
5072 static void
5073 kmem_move_end(kmem_cache_t *cp, kmem_move_t *callback)
5074 {
5075         avl_index_t index;
5076 
5077         ASSERT(cp->cache_defrag != NULL);
5078         ASSERT(taskq_member(kmem_move_taskq, curthread));
5079         ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
5080 


5086                 list_t *deadlist = &cp->cache_defrag->kmd_deadlist;
5087                 kmem_slab_t *sp;
5088 
5089                 /*
5090                  * The last pending move completed. Release all slabs from the
5091                  * front of the dead list except for any slab at the tail that
5092                  * needs to be released from the context of kmem_move_buffers().
5093                  * kmem deferred unmapping the buffers on these slabs in order
5094                  * to guarantee that buffers passed to the move callback have
5095                  * been touched only by kmem or by the client itself.
5096                  */
5097                 while ((sp = list_remove_head(deadlist)) != NULL) {
5098                         if (sp->slab_flags & KMEM_SLAB_MOVE_PENDING) {
5099                                 list_insert_tail(deadlist, sp);
5100                                 break;
5101                         }
5102                         cp->cache_defrag->kmd_deadcount--;
5103                         cp->cache_slab_destroy++;
5104                         mutex_exit(&cp->cache_lock);
5105                         kmem_slab_destroy(cp, sp);
5106                         KMEM_STAT_ADD(kmem_move_stats.kms_dead_slabs_freed);
5107                         mutex_enter(&cp->cache_lock);
5108                 }
5109         }
5110         mutex_exit(&cp->cache_lock);
5111         kmem_cache_free(kmem_move_cache, callback);
5112 }
5113 
5114 /*
5115  * Move buffers from least used slabs first by scanning backwards from the end
5116  * of the partial slab list. Scan at most max_scan candidate slabs and move
5117  * buffers from at most max_slabs slabs (0 for all partial slabs in both cases).
5118  * If desperate to reclaim memory, move buffers from any partial slab, otherwise
5119  * skip slabs with a ratio of allocated buffers at or above the current
5120  * threshold. Return the number of unskipped slabs (at most max_slabs, -1 if the
5121  * scan is aborted) so that the caller can adjust the reclaimability threshold
5122  * depending on how many reclaimable slabs it finds.
5123  *
5124  * kmem_move_buffers() drops and reacquires cache_lock every time it issues a
5125  * move request, since it is not valid for kmem_move_begin() to call
5126  * kmem_cache_alloc() or taskq_dispatch() with cache_lock held.


5231                                 list_t *deadlist =
5232                                     &cp->cache_defrag->kmd_deadlist;
5233                                 list_remove(deadlist, sp);
5234 
5235                                 if (!avl_is_empty(
5236                                     &cp->cache_defrag->kmd_moves_pending)) {
5237                                         /*
5238                                          * A pending move makes it unsafe to
5239                                          * destroy the slab, because even though
5240                                          * the move is no longer needed, the
5241                                          * context where that is determined
5242                                          * requires the slab to exist.
5243                                          * Fortunately, a pending move also
5244                                          * means we don't need to destroy the
5245                                          * slab here, since it will get
5246                                          * destroyed along with any other slabs
5247                                          * on the deadlist after the last
5248                                          * pending move completes.
5249                                          */
5250                                         list_insert_head(deadlist, sp);
5251                                         KMEM_STAT_ADD(kmem_move_stats.
5252                                             kms_endscan_slab_dead);
5253                                         return (-1);
5254                                 }
5255 
5256                                 /*
5257                                  * Destroy the slab now if it was completely
5258                                  * freed while we dropped cache_lock and there
5259                                  * are no pending moves. Since slab_refcnt
5260                                  * cannot change once it reaches zero, no new
5261                                  * pending moves from that slab are possible.
5262                                  */
5263                                 cp->cache_defrag->kmd_deadcount--;
5264                                 cp->cache_slab_destroy++;
5265                                 mutex_exit(&cp->cache_lock);
5266                                 kmem_slab_destroy(cp, sp);
5267                                 KMEM_STAT_ADD(kmem_move_stats.
5268                                     kms_dead_slabs_freed);
5269                                 KMEM_STAT_ADD(kmem_move_stats.
5270                                     kms_endscan_slab_destroyed);
5271                                 mutex_enter(&cp->cache_lock);
5272                                 /*
5273                                  * Since we can't pick up the scan where we left
5274                                  * off, abort the scan and say nothing about the
5275                                  * number of reclaimable slabs.
5276                                  */
5277                                 return (-1);
5278                         }
5279 
5280                         if (!success) {
5281                                 /*
5282                                  * Abort the scan if there is not enough memory
5283                                  * for the request and say nothing about the
5284                                  * number of reclaimable slabs.
5285                                  */
5286                                 KMEM_STAT_COND_ADD(s < max_slabs,
5287                                     kmem_move_stats.kms_endscan_nomem);
5288                                 return (-1);
5289                         }
5290 
5291                         /*
5292                          * The slab's position changed while the lock was
5293                          * dropped, so we don't know where we are in the
5294                          * sequence any more.
5295                          */
5296                         if (sp->slab_refcnt != refcnt) {
5297                                 /*
5298                                  * If this is a KMM_DEBUG move, the slab_refcnt
5299                                  * may have changed because we allocated a
5300                                  * destination buffer on the same slab. In that
5301                                  * case, we're not interested in counting it.
5302                                  */
5303                                 KMEM_STAT_COND_ADD(!(flags & KMM_DEBUG) &&
5304                                     (s < max_slabs),
5305                                     kmem_move_stats.kms_endscan_refcnt_changed);
5306                                 return (-1);
5307                         }
5308                         if ((sp->slab_flags & KMEM_SLAB_NOMOVE) != nomove) {
5309                                 KMEM_STAT_COND_ADD(s < max_slabs,
5310                                     kmem_move_stats.kms_endscan_nomove_changed);
5311                                 return (-1);
5312                         }
5313 
5314                         /*
5315                          * Generating a move request allocates a destination
5316                          * buffer from the slab layer, bumping the first partial
5317                          * slab if it is completely allocated. If the current
5318                          * slab becomes the first partial slab as a result, we
5319                          * can't continue to scan backwards.
5320                          *
5321                          * If this is a KMM_DEBUG move and we allocated the
5322                          * destination buffer from the last partial slab, then
5323                          * the buffer we're moving is on the same slab and our
5324                          * slab_refcnt has changed, causing us to return before
5325                          * reaching here if there are no partial slabs left.
5326                          */
5327                         ASSERT(!avl_is_empty(&cp->cache_partial_slabs));
5328                         if (sp == avl_first(&cp->cache_partial_slabs)) {
5329                                 /*
5330                                  * We're not interested in a second KMM_DEBUG
5331                                  * move.
5332                                  */
5333                                 goto end_scan;
5334                         }
5335                 }
5336         }
5337 end_scan:
5338 
5339         KMEM_STAT_COND_ADD(!(flags & KMM_DEBUG) &&
5340             (s < max_slabs) &&
5341             (sp == avl_first(&cp->cache_partial_slabs)),
5342             kmem_move_stats.kms_endscan_freelist);
5343 
5344         return (s);
5345 }
5346 
5347 typedef struct kmem_move_notify_args {
5348         kmem_cache_t *kmna_cache;
5349         void *kmna_buf;
5350 } kmem_move_notify_args_t;
5351 
5352 static void
5353 kmem_cache_move_notify_task(void *arg)
5354 {
5355         kmem_move_notify_args_t *args = arg;
5356         kmem_cache_t *cp = args->kmna_cache;
5357         void *buf = args->kmna_buf;
5358         kmem_slab_t *sp;
5359 
5360         ASSERT(taskq_member(kmem_taskq, curthread));
5361         ASSERT(list_link_active(&cp->cache_link));
5362 
5363         kmem_free(args, sizeof (kmem_move_notify_args_t));


5383                         return;
5384                 }
5385 
5386                 kmem_slab_move_yes(cp, sp, buf);
5387                 ASSERT(!(sp->slab_flags & KMEM_SLAB_MOVE_PENDING));
5388                 sp->slab_flags |= KMEM_SLAB_MOVE_PENDING;
5389                 mutex_exit(&cp->cache_lock);
5390                 /* see kmem_move_buffers() about dropping the lock */
5391                 (void) kmem_move_begin(cp, sp, buf, KMM_NOTIFY);
5392                 mutex_enter(&cp->cache_lock);
5393                 ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING);
5394                 sp->slab_flags &= ~KMEM_SLAB_MOVE_PENDING;
5395                 if (sp->slab_refcnt == 0) {
5396                         list_t *deadlist = &cp->cache_defrag->kmd_deadlist;
5397                         list_remove(deadlist, sp);
5398 
5399                         if (!avl_is_empty(
5400                             &cp->cache_defrag->kmd_moves_pending)) {
5401                                 list_insert_head(deadlist, sp);
5402                                 mutex_exit(&cp->cache_lock);
5403                                 KMEM_STAT_ADD(kmem_move_stats.
5404                                     kms_notify_slab_dead);
5405                                 return;
5406                         }
5407 
5408                         cp->cache_defrag->kmd_deadcount--;
5409                         cp->cache_slab_destroy++;
5410                         mutex_exit(&cp->cache_lock);
5411                         kmem_slab_destroy(cp, sp);
5412                         KMEM_STAT_ADD(kmem_move_stats.kms_dead_slabs_freed);
5413                         KMEM_STAT_ADD(kmem_move_stats.
5414                             kms_notify_slab_destroyed);
5415                         return;
5416                 }
5417         } else {
5418                 kmem_slab_move_yes(cp, sp, buf);
5419         }
5420         mutex_exit(&cp->cache_lock);
5421 }
5422 
5423 void
5424 kmem_cache_move_notify(kmem_cache_t *cp, void *buf)
5425 {
5426         kmem_move_notify_args_t *args;
5427 
5428         KMEM_STAT_ADD(kmem_move_stats.kms_notify);
5429         args = kmem_alloc(sizeof (kmem_move_notify_args_t), KM_NOSLEEP);
5430         if (args != NULL) {
5431                 args->kmna_cache = cp;
5432                 args->kmna_buf = buf;
5433                 if (!taskq_dispatch(kmem_taskq,
5434                     (task_func_t *)kmem_cache_move_notify_task, args,
5435                     TQ_NOSLEEP))
5436                         kmem_free(args, sizeof (kmem_move_notify_args_t));
5437         }
5438 }
5439 
5440 static void
5441 kmem_cache_defrag(kmem_cache_t *cp)
5442 {
5443         size_t n;
5444 
5445         ASSERT(cp->cache_defrag != NULL);
5446 
5447         mutex_enter(&cp->cache_lock);
5448         n = avl_numnodes(&cp->cache_partial_slabs);
5449         if (n > 1) {
5450                 /* kmem_move_buffers() drops and reacquires cache_lock */
5451                 KMEM_STAT_ADD(kmem_move_stats.kms_defrags);
5452                 cp->cache_defrag->kmd_defrags++;
5453                 (void) kmem_move_buffers(cp, n, 0, KMM_DESPERATE);
5454         }
5455         mutex_exit(&cp->cache_lock);
5456 }
5457 
5458 /* Is this cache above the fragmentation threshold? */
5459 static boolean_t
5460 kmem_cache_frag_threshold(kmem_cache_t *cp, uint64_t nfree)
5461 {
5462         /*
5463          *      nfree           kmem_frag_numer
5464          * ------------------ > ---------------
5465          * cp->cache_buftotal        kmem_frag_denom
5466          */
5467         return ((nfree * kmem_frag_denom) >
5468             (cp->cache_buftotal * kmem_frag_numer));
5469 }
5470 
5471 static boolean_t


5530         if (kmd->kmd_consolidate > 0) {
5531                 kmd->kmd_consolidate--;
5532                 mutex_exit(&cp->cache_lock);
5533                 kmem_cache_reap(cp);
5534                 return;
5535         }
5536 
5537         if (kmem_cache_is_fragmented(cp, &reap)) {
5538                 size_t slabs_found;
5539 
5540                 /*
5541                  * Consolidate reclaimable slabs from the end of the partial
5542                  * slab list (scan at most kmem_reclaim_scan_range slabs to find
5543                  * reclaimable slabs). Keep track of how many candidate slabs we
5544                  * looked for and how many we actually found so we can adjust
5545                  * the definition of a candidate slab if we're having trouble
5546                  * finding them.
5547                  *
5548                  * kmem_move_buffers() drops and reacquires cache_lock.
5549                  */
5550                 KMEM_STAT_ADD(kmem_move_stats.kms_scans);
5551                 kmd->kmd_scans++;
5552                 slabs_found = kmem_move_buffers(cp, kmem_reclaim_scan_range,
5553                     kmem_reclaim_max_slabs, 0);
5554                 if (slabs_found >= 0) {
5555                         kmd->kmd_slabs_sought += kmem_reclaim_max_slabs;
5556                         kmd->kmd_slabs_found += slabs_found;
5557                 }
5558 
5559                 if (++kmd->kmd_tries >= kmem_reclaim_scan_range) {
5560                         kmd->kmd_tries = 0;
5561 
5562                         /*
5563                          * If we had difficulty finding candidate slabs in
5564                          * previous scans, adjust the threshold so that
5565                          * candidates are easier to find.
5566                          */
5567                         if (kmd->kmd_slabs_found == kmd->kmd_slabs_sought) {
5568                                 kmem_adjust_reclaim_threshold(kmd, -1);
5569                         } else if ((kmd->kmd_slabs_found * 2) <
5570                             kmd->kmd_slabs_sought) {
5571                                 kmem_adjust_reclaim_threshold(kmd, 1);
5572                         }
5573                         kmd->kmd_slabs_sought = 0;
5574                         kmd->kmd_slabs_found = 0;
5575                 }
5576         } else {
5577                 kmem_reset_reclaim_threshold(cp->cache_defrag);
5578 #ifdef  DEBUG
5579                 if (!avl_is_empty(&cp->cache_partial_slabs)) {
5580                         /*
5581                          * In a debug kernel we want the consolidator to
5582                          * run occasionally even when there is plenty of
5583                          * memory.
5584                          */
5585                         uint16_t debug_rand;
5586 
5587                         (void) random_get_bytes((uint8_t *)&debug_rand, 2);
5588                         if (!kmem_move_noreap &&
5589                             ((debug_rand % kmem_mtb_reap) == 0)) {
5590                                 mutex_exit(&cp->cache_lock);
5591                                 KMEM_STAT_ADD(kmem_move_stats.kms_debug_reaps);
5592                                 kmem_cache_reap(cp);
5593                                 return;
5594                         } else if ((debug_rand % kmem_mtb_move) == 0) {
5595                                 KMEM_STAT_ADD(kmem_move_stats.kms_scans);
5596                                 KMEM_STAT_ADD(kmem_move_stats.kms_debug_scans);
5597                                 kmd->kmd_scans++;
5598                                 (void) kmem_move_buffers(cp,
5599                                     kmem_reclaim_scan_range, 1, KMM_DEBUG);
5600                         }
5601                 }
5602 #endif  /* DEBUG */
5603         }
5604 
5605         mutex_exit(&cp->cache_lock);
5606 
5607         if (reap) {
5608                 KMEM_STAT_ADD(kmem_move_stats.kms_scan_depot_ws_reaps);
5609                 kmem_depot_ws_reap(cp);
5610         }
5611 }


 142  *            (The system won't be getting the slab back as long as the
 143  *            immovable object holds it hostage, so there's no point in moving
 144  *            any of its objects.)
 145  *     LATER: The client is using the object and cannot move it now, so kmem
 146  *            frees the new object (the unused copy destination). kmem still
 147  *            attempts to move other objects off the slab, since it expects to
 148  *            succeed in clearing the slab in a later callback. The client
 149  *            should use LATER instead of NO if the object is likely to become
 150  *            movable very soon.
 151  * DONT_NEED: The client no longer needs the object, so kmem frees the old along
 152  *            with the new object (the unused copy destination). This response
 153  *            is the client's opportunity to be a model citizen and give back as
 154  *            much as it can.
 155  * DONT_KNOW: The client does not know about the object because
 156  *            a) the client has just allocated the object and not yet put it
 157  *               wherever it expects to find known objects
 158  *            b) the client has removed the object from wherever it expects to
 159  *               find known objects and is about to free it, or
 160  *            c) the client has freed the object.
 161  *            In all these cases (a, b, and c) kmem frees the new object (the
 162  *            unused copy destination).  In the first case, the object is in
 163  *            use and the correct action is that for LATER; in the latter two
 164  *            cases, we know that the object is either freed or about to be
 165  *            freed, in which case it is either already in a magazine or about
 166  *            to be in one.  In these cases, we know that the object will either
 167  *            be reallocated and reused, or it will end up in a full magazine
 168  *            that will be reaped (thereby liberating the slab).  Because it
 169  *            is prohibitively expensive to differentiate these cases, and
 170  *            because the defrag code is executed when we're low on memory
 171  *            (thereby biasing the system to reclaim full magazines) we treat
 172  *            all DONT_KNOW cases as LATER and rely on cache reaping to
 173  *            generally clean up full magazines.  While we take the same action
 174  *            for these cases, we maintain their semantic distinction:  if
 175  *            defragmentation is not occurring, it is useful to know if this
 176  *            is due to objects in use (LATER) or objects in an unknown state
 177  *            of transition (DONT_KNOW).
 178  *
 179  * 2.3 Object States
 180  *
 181  * Neither kmem nor the client can be assumed to know the object's whereabouts
 182  * at the time of the callback. An object belonging to a kmem cache may be in
 183  * any of the following states:
 184  *
 185  * 1. Uninitialized on the slab
 186  * 2. Allocated from the slab but not constructed (still uninitialized)
 187  * 3. Allocated from the slab, constructed, but not yet ready for business
 188  *    (not in a valid state for the move callback)
 189  * 4. In use (valid and known to the client)
 190  * 5. About to be freed (no longer in a valid state for the move callback)
 191  * 6. Freed to a magazine (still constructed)
 192  * 7. Allocated from a magazine, not yet ready for business (not in a valid
 193  *    state for the move callback), and about to return to state #4
 194  * 8. Deconstructed on a magazine that is about to be freed
 195  * 9. Freed to the slab
 196  *
 197  * Since the move callback may be called at any time while the object is in any


 280  * c_objects_lock is held. Note that after acquiring the lock, the client must
 281  * recheck the o_container pointer in case the object was removed just before
 282  * acquiring the lock.
 283  *
 284  * When the client is about to free an object, it must first remove that object
 285  * from the list, hash, or other structure where it is kept. At that time, to
 286  * mark the object so it can be distinguished from the remaining, known objects,
 287  * the client sets the designated low order bit:
 288  *
 289  *      mutex_enter(&container->c_objects_lock);
 290  *      object->o_container = (void *)((uintptr_t)object->o_container | 0x1);
 291  *      list_remove(&container->c_objects, object);
 292  *      mutex_exit(&container->c_objects_lock);
 293  *
 294  * In the common case, the object is freed to the magazine layer, where it may
 295  * be reused on a subsequent allocation without the overhead of calling the
 296  * constructor. While in the magazine it appears allocated from the point of
 297  * view of the slab layer, making it a candidate for the move callback. Most
 298  * objects unrecognized by the client in the move callback fall into this
 299  * category and are cheaply distinguished from known objects by the test
 300  * described earlier. Because searching magazines is prohibitively expensive
 301  * for kmem, clients that do not mark freed objects (and therefore return
 302  * KMEM_CBRC_DONT_KNOW for large numbers of objects) may find defragmentation
 303  * efficacy reduced.
 304  *
 305  * Invalidating the designated pointer member before freeing the object marks
 306  * the object to be avoided in the callback, and conversely, assigning a valid
 307  * value to the designated pointer member after allocating the object makes the
 308  * object fair game for the callback:
 309  *
 310  *      ... allocate object ...
 311  *      ... set any initial state not set by the constructor ...
 312  *
 313  *      mutex_enter(&container->c_objects_lock);
 314  *      list_insert_tail(&container->c_objects, object);
 315  *      membar_producer();
 316  *      object->o_container = container;
 317  *      mutex_exit(&container->c_objects_lock);
 318  *
 319  * Note that everything else must be valid before setting o_container makes the
 320  * object fair game for the move callback. The membar_producer() call ensures
 321  * that all the object's state is written to memory before setting the pointer
 322  * that transitions the object from state #3 or #7 (allocated, constructed, not
 323  * yet in use) to state #4 (in use, valid). That's important because the move


1034 static kmem_cache_t     *kmem_bufctl_cache;
1035 static kmem_cache_t     *kmem_bufctl_audit_cache;
1036 
1037 static kmutex_t         kmem_cache_lock;        /* inter-cache linkage only */
1038 static list_t           kmem_caches;
1039 
1040 static taskq_t          *kmem_taskq;
1041 static kmutex_t         kmem_flags_lock;
1042 static vmem_t           *kmem_metadata_arena;
1043 static vmem_t           *kmem_msb_arena;        /* arena for metadata caches */
1044 static vmem_t           *kmem_cache_arena;
1045 static vmem_t           *kmem_hash_arena;
1046 static vmem_t           *kmem_log_arena;
1047 static vmem_t           *kmem_oversize_arena;
1048 static vmem_t           *kmem_va_arena;
1049 static vmem_t           *kmem_default_arena;
1050 static vmem_t           *kmem_firewall_va_arena;
1051 static vmem_t           *kmem_firewall_arena;
1052 
1053 /*
















1054  * kmem slab consolidator thresholds (tunables)
1055  */
1056 size_t kmem_frag_minslabs = 101;        /* minimum total slabs */
1057 size_t kmem_frag_numer = 1;             /* free buffers (numerator) */
1058 size_t kmem_frag_denom = KMEM_VOID_FRACTION; /* buffers (denominator) */
1059 /*
1060  * Maximum number of slabs from which to move buffers during a single
1061  * maintenance interval while the system is not low on memory.
1062  */
1063 size_t kmem_reclaim_max_slabs = 1;
1064 /*
1065  * Number of slabs to scan backwards from the end of the partial slab list
1066  * when searching for buffers to relocate.
1067  */
1068 size_t kmem_reclaim_scan_range = 12;
1069 









































1070 /* consolidator knobs */
1071 boolean_t kmem_move_noreap;
1072 boolean_t kmem_move_blocked;
1073 boolean_t kmem_move_fulltilt;
1074 boolean_t kmem_move_any_partial;
1075 
1076 #ifdef  DEBUG
1077 /*
1078  * kmem consolidator debug tunables:
1079  * Ensure code coverage by occasionally running the consolidator even when the
1080  * caches are not fragmented (they may never be). These intervals are mean time
1081  * in cache maintenance intervals (kmem_cache_update).
1082  */
1083 uint32_t kmem_mtb_move = 60;    /* defrag 1 slab (~15min) */
1084 uint32_t kmem_mtb_reap = 1800;  /* defrag all slabs (~7.5hrs) */
1085 #endif  /* DEBUG */
1086 
1087 static kmem_cache_t     *kmem_defrag_cache;
1088 static kmem_cache_t     *kmem_move_cache;
1089 static taskq_t          *kmem_move_taskq;


1860                          */
1861                         if (sp->slab_flags & KMEM_SLAB_MOVE_PENDING) {
1862                                 list_insert_tail(deadlist, sp);
1863                         } else {
1864                                 list_insert_head(deadlist, sp);
1865                         }
1866                         cp->cache_defrag->kmd_deadcount++;
1867                         mutex_exit(&cp->cache_lock);
1868                 }
1869                 return;
1870         }
1871 
1872         if (bcp->bc_next == NULL) {
1873                 /* Transition the slab from completely allocated to partial. */
1874                 ASSERT(sp->slab_refcnt == (sp->slab_chunks - 1));
1875                 ASSERT(sp->slab_chunks > 1);
1876                 list_remove(&cp->cache_complete_slabs, sp);
1877                 cp->cache_complete_slab_count--;
1878                 avl_add(&cp->cache_partial_slabs, sp);
1879         } else {







1880                 (void) avl_update_gt(&cp->cache_partial_slabs, sp);

1881         }
1882 
1883         ASSERT((cp->cache_slab_create - cp->cache_slab_destroy) ==
1884             (cp->cache_complete_slab_count +
1885             avl_numnodes(&cp->cache_partial_slabs) +
1886             (cp->cache_defrag == NULL ? 0 : cp->cache_defrag->kmd_deadcount)));
1887         mutex_exit(&cp->cache_lock);
1888 }
1889 
1890 /*
1891  * Return -1 if kmem_error, 1 if constructor fails, 0 if successful.
1892  */
1893 static int
1894 kmem_cache_alloc_debug(kmem_cache_t *cp, void *buf, int kmflag, int construct,
1895     caddr_t caller)
1896 {
1897         kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
1898         kmem_bufctl_audit_t *bcp = (kmem_bufctl_audit_t *)btp->bt_bufctl;
1899         uint32_t mtbf;
1900 


3509                 kmcp->kmc_move_yes.value.ui64                = 0;
3510                 kmcp->kmc_move_no.value.ui64         = 0;
3511                 kmcp->kmc_move_later.value.ui64              = 0;
3512                 kmcp->kmc_move_dont_need.value.ui64  = 0;
3513                 kmcp->kmc_move_dont_know.value.ui64  = 0;
3514                 kmcp->kmc_move_hunt_found.value.ui64 = 0;
3515                 kmcp->kmc_move_slabs_freed.value.ui64        = 0;
3516                 kmcp->kmc_defrag.value.ui64          = 0;
3517                 kmcp->kmc_scan.value.ui64            = 0;
3518                 kmcp->kmc_move_reclaimable.value.ui64        = 0;
3519         } else {
3520                 int64_t reclaimable;
3521 
3522                 kmem_defrag_t *kd = cp->cache_defrag;
3523                 kmcp->kmc_move_callbacks.value.ui64  = kd->kmd_callbacks;
3524                 kmcp->kmc_move_yes.value.ui64                = kd->kmd_yes;
3525                 kmcp->kmc_move_no.value.ui64         = kd->kmd_no;
3526                 kmcp->kmc_move_later.value.ui64              = kd->kmd_later;
3527                 kmcp->kmc_move_dont_need.value.ui64  = kd->kmd_dont_need;
3528                 kmcp->kmc_move_dont_know.value.ui64  = kd->kmd_dont_know;
3529                 kmcp->kmc_move_hunt_found.value.ui64 = 0;
3530                 kmcp->kmc_move_slabs_freed.value.ui64        = kd->kmd_slabs_freed;
3531                 kmcp->kmc_defrag.value.ui64          = kd->kmd_defrags;
3532                 kmcp->kmc_scan.value.ui64            = kd->kmd_scans;
3533 
3534                 reclaimable = cp->cache_bufslab - (cp->cache_maxchunks - 1);
3535                 reclaimable = MAX(reclaimable, 0);
3536                 reclaimable += ((uint64_t)reap * cp->cache_magtype->mt_magsize);
3537                 kmcp->kmc_move_reclaimable.value.ui64        = reclaimable;
3538         }
3539 
3540         mutex_exit(&cp->cache_lock);
3541         return (0);
3542 }
3543 
3544 /*
3545  * Return a named statistic about a particular cache.
3546  * This shouldn't be called very often, so it's currently designed for
3547  * simplicity (leverages existing kstat support) rather than efficiency.
3548  */
3549 uint64_t


4080                 kmem_cache_free(kmem_defrag_cache, defrag); /* unused */
4081         }
4082 }
4083 
4084 void
4085 kmem_cache_destroy(kmem_cache_t *cp)
4086 {
4087         int cpu_seqid;
4088 
4089         /*
4090          * Remove the cache from the global cache list so that no one else
4091          * can schedule tasks on its behalf, wait for any pending tasks to
4092          * complete, purge the cache, and then destroy it.
4093          */
4094         mutex_enter(&kmem_cache_lock);
4095         list_remove(&kmem_caches, cp);
4096         mutex_exit(&kmem_cache_lock);
4097 
4098         if (kmem_taskq != NULL)
4099                 taskq_wait(kmem_taskq);
4100 
4101         if (kmem_move_taskq != NULL && cp->cache_defrag != NULL)
4102                 taskq_wait(kmem_move_taskq);
4103 
4104         kmem_cache_magazine_purge(cp);
4105 
4106         mutex_enter(&cp->cache_lock);
4107         if (cp->cache_buftotal != 0)
4108                 cmn_err(CE_WARN, "kmem_cache_destroy: '%s' (%p) not empty",
4109                     cp->cache_name, (void *)cp);
4110         if (cp->cache_defrag != NULL) {
4111                 avl_destroy(&cp->cache_defrag->kmd_moves_pending);
4112                 list_destroy(&cp->cache_defrag->kmd_deadlist);
4113                 kmem_cache_free(kmem_defrag_cache, cp->cache_defrag);
4114                 cp->cache_defrag = NULL;
4115         }
4116         /*
4117          * The cache is now dead.  There should be no further activity.  We
4118          * enforce this by setting land mines in the constructor, destructor,
4119          * reclaim, and move routines that induce a kernel text fault if
4120          * invoked.
4121          */


4608                 return (B_FALSE);
4609         }
4610 
4611         if ((refcnt == 1) || kmem_move_any_partial) {
4612                 return (refcnt < sp->slab_chunks);
4613         }
4614 
4615         /*
4616          * The reclaim threshold is adjusted at each kmem_cache_scan() so that
4617          * slabs with a progressively higher percentage of used buffers can be
4618          * reclaimed until the cache as a whole is no longer fragmented.
4619          *
4620          *      sp->slab_refcnt   kmd_reclaim_numer
4621          *      --------------- < ------------------
4622          *      sp->slab_chunks   KMEM_VOID_FRACTION
4623          */
4624         return ((refcnt * KMEM_VOID_FRACTION) <
4625             (sp->slab_chunks * cp->cache_defrag->kmd_reclaim_numer));
4626 }
4627 




















4628 /*




































































4629  * May be called from the kmem_move_taskq, from kmem_cache_move_notify_task(),
4630  * or when the buffer is freed.
4631  */
4632 static void
4633 kmem_slab_move_yes(kmem_cache_t *cp, kmem_slab_t *sp, void *from_buf)
4634 {
4635         ASSERT(MUTEX_HELD(&cp->cache_lock));
4636         ASSERT(KMEM_SLAB_MEMBER(sp, from_buf));
4637 
4638         if (!KMEM_SLAB_IS_PARTIAL(sp)) {
4639                 return;
4640         }
4641 
4642         if (sp->slab_flags & KMEM_SLAB_NOMOVE) {
4643                 if (KMEM_SLAB_OFFSET(sp, from_buf) == sp->slab_stuck_offset) {
4644                         avl_remove(&cp->cache_partial_slabs, sp);
4645                         sp->slab_flags &= ~KMEM_SLAB_NOMOVE;
4646                         sp->slab_stuck_offset = (uint32_t)-1;
4647                         avl_add(&cp->cache_partial_slabs, sp);
4648                 }


4671 }
4672 
4673 static void kmem_move_end(kmem_cache_t *, kmem_move_t *);
4674 
4675 /*
4676  * The move callback takes two buffer addresses, the buffer to be moved, and a
4677  * newly allocated and constructed buffer selected by kmem as the destination.
4678  * It also takes the size of the buffer and an optional user argument specified
4679  * at cache creation time. kmem guarantees that the buffer to be moved has not
4680  * been unmapped by the virtual memory subsystem. Beyond that, it cannot
4681  * guarantee the present whereabouts of the buffer to be moved, so it is up to
4682  * the client to safely determine whether or not it is still using the buffer.
4683  * The client must not free either of the buffers passed to the move callback,
4684  * since kmem wants to free them directly to the slab layer. The client response
4685  * tells kmem which of the two buffers to free:
4686  *
4687  * YES          kmem frees the old buffer (the move was successful)
4688  * NO           kmem frees the new buffer, marks the slab of the old buffer
4689  *              non-reclaimable to avoid bothering the client again
4690  * LATER        kmem frees the new buffer, increments slab_later_count
4691  * DONT_KNOW    kmem frees the new buffer
4692  * DONT_NEED    kmem frees both the old buffer and the new buffer
4693  *
4694  * The pending callback argument now being processed contains both of the
4695  * buffers (old and new) passed to the move callback function, the slab of the
4696  * old buffer, and flags related to the move request, such as whether or not the
4697  * system was desperate for memory.
4698  *
4699  * Slabs are not freed while there is a pending callback, but instead are kept
4700  * on a deadlist, which is drained after the last callback completes. This means
4701  * that slabs are safe to access until kmem_move_end(), no matter how many of
4702  * their buffers have been freed. Once slab_refcnt reaches zero, it stays at
4703  * zero for as long as the slab remains on the deadlist and until the slab is
4704  * freed.
4705  */
4706 static void
4707 kmem_move_buffer(kmem_move_t *callback)
4708 {
4709         kmem_cbrc_t response;
4710         kmem_slab_t *sp = callback->kmm_from_slab;
4711         kmem_cache_t *cp = sp->slab_cache;
4712         boolean_t free_on_slab;
4713 
4714         ASSERT(taskq_member(kmem_move_taskq, curthread));
4715         ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
4716         ASSERT(KMEM_SLAB_MEMBER(sp, callback->kmm_from_buf));
4717 
4718         /*
4719          * The number of allocated buffers on the slab may have changed since we
4720          * last checked the slab's reclaimability (when the pending move was
4721          * enqueued), or the client may have responded NO when asked to move
4722          * another buffer on the same slab.
4723          */
4724         if (!kmem_slab_is_reclaimable(cp, sp, callback->kmm_flags)) {



4725                 kmem_slab_free(cp, callback->kmm_to_buf);
4726                 kmem_move_end(cp, callback);
4727                 return;
4728         }
4729 
4730         /*
4731          * Checking the slab layer is easy, so we might as well do that here
4732          * in case we can avoid bothering the client.


4733          */
4734         mutex_enter(&cp->cache_lock);
4735         free_on_slab = (kmem_slab_allocated(cp, sp,
4736             callback->kmm_from_buf) == NULL);
4737         mutex_exit(&cp->cache_lock);
4738 
4739         if (free_on_slab) {

4740                 kmem_slab_free(cp, callback->kmm_to_buf);
4741                 kmem_move_end(cp, callback);
4742                 return;
4743         }
4744 
4745         if (cp->cache_flags & KMF_BUFTAG) {
4746                 /*
4747                  * Make kmem_cache_alloc_debug() apply the constructor for us.
4748                  */
4749                 if (kmem_cache_alloc_debug(cp, callback->kmm_to_buf,
4750                     KM_NOSLEEP, 1, caller()) != 0) {

4751                         kmem_move_end(cp, callback);
4752                         return;
4753                 }
4754         } else if (cp->cache_constructor != NULL &&
4755             cp->cache_constructor(callback->kmm_to_buf, cp->cache_private,
4756             KM_NOSLEEP) != 0) {
4757                 atomic_inc_64(&cp->cache_alloc_fail);

4758                 kmem_slab_free(cp, callback->kmm_to_buf);
4759                 kmem_move_end(cp, callback);
4760                 return;
4761         }
4762 



4763         cp->cache_defrag->kmd_callbacks++;
4764         cp->cache_defrag->kmd_thread = curthread;
4765         cp->cache_defrag->kmd_from_buf = callback->kmm_from_buf;
4766         cp->cache_defrag->kmd_to_buf = callback->kmm_to_buf;
4767         DTRACE_PROBE2(kmem__move__start, kmem_cache_t *, cp, kmem_move_t *,
4768             callback);
4769 
4770         response = cp->cache_move(callback->kmm_from_buf,
4771             callback->kmm_to_buf, cp->cache_bufsize, cp->cache_private);
4772 
4773         DTRACE_PROBE3(kmem__move__end, kmem_cache_t *, cp, kmem_move_t *,
4774             callback, kmem_cbrc_t, response);
4775         cp->cache_defrag->kmd_thread = NULL;
4776         cp->cache_defrag->kmd_from_buf = NULL;
4777         cp->cache_defrag->kmd_to_buf = NULL;
4778 
4779         if (response == KMEM_CBRC_YES) {

4780                 cp->cache_defrag->kmd_yes++;
4781                 kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE);
4782                 /* slab safe to access until kmem_move_end() */
4783                 if (sp->slab_refcnt == 0)
4784                         cp->cache_defrag->kmd_slabs_freed++;
4785                 mutex_enter(&cp->cache_lock);
4786                 kmem_slab_move_yes(cp, sp, callback->kmm_from_buf);
4787                 mutex_exit(&cp->cache_lock);
4788                 kmem_move_end(cp, callback);
4789                 return;
4790         }
4791 
4792         switch (response) {
4793         case KMEM_CBRC_NO:

4794                 cp->cache_defrag->kmd_no++;
4795                 mutex_enter(&cp->cache_lock);
4796                 kmem_slab_move_no(cp, sp, callback->kmm_from_buf);
4797                 mutex_exit(&cp->cache_lock);
4798                 break;
4799         case KMEM_CBRC_LATER:

4800                 cp->cache_defrag->kmd_later++;
4801                 mutex_enter(&cp->cache_lock);
4802                 if (!KMEM_SLAB_IS_PARTIAL(sp)) {
4803                         mutex_exit(&cp->cache_lock);
4804                         break;
4805                 }
4806 
4807                 if (++sp->slab_later_count >= KMEM_DISBELIEF) {

4808                         kmem_slab_move_no(cp, sp, callback->kmm_from_buf);
4809                 } else if (!(sp->slab_flags & KMEM_SLAB_NOMOVE)) {
4810                         sp->slab_stuck_offset = KMEM_SLAB_OFFSET(sp,
4811                             callback->kmm_from_buf);
4812                 }
4813                 mutex_exit(&cp->cache_lock);
4814                 break;
4815         case KMEM_CBRC_DONT_NEED:

4816                 cp->cache_defrag->kmd_dont_need++;
4817                 kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE);
4818                 if (sp->slab_refcnt == 0)
4819                         cp->cache_defrag->kmd_slabs_freed++;
4820                 mutex_enter(&cp->cache_lock);
4821                 kmem_slab_move_yes(cp, sp, callback->kmm_from_buf);
4822                 mutex_exit(&cp->cache_lock);
4823                 break;
4824         case KMEM_CBRC_DONT_KNOW:
4825                 /*
4826                  * If we don't know if we can move this buffer or not, we'll
4827                  * just assume that we can't:  if the buffer is in fact free,
4828                  * then it is sitting in one of the per-CPU magazines or in
4829                  * a full magazine in the depot layer.  Either way, because
4830                  * defrag is induced in the same logic that reaps a cache,
4831                  * it's likely that full magazines will be returned to the
4832                  * system soon (thereby accomplishing what we're trying to
4833                  * accomplish here: return those magazines to their slabs).
4834                  * Given this, any work that we might do now to locate a buffer
4835                  * in a magazine is wasted (and expensive!) work; we bump
4836                  * a counter in this case and otherwise assume that we can't
4837                  * move it.
4838                  */
4839                 cp->cache_defrag->kmd_dont_know++;











4840                 break;
4841         default:
4842                 panic("'%s' (%p) unexpected move callback response %d\n",
4843                     cp->cache_name, (void *)cp, response);
4844         }
4845 
4846         kmem_slab_free_constructed(cp, callback->kmm_to_buf, B_FALSE);
4847         kmem_move_end(cp, callback);
4848 }
4849 
4850 /* Return B_FALSE if there is insufficient memory for the move request. */
4851 static boolean_t
4852 kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags)
4853 {
4854         void *to_buf;
4855         avl_index_t index;
4856         kmem_move_t *callback, *pending;
4857         ulong_t n;
4858 
4859         ASSERT(taskq_member(kmem_taskq, curthread));
4860         ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
4861         ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING);
4862 
4863         callback = kmem_cache_alloc(kmem_move_cache, KM_NOSLEEP);
4864 
4865         if (callback == NULL)
4866                 return (B_FALSE);

4867 
4868         callback->kmm_from_slab = sp;
4869         callback->kmm_from_buf = buf;
4870         callback->kmm_flags = flags;
4871 
4872         mutex_enter(&cp->cache_lock);
4873 
4874         n = avl_numnodes(&cp->cache_partial_slabs);
4875         if ((n == 0) || ((n == 1) && !(flags & KMM_DEBUG))) {
4876                 mutex_exit(&cp->cache_lock);
4877                 kmem_cache_free(kmem_move_cache, callback);
4878                 return (B_TRUE); /* there is no need for the move request */
4879         }
4880 
4881         pending = avl_find(&cp->cache_defrag->kmd_moves_pending, buf, &index);
4882         if (pending != NULL) {
4883                 /*
4884                  * If the move is already pending and we're desperate now,
4885                  * update the move flags.
4886                  */
4887                 if (flags & KMM_DESPERATE) {
4888                         pending->kmm_flags |= KMM_DESPERATE;
4889                 }
4890                 mutex_exit(&cp->cache_lock);

4891                 kmem_cache_free(kmem_move_cache, callback);
4892                 return (B_TRUE);
4893         }
4894 
4895         to_buf = kmem_slab_alloc_impl(cp, avl_first(&cp->cache_partial_slabs),
4896             B_FALSE);
4897         callback->kmm_to_buf = to_buf;
4898         avl_insert(&cp->cache_defrag->kmd_moves_pending, callback, index);
4899 
4900         mutex_exit(&cp->cache_lock);
4901 
4902         if (!taskq_dispatch(kmem_move_taskq, (task_func_t *)kmem_move_buffer,
4903             callback, TQ_NOSLEEP)) {

4904                 mutex_enter(&cp->cache_lock);
4905                 avl_remove(&cp->cache_defrag->kmd_moves_pending, callback);
4906                 mutex_exit(&cp->cache_lock);
4907                 kmem_slab_free(cp, to_buf);
4908                 kmem_cache_free(kmem_move_cache, callback);
4909                 return (B_FALSE);
4910         }
4911 
4912         return (B_TRUE);
4913 }
4914 
4915 static void
4916 kmem_move_end(kmem_cache_t *cp, kmem_move_t *callback)
4917 {
4918         avl_index_t index;
4919 
4920         ASSERT(cp->cache_defrag != NULL);
4921         ASSERT(taskq_member(kmem_move_taskq, curthread));
4922         ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
4923 


4929                 list_t *deadlist = &cp->cache_defrag->kmd_deadlist;
4930                 kmem_slab_t *sp;
4931 
4932                 /*
4933                  * The last pending move completed. Release all slabs from the
4934                  * front of the dead list except for any slab at the tail that
4935                  * needs to be released from the context of kmem_move_buffers().
4936                  * kmem deferred unmapping the buffers on these slabs in order
4937                  * to guarantee that buffers passed to the move callback have
4938                  * been touched only by kmem or by the client itself.
4939                  */
4940                 while ((sp = list_remove_head(deadlist)) != NULL) {
4941                         if (sp->slab_flags & KMEM_SLAB_MOVE_PENDING) {
4942                                 list_insert_tail(deadlist, sp);
4943                                 break;
4944                         }
4945                         cp->cache_defrag->kmd_deadcount--;
4946                         cp->cache_slab_destroy++;
4947                         mutex_exit(&cp->cache_lock);
4948                         kmem_slab_destroy(cp, sp);

4949                         mutex_enter(&cp->cache_lock);
4950                 }
4951         }
4952         mutex_exit(&cp->cache_lock);
4953         kmem_cache_free(kmem_move_cache, callback);
4954 }
4955 
4956 /*
4957  * Move buffers from least used slabs first by scanning backwards from the end
4958  * of the partial slab list. Scan at most max_scan candidate slabs and move
4959  * buffers from at most max_slabs slabs (0 for all partial slabs in both cases).
4960  * If desperate to reclaim memory, move buffers from any partial slab, otherwise
4961  * skip slabs with a ratio of allocated buffers at or above the current
4962  * threshold. Return the number of unskipped slabs (at most max_slabs, -1 if the
4963  * scan is aborted) so that the caller can adjust the reclaimability threshold
4964  * depending on how many reclaimable slabs it finds.
4965  *
4966  * kmem_move_buffers() drops and reacquires cache_lock every time it issues a
4967  * move request, since it is not valid for kmem_move_begin() to call
4968  * kmem_cache_alloc() or taskq_dispatch() with cache_lock held.


5073                                 list_t *deadlist =
5074                                     &cp->cache_defrag->kmd_deadlist;
5075                                 list_remove(deadlist, sp);
5076 
5077                                 if (!avl_is_empty(
5078                                     &cp->cache_defrag->kmd_moves_pending)) {
5079                                         /*
5080                                          * A pending move makes it unsafe to
5081                                          * destroy the slab, because even though
5082                                          * the move is no longer needed, the
5083                                          * context where that is determined
5084                                          * requires the slab to exist.
5085                                          * Fortunately, a pending move also
5086                                          * means we don't need to destroy the
5087                                          * slab here, since it will get
5088                                          * destroyed along with any other slabs
5089                                          * on the deadlist after the last
5090                                          * pending move completes.
5091                                          */
5092                                         list_insert_head(deadlist, sp);


5093                                         return (-1);
5094                                 }
5095 
5096                                 /*
5097                                  * Destroy the slab now if it was completely
5098                                  * freed while we dropped cache_lock and there
5099                                  * are no pending moves. Since slab_refcnt
5100                                  * cannot change once it reaches zero, no new
5101                                  * pending moves from that slab are possible.
5102                                  */
5103                                 cp->cache_defrag->kmd_deadcount--;
5104                                 cp->cache_slab_destroy++;
5105                                 mutex_exit(&cp->cache_lock);
5106                                 kmem_slab_destroy(cp, sp);




5107                                 mutex_enter(&cp->cache_lock);
5108                                 /*
5109                                  * Since we can't pick up the scan where we left
5110                                  * off, abort the scan and say nothing about the
5111                                  * number of reclaimable slabs.
5112                                  */
5113                                 return (-1);
5114                         }
5115 
5116                         if (!success) {
5117                                 /*
5118                                  * Abort the scan if there is not enough memory
5119                                  * for the request and say nothing about the
5120                                  * number of reclaimable slabs.
5121                                  */


5122                                 return (-1);
5123                         }
5124 
5125                         /*
5126                          * The slab's position changed while the lock was
5127                          * dropped, so we don't know where we are in the
5128                          * sequence any more.
5129                          */
5130                         if (sp->slab_refcnt != refcnt) {
5131                                 /*
5132                                  * If this is a KMM_DEBUG move, the slab_refcnt
5133                                  * may have changed because we allocated a
5134                                  * destination buffer on the same slab. In that
5135                                  * case, we're not interested in counting it.
5136                                  */



5137                                 return (-1);
5138                         }
5139                         if ((sp->slab_flags & KMEM_SLAB_NOMOVE) != nomove)


5140                                 return (-1);

5141 
5142                         /*
5143                          * Generating a move request allocates a destination
5144                          * buffer from the slab layer, bumping the first partial
5145                          * slab if it is completely allocated. If the current
5146                          * slab becomes the first partial slab as a result, we
5147                          * can't continue to scan backwards.
5148                          *
5149                          * If this is a KMM_DEBUG move and we allocated the
5150                          * destination buffer from the last partial slab, then
5151                          * the buffer we're moving is on the same slab and our
5152                          * slab_refcnt has changed, causing us to return before
5153                          * reaching here if there are no partial slabs left.
5154                          */
5155                         ASSERT(!avl_is_empty(&cp->cache_partial_slabs));
5156                         if (sp == avl_first(&cp->cache_partial_slabs)) {
5157                                 /*
5158                                  * We're not interested in a second KMM_DEBUG
5159                                  * move.
5160                                  */
5161                                 goto end_scan;
5162                         }
5163                 }
5164         }
5165 end_scan:
5166 





5167         return (s);
5168 }
5169 
5170 typedef struct kmem_move_notify_args {
5171         kmem_cache_t *kmna_cache;
5172         void *kmna_buf;
5173 } kmem_move_notify_args_t;
5174 
5175 static void
5176 kmem_cache_move_notify_task(void *arg)
5177 {
5178         kmem_move_notify_args_t *args = arg;
5179         kmem_cache_t *cp = args->kmna_cache;
5180         void *buf = args->kmna_buf;
5181         kmem_slab_t *sp;
5182 
5183         ASSERT(taskq_member(kmem_taskq, curthread));
5184         ASSERT(list_link_active(&cp->cache_link));
5185 
5186         kmem_free(args, sizeof (kmem_move_notify_args_t));


5206                         return;
5207                 }
5208 
5209                 kmem_slab_move_yes(cp, sp, buf);
5210                 ASSERT(!(sp->slab_flags & KMEM_SLAB_MOVE_PENDING));
5211                 sp->slab_flags |= KMEM_SLAB_MOVE_PENDING;
5212                 mutex_exit(&cp->cache_lock);
5213                 /* see kmem_move_buffers() about dropping the lock */
5214                 (void) kmem_move_begin(cp, sp, buf, KMM_NOTIFY);
5215                 mutex_enter(&cp->cache_lock);
5216                 ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING);
5217                 sp->slab_flags &= ~KMEM_SLAB_MOVE_PENDING;
5218                 if (sp->slab_refcnt == 0) {
5219                         list_t *deadlist = &cp->cache_defrag->kmd_deadlist;
5220                         list_remove(deadlist, sp);
5221 
5222                         if (!avl_is_empty(
5223                             &cp->cache_defrag->kmd_moves_pending)) {
5224                                 list_insert_head(deadlist, sp);
5225                                 mutex_exit(&cp->cache_lock);


5226                                 return;
5227                         }
5228 
5229                         cp->cache_defrag->kmd_deadcount--;
5230                         cp->cache_slab_destroy++;
5231                         mutex_exit(&cp->cache_lock);
5232                         kmem_slab_destroy(cp, sp);



5233                         return;
5234                 }
5235         } else {
5236                 kmem_slab_move_yes(cp, sp, buf);
5237         }
5238         mutex_exit(&cp->cache_lock);
5239 }
5240 
5241 void
5242 kmem_cache_move_notify(kmem_cache_t *cp, void *buf)
5243 {
5244         kmem_move_notify_args_t *args;
5245 

5246         args = kmem_alloc(sizeof (kmem_move_notify_args_t), KM_NOSLEEP);
5247         if (args != NULL) {
5248                 args->kmna_cache = cp;
5249                 args->kmna_buf = buf;
5250                 if (!taskq_dispatch(kmem_taskq,
5251                     (task_func_t *)kmem_cache_move_notify_task, args,
5252                     TQ_NOSLEEP))
5253                         kmem_free(args, sizeof (kmem_move_notify_args_t));
5254         }
5255 }
5256 
5257 static void
5258 kmem_cache_defrag(kmem_cache_t *cp)
5259 {
5260         size_t n;
5261 
5262         ASSERT(cp->cache_defrag != NULL);
5263 
5264         mutex_enter(&cp->cache_lock);
5265         n = avl_numnodes(&cp->cache_partial_slabs);
5266         if (n > 1) {
5267                 /* kmem_move_buffers() drops and reacquires cache_lock */

5268                 cp->cache_defrag->kmd_defrags++;
5269                 (void) kmem_move_buffers(cp, n, 0, KMM_DESPERATE);
5270         }
5271         mutex_exit(&cp->cache_lock);
5272 }
5273 
5274 /* Is this cache above the fragmentation threshold? */
5275 static boolean_t
5276 kmem_cache_frag_threshold(kmem_cache_t *cp, uint64_t nfree)
5277 {
5278         /*
5279          *      nfree           kmem_frag_numer
5280          * ------------------ > ---------------
5281          * cp->cache_buftotal        kmem_frag_denom
5282          */
5283         return ((nfree * kmem_frag_denom) >
5284             (cp->cache_buftotal * kmem_frag_numer));
5285 }
5286 
5287 static boolean_t


5346         if (kmd->kmd_consolidate > 0) {
5347                 kmd->kmd_consolidate--;
5348                 mutex_exit(&cp->cache_lock);
5349                 kmem_cache_reap(cp);
5350                 return;
5351         }
5352 
5353         if (kmem_cache_is_fragmented(cp, &reap)) {
5354                 size_t slabs_found;
5355 
5356                 /*
5357                  * Consolidate reclaimable slabs from the end of the partial
5358                  * slab list (scan at most kmem_reclaim_scan_range slabs to find
5359                  * reclaimable slabs). Keep track of how many candidate slabs we
5360                  * looked for and how many we actually found so we can adjust
5361                  * the definition of a candidate slab if we're having trouble
5362                  * finding them.
5363                  *
5364                  * kmem_move_buffers() drops and reacquires cache_lock.
5365                  */

5366                 kmd->kmd_scans++;
5367                 slabs_found = kmem_move_buffers(cp, kmem_reclaim_scan_range,
5368                     kmem_reclaim_max_slabs, 0);
5369                 if (slabs_found >= 0) {
5370                         kmd->kmd_slabs_sought += kmem_reclaim_max_slabs;
5371                         kmd->kmd_slabs_found += slabs_found;
5372                 }
5373 
5374                 if (++kmd->kmd_tries >= kmem_reclaim_scan_range) {
5375                         kmd->kmd_tries = 0;
5376 
5377                         /*
5378                          * If we had difficulty finding candidate slabs in
5379                          * previous scans, adjust the threshold so that
5380                          * candidates are easier to find.
5381                          */
5382                         if (kmd->kmd_slabs_found == kmd->kmd_slabs_sought) {
5383                                 kmem_adjust_reclaim_threshold(kmd, -1);
5384                         } else if ((kmd->kmd_slabs_found * 2) <
5385                             kmd->kmd_slabs_sought) {
5386                                 kmem_adjust_reclaim_threshold(kmd, 1);
5387                         }
5388                         kmd->kmd_slabs_sought = 0;
5389                         kmd->kmd_slabs_found = 0;
5390                 }
5391         } else {
5392                 kmem_reset_reclaim_threshold(cp->cache_defrag);
5393 #ifdef  DEBUG
5394                 if (!avl_is_empty(&cp->cache_partial_slabs)) {
5395                         /*
5396                          * In a debug kernel we want the consolidator to
5397                          * run occasionally even when there is plenty of
5398                          * memory.
5399                          */
5400                         uint16_t debug_rand;
5401 
5402                         (void) random_get_bytes((uint8_t *)&debug_rand, 2);
5403                         if (!kmem_move_noreap &&
5404                             ((debug_rand % kmem_mtb_reap) == 0)) {
5405                                 mutex_exit(&cp->cache_lock);

5406                                 kmem_cache_reap(cp);
5407                                 return;
5408                         } else if ((debug_rand % kmem_mtb_move) == 0) {


5409                                 kmd->kmd_scans++;
5410                                 (void) kmem_move_buffers(cp,
5411                                     kmem_reclaim_scan_range, 1, KMM_DEBUG);
5412                         }
5413                 }
5414 #endif  /* DEBUG */
5415         }
5416 
5417         mutex_exit(&cp->cache_lock);
5418 
5419         if (reap)

5420                 kmem_depot_ws_reap(cp);

5421 }