142 * (The system won't be getting the slab back as long as the
143 * immovable object holds it hostage, so there's no point in moving
144 * any of its objects.)
145 * LATER: The client is using the object and cannot move it now, so kmem
146 * frees the new object (the unused copy destination). kmem still
147 * attempts to move other objects off the slab, since it expects to
148 * succeed in clearing the slab in a later callback. The client
149 * should use LATER instead of NO if the object is likely to become
150 * movable very soon.
151 * DONT_NEED: The client no longer needs the object, so kmem frees the old along
152 * with the new object (the unused copy destination). This response
153 * is the client's opportunity to be a model citizen and give back as
154 * much as it can.
155 * DONT_KNOW: The client does not know about the object because
156 * a) the client has just allocated the object and not yet put it
157 * wherever it expects to find known objects
158 * b) the client has removed the object from wherever it expects to
159 * find known objects and is about to free it, or
160 * c) the client has freed the object.
161 * In all these cases (a, b, and c) kmem frees the new object (the
162 * unused copy destination) and searches for the old object in the
163 * magazine layer. If found, the object is removed from the magazine
164 * layer and freed to the slab layer so it will no longer hold the
165 * slab hostage.
166 *
167 * 2.3 Object States
168 *
169 * Neither kmem nor the client can be assumed to know the object's whereabouts
170 * at the time of the callback. An object belonging to a kmem cache may be in
171 * any of the following states:
172 *
173 * 1. Uninitialized on the slab
174 * 2. Allocated from the slab but not constructed (still uninitialized)
175 * 3. Allocated from the slab, constructed, but not yet ready for business
176 * (not in a valid state for the move callback)
177 * 4. In use (valid and known to the client)
178 * 5. About to be freed (no longer in a valid state for the move callback)
179 * 6. Freed to a magazine (still constructed)
180 * 7. Allocated from a magazine, not yet ready for business (not in a valid
181 * state for the move callback), and about to return to state #4
182 * 8. Deconstructed on a magazine that is about to be freed
183 * 9. Freed to the slab
184 *
185 * Since the move callback may be called at any time while the object is in any
268 * c_objects_lock is held. Note that after acquiring the lock, the client must
269 * recheck the o_container pointer in case the object was removed just before
270 * acquiring the lock.
271 *
272 * When the client is about to free an object, it must first remove that object
273 * from the list, hash, or other structure where it is kept. At that time, to
274 * mark the object so it can be distinguished from the remaining, known objects,
275 * the client sets the designated low order bit:
276 *
277 * mutex_enter(&container->c_objects_lock);
278 * object->o_container = (void *)((uintptr_t)object->o_container | 0x1);
279 * list_remove(&container->c_objects, object);
280 * mutex_exit(&container->c_objects_lock);
281 *
282 * In the common case, the object is freed to the magazine layer, where it may
283 * be reused on a subsequent allocation without the overhead of calling the
284 * constructor. While in the magazine it appears allocated from the point of
285 * view of the slab layer, making it a candidate for the move callback. Most
286 * objects unrecognized by the client in the move callback fall into this
287 * category and are cheaply distinguished from known objects by the test
288 * described earlier. Since recognition is cheap for the client, and searching
289 * magazines is expensive for kmem, kmem defers searching until the client first
290 * returns KMEM_CBRC_DONT_KNOW. As long as the needed effort is reasonable, kmem
291 * elsewhere does what it can to avoid bothering the client unnecessarily.
292 *
293 * Invalidating the designated pointer member before freeing the object marks
294 * the object to be avoided in the callback, and conversely, assigning a valid
295 * value to the designated pointer member after allocating the object makes the
296 * object fair game for the callback:
297 *
298 * ... allocate object ...
299 * ... set any initial state not set by the constructor ...
300 *
301 * mutex_enter(&container->c_objects_lock);
302 * list_insert_tail(&container->c_objects, object);
303 * membar_producer();
304 * object->o_container = container;
305 * mutex_exit(&container->c_objects_lock);
306 *
307 * Note that everything else must be valid before setting o_container makes the
308 * object fair game for the move callback. The membar_producer() call ensures
309 * that all the object's state is written to memory before setting the pointer
310 * that transitions the object from state #3 or #7 (allocated, constructed, not
311 * yet in use) to state #4 (in use, valid). That's important because the move
1022 static kmem_cache_t *kmem_bufctl_cache;
1023 static kmem_cache_t *kmem_bufctl_audit_cache;
1024
1025 static kmutex_t kmem_cache_lock; /* inter-cache linkage only */
1026 static list_t kmem_caches;
1027
1028 static taskq_t *kmem_taskq;
1029 static kmutex_t kmem_flags_lock;
1030 static vmem_t *kmem_metadata_arena;
1031 static vmem_t *kmem_msb_arena; /* arena for metadata caches */
1032 static vmem_t *kmem_cache_arena;
1033 static vmem_t *kmem_hash_arena;
1034 static vmem_t *kmem_log_arena;
1035 static vmem_t *kmem_oversize_arena;
1036 static vmem_t *kmem_va_arena;
1037 static vmem_t *kmem_default_arena;
1038 static vmem_t *kmem_firewall_va_arena;
1039 static vmem_t *kmem_firewall_arena;
1040
1041 /*
1042 * Define KMEM_STATS to turn on statistic gathering. By default, it is only
1043 * turned on when DEBUG is also defined.
1044 */
1045 #ifdef DEBUG
1046 #define KMEM_STATS
1047 #endif /* DEBUG */
1048
1049 #ifdef KMEM_STATS
1050 #define KMEM_STAT_ADD(stat) ((stat)++)
1051 #define KMEM_STAT_COND_ADD(cond, stat) ((void) (!(cond) || (stat)++))
1052 #else
1053 #define KMEM_STAT_ADD(stat) /* nothing */
1054 #define KMEM_STAT_COND_ADD(cond, stat) /* nothing */
1055 #endif /* KMEM_STATS */
1056
1057 /*
1058 * kmem slab consolidator thresholds (tunables)
1059 */
1060 size_t kmem_frag_minslabs = 101; /* minimum total slabs */
1061 size_t kmem_frag_numer = 1; /* free buffers (numerator) */
1062 size_t kmem_frag_denom = KMEM_VOID_FRACTION; /* buffers (denominator) */
1063 /*
1064 * Maximum number of slabs from which to move buffers during a single
1065 * maintenance interval while the system is not low on memory.
1066 */
1067 size_t kmem_reclaim_max_slabs = 1;
1068 /*
1069 * Number of slabs to scan backwards from the end of the partial slab list
1070 * when searching for buffers to relocate.
1071 */
1072 size_t kmem_reclaim_scan_range = 12;
1073
1074 #ifdef KMEM_STATS
1075 static struct {
1076 uint64_t kms_callbacks;
1077 uint64_t kms_yes;
1078 uint64_t kms_no;
1079 uint64_t kms_later;
1080 uint64_t kms_dont_need;
1081 uint64_t kms_dont_know;
1082 uint64_t kms_hunt_found_mag;
1083 uint64_t kms_hunt_found_slab;
1084 uint64_t kms_hunt_alloc_fail;
1085 uint64_t kms_hunt_lucky;
1086 uint64_t kms_notify;
1087 uint64_t kms_notify_callbacks;
1088 uint64_t kms_disbelief;
1089 uint64_t kms_already_pending;
1090 uint64_t kms_callback_alloc_fail;
1091 uint64_t kms_callback_taskq_fail;
1092 uint64_t kms_endscan_slab_dead;
1093 uint64_t kms_endscan_slab_destroyed;
1094 uint64_t kms_endscan_nomem;
1095 uint64_t kms_endscan_refcnt_changed;
1096 uint64_t kms_endscan_nomove_changed;
1097 uint64_t kms_endscan_freelist;
1098 uint64_t kms_avl_update;
1099 uint64_t kms_avl_noupdate;
1100 uint64_t kms_no_longer_reclaimable;
1101 uint64_t kms_notify_no_longer_reclaimable;
1102 uint64_t kms_notify_slab_dead;
1103 uint64_t kms_notify_slab_destroyed;
1104 uint64_t kms_alloc_fail;
1105 uint64_t kms_constructor_fail;
1106 uint64_t kms_dead_slabs_freed;
1107 uint64_t kms_defrags;
1108 uint64_t kms_scans;
1109 uint64_t kms_scan_depot_ws_reaps;
1110 uint64_t kms_debug_reaps;
1111 uint64_t kms_debug_scans;
1112 } kmem_move_stats;
1113 #endif /* KMEM_STATS */
1114
1115 /* consolidator knobs */
1116 boolean_t kmem_move_noreap;
1117 boolean_t kmem_move_blocked;
1118 boolean_t kmem_move_fulltilt;
1119 boolean_t kmem_move_any_partial;
1120
1121 #ifdef DEBUG
1122 /*
1123 * kmem consolidator debug tunables:
1124 * Ensure code coverage by occasionally running the consolidator even when the
1125 * caches are not fragmented (they may never be). These intervals are mean time
1126 * in cache maintenance intervals (kmem_cache_update).
1127 */
1128 uint32_t kmem_mtb_move = 60; /* defrag 1 slab (~15min) */
1129 uint32_t kmem_mtb_reap = 1800; /* defrag all slabs (~7.5hrs) */
1130 #endif /* DEBUG */
1131
1132 static kmem_cache_t *kmem_defrag_cache;
1133 static kmem_cache_t *kmem_move_cache;
1134 static taskq_t *kmem_move_taskq;
1905 */
1906 if (sp->slab_flags & KMEM_SLAB_MOVE_PENDING) {
1907 list_insert_tail(deadlist, sp);
1908 } else {
1909 list_insert_head(deadlist, sp);
1910 }
1911 cp->cache_defrag->kmd_deadcount++;
1912 mutex_exit(&cp->cache_lock);
1913 }
1914 return;
1915 }
1916
1917 if (bcp->bc_next == NULL) {
1918 /* Transition the slab from completely allocated to partial. */
1919 ASSERT(sp->slab_refcnt == (sp->slab_chunks - 1));
1920 ASSERT(sp->slab_chunks > 1);
1921 list_remove(&cp->cache_complete_slabs, sp);
1922 cp->cache_complete_slab_count--;
1923 avl_add(&cp->cache_partial_slabs, sp);
1924 } else {
1925 #ifdef DEBUG
1926 if (avl_update_gt(&cp->cache_partial_slabs, sp)) {
1927 KMEM_STAT_ADD(kmem_move_stats.kms_avl_update);
1928 } else {
1929 KMEM_STAT_ADD(kmem_move_stats.kms_avl_noupdate);
1930 }
1931 #else
1932 (void) avl_update_gt(&cp->cache_partial_slabs, sp);
1933 #endif
1934 }
1935
1936 ASSERT((cp->cache_slab_create - cp->cache_slab_destroy) ==
1937 (cp->cache_complete_slab_count +
1938 avl_numnodes(&cp->cache_partial_slabs) +
1939 (cp->cache_defrag == NULL ? 0 : cp->cache_defrag->kmd_deadcount)));
1940 mutex_exit(&cp->cache_lock);
1941 }
1942
1943 /*
1944 * Return -1 if kmem_error, 1 if constructor fails, 0 if successful.
1945 */
1946 static int
1947 kmem_cache_alloc_debug(kmem_cache_t *cp, void *buf, int kmflag, int construct,
1948 caddr_t caller)
1949 {
1950 kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
1951 kmem_bufctl_audit_t *bcp = (kmem_bufctl_audit_t *)btp->bt_bufctl;
1952 uint32_t mtbf;
1953
3562 kmcp->kmc_move_yes.value.ui64 = 0;
3563 kmcp->kmc_move_no.value.ui64 = 0;
3564 kmcp->kmc_move_later.value.ui64 = 0;
3565 kmcp->kmc_move_dont_need.value.ui64 = 0;
3566 kmcp->kmc_move_dont_know.value.ui64 = 0;
3567 kmcp->kmc_move_hunt_found.value.ui64 = 0;
3568 kmcp->kmc_move_slabs_freed.value.ui64 = 0;
3569 kmcp->kmc_defrag.value.ui64 = 0;
3570 kmcp->kmc_scan.value.ui64 = 0;
3571 kmcp->kmc_move_reclaimable.value.ui64 = 0;
3572 } else {
3573 int64_t reclaimable;
3574
3575 kmem_defrag_t *kd = cp->cache_defrag;
3576 kmcp->kmc_move_callbacks.value.ui64 = kd->kmd_callbacks;
3577 kmcp->kmc_move_yes.value.ui64 = kd->kmd_yes;
3578 kmcp->kmc_move_no.value.ui64 = kd->kmd_no;
3579 kmcp->kmc_move_later.value.ui64 = kd->kmd_later;
3580 kmcp->kmc_move_dont_need.value.ui64 = kd->kmd_dont_need;
3581 kmcp->kmc_move_dont_know.value.ui64 = kd->kmd_dont_know;
3582 kmcp->kmc_move_hunt_found.value.ui64 = kd->kmd_hunt_found;
3583 kmcp->kmc_move_slabs_freed.value.ui64 = kd->kmd_slabs_freed;
3584 kmcp->kmc_defrag.value.ui64 = kd->kmd_defrags;
3585 kmcp->kmc_scan.value.ui64 = kd->kmd_scans;
3586
3587 reclaimable = cp->cache_bufslab - (cp->cache_maxchunks - 1);
3588 reclaimable = MAX(reclaimable, 0);
3589 reclaimable += ((uint64_t)reap * cp->cache_magtype->mt_magsize);
3590 kmcp->kmc_move_reclaimable.value.ui64 = reclaimable;
3591 }
3592
3593 mutex_exit(&cp->cache_lock);
3594 return (0);
3595 }
3596
3597 /*
3598 * Return a named statistic about a particular cache.
3599 * This shouldn't be called very often, so it's currently designed for
3600 * simplicity (leverages existing kstat support) rather than efficiency.
3601 */
3602 uint64_t
4133 kmem_cache_free(kmem_defrag_cache, defrag); /* unused */
4134 }
4135 }
4136
4137 void
4138 kmem_cache_destroy(kmem_cache_t *cp)
4139 {
4140 int cpu_seqid;
4141
4142 /*
4143 * Remove the cache from the global cache list so that no one else
4144 * can schedule tasks on its behalf, wait for any pending tasks to
4145 * complete, purge the cache, and then destroy it.
4146 */
4147 mutex_enter(&kmem_cache_lock);
4148 list_remove(&kmem_caches, cp);
4149 mutex_exit(&kmem_cache_lock);
4150
4151 if (kmem_taskq != NULL)
4152 taskq_wait(kmem_taskq);
4153 if (kmem_move_taskq != NULL)
4154 taskq_wait(kmem_move_taskq);
4155
4156 kmem_cache_magazine_purge(cp);
4157
4158 mutex_enter(&cp->cache_lock);
4159 if (cp->cache_buftotal != 0)
4160 cmn_err(CE_WARN, "kmem_cache_destroy: '%s' (%p) not empty",
4161 cp->cache_name, (void *)cp);
4162 if (cp->cache_defrag != NULL) {
4163 avl_destroy(&cp->cache_defrag->kmd_moves_pending);
4164 list_destroy(&cp->cache_defrag->kmd_deadlist);
4165 kmem_cache_free(kmem_defrag_cache, cp->cache_defrag);
4166 cp->cache_defrag = NULL;
4167 }
4168 /*
4169 * The cache is now dead. There should be no further activity. We
4170 * enforce this by setting land mines in the constructor, destructor,
4171 * reclaim, and move routines that induce a kernel text fault if
4172 * invoked.
4173 */
4660 return (B_FALSE);
4661 }
4662
4663 if ((refcnt == 1) || kmem_move_any_partial) {
4664 return (refcnt < sp->slab_chunks);
4665 }
4666
4667 /*
4668 * The reclaim threshold is adjusted at each kmem_cache_scan() so that
4669 * slabs with a progressively higher percentage of used buffers can be
4670 * reclaimed until the cache as a whole is no longer fragmented.
4671 *
4672 * sp->slab_refcnt kmd_reclaim_numer
4673 * --------------- < ------------------
4674 * sp->slab_chunks KMEM_VOID_FRACTION
4675 */
4676 return ((refcnt * KMEM_VOID_FRACTION) <
4677 (sp->slab_chunks * cp->cache_defrag->kmd_reclaim_numer));
4678 }
4679
4680 static void *
4681 kmem_hunt_mag(kmem_cache_t *cp, kmem_magazine_t *m, int n, void *buf,
4682 void *tbuf)
4683 {
4684 int i; /* magazine round index */
4685
4686 for (i = 0; i < n; i++) {
4687 if (buf == m->mag_round[i]) {
4688 if (cp->cache_flags & KMF_BUFTAG) {
4689 (void) kmem_cache_free_debug(cp, tbuf,
4690 caller());
4691 }
4692 m->mag_round[i] = tbuf;
4693 return (buf);
4694 }
4695 }
4696
4697 return (NULL);
4698 }
4699
4700 /*
4701 * Hunt the magazine layer for the given buffer. If found, the buffer is
4702 * removed from the magazine layer and returned, otherwise NULL is returned.
4703 * The state of the returned buffer is freed and constructed.
4704 */
4705 static void *
4706 kmem_hunt_mags(kmem_cache_t *cp, void *buf)
4707 {
4708 kmem_cpu_cache_t *ccp;
4709 kmem_magazine_t *m;
4710 int cpu_seqid;
4711 int n; /* magazine rounds */
4712 void *tbuf; /* temporary swap buffer */
4713
4714 ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
4715
4716 /*
4717 * Allocated a buffer to swap with the one we hope to pull out of a
4718 * magazine when found.
4719 */
4720 tbuf = kmem_cache_alloc(cp, KM_NOSLEEP);
4721 if (tbuf == NULL) {
4722 KMEM_STAT_ADD(kmem_move_stats.kms_hunt_alloc_fail);
4723 return (NULL);
4724 }
4725 if (tbuf == buf) {
4726 KMEM_STAT_ADD(kmem_move_stats.kms_hunt_lucky);
4727 if (cp->cache_flags & KMF_BUFTAG) {
4728 (void) kmem_cache_free_debug(cp, buf, caller());
4729 }
4730 return (buf);
4731 }
4732
4733 /* Hunt the depot. */
4734 mutex_enter(&cp->cache_depot_lock);
4735 n = cp->cache_magtype->mt_magsize;
4736 for (m = cp->cache_full.ml_list; m != NULL; m = m->mag_next) {
4737 if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) {
4738 mutex_exit(&cp->cache_depot_lock);
4739 return (buf);
4740 }
4741 }
4742 mutex_exit(&cp->cache_depot_lock);
4743
4744 /* Hunt the per-CPU magazines. */
4745 for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) {
4746 ccp = &cp->cache_cpu[cpu_seqid];
4747
4748 mutex_enter(&ccp->cc_lock);
4749 m = ccp->cc_loaded;
4750 n = ccp->cc_rounds;
4751 if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) {
4752 mutex_exit(&ccp->cc_lock);
4753 return (buf);
4754 }
4755 m = ccp->cc_ploaded;
4756 n = ccp->cc_prounds;
4757 if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) {
4758 mutex_exit(&ccp->cc_lock);
4759 return (buf);
4760 }
4761 mutex_exit(&ccp->cc_lock);
4762 }
4763
4764 kmem_cache_free(cp, tbuf);
4765 return (NULL);
4766 }
4767
4768 /*
4769 * May be called from the kmem_move_taskq, from kmem_cache_move_notify_task(),
4770 * or when the buffer is freed.
4771 */
4772 static void
4773 kmem_slab_move_yes(kmem_cache_t *cp, kmem_slab_t *sp, void *from_buf)
4774 {
4775 ASSERT(MUTEX_HELD(&cp->cache_lock));
4776 ASSERT(KMEM_SLAB_MEMBER(sp, from_buf));
4777
4778 if (!KMEM_SLAB_IS_PARTIAL(sp)) {
4779 return;
4780 }
4781
4782 if (sp->slab_flags & KMEM_SLAB_NOMOVE) {
4783 if (KMEM_SLAB_OFFSET(sp, from_buf) == sp->slab_stuck_offset) {
4784 avl_remove(&cp->cache_partial_slabs, sp);
4785 sp->slab_flags &= ~KMEM_SLAB_NOMOVE;
4786 sp->slab_stuck_offset = (uint32_t)-1;
4787 avl_add(&cp->cache_partial_slabs, sp);
4788 }
4811 }
4812
4813 static void kmem_move_end(kmem_cache_t *, kmem_move_t *);
4814
4815 /*
4816 * The move callback takes two buffer addresses, the buffer to be moved, and a
4817 * newly allocated and constructed buffer selected by kmem as the destination.
4818 * It also takes the size of the buffer and an optional user argument specified
4819 * at cache creation time. kmem guarantees that the buffer to be moved has not
4820 * been unmapped by the virtual memory subsystem. Beyond that, it cannot
4821 * guarantee the present whereabouts of the buffer to be moved, so it is up to
4822 * the client to safely determine whether or not it is still using the buffer.
4823 * The client must not free either of the buffers passed to the move callback,
4824 * since kmem wants to free them directly to the slab layer. The client response
4825 * tells kmem which of the two buffers to free:
4826 *
4827 * YES kmem frees the old buffer (the move was successful)
4828 * NO kmem frees the new buffer, marks the slab of the old buffer
4829 * non-reclaimable to avoid bothering the client again
4830 * LATER kmem frees the new buffer, increments slab_later_count
4831 * DONT_KNOW kmem frees the new buffer, searches mags for the old buffer
4832 * DONT_NEED kmem frees both the old buffer and the new buffer
4833 *
4834 * The pending callback argument now being processed contains both of the
4835 * buffers (old and new) passed to the move callback function, the slab of the
4836 * old buffer, and flags related to the move request, such as whether or not the
4837 * system was desperate for memory.
4838 *
4839 * Slabs are not freed while there is a pending callback, but instead are kept
4840 * on a deadlist, which is drained after the last callback completes. This means
4841 * that slabs are safe to access until kmem_move_end(), no matter how many of
4842 * their buffers have been freed. Once slab_refcnt reaches zero, it stays at
4843 * zero for as long as the slab remains on the deadlist and until the slab is
4844 * freed.
4845 */
4846 static void
4847 kmem_move_buffer(kmem_move_t *callback)
4848 {
4849 kmem_cbrc_t response;
4850 kmem_slab_t *sp = callback->kmm_from_slab;
4851 kmem_cache_t *cp = sp->slab_cache;
4852 boolean_t free_on_slab;
4853
4854 ASSERT(taskq_member(kmem_move_taskq, curthread));
4855 ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
4856 ASSERT(KMEM_SLAB_MEMBER(sp, callback->kmm_from_buf));
4857
4858 /*
4859 * The number of allocated buffers on the slab may have changed since we
4860 * last checked the slab's reclaimability (when the pending move was
4861 * enqueued), or the client may have responded NO when asked to move
4862 * another buffer on the same slab.
4863 */
4864 if (!kmem_slab_is_reclaimable(cp, sp, callback->kmm_flags)) {
4865 KMEM_STAT_ADD(kmem_move_stats.kms_no_longer_reclaimable);
4866 KMEM_STAT_COND_ADD((callback->kmm_flags & KMM_NOTIFY),
4867 kmem_move_stats.kms_notify_no_longer_reclaimable);
4868 kmem_slab_free(cp, callback->kmm_to_buf);
4869 kmem_move_end(cp, callback);
4870 return;
4871 }
4872
4873 /*
4874 * Hunting magazines is expensive, so we'll wait to do that until the
4875 * client responds KMEM_CBRC_DONT_KNOW. However, checking the slab layer
4876 * is cheap, so we might as well do that here in case we can avoid
4877 * bothering the client.
4878 */
4879 mutex_enter(&cp->cache_lock);
4880 free_on_slab = (kmem_slab_allocated(cp, sp,
4881 callback->kmm_from_buf) == NULL);
4882 mutex_exit(&cp->cache_lock);
4883
4884 if (free_on_slab) {
4885 KMEM_STAT_ADD(kmem_move_stats.kms_hunt_found_slab);
4886 kmem_slab_free(cp, callback->kmm_to_buf);
4887 kmem_move_end(cp, callback);
4888 return;
4889 }
4890
4891 if (cp->cache_flags & KMF_BUFTAG) {
4892 /*
4893 * Make kmem_cache_alloc_debug() apply the constructor for us.
4894 */
4895 if (kmem_cache_alloc_debug(cp, callback->kmm_to_buf,
4896 KM_NOSLEEP, 1, caller()) != 0) {
4897 KMEM_STAT_ADD(kmem_move_stats.kms_alloc_fail);
4898 kmem_move_end(cp, callback);
4899 return;
4900 }
4901 } else if (cp->cache_constructor != NULL &&
4902 cp->cache_constructor(callback->kmm_to_buf, cp->cache_private,
4903 KM_NOSLEEP) != 0) {
4904 atomic_inc_64(&cp->cache_alloc_fail);
4905 KMEM_STAT_ADD(kmem_move_stats.kms_constructor_fail);
4906 kmem_slab_free(cp, callback->kmm_to_buf);
4907 kmem_move_end(cp, callback);
4908 return;
4909 }
4910
4911 KMEM_STAT_ADD(kmem_move_stats.kms_callbacks);
4912 KMEM_STAT_COND_ADD((callback->kmm_flags & KMM_NOTIFY),
4913 kmem_move_stats.kms_notify_callbacks);
4914 cp->cache_defrag->kmd_callbacks++;
4915 cp->cache_defrag->kmd_thread = curthread;
4916 cp->cache_defrag->kmd_from_buf = callback->kmm_from_buf;
4917 cp->cache_defrag->kmd_to_buf = callback->kmm_to_buf;
4918 DTRACE_PROBE2(kmem__move__start, kmem_cache_t *, cp, kmem_move_t *,
4919 callback);
4920
4921 response = cp->cache_move(callback->kmm_from_buf,
4922 callback->kmm_to_buf, cp->cache_bufsize, cp->cache_private);
4923
4924 DTRACE_PROBE3(kmem__move__end, kmem_cache_t *, cp, kmem_move_t *,
4925 callback, kmem_cbrc_t, response);
4926 cp->cache_defrag->kmd_thread = NULL;
4927 cp->cache_defrag->kmd_from_buf = NULL;
4928 cp->cache_defrag->kmd_to_buf = NULL;
4929
4930 if (response == KMEM_CBRC_YES) {
4931 KMEM_STAT_ADD(kmem_move_stats.kms_yes);
4932 cp->cache_defrag->kmd_yes++;
4933 kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE);
4934 /* slab safe to access until kmem_move_end() */
4935 if (sp->slab_refcnt == 0)
4936 cp->cache_defrag->kmd_slabs_freed++;
4937 mutex_enter(&cp->cache_lock);
4938 kmem_slab_move_yes(cp, sp, callback->kmm_from_buf);
4939 mutex_exit(&cp->cache_lock);
4940 kmem_move_end(cp, callback);
4941 return;
4942 }
4943
4944 switch (response) {
4945 case KMEM_CBRC_NO:
4946 KMEM_STAT_ADD(kmem_move_stats.kms_no);
4947 cp->cache_defrag->kmd_no++;
4948 mutex_enter(&cp->cache_lock);
4949 kmem_slab_move_no(cp, sp, callback->kmm_from_buf);
4950 mutex_exit(&cp->cache_lock);
4951 break;
4952 case KMEM_CBRC_LATER:
4953 KMEM_STAT_ADD(kmem_move_stats.kms_later);
4954 cp->cache_defrag->kmd_later++;
4955 mutex_enter(&cp->cache_lock);
4956 if (!KMEM_SLAB_IS_PARTIAL(sp)) {
4957 mutex_exit(&cp->cache_lock);
4958 break;
4959 }
4960
4961 if (++sp->slab_later_count >= KMEM_DISBELIEF) {
4962 KMEM_STAT_ADD(kmem_move_stats.kms_disbelief);
4963 kmem_slab_move_no(cp, sp, callback->kmm_from_buf);
4964 } else if (!(sp->slab_flags & KMEM_SLAB_NOMOVE)) {
4965 sp->slab_stuck_offset = KMEM_SLAB_OFFSET(sp,
4966 callback->kmm_from_buf);
4967 }
4968 mutex_exit(&cp->cache_lock);
4969 break;
4970 case KMEM_CBRC_DONT_NEED:
4971 KMEM_STAT_ADD(kmem_move_stats.kms_dont_need);
4972 cp->cache_defrag->kmd_dont_need++;
4973 kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE);
4974 if (sp->slab_refcnt == 0)
4975 cp->cache_defrag->kmd_slabs_freed++;
4976 mutex_enter(&cp->cache_lock);
4977 kmem_slab_move_yes(cp, sp, callback->kmm_from_buf);
4978 mutex_exit(&cp->cache_lock);
4979 break;
4980 case KMEM_CBRC_DONT_KNOW:
4981 KMEM_STAT_ADD(kmem_move_stats.kms_dont_know);
4982 cp->cache_defrag->kmd_dont_know++;
4983 if (kmem_hunt_mags(cp, callback->kmm_from_buf) != NULL) {
4984 KMEM_STAT_ADD(kmem_move_stats.kms_hunt_found_mag);
4985 cp->cache_defrag->kmd_hunt_found++;
4986 kmem_slab_free_constructed(cp, callback->kmm_from_buf,
4987 B_TRUE);
4988 if (sp->slab_refcnt == 0)
4989 cp->cache_defrag->kmd_slabs_freed++;
4990 mutex_enter(&cp->cache_lock);
4991 kmem_slab_move_yes(cp, sp, callback->kmm_from_buf);
4992 mutex_exit(&cp->cache_lock);
4993 }
4994 break;
4995 default:
4996 panic("'%s' (%p) unexpected move callback response %d\n",
4997 cp->cache_name, (void *)cp, response);
4998 }
4999
5000 kmem_slab_free_constructed(cp, callback->kmm_to_buf, B_FALSE);
5001 kmem_move_end(cp, callback);
5002 }
5003
5004 /* Return B_FALSE if there is insufficient memory for the move request. */
5005 static boolean_t
5006 kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags)
5007 {
5008 void *to_buf;
5009 avl_index_t index;
5010 kmem_move_t *callback, *pending;
5011 ulong_t n;
5012
5013 ASSERT(taskq_member(kmem_taskq, curthread));
5014 ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
5015 ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING);
5016
5017 callback = kmem_cache_alloc(kmem_move_cache, KM_NOSLEEP);
5018 if (callback == NULL) {
5019 KMEM_STAT_ADD(kmem_move_stats.kms_callback_alloc_fail);
5020 return (B_FALSE);
5021 }
5022
5023 callback->kmm_from_slab = sp;
5024 callback->kmm_from_buf = buf;
5025 callback->kmm_flags = flags;
5026
5027 mutex_enter(&cp->cache_lock);
5028
5029 n = avl_numnodes(&cp->cache_partial_slabs);
5030 if ((n == 0) || ((n == 1) && !(flags & KMM_DEBUG))) {
5031 mutex_exit(&cp->cache_lock);
5032 kmem_cache_free(kmem_move_cache, callback);
5033 return (B_TRUE); /* there is no need for the move request */
5034 }
5035
5036 pending = avl_find(&cp->cache_defrag->kmd_moves_pending, buf, &index);
5037 if (pending != NULL) {
5038 /*
5039 * If the move is already pending and we're desperate now,
5040 * update the move flags.
5041 */
5042 if (flags & KMM_DESPERATE) {
5043 pending->kmm_flags |= KMM_DESPERATE;
5044 }
5045 mutex_exit(&cp->cache_lock);
5046 KMEM_STAT_ADD(kmem_move_stats.kms_already_pending);
5047 kmem_cache_free(kmem_move_cache, callback);
5048 return (B_TRUE);
5049 }
5050
5051 to_buf = kmem_slab_alloc_impl(cp, avl_first(&cp->cache_partial_slabs),
5052 B_FALSE);
5053 callback->kmm_to_buf = to_buf;
5054 avl_insert(&cp->cache_defrag->kmd_moves_pending, callback, index);
5055
5056 mutex_exit(&cp->cache_lock);
5057
5058 if (!taskq_dispatch(kmem_move_taskq, (task_func_t *)kmem_move_buffer,
5059 callback, TQ_NOSLEEP)) {
5060 KMEM_STAT_ADD(kmem_move_stats.kms_callback_taskq_fail);
5061 mutex_enter(&cp->cache_lock);
5062 avl_remove(&cp->cache_defrag->kmd_moves_pending, callback);
5063 mutex_exit(&cp->cache_lock);
5064 kmem_slab_free(cp, to_buf);
5065 kmem_cache_free(kmem_move_cache, callback);
5066 return (B_FALSE);
5067 }
5068
5069 return (B_TRUE);
5070 }
5071
5072 static void
5073 kmem_move_end(kmem_cache_t *cp, kmem_move_t *callback)
5074 {
5075 avl_index_t index;
5076
5077 ASSERT(cp->cache_defrag != NULL);
5078 ASSERT(taskq_member(kmem_move_taskq, curthread));
5079 ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
5080
5086 list_t *deadlist = &cp->cache_defrag->kmd_deadlist;
5087 kmem_slab_t *sp;
5088
5089 /*
5090 * The last pending move completed. Release all slabs from the
5091 * front of the dead list except for any slab at the tail that
5092 * needs to be released from the context of kmem_move_buffers().
5093 * kmem deferred unmapping the buffers on these slabs in order
5094 * to guarantee that buffers passed to the move callback have
5095 * been touched only by kmem or by the client itself.
5096 */
5097 while ((sp = list_remove_head(deadlist)) != NULL) {
5098 if (sp->slab_flags & KMEM_SLAB_MOVE_PENDING) {
5099 list_insert_tail(deadlist, sp);
5100 break;
5101 }
5102 cp->cache_defrag->kmd_deadcount--;
5103 cp->cache_slab_destroy++;
5104 mutex_exit(&cp->cache_lock);
5105 kmem_slab_destroy(cp, sp);
5106 KMEM_STAT_ADD(kmem_move_stats.kms_dead_slabs_freed);
5107 mutex_enter(&cp->cache_lock);
5108 }
5109 }
5110 mutex_exit(&cp->cache_lock);
5111 kmem_cache_free(kmem_move_cache, callback);
5112 }
5113
5114 /*
5115 * Move buffers from least used slabs first by scanning backwards from the end
5116 * of the partial slab list. Scan at most max_scan candidate slabs and move
5117 * buffers from at most max_slabs slabs (0 for all partial slabs in both cases).
5118 * If desperate to reclaim memory, move buffers from any partial slab, otherwise
5119 * skip slabs with a ratio of allocated buffers at or above the current
5120 * threshold. Return the number of unskipped slabs (at most max_slabs, -1 if the
5121 * scan is aborted) so that the caller can adjust the reclaimability threshold
5122 * depending on how many reclaimable slabs it finds.
5123 *
5124 * kmem_move_buffers() drops and reacquires cache_lock every time it issues a
5125 * move request, since it is not valid for kmem_move_begin() to call
5126 * kmem_cache_alloc() or taskq_dispatch() with cache_lock held.
5231 list_t *deadlist =
5232 &cp->cache_defrag->kmd_deadlist;
5233 list_remove(deadlist, sp);
5234
5235 if (!avl_is_empty(
5236 &cp->cache_defrag->kmd_moves_pending)) {
5237 /*
5238 * A pending move makes it unsafe to
5239 * destroy the slab, because even though
5240 * the move is no longer needed, the
5241 * context where that is determined
5242 * requires the slab to exist.
5243 * Fortunately, a pending move also
5244 * means we don't need to destroy the
5245 * slab here, since it will get
5246 * destroyed along with any other slabs
5247 * on the deadlist after the last
5248 * pending move completes.
5249 */
5250 list_insert_head(deadlist, sp);
5251 KMEM_STAT_ADD(kmem_move_stats.
5252 kms_endscan_slab_dead);
5253 return (-1);
5254 }
5255
5256 /*
5257 * Destroy the slab now if it was completely
5258 * freed while we dropped cache_lock and there
5259 * are no pending moves. Since slab_refcnt
5260 * cannot change once it reaches zero, no new
5261 * pending moves from that slab are possible.
5262 */
5263 cp->cache_defrag->kmd_deadcount--;
5264 cp->cache_slab_destroy++;
5265 mutex_exit(&cp->cache_lock);
5266 kmem_slab_destroy(cp, sp);
5267 KMEM_STAT_ADD(kmem_move_stats.
5268 kms_dead_slabs_freed);
5269 KMEM_STAT_ADD(kmem_move_stats.
5270 kms_endscan_slab_destroyed);
5271 mutex_enter(&cp->cache_lock);
5272 /*
5273 * Since we can't pick up the scan where we left
5274 * off, abort the scan and say nothing about the
5275 * number of reclaimable slabs.
5276 */
5277 return (-1);
5278 }
5279
5280 if (!success) {
5281 /*
5282 * Abort the scan if there is not enough memory
5283 * for the request and say nothing about the
5284 * number of reclaimable slabs.
5285 */
5286 KMEM_STAT_COND_ADD(s < max_slabs,
5287 kmem_move_stats.kms_endscan_nomem);
5288 return (-1);
5289 }
5290
5291 /*
5292 * The slab's position changed while the lock was
5293 * dropped, so we don't know where we are in the
5294 * sequence any more.
5295 */
5296 if (sp->slab_refcnt != refcnt) {
5297 /*
5298 * If this is a KMM_DEBUG move, the slab_refcnt
5299 * may have changed because we allocated a
5300 * destination buffer on the same slab. In that
5301 * case, we're not interested in counting it.
5302 */
5303 KMEM_STAT_COND_ADD(!(flags & KMM_DEBUG) &&
5304 (s < max_slabs),
5305 kmem_move_stats.kms_endscan_refcnt_changed);
5306 return (-1);
5307 }
5308 if ((sp->slab_flags & KMEM_SLAB_NOMOVE) != nomove) {
5309 KMEM_STAT_COND_ADD(s < max_slabs,
5310 kmem_move_stats.kms_endscan_nomove_changed);
5311 return (-1);
5312 }
5313
5314 /*
5315 * Generating a move request allocates a destination
5316 * buffer from the slab layer, bumping the first partial
5317 * slab if it is completely allocated. If the current
5318 * slab becomes the first partial slab as a result, we
5319 * can't continue to scan backwards.
5320 *
5321 * If this is a KMM_DEBUG move and we allocated the
5322 * destination buffer from the last partial slab, then
5323 * the buffer we're moving is on the same slab and our
5324 * slab_refcnt has changed, causing us to return before
5325 * reaching here if there are no partial slabs left.
5326 */
5327 ASSERT(!avl_is_empty(&cp->cache_partial_slabs));
5328 if (sp == avl_first(&cp->cache_partial_slabs)) {
5329 /*
5330 * We're not interested in a second KMM_DEBUG
5331 * move.
5332 */
5333 goto end_scan;
5334 }
5335 }
5336 }
5337 end_scan:
5338
5339 KMEM_STAT_COND_ADD(!(flags & KMM_DEBUG) &&
5340 (s < max_slabs) &&
5341 (sp == avl_first(&cp->cache_partial_slabs)),
5342 kmem_move_stats.kms_endscan_freelist);
5343
5344 return (s);
5345 }
5346
5347 typedef struct kmem_move_notify_args {
5348 kmem_cache_t *kmna_cache;
5349 void *kmna_buf;
5350 } kmem_move_notify_args_t;
5351
5352 static void
5353 kmem_cache_move_notify_task(void *arg)
5354 {
5355 kmem_move_notify_args_t *args = arg;
5356 kmem_cache_t *cp = args->kmna_cache;
5357 void *buf = args->kmna_buf;
5358 kmem_slab_t *sp;
5359
5360 ASSERT(taskq_member(kmem_taskq, curthread));
5361 ASSERT(list_link_active(&cp->cache_link));
5362
5363 kmem_free(args, sizeof (kmem_move_notify_args_t));
5383 return;
5384 }
5385
5386 kmem_slab_move_yes(cp, sp, buf);
5387 ASSERT(!(sp->slab_flags & KMEM_SLAB_MOVE_PENDING));
5388 sp->slab_flags |= KMEM_SLAB_MOVE_PENDING;
5389 mutex_exit(&cp->cache_lock);
5390 /* see kmem_move_buffers() about dropping the lock */
5391 (void) kmem_move_begin(cp, sp, buf, KMM_NOTIFY);
5392 mutex_enter(&cp->cache_lock);
5393 ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING);
5394 sp->slab_flags &= ~KMEM_SLAB_MOVE_PENDING;
5395 if (sp->slab_refcnt == 0) {
5396 list_t *deadlist = &cp->cache_defrag->kmd_deadlist;
5397 list_remove(deadlist, sp);
5398
5399 if (!avl_is_empty(
5400 &cp->cache_defrag->kmd_moves_pending)) {
5401 list_insert_head(deadlist, sp);
5402 mutex_exit(&cp->cache_lock);
5403 KMEM_STAT_ADD(kmem_move_stats.
5404 kms_notify_slab_dead);
5405 return;
5406 }
5407
5408 cp->cache_defrag->kmd_deadcount--;
5409 cp->cache_slab_destroy++;
5410 mutex_exit(&cp->cache_lock);
5411 kmem_slab_destroy(cp, sp);
5412 KMEM_STAT_ADD(kmem_move_stats.kms_dead_slabs_freed);
5413 KMEM_STAT_ADD(kmem_move_stats.
5414 kms_notify_slab_destroyed);
5415 return;
5416 }
5417 } else {
5418 kmem_slab_move_yes(cp, sp, buf);
5419 }
5420 mutex_exit(&cp->cache_lock);
5421 }
5422
5423 void
5424 kmem_cache_move_notify(kmem_cache_t *cp, void *buf)
5425 {
5426 kmem_move_notify_args_t *args;
5427
5428 KMEM_STAT_ADD(kmem_move_stats.kms_notify);
5429 args = kmem_alloc(sizeof (kmem_move_notify_args_t), KM_NOSLEEP);
5430 if (args != NULL) {
5431 args->kmna_cache = cp;
5432 args->kmna_buf = buf;
5433 if (!taskq_dispatch(kmem_taskq,
5434 (task_func_t *)kmem_cache_move_notify_task, args,
5435 TQ_NOSLEEP))
5436 kmem_free(args, sizeof (kmem_move_notify_args_t));
5437 }
5438 }
5439
5440 static void
5441 kmem_cache_defrag(kmem_cache_t *cp)
5442 {
5443 size_t n;
5444
5445 ASSERT(cp->cache_defrag != NULL);
5446
5447 mutex_enter(&cp->cache_lock);
5448 n = avl_numnodes(&cp->cache_partial_slabs);
5449 if (n > 1) {
5450 /* kmem_move_buffers() drops and reacquires cache_lock */
5451 KMEM_STAT_ADD(kmem_move_stats.kms_defrags);
5452 cp->cache_defrag->kmd_defrags++;
5453 (void) kmem_move_buffers(cp, n, 0, KMM_DESPERATE);
5454 }
5455 mutex_exit(&cp->cache_lock);
5456 }
5457
5458 /* Is this cache above the fragmentation threshold? */
5459 static boolean_t
5460 kmem_cache_frag_threshold(kmem_cache_t *cp, uint64_t nfree)
5461 {
5462 /*
5463 * nfree kmem_frag_numer
5464 * ------------------ > ---------------
5465 * cp->cache_buftotal kmem_frag_denom
5466 */
5467 return ((nfree * kmem_frag_denom) >
5468 (cp->cache_buftotal * kmem_frag_numer));
5469 }
5470
5471 static boolean_t
5530 if (kmd->kmd_consolidate > 0) {
5531 kmd->kmd_consolidate--;
5532 mutex_exit(&cp->cache_lock);
5533 kmem_cache_reap(cp);
5534 return;
5535 }
5536
5537 if (kmem_cache_is_fragmented(cp, &reap)) {
5538 size_t slabs_found;
5539
5540 /*
5541 * Consolidate reclaimable slabs from the end of the partial
5542 * slab list (scan at most kmem_reclaim_scan_range slabs to find
5543 * reclaimable slabs). Keep track of how many candidate slabs we
5544 * looked for and how many we actually found so we can adjust
5545 * the definition of a candidate slab if we're having trouble
5546 * finding them.
5547 *
5548 * kmem_move_buffers() drops and reacquires cache_lock.
5549 */
5550 KMEM_STAT_ADD(kmem_move_stats.kms_scans);
5551 kmd->kmd_scans++;
5552 slabs_found = kmem_move_buffers(cp, kmem_reclaim_scan_range,
5553 kmem_reclaim_max_slabs, 0);
5554 if (slabs_found >= 0) {
5555 kmd->kmd_slabs_sought += kmem_reclaim_max_slabs;
5556 kmd->kmd_slabs_found += slabs_found;
5557 }
5558
5559 if (++kmd->kmd_tries >= kmem_reclaim_scan_range) {
5560 kmd->kmd_tries = 0;
5561
5562 /*
5563 * If we had difficulty finding candidate slabs in
5564 * previous scans, adjust the threshold so that
5565 * candidates are easier to find.
5566 */
5567 if (kmd->kmd_slabs_found == kmd->kmd_slabs_sought) {
5568 kmem_adjust_reclaim_threshold(kmd, -1);
5569 } else if ((kmd->kmd_slabs_found * 2) <
5570 kmd->kmd_slabs_sought) {
5571 kmem_adjust_reclaim_threshold(kmd, 1);
5572 }
5573 kmd->kmd_slabs_sought = 0;
5574 kmd->kmd_slabs_found = 0;
5575 }
5576 } else {
5577 kmem_reset_reclaim_threshold(cp->cache_defrag);
5578 #ifdef DEBUG
5579 if (!avl_is_empty(&cp->cache_partial_slabs)) {
5580 /*
5581 * In a debug kernel we want the consolidator to
5582 * run occasionally even when there is plenty of
5583 * memory.
5584 */
5585 uint16_t debug_rand;
5586
5587 (void) random_get_bytes((uint8_t *)&debug_rand, 2);
5588 if (!kmem_move_noreap &&
5589 ((debug_rand % kmem_mtb_reap) == 0)) {
5590 mutex_exit(&cp->cache_lock);
5591 KMEM_STAT_ADD(kmem_move_stats.kms_debug_reaps);
5592 kmem_cache_reap(cp);
5593 return;
5594 } else if ((debug_rand % kmem_mtb_move) == 0) {
5595 KMEM_STAT_ADD(kmem_move_stats.kms_scans);
5596 KMEM_STAT_ADD(kmem_move_stats.kms_debug_scans);
5597 kmd->kmd_scans++;
5598 (void) kmem_move_buffers(cp,
5599 kmem_reclaim_scan_range, 1, KMM_DEBUG);
5600 }
5601 }
5602 #endif /* DEBUG */
5603 }
5604
5605 mutex_exit(&cp->cache_lock);
5606
5607 if (reap) {
5608 KMEM_STAT_ADD(kmem_move_stats.kms_scan_depot_ws_reaps);
5609 kmem_depot_ws_reap(cp);
5610 }
5611 }
|
142 * (The system won't be getting the slab back as long as the
143 * immovable object holds it hostage, so there's no point in moving
144 * any of its objects.)
145 * LATER: The client is using the object and cannot move it now, so kmem
146 * frees the new object (the unused copy destination). kmem still
147 * attempts to move other objects off the slab, since it expects to
148 * succeed in clearing the slab in a later callback. The client
149 * should use LATER instead of NO if the object is likely to become
150 * movable very soon.
151 * DONT_NEED: The client no longer needs the object, so kmem frees the old along
152 * with the new object (the unused copy destination). This response
153 * is the client's opportunity to be a model citizen and give back as
154 * much as it can.
155 * DONT_KNOW: The client does not know about the object because
156 * a) the client has just allocated the object and not yet put it
157 * wherever it expects to find known objects
158 * b) the client has removed the object from wherever it expects to
159 * find known objects and is about to free it, or
160 * c) the client has freed the object.
161 * In all these cases (a, b, and c) kmem frees the new object (the
162 * unused copy destination). In the first case, the object is in
163 * use and the correct action is that for LATER; in the latter two
164 * cases, we know that the object is either freed or about to be
165 * freed, in which case it is either already in a magazine or about
166 * to be in one. In these cases, we know that the object will either
167 * be reallocated and reused, or it will end up in a full magazine
168 * that will be reaped (thereby liberating the slab). Because it
169 * is prohibitively expensive to differentiate these cases, and
170 * because the defrag code is executed when we're low on memory
171 * (thereby biasing the system to reclaim full magazines) we treat
172 * all DONT_KNOW cases as LATER and rely on cache reaping to
173 * generally clean up full magazines. While we take the same action
174 * for these cases, we maintain their semantic distinction: if
175 * defragmentation is not occurring, it is useful to know if this
176 * is due to objects in use (LATER) or objects in an unknown state
177 * of transition (DONT_KNOW).
178 *
179 * 2.3 Object States
180 *
181 * Neither kmem nor the client can be assumed to know the object's whereabouts
182 * at the time of the callback. An object belonging to a kmem cache may be in
183 * any of the following states:
184 *
185 * 1. Uninitialized on the slab
186 * 2. Allocated from the slab but not constructed (still uninitialized)
187 * 3. Allocated from the slab, constructed, but not yet ready for business
188 * (not in a valid state for the move callback)
189 * 4. In use (valid and known to the client)
190 * 5. About to be freed (no longer in a valid state for the move callback)
191 * 6. Freed to a magazine (still constructed)
192 * 7. Allocated from a magazine, not yet ready for business (not in a valid
193 * state for the move callback), and about to return to state #4
194 * 8. Deconstructed on a magazine that is about to be freed
195 * 9. Freed to the slab
196 *
197 * Since the move callback may be called at any time while the object is in any
280 * c_objects_lock is held. Note that after acquiring the lock, the client must
281 * recheck the o_container pointer in case the object was removed just before
282 * acquiring the lock.
283 *
284 * When the client is about to free an object, it must first remove that object
285 * from the list, hash, or other structure where it is kept. At that time, to
286 * mark the object so it can be distinguished from the remaining, known objects,
287 * the client sets the designated low order bit:
288 *
289 * mutex_enter(&container->c_objects_lock);
290 * object->o_container = (void *)((uintptr_t)object->o_container | 0x1);
291 * list_remove(&container->c_objects, object);
292 * mutex_exit(&container->c_objects_lock);
293 *
294 * In the common case, the object is freed to the magazine layer, where it may
295 * be reused on a subsequent allocation without the overhead of calling the
296 * constructor. While in the magazine it appears allocated from the point of
297 * view of the slab layer, making it a candidate for the move callback. Most
298 * objects unrecognized by the client in the move callback fall into this
299 * category and are cheaply distinguished from known objects by the test
300 * described earlier. Because searching magazines is prohibitively expensive
301 * for kmem, clients that do not mark freed objects (and therefore return
302 * KMEM_CBRC_DONT_KNOW for large numbers of objects) may find defragmentation
303 * efficacy reduced.
304 *
305 * Invalidating the designated pointer member before freeing the object marks
306 * the object to be avoided in the callback, and conversely, assigning a valid
307 * value to the designated pointer member after allocating the object makes the
308 * object fair game for the callback:
309 *
310 * ... allocate object ...
311 * ... set any initial state not set by the constructor ...
312 *
313 * mutex_enter(&container->c_objects_lock);
314 * list_insert_tail(&container->c_objects, object);
315 * membar_producer();
316 * object->o_container = container;
317 * mutex_exit(&container->c_objects_lock);
318 *
319 * Note that everything else must be valid before setting o_container makes the
320 * object fair game for the move callback. The membar_producer() call ensures
321 * that all the object's state is written to memory before setting the pointer
322 * that transitions the object from state #3 or #7 (allocated, constructed, not
323 * yet in use) to state #4 (in use, valid). That's important because the move
1034 static kmem_cache_t *kmem_bufctl_cache;
1035 static kmem_cache_t *kmem_bufctl_audit_cache;
1036
1037 static kmutex_t kmem_cache_lock; /* inter-cache linkage only */
1038 static list_t kmem_caches;
1039
1040 static taskq_t *kmem_taskq;
1041 static kmutex_t kmem_flags_lock;
1042 static vmem_t *kmem_metadata_arena;
1043 static vmem_t *kmem_msb_arena; /* arena for metadata caches */
1044 static vmem_t *kmem_cache_arena;
1045 static vmem_t *kmem_hash_arena;
1046 static vmem_t *kmem_log_arena;
1047 static vmem_t *kmem_oversize_arena;
1048 static vmem_t *kmem_va_arena;
1049 static vmem_t *kmem_default_arena;
1050 static vmem_t *kmem_firewall_va_arena;
1051 static vmem_t *kmem_firewall_arena;
1052
1053 /*
1054 * kmem slab consolidator thresholds (tunables)
1055 */
1056 size_t kmem_frag_minslabs = 101; /* minimum total slabs */
1057 size_t kmem_frag_numer = 1; /* free buffers (numerator) */
1058 size_t kmem_frag_denom = KMEM_VOID_FRACTION; /* buffers (denominator) */
1059 /*
1060 * Maximum number of slabs from which to move buffers during a single
1061 * maintenance interval while the system is not low on memory.
1062 */
1063 size_t kmem_reclaim_max_slabs = 1;
1064 /*
1065 * Number of slabs to scan backwards from the end of the partial slab list
1066 * when searching for buffers to relocate.
1067 */
1068 size_t kmem_reclaim_scan_range = 12;
1069
1070 /* consolidator knobs */
1071 boolean_t kmem_move_noreap;
1072 boolean_t kmem_move_blocked;
1073 boolean_t kmem_move_fulltilt;
1074 boolean_t kmem_move_any_partial;
1075
1076 #ifdef DEBUG
1077 /*
1078 * kmem consolidator debug tunables:
1079 * Ensure code coverage by occasionally running the consolidator even when the
1080 * caches are not fragmented (they may never be). These intervals are mean time
1081 * in cache maintenance intervals (kmem_cache_update).
1082 */
1083 uint32_t kmem_mtb_move = 60; /* defrag 1 slab (~15min) */
1084 uint32_t kmem_mtb_reap = 1800; /* defrag all slabs (~7.5hrs) */
1085 #endif /* DEBUG */
1086
1087 static kmem_cache_t *kmem_defrag_cache;
1088 static kmem_cache_t *kmem_move_cache;
1089 static taskq_t *kmem_move_taskq;
1860 */
1861 if (sp->slab_flags & KMEM_SLAB_MOVE_PENDING) {
1862 list_insert_tail(deadlist, sp);
1863 } else {
1864 list_insert_head(deadlist, sp);
1865 }
1866 cp->cache_defrag->kmd_deadcount++;
1867 mutex_exit(&cp->cache_lock);
1868 }
1869 return;
1870 }
1871
1872 if (bcp->bc_next == NULL) {
1873 /* Transition the slab from completely allocated to partial. */
1874 ASSERT(sp->slab_refcnt == (sp->slab_chunks - 1));
1875 ASSERT(sp->slab_chunks > 1);
1876 list_remove(&cp->cache_complete_slabs, sp);
1877 cp->cache_complete_slab_count--;
1878 avl_add(&cp->cache_partial_slabs, sp);
1879 } else {
1880 (void) avl_update_gt(&cp->cache_partial_slabs, sp);
1881 }
1882
1883 ASSERT((cp->cache_slab_create - cp->cache_slab_destroy) ==
1884 (cp->cache_complete_slab_count +
1885 avl_numnodes(&cp->cache_partial_slabs) +
1886 (cp->cache_defrag == NULL ? 0 : cp->cache_defrag->kmd_deadcount)));
1887 mutex_exit(&cp->cache_lock);
1888 }
1889
1890 /*
1891 * Return -1 if kmem_error, 1 if constructor fails, 0 if successful.
1892 */
1893 static int
1894 kmem_cache_alloc_debug(kmem_cache_t *cp, void *buf, int kmflag, int construct,
1895 caddr_t caller)
1896 {
1897 kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
1898 kmem_bufctl_audit_t *bcp = (kmem_bufctl_audit_t *)btp->bt_bufctl;
1899 uint32_t mtbf;
1900
3509 kmcp->kmc_move_yes.value.ui64 = 0;
3510 kmcp->kmc_move_no.value.ui64 = 0;
3511 kmcp->kmc_move_later.value.ui64 = 0;
3512 kmcp->kmc_move_dont_need.value.ui64 = 0;
3513 kmcp->kmc_move_dont_know.value.ui64 = 0;
3514 kmcp->kmc_move_hunt_found.value.ui64 = 0;
3515 kmcp->kmc_move_slabs_freed.value.ui64 = 0;
3516 kmcp->kmc_defrag.value.ui64 = 0;
3517 kmcp->kmc_scan.value.ui64 = 0;
3518 kmcp->kmc_move_reclaimable.value.ui64 = 0;
3519 } else {
3520 int64_t reclaimable;
3521
3522 kmem_defrag_t *kd = cp->cache_defrag;
3523 kmcp->kmc_move_callbacks.value.ui64 = kd->kmd_callbacks;
3524 kmcp->kmc_move_yes.value.ui64 = kd->kmd_yes;
3525 kmcp->kmc_move_no.value.ui64 = kd->kmd_no;
3526 kmcp->kmc_move_later.value.ui64 = kd->kmd_later;
3527 kmcp->kmc_move_dont_need.value.ui64 = kd->kmd_dont_need;
3528 kmcp->kmc_move_dont_know.value.ui64 = kd->kmd_dont_know;
3529 kmcp->kmc_move_hunt_found.value.ui64 = 0;
3530 kmcp->kmc_move_slabs_freed.value.ui64 = kd->kmd_slabs_freed;
3531 kmcp->kmc_defrag.value.ui64 = kd->kmd_defrags;
3532 kmcp->kmc_scan.value.ui64 = kd->kmd_scans;
3533
3534 reclaimable = cp->cache_bufslab - (cp->cache_maxchunks - 1);
3535 reclaimable = MAX(reclaimable, 0);
3536 reclaimable += ((uint64_t)reap * cp->cache_magtype->mt_magsize);
3537 kmcp->kmc_move_reclaimable.value.ui64 = reclaimable;
3538 }
3539
3540 mutex_exit(&cp->cache_lock);
3541 return (0);
3542 }
3543
3544 /*
3545 * Return a named statistic about a particular cache.
3546 * This shouldn't be called very often, so it's currently designed for
3547 * simplicity (leverages existing kstat support) rather than efficiency.
3548 */
3549 uint64_t
4080 kmem_cache_free(kmem_defrag_cache, defrag); /* unused */
4081 }
4082 }
4083
4084 void
4085 kmem_cache_destroy(kmem_cache_t *cp)
4086 {
4087 int cpu_seqid;
4088
4089 /*
4090 * Remove the cache from the global cache list so that no one else
4091 * can schedule tasks on its behalf, wait for any pending tasks to
4092 * complete, purge the cache, and then destroy it.
4093 */
4094 mutex_enter(&kmem_cache_lock);
4095 list_remove(&kmem_caches, cp);
4096 mutex_exit(&kmem_cache_lock);
4097
4098 if (kmem_taskq != NULL)
4099 taskq_wait(kmem_taskq);
4100
4101 if (kmem_move_taskq != NULL && cp->cache_defrag != NULL)
4102 taskq_wait(kmem_move_taskq);
4103
4104 kmem_cache_magazine_purge(cp);
4105
4106 mutex_enter(&cp->cache_lock);
4107 if (cp->cache_buftotal != 0)
4108 cmn_err(CE_WARN, "kmem_cache_destroy: '%s' (%p) not empty",
4109 cp->cache_name, (void *)cp);
4110 if (cp->cache_defrag != NULL) {
4111 avl_destroy(&cp->cache_defrag->kmd_moves_pending);
4112 list_destroy(&cp->cache_defrag->kmd_deadlist);
4113 kmem_cache_free(kmem_defrag_cache, cp->cache_defrag);
4114 cp->cache_defrag = NULL;
4115 }
4116 /*
4117 * The cache is now dead. There should be no further activity. We
4118 * enforce this by setting land mines in the constructor, destructor,
4119 * reclaim, and move routines that induce a kernel text fault if
4120 * invoked.
4121 */
4608 return (B_FALSE);
4609 }
4610
4611 if ((refcnt == 1) || kmem_move_any_partial) {
4612 return (refcnt < sp->slab_chunks);
4613 }
4614
4615 /*
4616 * The reclaim threshold is adjusted at each kmem_cache_scan() so that
4617 * slabs with a progressively higher percentage of used buffers can be
4618 * reclaimed until the cache as a whole is no longer fragmented.
4619 *
4620 * sp->slab_refcnt kmd_reclaim_numer
4621 * --------------- < ------------------
4622 * sp->slab_chunks KMEM_VOID_FRACTION
4623 */
4624 return ((refcnt * KMEM_VOID_FRACTION) <
4625 (sp->slab_chunks * cp->cache_defrag->kmd_reclaim_numer));
4626 }
4627
4628 /*
4629 * May be called from the kmem_move_taskq, from kmem_cache_move_notify_task(),
4630 * or when the buffer is freed.
4631 */
4632 static void
4633 kmem_slab_move_yes(kmem_cache_t *cp, kmem_slab_t *sp, void *from_buf)
4634 {
4635 ASSERT(MUTEX_HELD(&cp->cache_lock));
4636 ASSERT(KMEM_SLAB_MEMBER(sp, from_buf));
4637
4638 if (!KMEM_SLAB_IS_PARTIAL(sp)) {
4639 return;
4640 }
4641
4642 if (sp->slab_flags & KMEM_SLAB_NOMOVE) {
4643 if (KMEM_SLAB_OFFSET(sp, from_buf) == sp->slab_stuck_offset) {
4644 avl_remove(&cp->cache_partial_slabs, sp);
4645 sp->slab_flags &= ~KMEM_SLAB_NOMOVE;
4646 sp->slab_stuck_offset = (uint32_t)-1;
4647 avl_add(&cp->cache_partial_slabs, sp);
4648 }
4671 }
4672
4673 static void kmem_move_end(kmem_cache_t *, kmem_move_t *);
4674
4675 /*
4676 * The move callback takes two buffer addresses, the buffer to be moved, and a
4677 * newly allocated and constructed buffer selected by kmem as the destination.
4678 * It also takes the size of the buffer and an optional user argument specified
4679 * at cache creation time. kmem guarantees that the buffer to be moved has not
4680 * been unmapped by the virtual memory subsystem. Beyond that, it cannot
4681 * guarantee the present whereabouts of the buffer to be moved, so it is up to
4682 * the client to safely determine whether or not it is still using the buffer.
4683 * The client must not free either of the buffers passed to the move callback,
4684 * since kmem wants to free them directly to the slab layer. The client response
4685 * tells kmem which of the two buffers to free:
4686 *
4687 * YES kmem frees the old buffer (the move was successful)
4688 * NO kmem frees the new buffer, marks the slab of the old buffer
4689 * non-reclaimable to avoid bothering the client again
4690 * LATER kmem frees the new buffer, increments slab_later_count
4691 * DONT_KNOW kmem frees the new buffer
4692 * DONT_NEED kmem frees both the old buffer and the new buffer
4693 *
4694 * The pending callback argument now being processed contains both of the
4695 * buffers (old and new) passed to the move callback function, the slab of the
4696 * old buffer, and flags related to the move request, such as whether or not the
4697 * system was desperate for memory.
4698 *
4699 * Slabs are not freed while there is a pending callback, but instead are kept
4700 * on a deadlist, which is drained after the last callback completes. This means
4701 * that slabs are safe to access until kmem_move_end(), no matter how many of
4702 * their buffers have been freed. Once slab_refcnt reaches zero, it stays at
4703 * zero for as long as the slab remains on the deadlist and until the slab is
4704 * freed.
4705 */
4706 static void
4707 kmem_move_buffer(kmem_move_t *callback)
4708 {
4709 kmem_cbrc_t response;
4710 kmem_slab_t *sp = callback->kmm_from_slab;
4711 kmem_cache_t *cp = sp->slab_cache;
4712 boolean_t free_on_slab;
4713
4714 ASSERT(taskq_member(kmem_move_taskq, curthread));
4715 ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
4716 ASSERT(KMEM_SLAB_MEMBER(sp, callback->kmm_from_buf));
4717
4718 /*
4719 * The number of allocated buffers on the slab may have changed since we
4720 * last checked the slab's reclaimability (when the pending move was
4721 * enqueued), or the client may have responded NO when asked to move
4722 * another buffer on the same slab.
4723 */
4724 if (!kmem_slab_is_reclaimable(cp, sp, callback->kmm_flags)) {
4725 kmem_slab_free(cp, callback->kmm_to_buf);
4726 kmem_move_end(cp, callback);
4727 return;
4728 }
4729
4730 /*
4731 * Checking the slab layer is easy, so we might as well do that here
4732 * in case we can avoid bothering the client.
4733 */
4734 mutex_enter(&cp->cache_lock);
4735 free_on_slab = (kmem_slab_allocated(cp, sp,
4736 callback->kmm_from_buf) == NULL);
4737 mutex_exit(&cp->cache_lock);
4738
4739 if (free_on_slab) {
4740 kmem_slab_free(cp, callback->kmm_to_buf);
4741 kmem_move_end(cp, callback);
4742 return;
4743 }
4744
4745 if (cp->cache_flags & KMF_BUFTAG) {
4746 /*
4747 * Make kmem_cache_alloc_debug() apply the constructor for us.
4748 */
4749 if (kmem_cache_alloc_debug(cp, callback->kmm_to_buf,
4750 KM_NOSLEEP, 1, caller()) != 0) {
4751 kmem_move_end(cp, callback);
4752 return;
4753 }
4754 } else if (cp->cache_constructor != NULL &&
4755 cp->cache_constructor(callback->kmm_to_buf, cp->cache_private,
4756 KM_NOSLEEP) != 0) {
4757 atomic_inc_64(&cp->cache_alloc_fail);
4758 kmem_slab_free(cp, callback->kmm_to_buf);
4759 kmem_move_end(cp, callback);
4760 return;
4761 }
4762
4763 cp->cache_defrag->kmd_callbacks++;
4764 cp->cache_defrag->kmd_thread = curthread;
4765 cp->cache_defrag->kmd_from_buf = callback->kmm_from_buf;
4766 cp->cache_defrag->kmd_to_buf = callback->kmm_to_buf;
4767 DTRACE_PROBE2(kmem__move__start, kmem_cache_t *, cp, kmem_move_t *,
4768 callback);
4769
4770 response = cp->cache_move(callback->kmm_from_buf,
4771 callback->kmm_to_buf, cp->cache_bufsize, cp->cache_private);
4772
4773 DTRACE_PROBE3(kmem__move__end, kmem_cache_t *, cp, kmem_move_t *,
4774 callback, kmem_cbrc_t, response);
4775 cp->cache_defrag->kmd_thread = NULL;
4776 cp->cache_defrag->kmd_from_buf = NULL;
4777 cp->cache_defrag->kmd_to_buf = NULL;
4778
4779 if (response == KMEM_CBRC_YES) {
4780 cp->cache_defrag->kmd_yes++;
4781 kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE);
4782 /* slab safe to access until kmem_move_end() */
4783 if (sp->slab_refcnt == 0)
4784 cp->cache_defrag->kmd_slabs_freed++;
4785 mutex_enter(&cp->cache_lock);
4786 kmem_slab_move_yes(cp, sp, callback->kmm_from_buf);
4787 mutex_exit(&cp->cache_lock);
4788 kmem_move_end(cp, callback);
4789 return;
4790 }
4791
4792 switch (response) {
4793 case KMEM_CBRC_NO:
4794 cp->cache_defrag->kmd_no++;
4795 mutex_enter(&cp->cache_lock);
4796 kmem_slab_move_no(cp, sp, callback->kmm_from_buf);
4797 mutex_exit(&cp->cache_lock);
4798 break;
4799 case KMEM_CBRC_LATER:
4800 cp->cache_defrag->kmd_later++;
4801 mutex_enter(&cp->cache_lock);
4802 if (!KMEM_SLAB_IS_PARTIAL(sp)) {
4803 mutex_exit(&cp->cache_lock);
4804 break;
4805 }
4806
4807 if (++sp->slab_later_count >= KMEM_DISBELIEF) {
4808 kmem_slab_move_no(cp, sp, callback->kmm_from_buf);
4809 } else if (!(sp->slab_flags & KMEM_SLAB_NOMOVE)) {
4810 sp->slab_stuck_offset = KMEM_SLAB_OFFSET(sp,
4811 callback->kmm_from_buf);
4812 }
4813 mutex_exit(&cp->cache_lock);
4814 break;
4815 case KMEM_CBRC_DONT_NEED:
4816 cp->cache_defrag->kmd_dont_need++;
4817 kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE);
4818 if (sp->slab_refcnt == 0)
4819 cp->cache_defrag->kmd_slabs_freed++;
4820 mutex_enter(&cp->cache_lock);
4821 kmem_slab_move_yes(cp, sp, callback->kmm_from_buf);
4822 mutex_exit(&cp->cache_lock);
4823 break;
4824 case KMEM_CBRC_DONT_KNOW:
4825 /*
4826 * If we don't know if we can move this buffer or not, we'll
4827 * just assume that we can't: if the buffer is in fact free,
4828 * then it is sitting in one of the per-CPU magazines or in
4829 * a full magazine in the depot layer. Either way, because
4830 * defrag is induced in the same logic that reaps a cache,
4831 * it's likely that full magazines will be returned to the
4832 * system soon (thereby accomplishing what we're trying to
4833 * accomplish here: return those magazines to their slabs).
4834 * Given this, any work that we might do now to locate a buffer
4835 * in a magazine is wasted (and expensive!) work; we bump
4836 * a counter in this case and otherwise assume that we can't
4837 * move it.
4838 */
4839 cp->cache_defrag->kmd_dont_know++;
4840 break;
4841 default:
4842 panic("'%s' (%p) unexpected move callback response %d\n",
4843 cp->cache_name, (void *)cp, response);
4844 }
4845
4846 kmem_slab_free_constructed(cp, callback->kmm_to_buf, B_FALSE);
4847 kmem_move_end(cp, callback);
4848 }
4849
4850 /* Return B_FALSE if there is insufficient memory for the move request. */
4851 static boolean_t
4852 kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags)
4853 {
4854 void *to_buf;
4855 avl_index_t index;
4856 kmem_move_t *callback, *pending;
4857 ulong_t n;
4858
4859 ASSERT(taskq_member(kmem_taskq, curthread));
4860 ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
4861 ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING);
4862
4863 callback = kmem_cache_alloc(kmem_move_cache, KM_NOSLEEP);
4864
4865 if (callback == NULL)
4866 return (B_FALSE);
4867
4868 callback->kmm_from_slab = sp;
4869 callback->kmm_from_buf = buf;
4870 callback->kmm_flags = flags;
4871
4872 mutex_enter(&cp->cache_lock);
4873
4874 n = avl_numnodes(&cp->cache_partial_slabs);
4875 if ((n == 0) || ((n == 1) && !(flags & KMM_DEBUG))) {
4876 mutex_exit(&cp->cache_lock);
4877 kmem_cache_free(kmem_move_cache, callback);
4878 return (B_TRUE); /* there is no need for the move request */
4879 }
4880
4881 pending = avl_find(&cp->cache_defrag->kmd_moves_pending, buf, &index);
4882 if (pending != NULL) {
4883 /*
4884 * If the move is already pending and we're desperate now,
4885 * update the move flags.
4886 */
4887 if (flags & KMM_DESPERATE) {
4888 pending->kmm_flags |= KMM_DESPERATE;
4889 }
4890 mutex_exit(&cp->cache_lock);
4891 kmem_cache_free(kmem_move_cache, callback);
4892 return (B_TRUE);
4893 }
4894
4895 to_buf = kmem_slab_alloc_impl(cp, avl_first(&cp->cache_partial_slabs),
4896 B_FALSE);
4897 callback->kmm_to_buf = to_buf;
4898 avl_insert(&cp->cache_defrag->kmd_moves_pending, callback, index);
4899
4900 mutex_exit(&cp->cache_lock);
4901
4902 if (!taskq_dispatch(kmem_move_taskq, (task_func_t *)kmem_move_buffer,
4903 callback, TQ_NOSLEEP)) {
4904 mutex_enter(&cp->cache_lock);
4905 avl_remove(&cp->cache_defrag->kmd_moves_pending, callback);
4906 mutex_exit(&cp->cache_lock);
4907 kmem_slab_free(cp, to_buf);
4908 kmem_cache_free(kmem_move_cache, callback);
4909 return (B_FALSE);
4910 }
4911
4912 return (B_TRUE);
4913 }
4914
4915 static void
4916 kmem_move_end(kmem_cache_t *cp, kmem_move_t *callback)
4917 {
4918 avl_index_t index;
4919
4920 ASSERT(cp->cache_defrag != NULL);
4921 ASSERT(taskq_member(kmem_move_taskq, curthread));
4922 ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
4923
4929 list_t *deadlist = &cp->cache_defrag->kmd_deadlist;
4930 kmem_slab_t *sp;
4931
4932 /*
4933 * The last pending move completed. Release all slabs from the
4934 * front of the dead list except for any slab at the tail that
4935 * needs to be released from the context of kmem_move_buffers().
4936 * kmem deferred unmapping the buffers on these slabs in order
4937 * to guarantee that buffers passed to the move callback have
4938 * been touched only by kmem or by the client itself.
4939 */
4940 while ((sp = list_remove_head(deadlist)) != NULL) {
4941 if (sp->slab_flags & KMEM_SLAB_MOVE_PENDING) {
4942 list_insert_tail(deadlist, sp);
4943 break;
4944 }
4945 cp->cache_defrag->kmd_deadcount--;
4946 cp->cache_slab_destroy++;
4947 mutex_exit(&cp->cache_lock);
4948 kmem_slab_destroy(cp, sp);
4949 mutex_enter(&cp->cache_lock);
4950 }
4951 }
4952 mutex_exit(&cp->cache_lock);
4953 kmem_cache_free(kmem_move_cache, callback);
4954 }
4955
4956 /*
4957 * Move buffers from least used slabs first by scanning backwards from the end
4958 * of the partial slab list. Scan at most max_scan candidate slabs and move
4959 * buffers from at most max_slabs slabs (0 for all partial slabs in both cases).
4960 * If desperate to reclaim memory, move buffers from any partial slab, otherwise
4961 * skip slabs with a ratio of allocated buffers at or above the current
4962 * threshold. Return the number of unskipped slabs (at most max_slabs, -1 if the
4963 * scan is aborted) so that the caller can adjust the reclaimability threshold
4964 * depending on how many reclaimable slabs it finds.
4965 *
4966 * kmem_move_buffers() drops and reacquires cache_lock every time it issues a
4967 * move request, since it is not valid for kmem_move_begin() to call
4968 * kmem_cache_alloc() or taskq_dispatch() with cache_lock held.
5073 list_t *deadlist =
5074 &cp->cache_defrag->kmd_deadlist;
5075 list_remove(deadlist, sp);
5076
5077 if (!avl_is_empty(
5078 &cp->cache_defrag->kmd_moves_pending)) {
5079 /*
5080 * A pending move makes it unsafe to
5081 * destroy the slab, because even though
5082 * the move is no longer needed, the
5083 * context where that is determined
5084 * requires the slab to exist.
5085 * Fortunately, a pending move also
5086 * means we don't need to destroy the
5087 * slab here, since it will get
5088 * destroyed along with any other slabs
5089 * on the deadlist after the last
5090 * pending move completes.
5091 */
5092 list_insert_head(deadlist, sp);
5093 return (-1);
5094 }
5095
5096 /*
5097 * Destroy the slab now if it was completely
5098 * freed while we dropped cache_lock and there
5099 * are no pending moves. Since slab_refcnt
5100 * cannot change once it reaches zero, no new
5101 * pending moves from that slab are possible.
5102 */
5103 cp->cache_defrag->kmd_deadcount--;
5104 cp->cache_slab_destroy++;
5105 mutex_exit(&cp->cache_lock);
5106 kmem_slab_destroy(cp, sp);
5107 mutex_enter(&cp->cache_lock);
5108 /*
5109 * Since we can't pick up the scan where we left
5110 * off, abort the scan and say nothing about the
5111 * number of reclaimable slabs.
5112 */
5113 return (-1);
5114 }
5115
5116 if (!success) {
5117 /*
5118 * Abort the scan if there is not enough memory
5119 * for the request and say nothing about the
5120 * number of reclaimable slabs.
5121 */
5122 return (-1);
5123 }
5124
5125 /*
5126 * The slab's position changed while the lock was
5127 * dropped, so we don't know where we are in the
5128 * sequence any more.
5129 */
5130 if (sp->slab_refcnt != refcnt) {
5131 /*
5132 * If this is a KMM_DEBUG move, the slab_refcnt
5133 * may have changed because we allocated a
5134 * destination buffer on the same slab. In that
5135 * case, we're not interested in counting it.
5136 */
5137 return (-1);
5138 }
5139 if ((sp->slab_flags & KMEM_SLAB_NOMOVE) != nomove)
5140 return (-1);
5141
5142 /*
5143 * Generating a move request allocates a destination
5144 * buffer from the slab layer, bumping the first partial
5145 * slab if it is completely allocated. If the current
5146 * slab becomes the first partial slab as a result, we
5147 * can't continue to scan backwards.
5148 *
5149 * If this is a KMM_DEBUG move and we allocated the
5150 * destination buffer from the last partial slab, then
5151 * the buffer we're moving is on the same slab and our
5152 * slab_refcnt has changed, causing us to return before
5153 * reaching here if there are no partial slabs left.
5154 */
5155 ASSERT(!avl_is_empty(&cp->cache_partial_slabs));
5156 if (sp == avl_first(&cp->cache_partial_slabs)) {
5157 /*
5158 * We're not interested in a second KMM_DEBUG
5159 * move.
5160 */
5161 goto end_scan;
5162 }
5163 }
5164 }
5165 end_scan:
5166
5167 return (s);
5168 }
5169
5170 typedef struct kmem_move_notify_args {
5171 kmem_cache_t *kmna_cache;
5172 void *kmna_buf;
5173 } kmem_move_notify_args_t;
5174
5175 static void
5176 kmem_cache_move_notify_task(void *arg)
5177 {
5178 kmem_move_notify_args_t *args = arg;
5179 kmem_cache_t *cp = args->kmna_cache;
5180 void *buf = args->kmna_buf;
5181 kmem_slab_t *sp;
5182
5183 ASSERT(taskq_member(kmem_taskq, curthread));
5184 ASSERT(list_link_active(&cp->cache_link));
5185
5186 kmem_free(args, sizeof (kmem_move_notify_args_t));
5206 return;
5207 }
5208
5209 kmem_slab_move_yes(cp, sp, buf);
5210 ASSERT(!(sp->slab_flags & KMEM_SLAB_MOVE_PENDING));
5211 sp->slab_flags |= KMEM_SLAB_MOVE_PENDING;
5212 mutex_exit(&cp->cache_lock);
5213 /* see kmem_move_buffers() about dropping the lock */
5214 (void) kmem_move_begin(cp, sp, buf, KMM_NOTIFY);
5215 mutex_enter(&cp->cache_lock);
5216 ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING);
5217 sp->slab_flags &= ~KMEM_SLAB_MOVE_PENDING;
5218 if (sp->slab_refcnt == 0) {
5219 list_t *deadlist = &cp->cache_defrag->kmd_deadlist;
5220 list_remove(deadlist, sp);
5221
5222 if (!avl_is_empty(
5223 &cp->cache_defrag->kmd_moves_pending)) {
5224 list_insert_head(deadlist, sp);
5225 mutex_exit(&cp->cache_lock);
5226 return;
5227 }
5228
5229 cp->cache_defrag->kmd_deadcount--;
5230 cp->cache_slab_destroy++;
5231 mutex_exit(&cp->cache_lock);
5232 kmem_slab_destroy(cp, sp);
5233 return;
5234 }
5235 } else {
5236 kmem_slab_move_yes(cp, sp, buf);
5237 }
5238 mutex_exit(&cp->cache_lock);
5239 }
5240
5241 void
5242 kmem_cache_move_notify(kmem_cache_t *cp, void *buf)
5243 {
5244 kmem_move_notify_args_t *args;
5245
5246 args = kmem_alloc(sizeof (kmem_move_notify_args_t), KM_NOSLEEP);
5247 if (args != NULL) {
5248 args->kmna_cache = cp;
5249 args->kmna_buf = buf;
5250 if (!taskq_dispatch(kmem_taskq,
5251 (task_func_t *)kmem_cache_move_notify_task, args,
5252 TQ_NOSLEEP))
5253 kmem_free(args, sizeof (kmem_move_notify_args_t));
5254 }
5255 }
5256
5257 static void
5258 kmem_cache_defrag(kmem_cache_t *cp)
5259 {
5260 size_t n;
5261
5262 ASSERT(cp->cache_defrag != NULL);
5263
5264 mutex_enter(&cp->cache_lock);
5265 n = avl_numnodes(&cp->cache_partial_slabs);
5266 if (n > 1) {
5267 /* kmem_move_buffers() drops and reacquires cache_lock */
5268 cp->cache_defrag->kmd_defrags++;
5269 (void) kmem_move_buffers(cp, n, 0, KMM_DESPERATE);
5270 }
5271 mutex_exit(&cp->cache_lock);
5272 }
5273
5274 /* Is this cache above the fragmentation threshold? */
5275 static boolean_t
5276 kmem_cache_frag_threshold(kmem_cache_t *cp, uint64_t nfree)
5277 {
5278 /*
5279 * nfree kmem_frag_numer
5280 * ------------------ > ---------------
5281 * cp->cache_buftotal kmem_frag_denom
5282 */
5283 return ((nfree * kmem_frag_denom) >
5284 (cp->cache_buftotal * kmem_frag_numer));
5285 }
5286
5287 static boolean_t
5346 if (kmd->kmd_consolidate > 0) {
5347 kmd->kmd_consolidate--;
5348 mutex_exit(&cp->cache_lock);
5349 kmem_cache_reap(cp);
5350 return;
5351 }
5352
5353 if (kmem_cache_is_fragmented(cp, &reap)) {
5354 size_t slabs_found;
5355
5356 /*
5357 * Consolidate reclaimable slabs from the end of the partial
5358 * slab list (scan at most kmem_reclaim_scan_range slabs to find
5359 * reclaimable slabs). Keep track of how many candidate slabs we
5360 * looked for and how many we actually found so we can adjust
5361 * the definition of a candidate slab if we're having trouble
5362 * finding them.
5363 *
5364 * kmem_move_buffers() drops and reacquires cache_lock.
5365 */
5366 kmd->kmd_scans++;
5367 slabs_found = kmem_move_buffers(cp, kmem_reclaim_scan_range,
5368 kmem_reclaim_max_slabs, 0);
5369 if (slabs_found >= 0) {
5370 kmd->kmd_slabs_sought += kmem_reclaim_max_slabs;
5371 kmd->kmd_slabs_found += slabs_found;
5372 }
5373
5374 if (++kmd->kmd_tries >= kmem_reclaim_scan_range) {
5375 kmd->kmd_tries = 0;
5376
5377 /*
5378 * If we had difficulty finding candidate slabs in
5379 * previous scans, adjust the threshold so that
5380 * candidates are easier to find.
5381 */
5382 if (kmd->kmd_slabs_found == kmd->kmd_slabs_sought) {
5383 kmem_adjust_reclaim_threshold(kmd, -1);
5384 } else if ((kmd->kmd_slabs_found * 2) <
5385 kmd->kmd_slabs_sought) {
5386 kmem_adjust_reclaim_threshold(kmd, 1);
5387 }
5388 kmd->kmd_slabs_sought = 0;
5389 kmd->kmd_slabs_found = 0;
5390 }
5391 } else {
5392 kmem_reset_reclaim_threshold(cp->cache_defrag);
5393 #ifdef DEBUG
5394 if (!avl_is_empty(&cp->cache_partial_slabs)) {
5395 /*
5396 * In a debug kernel we want the consolidator to
5397 * run occasionally even when there is plenty of
5398 * memory.
5399 */
5400 uint16_t debug_rand;
5401
5402 (void) random_get_bytes((uint8_t *)&debug_rand, 2);
5403 if (!kmem_move_noreap &&
5404 ((debug_rand % kmem_mtb_reap) == 0)) {
5405 mutex_exit(&cp->cache_lock);
5406 kmem_cache_reap(cp);
5407 return;
5408 } else if ((debug_rand % kmem_mtb_move) == 0) {
5409 kmd->kmd_scans++;
5410 (void) kmem_move_buffers(cp,
5411 kmem_reclaim_scan_range, 1, KMM_DEBUG);
5412 }
5413 }
5414 #endif /* DEBUG */
5415 }
5416
5417 mutex_exit(&cp->cache_lock);
5418
5419 if (reap)
5420 kmem_depot_ws_reap(cp);
5421 }
|