3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2015 Joyent, Inc. All rights reserved.
24 * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
25 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
26 */
27
28 /*
29 * Kernel memory allocator, as described in the following two papers and a
30 * statement about the consolidator:
31 *
32 * Jeff Bonwick,
33 * The Slab Allocator: An Object-Caching Kernel Memory Allocator.
34 * Proceedings of the Summer 1994 Usenix Conference.
35 * Available as /shared/sac/PSARC/1994/028/materials/kmem.pdf.
36 *
37 * Jeff Bonwick and Jonathan Adams,
38 * Magazines and vmem: Extending the Slab Allocator to Many CPUs and
39 * Arbitrary Resources.
40 * Proceedings of the 2001 Usenix Conference.
41 * Available as /shared/sac/PSARC/2000/550/materials/vmem.pdf.
42 *
43 * kmem Slab Consolidator Big Theory Statement:
143 * (The system won't be getting the slab back as long as the
144 * immovable object holds it hostage, so there's no point in moving
145 * any of its objects.)
146 * LATER: The client is using the object and cannot move it now, so kmem
147 * frees the new object (the unused copy destination). kmem still
148 * attempts to move other objects off the slab, since it expects to
149 * succeed in clearing the slab in a later callback. The client
150 * should use LATER instead of NO if the object is likely to become
151 * movable very soon.
152 * DONT_NEED: The client no longer needs the object, so kmem frees the old along
153 * with the new object (the unused copy destination). This response
154 * is the client's opportunity to be a model citizen and give back as
155 * much as it can.
156 * DONT_KNOW: The client does not know about the object because
157 * a) the client has just allocated the object and not yet put it
158 * wherever it expects to find known objects
159 * b) the client has removed the object from wherever it expects to
160 * find known objects and is about to free it, or
161 * c) the client has freed the object.
162 * In all these cases (a, b, and c) kmem frees the new object (the
163 * unused copy destination). In the first case, the object is in
164 * use and the correct action is that for LATER; in the latter two
165 * cases, we know that the object is either freed or about to be
166 * freed, in which case it is either already in a magazine or about
167 * to be in one. In these cases, we know that the object will either
168 * be reallocated and reused, or it will end up in a full magazine
169 * that will be reaped (thereby liberating the slab). Because it
170 * is prohibitively expensive to differentiate these cases, and
171 * because the defrag code is executed when we're low on memory
172 * (thereby biasing the system to reclaim full magazines) we treat
173 * all DONT_KNOW cases as LATER and rely on cache reaping to
174 * generally clean up full magazines. While we take the same action
175 * for these cases, we maintain their semantic distinction: if
176 * defragmentation is not occurring, it is useful to know if this
177 * is due to objects in use (LATER) or objects in an unknown state
178 * of transition (DONT_KNOW).
179 *
180 * 2.3 Object States
181 *
182 * Neither kmem nor the client can be assumed to know the object's whereabouts
183 * at the time of the callback. An object belonging to a kmem cache may be in
184 * any of the following states:
185 *
186 * 1. Uninitialized on the slab
187 * 2. Allocated from the slab but not constructed (still uninitialized)
188 * 3. Allocated from the slab, constructed, but not yet ready for business
189 * (not in a valid state for the move callback)
190 * 4. In use (valid and known to the client)
191 * 5. About to be freed (no longer in a valid state for the move callback)
192 * 6. Freed to a magazine (still constructed)
193 * 7. Allocated from a magazine, not yet ready for business (not in a valid
194 * state for the move callback), and about to return to state #4
195 * 8. Deconstructed on a magazine that is about to be freed
196 * 9. Freed to the slab
197 *
198 * Since the move callback may be called at any time while the object is in any
281 * c_objects_lock is held. Note that after acquiring the lock, the client must
282 * recheck the o_container pointer in case the object was removed just before
283 * acquiring the lock.
284 *
285 * When the client is about to free an object, it must first remove that object
286 * from the list, hash, or other structure where it is kept. At that time, to
287 * mark the object so it can be distinguished from the remaining, known objects,
288 * the client sets the designated low order bit:
289 *
290 * mutex_enter(&container->c_objects_lock);
291 * object->o_container = (void *)((uintptr_t)object->o_container | 0x1);
292 * list_remove(&container->c_objects, object);
293 * mutex_exit(&container->c_objects_lock);
294 *
295 * In the common case, the object is freed to the magazine layer, where it may
296 * be reused on a subsequent allocation without the overhead of calling the
297 * constructor. While in the magazine it appears allocated from the point of
298 * view of the slab layer, making it a candidate for the move callback. Most
299 * objects unrecognized by the client in the move callback fall into this
300 * category and are cheaply distinguished from known objects by the test
301 * described earlier. Because searching magazines is prohibitively expensive
302 * for kmem, clients that do not mark freed objects (and therefore return
303 * KMEM_CBRC_DONT_KNOW for large numbers of objects) may find defragmentation
304 * efficacy reduced.
305 *
306 * Invalidating the designated pointer member before freeing the object marks
307 * the object to be avoided in the callback, and conversely, assigning a valid
308 * value to the designated pointer member after allocating the object makes the
309 * object fair game for the callback:
310 *
311 * ... allocate object ...
312 * ... set any initial state not set by the constructor ...
313 *
314 * mutex_enter(&container->c_objects_lock);
315 * list_insert_tail(&container->c_objects, object);
316 * membar_producer();
317 * object->o_container = container;
318 * mutex_exit(&container->c_objects_lock);
319 *
320 * Note that everything else must be valid before setting o_container makes the
321 * object fair game for the move callback. The membar_producer() call ensures
322 * that all the object's state is written to memory before setting the pointer
323 * that transitions the object from state #3 or #7 (allocated, constructed, not
324 * yet in use) to state #4 (in use, valid). That's important because the move
994 { 95, 64, 0, 512 },
995 { 143, 64, 0, 0 },
996 };
997
998 static uint32_t kmem_reaping;
999 static uint32_t kmem_reaping_idspace;
1000
1001 /*
1002 * kmem tunables
1003 */
1004 clock_t kmem_reap_interval; /* cache reaping rate [15 * HZ ticks] */
1005 int kmem_depot_contention = 3; /* max failed tryenters per real interval */
1006 pgcnt_t kmem_reapahead = 0; /* start reaping N pages before pageout */
1007 int kmem_panic = 1; /* whether to panic on error */
1008 int kmem_logging = 1; /* kmem_log_enter() override */
1009 uint32_t kmem_mtbf = 0; /* mean time between failures [default: off] */
1010 size_t kmem_transaction_log_size; /* transaction log size [2% of memory] */
1011 size_t kmem_content_log_size; /* content log size [2% of memory] */
1012 size_t kmem_failure_log_size; /* failure log [4 pages per CPU] */
1013 size_t kmem_slab_log_size; /* slab create log [4 pages per CPU] */
1014 size_t kmem_zerosized_log_size; /* zero-sized log [4 pages per CPU] */
1015 size_t kmem_content_maxsave = 256; /* KMF_CONTENTS max bytes to log */
1016 size_t kmem_lite_minsize = 0; /* minimum buffer size for KMF_LITE */
1017 size_t kmem_lite_maxalign = 1024; /* maximum buffer alignment for KMF_LITE */
1018 int kmem_lite_pcs = 4; /* number of PCs to store in KMF_LITE mode */
1019 size_t kmem_maxverify; /* maximum bytes to inspect in debug routines */
1020 size_t kmem_minfirewall; /* hardware-enforced redzone threshold */
1021
1022 #ifdef DEBUG
1023 int kmem_warn_zerosized = 1; /* whether to warn on zero-sized KM_SLEEP */
1024 #else
1025 int kmem_warn_zerosized = 0; /* whether to warn on zero-sized KM_SLEEP */
1026 #endif
1027
1028 int kmem_panic_zerosized = 0; /* whether to panic on zero-sized KM_SLEEP */
1029
1030 #ifdef _LP64
1031 size_t kmem_max_cached = KMEM_BIG_MAXBUF; /* maximum kmem_alloc cache */
1032 #else
1033 size_t kmem_max_cached = KMEM_BIG_MAXBUF_32BIT; /* maximum kmem_alloc cache */
1034 #endif
1035
1036 #ifdef DEBUG
1037 int kmem_flags = KMF_AUDIT | KMF_DEADBEEF | KMF_REDZONE | KMF_CONTENTS;
1038 #else
1039 int kmem_flags = 0;
1040 #endif
1041 int kmem_ready;
1042
1043 static kmem_cache_t *kmem_slab_cache;
1044 static kmem_cache_t *kmem_bufctl_cache;
1045 static kmem_cache_t *kmem_bufctl_audit_cache;
1046
1047 static kmutex_t kmem_cache_lock; /* inter-cache linkage only */
1048 static list_t kmem_caches;
1049
1050 static taskq_t *kmem_taskq;
1051 static kmutex_t kmem_flags_lock;
1052 static vmem_t *kmem_metadata_arena;
1053 static vmem_t *kmem_msb_arena; /* arena for metadata caches */
1054 static vmem_t *kmem_cache_arena;
1055 static vmem_t *kmem_hash_arena;
1056 static vmem_t *kmem_log_arena;
1057 static vmem_t *kmem_oversize_arena;
1058 static vmem_t *kmem_va_arena;
1059 static vmem_t *kmem_default_arena;
1060 static vmem_t *kmem_firewall_va_arena;
1061 static vmem_t *kmem_firewall_arena;
1062
1063 static int kmem_zerosized; /* # of zero-sized allocs */
1064
1065 /*
1066 * kmem slab consolidator thresholds (tunables)
1067 */
1068 size_t kmem_frag_minslabs = 101; /* minimum total slabs */
1069 size_t kmem_frag_numer = 1; /* free buffers (numerator) */
1070 size_t kmem_frag_denom = KMEM_VOID_FRACTION; /* buffers (denominator) */
1071 /*
1072 * Maximum number of slabs from which to move buffers during a single
1073 * maintenance interval while the system is not low on memory.
1074 */
1075 size_t kmem_reclaim_max_slabs = 1;
1076 /*
1077 * Number of slabs to scan backwards from the end of the partial slab list
1078 * when searching for buffers to relocate.
1079 */
1080 size_t kmem_reclaim_scan_range = 12;
1081
1082 /* consolidator knobs */
1083 static boolean_t kmem_move_noreap;
1084 static boolean_t kmem_move_blocked;
1085 static boolean_t kmem_move_fulltilt;
1086 static boolean_t kmem_move_any_partial;
1087
1088 #ifdef DEBUG
1089 /*
1090 * kmem consolidator debug tunables:
1091 * Ensure code coverage by occasionally running the consolidator even when the
1092 * caches are not fragmented (they may never be). These intervals are mean time
1093 * in cache maintenance intervals (kmem_cache_update).
1094 */
1095 uint32_t kmem_mtb_move = 60; /* defrag 1 slab (~15min) */
1096 uint32_t kmem_mtb_reap = 1800; /* defrag all slabs (~7.5hrs) */
1097 #endif /* DEBUG */
1098
1099 static kmem_cache_t *kmem_defrag_cache;
1100 static kmem_cache_t *kmem_move_cache;
1101 static taskq_t *kmem_move_taskq;
1102
1103 static void kmem_cache_scan(kmem_cache_t *);
1104 static void kmem_cache_defrag(kmem_cache_t *);
1105 static void kmem_slab_prefill(kmem_cache_t *, kmem_slab_t *);
1106
1107
1108 kmem_log_header_t *kmem_transaction_log;
1109 kmem_log_header_t *kmem_content_log;
1110 kmem_log_header_t *kmem_failure_log;
1111 kmem_log_header_t *kmem_slab_log;
1112 kmem_log_header_t *kmem_zerosized_log;
1113
1114 static int kmem_lite_count; /* # of PCs in kmem_buftag_lite_t */
1115
1116 #define KMEM_BUFTAG_LITE_ENTER(bt, count, caller) \
1117 if ((count) > 0) { \
1118 pc_t *_s = ((kmem_buftag_lite_t *)(bt))->bt_history; \
1119 pc_t *_e; \
1120 /* memmove() the old entries down one notch */ \
1121 for (_e = &_s[(count) - 1]; _e > _s; _e--) \
1122 *_e = *(_e - 1); \
1123 *_s = (uintptr_t)(caller); \
1124 }
1125
1126 #define KMERR_MODIFIED 0 /* buffer modified while on freelist */
1127 #define KMERR_REDZONE 1 /* redzone violation (write past end of buf) */
1128 #define KMERR_DUPFREE 2 /* freed a buffer twice */
1129 #define KMERR_BADADDR 3 /* freed a bad (unallocated) address */
1130 #define KMERR_BADBUFTAG 4 /* buftag corrupted */
1131 #define KMERR_BADBUFCTL 5 /* bufctl corrupted */
1132 #define KMERR_BADCACHE 6 /* freed a buffer to the wrong cache */
1873 */
1874 if (sp->slab_flags & KMEM_SLAB_MOVE_PENDING) {
1875 list_insert_tail(deadlist, sp);
1876 } else {
1877 list_insert_head(deadlist, sp);
1878 }
1879 cp->cache_defrag->kmd_deadcount++;
1880 mutex_exit(&cp->cache_lock);
1881 }
1882 return;
1883 }
1884
1885 if (bcp->bc_next == NULL) {
1886 /* Transition the slab from completely allocated to partial. */
1887 ASSERT(sp->slab_refcnt == (sp->slab_chunks - 1));
1888 ASSERT(sp->slab_chunks > 1);
1889 list_remove(&cp->cache_complete_slabs, sp);
1890 cp->cache_complete_slab_count--;
1891 avl_add(&cp->cache_partial_slabs, sp);
1892 } else {
1893 (void) avl_update_gt(&cp->cache_partial_slabs, sp);
1894 }
1895
1896 ASSERT((cp->cache_slab_create - cp->cache_slab_destroy) ==
1897 (cp->cache_complete_slab_count +
1898 avl_numnodes(&cp->cache_partial_slabs) +
1899 (cp->cache_defrag == NULL ? 0 : cp->cache_defrag->kmd_deadcount)));
1900 mutex_exit(&cp->cache_lock);
1901 }
1902
1903 /*
1904 * Return -1 if kmem_error, 1 if constructor fails, 0 if successful.
1905 */
1906 static int
1907 kmem_cache_alloc_debug(kmem_cache_t *cp, void *buf, int kmflag, int construct,
1908 caddr_t caller)
1909 {
1910 kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
1911 kmem_bufctl_audit_t *bcp = (kmem_bufctl_audit_t *)btp->bt_bufctl;
1912 uint32_t mtbf;
1913
2907 return (buf);
2908 }
2909
2910 void *
2911 kmem_alloc(size_t size, int kmflag)
2912 {
2913 size_t index;
2914 kmem_cache_t *cp;
2915 void *buf;
2916
2917 if ((index = ((size - 1) >> KMEM_ALIGN_SHIFT)) < KMEM_ALLOC_TABLE_MAX) {
2918 cp = kmem_alloc_table[index];
2919 /* fall through to kmem_cache_alloc() */
2920
2921 } else if ((index = ((size - 1) >> KMEM_BIG_SHIFT)) <
2922 kmem_big_alloc_table_max) {
2923 cp = kmem_big_alloc_table[index];
2924 /* fall through to kmem_cache_alloc() */
2925
2926 } else {
2927 if (size == 0) {
2928 if (kmflag != KM_SLEEP && !(kmflag & KM_PANIC))
2929 return (NULL);
2930
2931 /*
2932 * If this is a sleeping allocation or one that has
2933 * been specified to panic on allocation failure, we
2934 * consider it to be deprecated behavior to allocate
2935 * 0 bytes. If we have been configured to panic under
2936 * this condition, we panic; if to warn, we warn -- and
2937 * regardless, we log to the kmem_zerosized_log that
2938 * that this condition has occurred (which gives us
2939 * enough information to be able to debug it).
2940 */
2941 if (kmem_panic && kmem_panic_zerosized)
2942 panic("attempted to kmem_alloc() size of 0");
2943
2944 if (kmem_warn_zerosized) {
2945 cmn_err(CE_WARN, "kmem_alloc(): sleeping "
2946 "allocation with size of 0; "
2947 "see kmem_zerosized_log for details");
2948 }
2949
2950 kmem_log_event(kmem_zerosized_log, NULL, NULL, NULL);
2951
2952 return (NULL);
2953 }
2954
2955 buf = vmem_alloc(kmem_oversize_arena, size,
2956 kmflag & KM_VMFLAGS);
2957 if (buf == NULL)
2958 kmem_log_event(kmem_failure_log, NULL, NULL,
2959 (void *)size);
2960 else if (KMEM_DUMP(kmem_slab_cache)) {
2961 /* stats for dump intercept */
2962 kmem_dump_oversize_allocs++;
2963 if (size > kmem_dump_oversize_max)
2964 kmem_dump_oversize_max = size;
2965 }
2966 return (buf);
2967 }
2968
2969 buf = kmem_cache_alloc(cp, kmflag);
2970 if ((cp->cache_flags & KMF_BUFTAG) && !KMEM_DUMP(cp) && buf != NULL) {
2971 kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
2972 ((uint8_t *)buf)[size] = KMEM_REDZONE_BYTE;
2973 ((uint32_t *)btp)[1] = KMEM_SIZE_ENCODE(size);
2974
3547 kmcp->kmc_move_yes.value.ui64 = 0;
3548 kmcp->kmc_move_no.value.ui64 = 0;
3549 kmcp->kmc_move_later.value.ui64 = 0;
3550 kmcp->kmc_move_dont_need.value.ui64 = 0;
3551 kmcp->kmc_move_dont_know.value.ui64 = 0;
3552 kmcp->kmc_move_hunt_found.value.ui64 = 0;
3553 kmcp->kmc_move_slabs_freed.value.ui64 = 0;
3554 kmcp->kmc_defrag.value.ui64 = 0;
3555 kmcp->kmc_scan.value.ui64 = 0;
3556 kmcp->kmc_move_reclaimable.value.ui64 = 0;
3557 } else {
3558 int64_t reclaimable;
3559
3560 kmem_defrag_t *kd = cp->cache_defrag;
3561 kmcp->kmc_move_callbacks.value.ui64 = kd->kmd_callbacks;
3562 kmcp->kmc_move_yes.value.ui64 = kd->kmd_yes;
3563 kmcp->kmc_move_no.value.ui64 = kd->kmd_no;
3564 kmcp->kmc_move_later.value.ui64 = kd->kmd_later;
3565 kmcp->kmc_move_dont_need.value.ui64 = kd->kmd_dont_need;
3566 kmcp->kmc_move_dont_know.value.ui64 = kd->kmd_dont_know;
3567 kmcp->kmc_move_hunt_found.value.ui64 = 0;
3568 kmcp->kmc_move_slabs_freed.value.ui64 = kd->kmd_slabs_freed;
3569 kmcp->kmc_defrag.value.ui64 = kd->kmd_defrags;
3570 kmcp->kmc_scan.value.ui64 = kd->kmd_scans;
3571
3572 reclaimable = cp->cache_bufslab - (cp->cache_maxchunks - 1);
3573 reclaimable = MAX(reclaimable, 0);
3574 reclaimable += ((uint64_t)reap * cp->cache_magtype->mt_magsize);
3575 kmcp->kmc_move_reclaimable.value.ui64 = reclaimable;
3576 }
3577
3578 mutex_exit(&cp->cache_lock);
3579 return (0);
3580 }
3581
3582 /*
3583 * Return a named statistic about a particular cache.
3584 * This shouldn't be called very often, so it's currently designed for
3585 * simplicity (leverages existing kstat support) rather than efficiency.
3586 */
3587 uint64_t
4457 segkmem_alloc, segkmem_free, kmem_minfirewall < ULONG_MAX?
4458 kmem_firewall_va_arena : heap_arena, 0, VMC_DUMPSAFE |
4459 VM_SLEEP);
4460 }
4461
4462 kmem_cache_init(2, use_large_pages);
4463
4464 if (kmem_flags & (KMF_AUDIT | KMF_RANDOMIZE)) {
4465 if (kmem_transaction_log_size == 0)
4466 kmem_transaction_log_size = kmem_maxavail() / 50;
4467 kmem_transaction_log = kmem_log_init(kmem_transaction_log_size);
4468 }
4469
4470 if (kmem_flags & (KMF_CONTENTS | KMF_RANDOMIZE)) {
4471 if (kmem_content_log_size == 0)
4472 kmem_content_log_size = kmem_maxavail() / 50;
4473 kmem_content_log = kmem_log_init(kmem_content_log_size);
4474 }
4475
4476 kmem_failure_log = kmem_log_init(kmem_failure_log_size);
4477 kmem_slab_log = kmem_log_init(kmem_slab_log_size);
4478 kmem_zerosized_log = kmem_log_init(kmem_zerosized_log_size);
4479
4480 /*
4481 * Initialize STREAMS message caches so allocb() is available.
4482 * This allows us to initialize the logging framework (cmn_err(9F),
4483 * strlog(9F), etc) so we can start recording messages.
4484 */
4485 streams_msg_init();
4486
4487 /*
4488 * Initialize the ZSD framework in Zones so modules loaded henceforth
4489 * can register their callbacks.
4490 */
4491 zone_zsd_init();
4492
4493 log_init();
4494 taskq_init();
4495
4496 /*
4497 * Warn about invalid or dangerous values of kmem_flags.
4498 * Always warn about unsupported values.
4646 return (B_FALSE);
4647 }
4648
4649 if ((refcnt == 1) || kmem_move_any_partial) {
4650 return (refcnt < sp->slab_chunks);
4651 }
4652
4653 /*
4654 * The reclaim threshold is adjusted at each kmem_cache_scan() so that
4655 * slabs with a progressively higher percentage of used buffers can be
4656 * reclaimed until the cache as a whole is no longer fragmented.
4657 *
4658 * sp->slab_refcnt kmd_reclaim_numer
4659 * --------------- < ------------------
4660 * sp->slab_chunks KMEM_VOID_FRACTION
4661 */
4662 return ((refcnt * KMEM_VOID_FRACTION) <
4663 (sp->slab_chunks * cp->cache_defrag->kmd_reclaim_numer));
4664 }
4665
4666 /*
4667 * May be called from the kmem_move_taskq, from kmem_cache_move_notify_task(),
4668 * or when the buffer is freed.
4669 */
4670 static void
4671 kmem_slab_move_yes(kmem_cache_t *cp, kmem_slab_t *sp, void *from_buf)
4672 {
4673 ASSERT(MUTEX_HELD(&cp->cache_lock));
4674 ASSERT(KMEM_SLAB_MEMBER(sp, from_buf));
4675
4676 if (!KMEM_SLAB_IS_PARTIAL(sp)) {
4677 return;
4678 }
4679
4680 if (sp->slab_flags & KMEM_SLAB_NOMOVE) {
4681 if (KMEM_SLAB_OFFSET(sp, from_buf) == sp->slab_stuck_offset) {
4682 avl_remove(&cp->cache_partial_slabs, sp);
4683 sp->slab_flags &= ~KMEM_SLAB_NOMOVE;
4684 sp->slab_stuck_offset = (uint32_t)-1;
4685 avl_add(&cp->cache_partial_slabs, sp);
4686 }
4709 }
4710
4711 static void kmem_move_end(kmem_cache_t *, kmem_move_t *);
4712
4713 /*
4714 * The move callback takes two buffer addresses, the buffer to be moved, and a
4715 * newly allocated and constructed buffer selected by kmem as the destination.
4716 * It also takes the size of the buffer and an optional user argument specified
4717 * at cache creation time. kmem guarantees that the buffer to be moved has not
4718 * been unmapped by the virtual memory subsystem. Beyond that, it cannot
4719 * guarantee the present whereabouts of the buffer to be moved, so it is up to
4720 * the client to safely determine whether or not it is still using the buffer.
4721 * The client must not free either of the buffers passed to the move callback,
4722 * since kmem wants to free them directly to the slab layer. The client response
4723 * tells kmem which of the two buffers to free:
4724 *
4725 * YES kmem frees the old buffer (the move was successful)
4726 * NO kmem frees the new buffer, marks the slab of the old buffer
4727 * non-reclaimable to avoid bothering the client again
4728 * LATER kmem frees the new buffer, increments slab_later_count
4729 * DONT_KNOW kmem frees the new buffer
4730 * DONT_NEED kmem frees both the old buffer and the new buffer
4731 *
4732 * The pending callback argument now being processed contains both of the
4733 * buffers (old and new) passed to the move callback function, the slab of the
4734 * old buffer, and flags related to the move request, such as whether or not the
4735 * system was desperate for memory.
4736 *
4737 * Slabs are not freed while there is a pending callback, but instead are kept
4738 * on a deadlist, which is drained after the last callback completes. This means
4739 * that slabs are safe to access until kmem_move_end(), no matter how many of
4740 * their buffers have been freed. Once slab_refcnt reaches zero, it stays at
4741 * zero for as long as the slab remains on the deadlist and until the slab is
4742 * freed.
4743 */
4744 static void
4745 kmem_move_buffer(kmem_move_t *callback)
4746 {
4747 kmem_cbrc_t response;
4748 kmem_slab_t *sp = callback->kmm_from_slab;
4749 kmem_cache_t *cp = sp->slab_cache;
4750 boolean_t free_on_slab;
4751
4752 ASSERT(taskq_member(kmem_move_taskq, curthread));
4753 ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
4754 ASSERT(KMEM_SLAB_MEMBER(sp, callback->kmm_from_buf));
4755
4756 /*
4757 * The number of allocated buffers on the slab may have changed since we
4758 * last checked the slab's reclaimability (when the pending move was
4759 * enqueued), or the client may have responded NO when asked to move
4760 * another buffer on the same slab.
4761 */
4762 if (!kmem_slab_is_reclaimable(cp, sp, callback->kmm_flags)) {
4763 kmem_slab_free(cp, callback->kmm_to_buf);
4764 kmem_move_end(cp, callback);
4765 return;
4766 }
4767
4768 /*
4769 * Checking the slab layer is easy, so we might as well do that here
4770 * in case we can avoid bothering the client.
4771 */
4772 mutex_enter(&cp->cache_lock);
4773 free_on_slab = (kmem_slab_allocated(cp, sp,
4774 callback->kmm_from_buf) == NULL);
4775 mutex_exit(&cp->cache_lock);
4776
4777 if (free_on_slab) {
4778 kmem_slab_free(cp, callback->kmm_to_buf);
4779 kmem_move_end(cp, callback);
4780 return;
4781 }
4782
4783 if (cp->cache_flags & KMF_BUFTAG) {
4784 /*
4785 * Make kmem_cache_alloc_debug() apply the constructor for us.
4786 */
4787 if (kmem_cache_alloc_debug(cp, callback->kmm_to_buf,
4788 KM_NOSLEEP, 1, caller()) != 0) {
4789 kmem_move_end(cp, callback);
4790 return;
4791 }
4792 } else if (cp->cache_constructor != NULL &&
4793 cp->cache_constructor(callback->kmm_to_buf, cp->cache_private,
4794 KM_NOSLEEP) != 0) {
4795 atomic_inc_64(&cp->cache_alloc_fail);
4796 kmem_slab_free(cp, callback->kmm_to_buf);
4797 kmem_move_end(cp, callback);
4798 return;
4799 }
4800
4801 cp->cache_defrag->kmd_callbacks++;
4802 cp->cache_defrag->kmd_thread = curthread;
4803 cp->cache_defrag->kmd_from_buf = callback->kmm_from_buf;
4804 cp->cache_defrag->kmd_to_buf = callback->kmm_to_buf;
4805 DTRACE_PROBE2(kmem__move__start, kmem_cache_t *, cp, kmem_move_t *,
4806 callback);
4807
4808 response = cp->cache_move(callback->kmm_from_buf,
4809 callback->kmm_to_buf, cp->cache_bufsize, cp->cache_private);
4810
4811 DTRACE_PROBE3(kmem__move__end, kmem_cache_t *, cp, kmem_move_t *,
4812 callback, kmem_cbrc_t, response);
4813 cp->cache_defrag->kmd_thread = NULL;
4814 cp->cache_defrag->kmd_from_buf = NULL;
4815 cp->cache_defrag->kmd_to_buf = NULL;
4816
4817 if (response == KMEM_CBRC_YES) {
4818 cp->cache_defrag->kmd_yes++;
4819 kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE);
4820 /* slab safe to access until kmem_move_end() */
4821 if (sp->slab_refcnt == 0)
4822 cp->cache_defrag->kmd_slabs_freed++;
4823 mutex_enter(&cp->cache_lock);
4824 kmem_slab_move_yes(cp, sp, callback->kmm_from_buf);
4825 mutex_exit(&cp->cache_lock);
4826 kmem_move_end(cp, callback);
4827 return;
4828 }
4829
4830 switch (response) {
4831 case KMEM_CBRC_NO:
4832 cp->cache_defrag->kmd_no++;
4833 mutex_enter(&cp->cache_lock);
4834 kmem_slab_move_no(cp, sp, callback->kmm_from_buf);
4835 mutex_exit(&cp->cache_lock);
4836 break;
4837 case KMEM_CBRC_LATER:
4838 cp->cache_defrag->kmd_later++;
4839 mutex_enter(&cp->cache_lock);
4840 if (!KMEM_SLAB_IS_PARTIAL(sp)) {
4841 mutex_exit(&cp->cache_lock);
4842 break;
4843 }
4844
4845 if (++sp->slab_later_count >= KMEM_DISBELIEF) {
4846 kmem_slab_move_no(cp, sp, callback->kmm_from_buf);
4847 } else if (!(sp->slab_flags & KMEM_SLAB_NOMOVE)) {
4848 sp->slab_stuck_offset = KMEM_SLAB_OFFSET(sp,
4849 callback->kmm_from_buf);
4850 }
4851 mutex_exit(&cp->cache_lock);
4852 break;
4853 case KMEM_CBRC_DONT_NEED:
4854 cp->cache_defrag->kmd_dont_need++;
4855 kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE);
4856 if (sp->slab_refcnt == 0)
4857 cp->cache_defrag->kmd_slabs_freed++;
4858 mutex_enter(&cp->cache_lock);
4859 kmem_slab_move_yes(cp, sp, callback->kmm_from_buf);
4860 mutex_exit(&cp->cache_lock);
4861 break;
4862 case KMEM_CBRC_DONT_KNOW:
4863 /*
4864 * If we don't know if we can move this buffer or not, we'll
4865 * just assume that we can't: if the buffer is in fact free,
4866 * then it is sitting in one of the per-CPU magazines or in
4867 * a full magazine in the depot layer. Either way, because
4868 * defrag is induced in the same logic that reaps a cache,
4869 * it's likely that full magazines will be returned to the
4870 * system soon (thereby accomplishing what we're trying to
4871 * accomplish here: return those magazines to their slabs).
4872 * Given this, any work that we might do now to locate a buffer
4873 * in a magazine is wasted (and expensive!) work; we bump
4874 * a counter in this case and otherwise assume that we can't
4875 * move it.
4876 */
4877 cp->cache_defrag->kmd_dont_know++;
4878 break;
4879 default:
4880 panic("'%s' (%p) unexpected move callback response %d\n",
4881 cp->cache_name, (void *)cp, response);
4882 }
4883
4884 kmem_slab_free_constructed(cp, callback->kmm_to_buf, B_FALSE);
4885 kmem_move_end(cp, callback);
4886 }
4887
4888 /* Return B_FALSE if there is insufficient memory for the move request. */
4889 static boolean_t
4890 kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags)
4891 {
4892 void *to_buf;
4893 avl_index_t index;
4894 kmem_move_t *callback, *pending;
4895 ulong_t n;
4896
4897 ASSERT(taskq_member(kmem_taskq, curthread));
4898 ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
4899 ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING);
4900
4901 callback = kmem_cache_alloc(kmem_move_cache, KM_NOSLEEP);
4902
4903 if (callback == NULL)
4904 return (B_FALSE);
4905
4906 callback->kmm_from_slab = sp;
4907 callback->kmm_from_buf = buf;
4908 callback->kmm_flags = flags;
4909
4910 mutex_enter(&cp->cache_lock);
4911
4912 n = avl_numnodes(&cp->cache_partial_slabs);
4913 if ((n == 0) || ((n == 1) && !(flags & KMM_DEBUG))) {
4914 mutex_exit(&cp->cache_lock);
4915 kmem_cache_free(kmem_move_cache, callback);
4916 return (B_TRUE); /* there is no need for the move request */
4917 }
4918
4919 pending = avl_find(&cp->cache_defrag->kmd_moves_pending, buf, &index);
4920 if (pending != NULL) {
4921 /*
4922 * If the move is already pending and we're desperate now,
4923 * update the move flags.
4924 */
4925 if (flags & KMM_DESPERATE) {
4926 pending->kmm_flags |= KMM_DESPERATE;
4927 }
4928 mutex_exit(&cp->cache_lock);
4929 kmem_cache_free(kmem_move_cache, callback);
4930 return (B_TRUE);
4931 }
4932
4933 to_buf = kmem_slab_alloc_impl(cp, avl_first(&cp->cache_partial_slabs),
4934 B_FALSE);
4935 callback->kmm_to_buf = to_buf;
4936 avl_insert(&cp->cache_defrag->kmd_moves_pending, callback, index);
4937
4938 mutex_exit(&cp->cache_lock);
4939
4940 if (!taskq_dispatch(kmem_move_taskq, (task_func_t *)kmem_move_buffer,
4941 callback, TQ_NOSLEEP)) {
4942 mutex_enter(&cp->cache_lock);
4943 avl_remove(&cp->cache_defrag->kmd_moves_pending, callback);
4944 mutex_exit(&cp->cache_lock);
4945 kmem_slab_free(cp, to_buf);
4946 kmem_cache_free(kmem_move_cache, callback);
4947 return (B_FALSE);
4948 }
4949
4950 return (B_TRUE);
4951 }
4952
4953 static void
4954 kmem_move_end(kmem_cache_t *cp, kmem_move_t *callback)
4955 {
4956 avl_index_t index;
4957
4958 ASSERT(cp->cache_defrag != NULL);
4959 ASSERT(taskq_member(kmem_move_taskq, curthread));
4960 ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
4961
4967 list_t *deadlist = &cp->cache_defrag->kmd_deadlist;
4968 kmem_slab_t *sp;
4969
4970 /*
4971 * The last pending move completed. Release all slabs from the
4972 * front of the dead list except for any slab at the tail that
4973 * needs to be released from the context of kmem_move_buffers().
4974 * kmem deferred unmapping the buffers on these slabs in order
4975 * to guarantee that buffers passed to the move callback have
4976 * been touched only by kmem or by the client itself.
4977 */
4978 while ((sp = list_remove_head(deadlist)) != NULL) {
4979 if (sp->slab_flags & KMEM_SLAB_MOVE_PENDING) {
4980 list_insert_tail(deadlist, sp);
4981 break;
4982 }
4983 cp->cache_defrag->kmd_deadcount--;
4984 cp->cache_slab_destroy++;
4985 mutex_exit(&cp->cache_lock);
4986 kmem_slab_destroy(cp, sp);
4987 mutex_enter(&cp->cache_lock);
4988 }
4989 }
4990 mutex_exit(&cp->cache_lock);
4991 kmem_cache_free(kmem_move_cache, callback);
4992 }
4993
4994 /*
4995 * Move buffers from least used slabs first by scanning backwards from the end
4996 * of the partial slab list. Scan at most max_scan candidate slabs and move
4997 * buffers from at most max_slabs slabs (0 for all partial slabs in both cases).
4998 * If desperate to reclaim memory, move buffers from any partial slab, otherwise
4999 * skip slabs with a ratio of allocated buffers at or above the current
5000 * threshold. Return the number of unskipped slabs (at most max_slabs, -1 if the
5001 * scan is aborted) so that the caller can adjust the reclaimability threshold
5002 * depending on how many reclaimable slabs it finds.
5003 *
5004 * kmem_move_buffers() drops and reacquires cache_lock every time it issues a
5005 * move request, since it is not valid for kmem_move_begin() to call
5006 * kmem_cache_alloc() or taskq_dispatch() with cache_lock held.
5111 list_t *deadlist =
5112 &cp->cache_defrag->kmd_deadlist;
5113 list_remove(deadlist, sp);
5114
5115 if (!avl_is_empty(
5116 &cp->cache_defrag->kmd_moves_pending)) {
5117 /*
5118 * A pending move makes it unsafe to
5119 * destroy the slab, because even though
5120 * the move is no longer needed, the
5121 * context where that is determined
5122 * requires the slab to exist.
5123 * Fortunately, a pending move also
5124 * means we don't need to destroy the
5125 * slab here, since it will get
5126 * destroyed along with any other slabs
5127 * on the deadlist after the last
5128 * pending move completes.
5129 */
5130 list_insert_head(deadlist, sp);
5131 return (-1);
5132 }
5133
5134 /*
5135 * Destroy the slab now if it was completely
5136 * freed while we dropped cache_lock and there
5137 * are no pending moves. Since slab_refcnt
5138 * cannot change once it reaches zero, no new
5139 * pending moves from that slab are possible.
5140 */
5141 cp->cache_defrag->kmd_deadcount--;
5142 cp->cache_slab_destroy++;
5143 mutex_exit(&cp->cache_lock);
5144 kmem_slab_destroy(cp, sp);
5145 mutex_enter(&cp->cache_lock);
5146 /*
5147 * Since we can't pick up the scan where we left
5148 * off, abort the scan and say nothing about the
5149 * number of reclaimable slabs.
5150 */
5151 return (-1);
5152 }
5153
5154 if (!success) {
5155 /*
5156 * Abort the scan if there is not enough memory
5157 * for the request and say nothing about the
5158 * number of reclaimable slabs.
5159 */
5160 return (-1);
5161 }
5162
5163 /*
5164 * The slab's position changed while the lock was
5165 * dropped, so we don't know where we are in the
5166 * sequence any more.
5167 */
5168 if (sp->slab_refcnt != refcnt) {
5169 /*
5170 * If this is a KMM_DEBUG move, the slab_refcnt
5171 * may have changed because we allocated a
5172 * destination buffer on the same slab. In that
5173 * case, we're not interested in counting it.
5174 */
5175 return (-1);
5176 }
5177 if ((sp->slab_flags & KMEM_SLAB_NOMOVE) != nomove)
5178 return (-1);
5179
5180 /*
5181 * Generating a move request allocates a destination
5182 * buffer from the slab layer, bumping the first partial
5183 * slab if it is completely allocated. If the current
5184 * slab becomes the first partial slab as a result, we
5185 * can't continue to scan backwards.
5186 *
5187 * If this is a KMM_DEBUG move and we allocated the
5188 * destination buffer from the last partial slab, then
5189 * the buffer we're moving is on the same slab and our
5190 * slab_refcnt has changed, causing us to return before
5191 * reaching here if there are no partial slabs left.
5192 */
5193 ASSERT(!avl_is_empty(&cp->cache_partial_slabs));
5194 if (sp == avl_first(&cp->cache_partial_slabs)) {
5195 /*
5196 * We're not interested in a second KMM_DEBUG
5197 * move.
5198 */
5199 goto end_scan;
5200 }
5201 }
5202 }
5203 end_scan:
5204
5205 return (s);
5206 }
5207
5208 typedef struct kmem_move_notify_args {
5209 kmem_cache_t *kmna_cache;
5210 void *kmna_buf;
5211 } kmem_move_notify_args_t;
5212
5213 static void
5214 kmem_cache_move_notify_task(void *arg)
5215 {
5216 kmem_move_notify_args_t *args = arg;
5217 kmem_cache_t *cp = args->kmna_cache;
5218 void *buf = args->kmna_buf;
5219 kmem_slab_t *sp;
5220
5221 ASSERT(taskq_member(kmem_taskq, curthread));
5222 ASSERT(list_link_active(&cp->cache_link));
5223
5224 kmem_free(args, sizeof (kmem_move_notify_args_t));
5244 return;
5245 }
5246
5247 kmem_slab_move_yes(cp, sp, buf);
5248 ASSERT(!(sp->slab_flags & KMEM_SLAB_MOVE_PENDING));
5249 sp->slab_flags |= KMEM_SLAB_MOVE_PENDING;
5250 mutex_exit(&cp->cache_lock);
5251 /* see kmem_move_buffers() about dropping the lock */
5252 (void) kmem_move_begin(cp, sp, buf, KMM_NOTIFY);
5253 mutex_enter(&cp->cache_lock);
5254 ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING);
5255 sp->slab_flags &= ~KMEM_SLAB_MOVE_PENDING;
5256 if (sp->slab_refcnt == 0) {
5257 list_t *deadlist = &cp->cache_defrag->kmd_deadlist;
5258 list_remove(deadlist, sp);
5259
5260 if (!avl_is_empty(
5261 &cp->cache_defrag->kmd_moves_pending)) {
5262 list_insert_head(deadlist, sp);
5263 mutex_exit(&cp->cache_lock);
5264 return;
5265 }
5266
5267 cp->cache_defrag->kmd_deadcount--;
5268 cp->cache_slab_destroy++;
5269 mutex_exit(&cp->cache_lock);
5270 kmem_slab_destroy(cp, sp);
5271 return;
5272 }
5273 } else {
5274 kmem_slab_move_yes(cp, sp, buf);
5275 }
5276 mutex_exit(&cp->cache_lock);
5277 }
5278
5279 void
5280 kmem_cache_move_notify(kmem_cache_t *cp, void *buf)
5281 {
5282 kmem_move_notify_args_t *args;
5283
5284 args = kmem_alloc(sizeof (kmem_move_notify_args_t), KM_NOSLEEP);
5285 if (args != NULL) {
5286 args->kmna_cache = cp;
5287 args->kmna_buf = buf;
5288 if (!taskq_dispatch(kmem_taskq,
5289 (task_func_t *)kmem_cache_move_notify_task, args,
5290 TQ_NOSLEEP))
5291 kmem_free(args, sizeof (kmem_move_notify_args_t));
5292 }
5293 }
5294
5295 static void
5296 kmem_cache_defrag(kmem_cache_t *cp)
5297 {
5298 size_t n;
5299
5300 ASSERT(cp->cache_defrag != NULL);
5301
5302 mutex_enter(&cp->cache_lock);
5303 n = avl_numnodes(&cp->cache_partial_slabs);
5304 if (n > 1) {
5305 /* kmem_move_buffers() drops and reacquires cache_lock */
5306 cp->cache_defrag->kmd_defrags++;
5307 (void) kmem_move_buffers(cp, n, 0, KMM_DESPERATE);
5308 }
5309 mutex_exit(&cp->cache_lock);
5310 }
5311
5312 /* Is this cache above the fragmentation threshold? */
5313 static boolean_t
5314 kmem_cache_frag_threshold(kmem_cache_t *cp, uint64_t nfree)
5315 {
5316 /*
5317 * nfree kmem_frag_numer
5318 * ------------------ > ---------------
5319 * cp->cache_buftotal kmem_frag_denom
5320 */
5321 return ((nfree * kmem_frag_denom) >
5322 (cp->cache_buftotal * kmem_frag_numer));
5323 }
5324
5325 static boolean_t
5384 if (kmd->kmd_consolidate > 0) {
5385 kmd->kmd_consolidate--;
5386 mutex_exit(&cp->cache_lock);
5387 kmem_cache_reap(cp);
5388 return;
5389 }
5390
5391 if (kmem_cache_is_fragmented(cp, &reap)) {
5392 size_t slabs_found;
5393
5394 /*
5395 * Consolidate reclaimable slabs from the end of the partial
5396 * slab list (scan at most kmem_reclaim_scan_range slabs to find
5397 * reclaimable slabs). Keep track of how many candidate slabs we
5398 * looked for and how many we actually found so we can adjust
5399 * the definition of a candidate slab if we're having trouble
5400 * finding them.
5401 *
5402 * kmem_move_buffers() drops and reacquires cache_lock.
5403 */
5404 kmd->kmd_scans++;
5405 slabs_found = kmem_move_buffers(cp, kmem_reclaim_scan_range,
5406 kmem_reclaim_max_slabs, 0);
5407 if (slabs_found >= 0) {
5408 kmd->kmd_slabs_sought += kmem_reclaim_max_slabs;
5409 kmd->kmd_slabs_found += slabs_found;
5410 }
5411
5412 if (++kmd->kmd_tries >= kmem_reclaim_scan_range) {
5413 kmd->kmd_tries = 0;
5414
5415 /*
5416 * If we had difficulty finding candidate slabs in
5417 * previous scans, adjust the threshold so that
5418 * candidates are easier to find.
5419 */
5420 if (kmd->kmd_slabs_found == kmd->kmd_slabs_sought) {
5421 kmem_adjust_reclaim_threshold(kmd, -1);
5422 } else if ((kmd->kmd_slabs_found * 2) <
5423 kmd->kmd_slabs_sought) {
5424 kmem_adjust_reclaim_threshold(kmd, 1);
5425 }
5426 kmd->kmd_slabs_sought = 0;
5427 kmd->kmd_slabs_found = 0;
5428 }
5429 } else {
5430 kmem_reset_reclaim_threshold(cp->cache_defrag);
5431 #ifdef DEBUG
5432 if (!avl_is_empty(&cp->cache_partial_slabs)) {
5433 /*
5434 * In a debug kernel we want the consolidator to
5435 * run occasionally even when there is plenty of
5436 * memory.
5437 */
5438 uint16_t debug_rand;
5439
5440 (void) random_get_bytes((uint8_t *)&debug_rand, 2);
5441 if (!kmem_move_noreap &&
5442 ((debug_rand % kmem_mtb_reap) == 0)) {
5443 mutex_exit(&cp->cache_lock);
5444 kmem_cache_reap(cp);
5445 return;
5446 } else if ((debug_rand % kmem_mtb_move) == 0) {
5447 kmd->kmd_scans++;
5448 (void) kmem_move_buffers(cp,
5449 kmem_reclaim_scan_range, 1, KMM_DEBUG);
5450 }
5451 }
5452 #endif /* DEBUG */
5453 }
5454
5455 mutex_exit(&cp->cache_lock);
5456
5457 if (reap)
5458 kmem_depot_ws_reap(cp);
5459 }
|
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
24 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
25 */
26
27 /*
28 * Kernel memory allocator, as described in the following two papers and a
29 * statement about the consolidator:
30 *
31 * Jeff Bonwick,
32 * The Slab Allocator: An Object-Caching Kernel Memory Allocator.
33 * Proceedings of the Summer 1994 Usenix Conference.
34 * Available as /shared/sac/PSARC/1994/028/materials/kmem.pdf.
35 *
36 * Jeff Bonwick and Jonathan Adams,
37 * Magazines and vmem: Extending the Slab Allocator to Many CPUs and
38 * Arbitrary Resources.
39 * Proceedings of the 2001 Usenix Conference.
40 * Available as /shared/sac/PSARC/2000/550/materials/vmem.pdf.
41 *
42 * kmem Slab Consolidator Big Theory Statement:
142 * (The system won't be getting the slab back as long as the
143 * immovable object holds it hostage, so there's no point in moving
144 * any of its objects.)
145 * LATER: The client is using the object and cannot move it now, so kmem
146 * frees the new object (the unused copy destination). kmem still
147 * attempts to move other objects off the slab, since it expects to
148 * succeed in clearing the slab in a later callback. The client
149 * should use LATER instead of NO if the object is likely to become
150 * movable very soon.
151 * DONT_NEED: The client no longer needs the object, so kmem frees the old along
152 * with the new object (the unused copy destination). This response
153 * is the client's opportunity to be a model citizen and give back as
154 * much as it can.
155 * DONT_KNOW: The client does not know about the object because
156 * a) the client has just allocated the object and not yet put it
157 * wherever it expects to find known objects
158 * b) the client has removed the object from wherever it expects to
159 * find known objects and is about to free it, or
160 * c) the client has freed the object.
161 * In all these cases (a, b, and c) kmem frees the new object (the
162 * unused copy destination) and searches for the old object in the
163 * magazine layer. If found, the object is removed from the magazine
164 * layer and freed to the slab layer so it will no longer hold the
165 * slab hostage.
166 *
167 * 2.3 Object States
168 *
169 * Neither kmem nor the client can be assumed to know the object's whereabouts
170 * at the time of the callback. An object belonging to a kmem cache may be in
171 * any of the following states:
172 *
173 * 1. Uninitialized on the slab
174 * 2. Allocated from the slab but not constructed (still uninitialized)
175 * 3. Allocated from the slab, constructed, but not yet ready for business
176 * (not in a valid state for the move callback)
177 * 4. In use (valid and known to the client)
178 * 5. About to be freed (no longer in a valid state for the move callback)
179 * 6. Freed to a magazine (still constructed)
180 * 7. Allocated from a magazine, not yet ready for business (not in a valid
181 * state for the move callback), and about to return to state #4
182 * 8. Deconstructed on a magazine that is about to be freed
183 * 9. Freed to the slab
184 *
185 * Since the move callback may be called at any time while the object is in any
268 * c_objects_lock is held. Note that after acquiring the lock, the client must
269 * recheck the o_container pointer in case the object was removed just before
270 * acquiring the lock.
271 *
272 * When the client is about to free an object, it must first remove that object
273 * from the list, hash, or other structure where it is kept. At that time, to
274 * mark the object so it can be distinguished from the remaining, known objects,
275 * the client sets the designated low order bit:
276 *
277 * mutex_enter(&container->c_objects_lock);
278 * object->o_container = (void *)((uintptr_t)object->o_container | 0x1);
279 * list_remove(&container->c_objects, object);
280 * mutex_exit(&container->c_objects_lock);
281 *
282 * In the common case, the object is freed to the magazine layer, where it may
283 * be reused on a subsequent allocation without the overhead of calling the
284 * constructor. While in the magazine it appears allocated from the point of
285 * view of the slab layer, making it a candidate for the move callback. Most
286 * objects unrecognized by the client in the move callback fall into this
287 * category and are cheaply distinguished from known objects by the test
288 * described earlier. Since recognition is cheap for the client, and searching
289 * magazines is expensive for kmem, kmem defers searching until the client first
290 * returns KMEM_CBRC_DONT_KNOW. As long as the needed effort is reasonable, kmem
291 * elsewhere does what it can to avoid bothering the client unnecessarily.
292 *
293 * Invalidating the designated pointer member before freeing the object marks
294 * the object to be avoided in the callback, and conversely, assigning a valid
295 * value to the designated pointer member after allocating the object makes the
296 * object fair game for the callback:
297 *
298 * ... allocate object ...
299 * ... set any initial state not set by the constructor ...
300 *
301 * mutex_enter(&container->c_objects_lock);
302 * list_insert_tail(&container->c_objects, object);
303 * membar_producer();
304 * object->o_container = container;
305 * mutex_exit(&container->c_objects_lock);
306 *
307 * Note that everything else must be valid before setting o_container makes the
308 * object fair game for the move callback. The membar_producer() call ensures
309 * that all the object's state is written to memory before setting the pointer
310 * that transitions the object from state #3 or #7 (allocated, constructed, not
311 * yet in use) to state #4 (in use, valid). That's important because the move
981 { 95, 64, 0, 512 },
982 { 143, 64, 0, 0 },
983 };
984
985 static uint32_t kmem_reaping;
986 static uint32_t kmem_reaping_idspace;
987
988 /*
989 * kmem tunables
990 */
991 clock_t kmem_reap_interval; /* cache reaping rate [15 * HZ ticks] */
992 int kmem_depot_contention = 3; /* max failed tryenters per real interval */
993 pgcnt_t kmem_reapahead = 0; /* start reaping N pages before pageout */
994 int kmem_panic = 1; /* whether to panic on error */
995 int kmem_logging = 1; /* kmem_log_enter() override */
996 uint32_t kmem_mtbf = 0; /* mean time between failures [default: off] */
997 size_t kmem_transaction_log_size; /* transaction log size [2% of memory] */
998 size_t kmem_content_log_size; /* content log size [2% of memory] */
999 size_t kmem_failure_log_size; /* failure log [4 pages per CPU] */
1000 size_t kmem_slab_log_size; /* slab create log [4 pages per CPU] */
1001 size_t kmem_content_maxsave = 256; /* KMF_CONTENTS max bytes to log */
1002 size_t kmem_lite_minsize = 0; /* minimum buffer size for KMF_LITE */
1003 size_t kmem_lite_maxalign = 1024; /* maximum buffer alignment for KMF_LITE */
1004 int kmem_lite_pcs = 4; /* number of PCs to store in KMF_LITE mode */
1005 size_t kmem_maxverify; /* maximum bytes to inspect in debug routines */
1006 size_t kmem_minfirewall; /* hardware-enforced redzone threshold */
1007
1008 #ifdef _LP64
1009 size_t kmem_max_cached = KMEM_BIG_MAXBUF; /* maximum kmem_alloc cache */
1010 #else
1011 size_t kmem_max_cached = KMEM_BIG_MAXBUF_32BIT; /* maximum kmem_alloc cache */
1012 #endif
1013
1014 #ifdef DEBUG
1015 int kmem_flags = KMF_AUDIT | KMF_DEADBEEF | KMF_REDZONE | KMF_CONTENTS;
1016 #else
1017 int kmem_flags = 0;
1018 #endif
1019 int kmem_ready;
1020
1021 static kmem_cache_t *kmem_slab_cache;
1022 static kmem_cache_t *kmem_bufctl_cache;
1023 static kmem_cache_t *kmem_bufctl_audit_cache;
1024
1025 static kmutex_t kmem_cache_lock; /* inter-cache linkage only */
1026 static list_t kmem_caches;
1027
1028 static taskq_t *kmem_taskq;
1029 static kmutex_t kmem_flags_lock;
1030 static vmem_t *kmem_metadata_arena;
1031 static vmem_t *kmem_msb_arena; /* arena for metadata caches */
1032 static vmem_t *kmem_cache_arena;
1033 static vmem_t *kmem_hash_arena;
1034 static vmem_t *kmem_log_arena;
1035 static vmem_t *kmem_oversize_arena;
1036 static vmem_t *kmem_va_arena;
1037 static vmem_t *kmem_default_arena;
1038 static vmem_t *kmem_firewall_va_arena;
1039 static vmem_t *kmem_firewall_arena;
1040
1041 /*
1042 * Define KMEM_STATS to turn on statistic gathering. By default, it is only
1043 * turned on when DEBUG is also defined.
1044 */
1045 #ifdef DEBUG
1046 #define KMEM_STATS
1047 #endif /* DEBUG */
1048
1049 #ifdef KMEM_STATS
1050 #define KMEM_STAT_ADD(stat) ((stat)++)
1051 #define KMEM_STAT_COND_ADD(cond, stat) ((void) (!(cond) || (stat)++))
1052 #else
1053 #define KMEM_STAT_ADD(stat) /* nothing */
1054 #define KMEM_STAT_COND_ADD(cond, stat) /* nothing */
1055 #endif /* KMEM_STATS */
1056
1057 /*
1058 * kmem slab consolidator thresholds (tunables)
1059 */
1060 size_t kmem_frag_minslabs = 101; /* minimum total slabs */
1061 size_t kmem_frag_numer = 1; /* free buffers (numerator) */
1062 size_t kmem_frag_denom = KMEM_VOID_FRACTION; /* buffers (denominator) */
1063 /*
1064 * Maximum number of slabs from which to move buffers during a single
1065 * maintenance interval while the system is not low on memory.
1066 */
1067 size_t kmem_reclaim_max_slabs = 1;
1068 /*
1069 * Number of slabs to scan backwards from the end of the partial slab list
1070 * when searching for buffers to relocate.
1071 */
1072 size_t kmem_reclaim_scan_range = 12;
1073
1074 #ifdef KMEM_STATS
1075 static struct {
1076 uint64_t kms_callbacks;
1077 uint64_t kms_yes;
1078 uint64_t kms_no;
1079 uint64_t kms_later;
1080 uint64_t kms_dont_need;
1081 uint64_t kms_dont_know;
1082 uint64_t kms_hunt_found_mag;
1083 uint64_t kms_hunt_found_slab;
1084 uint64_t kms_hunt_alloc_fail;
1085 uint64_t kms_hunt_lucky;
1086 uint64_t kms_notify;
1087 uint64_t kms_notify_callbacks;
1088 uint64_t kms_disbelief;
1089 uint64_t kms_already_pending;
1090 uint64_t kms_callback_alloc_fail;
1091 uint64_t kms_callback_taskq_fail;
1092 uint64_t kms_endscan_slab_dead;
1093 uint64_t kms_endscan_slab_destroyed;
1094 uint64_t kms_endscan_nomem;
1095 uint64_t kms_endscan_refcnt_changed;
1096 uint64_t kms_endscan_nomove_changed;
1097 uint64_t kms_endscan_freelist;
1098 uint64_t kms_avl_update;
1099 uint64_t kms_avl_noupdate;
1100 uint64_t kms_no_longer_reclaimable;
1101 uint64_t kms_notify_no_longer_reclaimable;
1102 uint64_t kms_notify_slab_dead;
1103 uint64_t kms_notify_slab_destroyed;
1104 uint64_t kms_alloc_fail;
1105 uint64_t kms_constructor_fail;
1106 uint64_t kms_dead_slabs_freed;
1107 uint64_t kms_defrags;
1108 uint64_t kms_scans;
1109 uint64_t kms_scan_depot_ws_reaps;
1110 uint64_t kms_debug_reaps;
1111 uint64_t kms_debug_scans;
1112 } kmem_move_stats;
1113 #endif /* KMEM_STATS */
1114
1115 /* consolidator knobs */
1116 static boolean_t kmem_move_noreap;
1117 static boolean_t kmem_move_blocked;
1118 static boolean_t kmem_move_fulltilt;
1119 static boolean_t kmem_move_any_partial;
1120
1121 #ifdef DEBUG
1122 /*
1123 * kmem consolidator debug tunables:
1124 * Ensure code coverage by occasionally running the consolidator even when the
1125 * caches are not fragmented (they may never be). These intervals are mean time
1126 * in cache maintenance intervals (kmem_cache_update).
1127 */
1128 uint32_t kmem_mtb_move = 60; /* defrag 1 slab (~15min) */
1129 uint32_t kmem_mtb_reap = 1800; /* defrag all slabs (~7.5hrs) */
1130 #endif /* DEBUG */
1131
1132 static kmem_cache_t *kmem_defrag_cache;
1133 static kmem_cache_t *kmem_move_cache;
1134 static taskq_t *kmem_move_taskq;
1135
1136 static void kmem_cache_scan(kmem_cache_t *);
1137 static void kmem_cache_defrag(kmem_cache_t *);
1138 static void kmem_slab_prefill(kmem_cache_t *, kmem_slab_t *);
1139
1140
1141 kmem_log_header_t *kmem_transaction_log;
1142 kmem_log_header_t *kmem_content_log;
1143 kmem_log_header_t *kmem_failure_log;
1144 kmem_log_header_t *kmem_slab_log;
1145
1146 static int kmem_lite_count; /* # of PCs in kmem_buftag_lite_t */
1147
1148 #define KMEM_BUFTAG_LITE_ENTER(bt, count, caller) \
1149 if ((count) > 0) { \
1150 pc_t *_s = ((kmem_buftag_lite_t *)(bt))->bt_history; \
1151 pc_t *_e; \
1152 /* memmove() the old entries down one notch */ \
1153 for (_e = &_s[(count) - 1]; _e > _s; _e--) \
1154 *_e = *(_e - 1); \
1155 *_s = (uintptr_t)(caller); \
1156 }
1157
1158 #define KMERR_MODIFIED 0 /* buffer modified while on freelist */
1159 #define KMERR_REDZONE 1 /* redzone violation (write past end of buf) */
1160 #define KMERR_DUPFREE 2 /* freed a buffer twice */
1161 #define KMERR_BADADDR 3 /* freed a bad (unallocated) address */
1162 #define KMERR_BADBUFTAG 4 /* buftag corrupted */
1163 #define KMERR_BADBUFCTL 5 /* bufctl corrupted */
1164 #define KMERR_BADCACHE 6 /* freed a buffer to the wrong cache */
1905 */
1906 if (sp->slab_flags & KMEM_SLAB_MOVE_PENDING) {
1907 list_insert_tail(deadlist, sp);
1908 } else {
1909 list_insert_head(deadlist, sp);
1910 }
1911 cp->cache_defrag->kmd_deadcount++;
1912 mutex_exit(&cp->cache_lock);
1913 }
1914 return;
1915 }
1916
1917 if (bcp->bc_next == NULL) {
1918 /* Transition the slab from completely allocated to partial. */
1919 ASSERT(sp->slab_refcnt == (sp->slab_chunks - 1));
1920 ASSERT(sp->slab_chunks > 1);
1921 list_remove(&cp->cache_complete_slabs, sp);
1922 cp->cache_complete_slab_count--;
1923 avl_add(&cp->cache_partial_slabs, sp);
1924 } else {
1925 #ifdef DEBUG
1926 if (avl_update_gt(&cp->cache_partial_slabs, sp)) {
1927 KMEM_STAT_ADD(kmem_move_stats.kms_avl_update);
1928 } else {
1929 KMEM_STAT_ADD(kmem_move_stats.kms_avl_noupdate);
1930 }
1931 #else
1932 (void) avl_update_gt(&cp->cache_partial_slabs, sp);
1933 #endif
1934 }
1935
1936 ASSERT((cp->cache_slab_create - cp->cache_slab_destroy) ==
1937 (cp->cache_complete_slab_count +
1938 avl_numnodes(&cp->cache_partial_slabs) +
1939 (cp->cache_defrag == NULL ? 0 : cp->cache_defrag->kmd_deadcount)));
1940 mutex_exit(&cp->cache_lock);
1941 }
1942
1943 /*
1944 * Return -1 if kmem_error, 1 if constructor fails, 0 if successful.
1945 */
1946 static int
1947 kmem_cache_alloc_debug(kmem_cache_t *cp, void *buf, int kmflag, int construct,
1948 caddr_t caller)
1949 {
1950 kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
1951 kmem_bufctl_audit_t *bcp = (kmem_bufctl_audit_t *)btp->bt_bufctl;
1952 uint32_t mtbf;
1953
2947 return (buf);
2948 }
2949
2950 void *
2951 kmem_alloc(size_t size, int kmflag)
2952 {
2953 size_t index;
2954 kmem_cache_t *cp;
2955 void *buf;
2956
2957 if ((index = ((size - 1) >> KMEM_ALIGN_SHIFT)) < KMEM_ALLOC_TABLE_MAX) {
2958 cp = kmem_alloc_table[index];
2959 /* fall through to kmem_cache_alloc() */
2960
2961 } else if ((index = ((size - 1) >> KMEM_BIG_SHIFT)) <
2962 kmem_big_alloc_table_max) {
2963 cp = kmem_big_alloc_table[index];
2964 /* fall through to kmem_cache_alloc() */
2965
2966 } else {
2967 if (size == 0)
2968 return (NULL);
2969
2970 buf = vmem_alloc(kmem_oversize_arena, size,
2971 kmflag & KM_VMFLAGS);
2972 if (buf == NULL)
2973 kmem_log_event(kmem_failure_log, NULL, NULL,
2974 (void *)size);
2975 else if (KMEM_DUMP(kmem_slab_cache)) {
2976 /* stats for dump intercept */
2977 kmem_dump_oversize_allocs++;
2978 if (size > kmem_dump_oversize_max)
2979 kmem_dump_oversize_max = size;
2980 }
2981 return (buf);
2982 }
2983
2984 buf = kmem_cache_alloc(cp, kmflag);
2985 if ((cp->cache_flags & KMF_BUFTAG) && !KMEM_DUMP(cp) && buf != NULL) {
2986 kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
2987 ((uint8_t *)buf)[size] = KMEM_REDZONE_BYTE;
2988 ((uint32_t *)btp)[1] = KMEM_SIZE_ENCODE(size);
2989
3562 kmcp->kmc_move_yes.value.ui64 = 0;
3563 kmcp->kmc_move_no.value.ui64 = 0;
3564 kmcp->kmc_move_later.value.ui64 = 0;
3565 kmcp->kmc_move_dont_need.value.ui64 = 0;
3566 kmcp->kmc_move_dont_know.value.ui64 = 0;
3567 kmcp->kmc_move_hunt_found.value.ui64 = 0;
3568 kmcp->kmc_move_slabs_freed.value.ui64 = 0;
3569 kmcp->kmc_defrag.value.ui64 = 0;
3570 kmcp->kmc_scan.value.ui64 = 0;
3571 kmcp->kmc_move_reclaimable.value.ui64 = 0;
3572 } else {
3573 int64_t reclaimable;
3574
3575 kmem_defrag_t *kd = cp->cache_defrag;
3576 kmcp->kmc_move_callbacks.value.ui64 = kd->kmd_callbacks;
3577 kmcp->kmc_move_yes.value.ui64 = kd->kmd_yes;
3578 kmcp->kmc_move_no.value.ui64 = kd->kmd_no;
3579 kmcp->kmc_move_later.value.ui64 = kd->kmd_later;
3580 kmcp->kmc_move_dont_need.value.ui64 = kd->kmd_dont_need;
3581 kmcp->kmc_move_dont_know.value.ui64 = kd->kmd_dont_know;
3582 kmcp->kmc_move_hunt_found.value.ui64 = kd->kmd_hunt_found;
3583 kmcp->kmc_move_slabs_freed.value.ui64 = kd->kmd_slabs_freed;
3584 kmcp->kmc_defrag.value.ui64 = kd->kmd_defrags;
3585 kmcp->kmc_scan.value.ui64 = kd->kmd_scans;
3586
3587 reclaimable = cp->cache_bufslab - (cp->cache_maxchunks - 1);
3588 reclaimable = MAX(reclaimable, 0);
3589 reclaimable += ((uint64_t)reap * cp->cache_magtype->mt_magsize);
3590 kmcp->kmc_move_reclaimable.value.ui64 = reclaimable;
3591 }
3592
3593 mutex_exit(&cp->cache_lock);
3594 return (0);
3595 }
3596
3597 /*
3598 * Return a named statistic about a particular cache.
3599 * This shouldn't be called very often, so it's currently designed for
3600 * simplicity (leverages existing kstat support) rather than efficiency.
3601 */
3602 uint64_t
4472 segkmem_alloc, segkmem_free, kmem_minfirewall < ULONG_MAX?
4473 kmem_firewall_va_arena : heap_arena, 0, VMC_DUMPSAFE |
4474 VM_SLEEP);
4475 }
4476
4477 kmem_cache_init(2, use_large_pages);
4478
4479 if (kmem_flags & (KMF_AUDIT | KMF_RANDOMIZE)) {
4480 if (kmem_transaction_log_size == 0)
4481 kmem_transaction_log_size = kmem_maxavail() / 50;
4482 kmem_transaction_log = kmem_log_init(kmem_transaction_log_size);
4483 }
4484
4485 if (kmem_flags & (KMF_CONTENTS | KMF_RANDOMIZE)) {
4486 if (kmem_content_log_size == 0)
4487 kmem_content_log_size = kmem_maxavail() / 50;
4488 kmem_content_log = kmem_log_init(kmem_content_log_size);
4489 }
4490
4491 kmem_failure_log = kmem_log_init(kmem_failure_log_size);
4492
4493 kmem_slab_log = kmem_log_init(kmem_slab_log_size);
4494
4495 /*
4496 * Initialize STREAMS message caches so allocb() is available.
4497 * This allows us to initialize the logging framework (cmn_err(9F),
4498 * strlog(9F), etc) so we can start recording messages.
4499 */
4500 streams_msg_init();
4501
4502 /*
4503 * Initialize the ZSD framework in Zones so modules loaded henceforth
4504 * can register their callbacks.
4505 */
4506 zone_zsd_init();
4507
4508 log_init();
4509 taskq_init();
4510
4511 /*
4512 * Warn about invalid or dangerous values of kmem_flags.
4513 * Always warn about unsupported values.
4661 return (B_FALSE);
4662 }
4663
4664 if ((refcnt == 1) || kmem_move_any_partial) {
4665 return (refcnt < sp->slab_chunks);
4666 }
4667
4668 /*
4669 * The reclaim threshold is adjusted at each kmem_cache_scan() so that
4670 * slabs with a progressively higher percentage of used buffers can be
4671 * reclaimed until the cache as a whole is no longer fragmented.
4672 *
4673 * sp->slab_refcnt kmd_reclaim_numer
4674 * --------------- < ------------------
4675 * sp->slab_chunks KMEM_VOID_FRACTION
4676 */
4677 return ((refcnt * KMEM_VOID_FRACTION) <
4678 (sp->slab_chunks * cp->cache_defrag->kmd_reclaim_numer));
4679 }
4680
4681 static void *
4682 kmem_hunt_mag(kmem_cache_t *cp, kmem_magazine_t *m, int n, void *buf,
4683 void *tbuf)
4684 {
4685 int i; /* magazine round index */
4686
4687 for (i = 0; i < n; i++) {
4688 if (buf == m->mag_round[i]) {
4689 if (cp->cache_flags & KMF_BUFTAG) {
4690 (void) kmem_cache_free_debug(cp, tbuf,
4691 caller());
4692 }
4693 m->mag_round[i] = tbuf;
4694 return (buf);
4695 }
4696 }
4697
4698 return (NULL);
4699 }
4700
4701 /*
4702 * Hunt the magazine layer for the given buffer. If found, the buffer is
4703 * removed from the magazine layer and returned, otherwise NULL is returned.
4704 * The state of the returned buffer is freed and constructed.
4705 */
4706 static void *
4707 kmem_hunt_mags(kmem_cache_t *cp, void *buf)
4708 {
4709 kmem_cpu_cache_t *ccp;
4710 kmem_magazine_t *m;
4711 int cpu_seqid;
4712 int n; /* magazine rounds */
4713 void *tbuf; /* temporary swap buffer */
4714
4715 ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
4716
4717 /*
4718 * Allocated a buffer to swap with the one we hope to pull out of a
4719 * magazine when found.
4720 */
4721 tbuf = kmem_cache_alloc(cp, KM_NOSLEEP);
4722 if (tbuf == NULL) {
4723 KMEM_STAT_ADD(kmem_move_stats.kms_hunt_alloc_fail);
4724 return (NULL);
4725 }
4726 if (tbuf == buf) {
4727 KMEM_STAT_ADD(kmem_move_stats.kms_hunt_lucky);
4728 if (cp->cache_flags & KMF_BUFTAG) {
4729 (void) kmem_cache_free_debug(cp, buf, caller());
4730 }
4731 return (buf);
4732 }
4733
4734 /* Hunt the depot. */
4735 mutex_enter(&cp->cache_depot_lock);
4736 n = cp->cache_magtype->mt_magsize;
4737 for (m = cp->cache_full.ml_list; m != NULL; m = m->mag_next) {
4738 if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) {
4739 mutex_exit(&cp->cache_depot_lock);
4740 return (buf);
4741 }
4742 }
4743 mutex_exit(&cp->cache_depot_lock);
4744
4745 /* Hunt the per-CPU magazines. */
4746 for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) {
4747 ccp = &cp->cache_cpu[cpu_seqid];
4748
4749 mutex_enter(&ccp->cc_lock);
4750 m = ccp->cc_loaded;
4751 n = ccp->cc_rounds;
4752 if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) {
4753 mutex_exit(&ccp->cc_lock);
4754 return (buf);
4755 }
4756 m = ccp->cc_ploaded;
4757 n = ccp->cc_prounds;
4758 if (kmem_hunt_mag(cp, m, n, buf, tbuf) != NULL) {
4759 mutex_exit(&ccp->cc_lock);
4760 return (buf);
4761 }
4762 mutex_exit(&ccp->cc_lock);
4763 }
4764
4765 kmem_cache_free(cp, tbuf);
4766 return (NULL);
4767 }
4768
4769 /*
4770 * May be called from the kmem_move_taskq, from kmem_cache_move_notify_task(),
4771 * or when the buffer is freed.
4772 */
4773 static void
4774 kmem_slab_move_yes(kmem_cache_t *cp, kmem_slab_t *sp, void *from_buf)
4775 {
4776 ASSERT(MUTEX_HELD(&cp->cache_lock));
4777 ASSERT(KMEM_SLAB_MEMBER(sp, from_buf));
4778
4779 if (!KMEM_SLAB_IS_PARTIAL(sp)) {
4780 return;
4781 }
4782
4783 if (sp->slab_flags & KMEM_SLAB_NOMOVE) {
4784 if (KMEM_SLAB_OFFSET(sp, from_buf) == sp->slab_stuck_offset) {
4785 avl_remove(&cp->cache_partial_slabs, sp);
4786 sp->slab_flags &= ~KMEM_SLAB_NOMOVE;
4787 sp->slab_stuck_offset = (uint32_t)-1;
4788 avl_add(&cp->cache_partial_slabs, sp);
4789 }
4812 }
4813
4814 static void kmem_move_end(kmem_cache_t *, kmem_move_t *);
4815
4816 /*
4817 * The move callback takes two buffer addresses, the buffer to be moved, and a
4818 * newly allocated and constructed buffer selected by kmem as the destination.
4819 * It also takes the size of the buffer and an optional user argument specified
4820 * at cache creation time. kmem guarantees that the buffer to be moved has not
4821 * been unmapped by the virtual memory subsystem. Beyond that, it cannot
4822 * guarantee the present whereabouts of the buffer to be moved, so it is up to
4823 * the client to safely determine whether or not it is still using the buffer.
4824 * The client must not free either of the buffers passed to the move callback,
4825 * since kmem wants to free them directly to the slab layer. The client response
4826 * tells kmem which of the two buffers to free:
4827 *
4828 * YES kmem frees the old buffer (the move was successful)
4829 * NO kmem frees the new buffer, marks the slab of the old buffer
4830 * non-reclaimable to avoid bothering the client again
4831 * LATER kmem frees the new buffer, increments slab_later_count
4832 * DONT_KNOW kmem frees the new buffer, searches mags for the old buffer
4833 * DONT_NEED kmem frees both the old buffer and the new buffer
4834 *
4835 * The pending callback argument now being processed contains both of the
4836 * buffers (old and new) passed to the move callback function, the slab of the
4837 * old buffer, and flags related to the move request, such as whether or not the
4838 * system was desperate for memory.
4839 *
4840 * Slabs are not freed while there is a pending callback, but instead are kept
4841 * on a deadlist, which is drained after the last callback completes. This means
4842 * that slabs are safe to access until kmem_move_end(), no matter how many of
4843 * their buffers have been freed. Once slab_refcnt reaches zero, it stays at
4844 * zero for as long as the slab remains on the deadlist and until the slab is
4845 * freed.
4846 */
4847 static void
4848 kmem_move_buffer(kmem_move_t *callback)
4849 {
4850 kmem_cbrc_t response;
4851 kmem_slab_t *sp = callback->kmm_from_slab;
4852 kmem_cache_t *cp = sp->slab_cache;
4853 boolean_t free_on_slab;
4854
4855 ASSERT(taskq_member(kmem_move_taskq, curthread));
4856 ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
4857 ASSERT(KMEM_SLAB_MEMBER(sp, callback->kmm_from_buf));
4858
4859 /*
4860 * The number of allocated buffers on the slab may have changed since we
4861 * last checked the slab's reclaimability (when the pending move was
4862 * enqueued), or the client may have responded NO when asked to move
4863 * another buffer on the same slab.
4864 */
4865 if (!kmem_slab_is_reclaimable(cp, sp, callback->kmm_flags)) {
4866 KMEM_STAT_ADD(kmem_move_stats.kms_no_longer_reclaimable);
4867 KMEM_STAT_COND_ADD((callback->kmm_flags & KMM_NOTIFY),
4868 kmem_move_stats.kms_notify_no_longer_reclaimable);
4869 kmem_slab_free(cp, callback->kmm_to_buf);
4870 kmem_move_end(cp, callback);
4871 return;
4872 }
4873
4874 /*
4875 * Hunting magazines is expensive, so we'll wait to do that until the
4876 * client responds KMEM_CBRC_DONT_KNOW. However, checking the slab layer
4877 * is cheap, so we might as well do that here in case we can avoid
4878 * bothering the client.
4879 */
4880 mutex_enter(&cp->cache_lock);
4881 free_on_slab = (kmem_slab_allocated(cp, sp,
4882 callback->kmm_from_buf) == NULL);
4883 mutex_exit(&cp->cache_lock);
4884
4885 if (free_on_slab) {
4886 KMEM_STAT_ADD(kmem_move_stats.kms_hunt_found_slab);
4887 kmem_slab_free(cp, callback->kmm_to_buf);
4888 kmem_move_end(cp, callback);
4889 return;
4890 }
4891
4892 if (cp->cache_flags & KMF_BUFTAG) {
4893 /*
4894 * Make kmem_cache_alloc_debug() apply the constructor for us.
4895 */
4896 if (kmem_cache_alloc_debug(cp, callback->kmm_to_buf,
4897 KM_NOSLEEP, 1, caller()) != 0) {
4898 KMEM_STAT_ADD(kmem_move_stats.kms_alloc_fail);
4899 kmem_move_end(cp, callback);
4900 return;
4901 }
4902 } else if (cp->cache_constructor != NULL &&
4903 cp->cache_constructor(callback->kmm_to_buf, cp->cache_private,
4904 KM_NOSLEEP) != 0) {
4905 atomic_inc_64(&cp->cache_alloc_fail);
4906 KMEM_STAT_ADD(kmem_move_stats.kms_constructor_fail);
4907 kmem_slab_free(cp, callback->kmm_to_buf);
4908 kmem_move_end(cp, callback);
4909 return;
4910 }
4911
4912 KMEM_STAT_ADD(kmem_move_stats.kms_callbacks);
4913 KMEM_STAT_COND_ADD((callback->kmm_flags & KMM_NOTIFY),
4914 kmem_move_stats.kms_notify_callbacks);
4915 cp->cache_defrag->kmd_callbacks++;
4916 cp->cache_defrag->kmd_thread = curthread;
4917 cp->cache_defrag->kmd_from_buf = callback->kmm_from_buf;
4918 cp->cache_defrag->kmd_to_buf = callback->kmm_to_buf;
4919 DTRACE_PROBE2(kmem__move__start, kmem_cache_t *, cp, kmem_move_t *,
4920 callback);
4921
4922 response = cp->cache_move(callback->kmm_from_buf,
4923 callback->kmm_to_buf, cp->cache_bufsize, cp->cache_private);
4924
4925 DTRACE_PROBE3(kmem__move__end, kmem_cache_t *, cp, kmem_move_t *,
4926 callback, kmem_cbrc_t, response);
4927 cp->cache_defrag->kmd_thread = NULL;
4928 cp->cache_defrag->kmd_from_buf = NULL;
4929 cp->cache_defrag->kmd_to_buf = NULL;
4930
4931 if (response == KMEM_CBRC_YES) {
4932 KMEM_STAT_ADD(kmem_move_stats.kms_yes);
4933 cp->cache_defrag->kmd_yes++;
4934 kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE);
4935 /* slab safe to access until kmem_move_end() */
4936 if (sp->slab_refcnt == 0)
4937 cp->cache_defrag->kmd_slabs_freed++;
4938 mutex_enter(&cp->cache_lock);
4939 kmem_slab_move_yes(cp, sp, callback->kmm_from_buf);
4940 mutex_exit(&cp->cache_lock);
4941 kmem_move_end(cp, callback);
4942 return;
4943 }
4944
4945 switch (response) {
4946 case KMEM_CBRC_NO:
4947 KMEM_STAT_ADD(kmem_move_stats.kms_no);
4948 cp->cache_defrag->kmd_no++;
4949 mutex_enter(&cp->cache_lock);
4950 kmem_slab_move_no(cp, sp, callback->kmm_from_buf);
4951 mutex_exit(&cp->cache_lock);
4952 break;
4953 case KMEM_CBRC_LATER:
4954 KMEM_STAT_ADD(kmem_move_stats.kms_later);
4955 cp->cache_defrag->kmd_later++;
4956 mutex_enter(&cp->cache_lock);
4957 if (!KMEM_SLAB_IS_PARTIAL(sp)) {
4958 mutex_exit(&cp->cache_lock);
4959 break;
4960 }
4961
4962 if (++sp->slab_later_count >= KMEM_DISBELIEF) {
4963 KMEM_STAT_ADD(kmem_move_stats.kms_disbelief);
4964 kmem_slab_move_no(cp, sp, callback->kmm_from_buf);
4965 } else if (!(sp->slab_flags & KMEM_SLAB_NOMOVE)) {
4966 sp->slab_stuck_offset = KMEM_SLAB_OFFSET(sp,
4967 callback->kmm_from_buf);
4968 }
4969 mutex_exit(&cp->cache_lock);
4970 break;
4971 case KMEM_CBRC_DONT_NEED:
4972 KMEM_STAT_ADD(kmem_move_stats.kms_dont_need);
4973 cp->cache_defrag->kmd_dont_need++;
4974 kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE);
4975 if (sp->slab_refcnt == 0)
4976 cp->cache_defrag->kmd_slabs_freed++;
4977 mutex_enter(&cp->cache_lock);
4978 kmem_slab_move_yes(cp, sp, callback->kmm_from_buf);
4979 mutex_exit(&cp->cache_lock);
4980 break;
4981 case KMEM_CBRC_DONT_KNOW:
4982 KMEM_STAT_ADD(kmem_move_stats.kms_dont_know);
4983 cp->cache_defrag->kmd_dont_know++;
4984 if (kmem_hunt_mags(cp, callback->kmm_from_buf) != NULL) {
4985 KMEM_STAT_ADD(kmem_move_stats.kms_hunt_found_mag);
4986 cp->cache_defrag->kmd_hunt_found++;
4987 kmem_slab_free_constructed(cp, callback->kmm_from_buf,
4988 B_TRUE);
4989 if (sp->slab_refcnt == 0)
4990 cp->cache_defrag->kmd_slabs_freed++;
4991 mutex_enter(&cp->cache_lock);
4992 kmem_slab_move_yes(cp, sp, callback->kmm_from_buf);
4993 mutex_exit(&cp->cache_lock);
4994 }
4995 break;
4996 default:
4997 panic("'%s' (%p) unexpected move callback response %d\n",
4998 cp->cache_name, (void *)cp, response);
4999 }
5000
5001 kmem_slab_free_constructed(cp, callback->kmm_to_buf, B_FALSE);
5002 kmem_move_end(cp, callback);
5003 }
5004
5005 /* Return B_FALSE if there is insufficient memory for the move request. */
5006 static boolean_t
5007 kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags)
5008 {
5009 void *to_buf;
5010 avl_index_t index;
5011 kmem_move_t *callback, *pending;
5012 ulong_t n;
5013
5014 ASSERT(taskq_member(kmem_taskq, curthread));
5015 ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
5016 ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING);
5017
5018 callback = kmem_cache_alloc(kmem_move_cache, KM_NOSLEEP);
5019 if (callback == NULL) {
5020 KMEM_STAT_ADD(kmem_move_stats.kms_callback_alloc_fail);
5021 return (B_FALSE);
5022 }
5023
5024 callback->kmm_from_slab = sp;
5025 callback->kmm_from_buf = buf;
5026 callback->kmm_flags = flags;
5027
5028 mutex_enter(&cp->cache_lock);
5029
5030 n = avl_numnodes(&cp->cache_partial_slabs);
5031 if ((n == 0) || ((n == 1) && !(flags & KMM_DEBUG))) {
5032 mutex_exit(&cp->cache_lock);
5033 kmem_cache_free(kmem_move_cache, callback);
5034 return (B_TRUE); /* there is no need for the move request */
5035 }
5036
5037 pending = avl_find(&cp->cache_defrag->kmd_moves_pending, buf, &index);
5038 if (pending != NULL) {
5039 /*
5040 * If the move is already pending and we're desperate now,
5041 * update the move flags.
5042 */
5043 if (flags & KMM_DESPERATE) {
5044 pending->kmm_flags |= KMM_DESPERATE;
5045 }
5046 mutex_exit(&cp->cache_lock);
5047 KMEM_STAT_ADD(kmem_move_stats.kms_already_pending);
5048 kmem_cache_free(kmem_move_cache, callback);
5049 return (B_TRUE);
5050 }
5051
5052 to_buf = kmem_slab_alloc_impl(cp, avl_first(&cp->cache_partial_slabs),
5053 B_FALSE);
5054 callback->kmm_to_buf = to_buf;
5055 avl_insert(&cp->cache_defrag->kmd_moves_pending, callback, index);
5056
5057 mutex_exit(&cp->cache_lock);
5058
5059 if (!taskq_dispatch(kmem_move_taskq, (task_func_t *)kmem_move_buffer,
5060 callback, TQ_NOSLEEP)) {
5061 KMEM_STAT_ADD(kmem_move_stats.kms_callback_taskq_fail);
5062 mutex_enter(&cp->cache_lock);
5063 avl_remove(&cp->cache_defrag->kmd_moves_pending, callback);
5064 mutex_exit(&cp->cache_lock);
5065 kmem_slab_free(cp, to_buf);
5066 kmem_cache_free(kmem_move_cache, callback);
5067 return (B_FALSE);
5068 }
5069
5070 return (B_TRUE);
5071 }
5072
5073 static void
5074 kmem_move_end(kmem_cache_t *cp, kmem_move_t *callback)
5075 {
5076 avl_index_t index;
5077
5078 ASSERT(cp->cache_defrag != NULL);
5079 ASSERT(taskq_member(kmem_move_taskq, curthread));
5080 ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
5081
5087 list_t *deadlist = &cp->cache_defrag->kmd_deadlist;
5088 kmem_slab_t *sp;
5089
5090 /*
5091 * The last pending move completed. Release all slabs from the
5092 * front of the dead list except for any slab at the tail that
5093 * needs to be released from the context of kmem_move_buffers().
5094 * kmem deferred unmapping the buffers on these slabs in order
5095 * to guarantee that buffers passed to the move callback have
5096 * been touched only by kmem or by the client itself.
5097 */
5098 while ((sp = list_remove_head(deadlist)) != NULL) {
5099 if (sp->slab_flags & KMEM_SLAB_MOVE_PENDING) {
5100 list_insert_tail(deadlist, sp);
5101 break;
5102 }
5103 cp->cache_defrag->kmd_deadcount--;
5104 cp->cache_slab_destroy++;
5105 mutex_exit(&cp->cache_lock);
5106 kmem_slab_destroy(cp, sp);
5107 KMEM_STAT_ADD(kmem_move_stats.kms_dead_slabs_freed);
5108 mutex_enter(&cp->cache_lock);
5109 }
5110 }
5111 mutex_exit(&cp->cache_lock);
5112 kmem_cache_free(kmem_move_cache, callback);
5113 }
5114
5115 /*
5116 * Move buffers from least used slabs first by scanning backwards from the end
5117 * of the partial slab list. Scan at most max_scan candidate slabs and move
5118 * buffers from at most max_slabs slabs (0 for all partial slabs in both cases).
5119 * If desperate to reclaim memory, move buffers from any partial slab, otherwise
5120 * skip slabs with a ratio of allocated buffers at or above the current
5121 * threshold. Return the number of unskipped slabs (at most max_slabs, -1 if the
5122 * scan is aborted) so that the caller can adjust the reclaimability threshold
5123 * depending on how many reclaimable slabs it finds.
5124 *
5125 * kmem_move_buffers() drops and reacquires cache_lock every time it issues a
5126 * move request, since it is not valid for kmem_move_begin() to call
5127 * kmem_cache_alloc() or taskq_dispatch() with cache_lock held.
5232 list_t *deadlist =
5233 &cp->cache_defrag->kmd_deadlist;
5234 list_remove(deadlist, sp);
5235
5236 if (!avl_is_empty(
5237 &cp->cache_defrag->kmd_moves_pending)) {
5238 /*
5239 * A pending move makes it unsafe to
5240 * destroy the slab, because even though
5241 * the move is no longer needed, the
5242 * context where that is determined
5243 * requires the slab to exist.
5244 * Fortunately, a pending move also
5245 * means we don't need to destroy the
5246 * slab here, since it will get
5247 * destroyed along with any other slabs
5248 * on the deadlist after the last
5249 * pending move completes.
5250 */
5251 list_insert_head(deadlist, sp);
5252 KMEM_STAT_ADD(kmem_move_stats.
5253 kms_endscan_slab_dead);
5254 return (-1);
5255 }
5256
5257 /*
5258 * Destroy the slab now if it was completely
5259 * freed while we dropped cache_lock and there
5260 * are no pending moves. Since slab_refcnt
5261 * cannot change once it reaches zero, no new
5262 * pending moves from that slab are possible.
5263 */
5264 cp->cache_defrag->kmd_deadcount--;
5265 cp->cache_slab_destroy++;
5266 mutex_exit(&cp->cache_lock);
5267 kmem_slab_destroy(cp, sp);
5268 KMEM_STAT_ADD(kmem_move_stats.
5269 kms_dead_slabs_freed);
5270 KMEM_STAT_ADD(kmem_move_stats.
5271 kms_endscan_slab_destroyed);
5272 mutex_enter(&cp->cache_lock);
5273 /*
5274 * Since we can't pick up the scan where we left
5275 * off, abort the scan and say nothing about the
5276 * number of reclaimable slabs.
5277 */
5278 return (-1);
5279 }
5280
5281 if (!success) {
5282 /*
5283 * Abort the scan if there is not enough memory
5284 * for the request and say nothing about the
5285 * number of reclaimable slabs.
5286 */
5287 KMEM_STAT_COND_ADD(s < max_slabs,
5288 kmem_move_stats.kms_endscan_nomem);
5289 return (-1);
5290 }
5291
5292 /*
5293 * The slab's position changed while the lock was
5294 * dropped, so we don't know where we are in the
5295 * sequence any more.
5296 */
5297 if (sp->slab_refcnt != refcnt) {
5298 /*
5299 * If this is a KMM_DEBUG move, the slab_refcnt
5300 * may have changed because we allocated a
5301 * destination buffer on the same slab. In that
5302 * case, we're not interested in counting it.
5303 */
5304 KMEM_STAT_COND_ADD(!(flags & KMM_DEBUG) &&
5305 (s < max_slabs),
5306 kmem_move_stats.kms_endscan_refcnt_changed);
5307 return (-1);
5308 }
5309 if ((sp->slab_flags & KMEM_SLAB_NOMOVE) != nomove) {
5310 KMEM_STAT_COND_ADD(s < max_slabs,
5311 kmem_move_stats.kms_endscan_nomove_changed);
5312 return (-1);
5313 }
5314
5315 /*
5316 * Generating a move request allocates a destination
5317 * buffer from the slab layer, bumping the first partial
5318 * slab if it is completely allocated. If the current
5319 * slab becomes the first partial slab as a result, we
5320 * can't continue to scan backwards.
5321 *
5322 * If this is a KMM_DEBUG move and we allocated the
5323 * destination buffer from the last partial slab, then
5324 * the buffer we're moving is on the same slab and our
5325 * slab_refcnt has changed, causing us to return before
5326 * reaching here if there are no partial slabs left.
5327 */
5328 ASSERT(!avl_is_empty(&cp->cache_partial_slabs));
5329 if (sp == avl_first(&cp->cache_partial_slabs)) {
5330 /*
5331 * We're not interested in a second KMM_DEBUG
5332 * move.
5333 */
5334 goto end_scan;
5335 }
5336 }
5337 }
5338 end_scan:
5339
5340 KMEM_STAT_COND_ADD(!(flags & KMM_DEBUG) &&
5341 (s < max_slabs) &&
5342 (sp == avl_first(&cp->cache_partial_slabs)),
5343 kmem_move_stats.kms_endscan_freelist);
5344
5345 return (s);
5346 }
5347
5348 typedef struct kmem_move_notify_args {
5349 kmem_cache_t *kmna_cache;
5350 void *kmna_buf;
5351 } kmem_move_notify_args_t;
5352
5353 static void
5354 kmem_cache_move_notify_task(void *arg)
5355 {
5356 kmem_move_notify_args_t *args = arg;
5357 kmem_cache_t *cp = args->kmna_cache;
5358 void *buf = args->kmna_buf;
5359 kmem_slab_t *sp;
5360
5361 ASSERT(taskq_member(kmem_taskq, curthread));
5362 ASSERT(list_link_active(&cp->cache_link));
5363
5364 kmem_free(args, sizeof (kmem_move_notify_args_t));
5384 return;
5385 }
5386
5387 kmem_slab_move_yes(cp, sp, buf);
5388 ASSERT(!(sp->slab_flags & KMEM_SLAB_MOVE_PENDING));
5389 sp->slab_flags |= KMEM_SLAB_MOVE_PENDING;
5390 mutex_exit(&cp->cache_lock);
5391 /* see kmem_move_buffers() about dropping the lock */
5392 (void) kmem_move_begin(cp, sp, buf, KMM_NOTIFY);
5393 mutex_enter(&cp->cache_lock);
5394 ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING);
5395 sp->slab_flags &= ~KMEM_SLAB_MOVE_PENDING;
5396 if (sp->slab_refcnt == 0) {
5397 list_t *deadlist = &cp->cache_defrag->kmd_deadlist;
5398 list_remove(deadlist, sp);
5399
5400 if (!avl_is_empty(
5401 &cp->cache_defrag->kmd_moves_pending)) {
5402 list_insert_head(deadlist, sp);
5403 mutex_exit(&cp->cache_lock);
5404 KMEM_STAT_ADD(kmem_move_stats.
5405 kms_notify_slab_dead);
5406 return;
5407 }
5408
5409 cp->cache_defrag->kmd_deadcount--;
5410 cp->cache_slab_destroy++;
5411 mutex_exit(&cp->cache_lock);
5412 kmem_slab_destroy(cp, sp);
5413 KMEM_STAT_ADD(kmem_move_stats.kms_dead_slabs_freed);
5414 KMEM_STAT_ADD(kmem_move_stats.
5415 kms_notify_slab_destroyed);
5416 return;
5417 }
5418 } else {
5419 kmem_slab_move_yes(cp, sp, buf);
5420 }
5421 mutex_exit(&cp->cache_lock);
5422 }
5423
5424 void
5425 kmem_cache_move_notify(kmem_cache_t *cp, void *buf)
5426 {
5427 kmem_move_notify_args_t *args;
5428
5429 KMEM_STAT_ADD(kmem_move_stats.kms_notify);
5430 args = kmem_alloc(sizeof (kmem_move_notify_args_t), KM_NOSLEEP);
5431 if (args != NULL) {
5432 args->kmna_cache = cp;
5433 args->kmna_buf = buf;
5434 if (!taskq_dispatch(kmem_taskq,
5435 (task_func_t *)kmem_cache_move_notify_task, args,
5436 TQ_NOSLEEP))
5437 kmem_free(args, sizeof (kmem_move_notify_args_t));
5438 }
5439 }
5440
5441 static void
5442 kmem_cache_defrag(kmem_cache_t *cp)
5443 {
5444 size_t n;
5445
5446 ASSERT(cp->cache_defrag != NULL);
5447
5448 mutex_enter(&cp->cache_lock);
5449 n = avl_numnodes(&cp->cache_partial_slabs);
5450 if (n > 1) {
5451 /* kmem_move_buffers() drops and reacquires cache_lock */
5452 KMEM_STAT_ADD(kmem_move_stats.kms_defrags);
5453 cp->cache_defrag->kmd_defrags++;
5454 (void) kmem_move_buffers(cp, n, 0, KMM_DESPERATE);
5455 }
5456 mutex_exit(&cp->cache_lock);
5457 }
5458
5459 /* Is this cache above the fragmentation threshold? */
5460 static boolean_t
5461 kmem_cache_frag_threshold(kmem_cache_t *cp, uint64_t nfree)
5462 {
5463 /*
5464 * nfree kmem_frag_numer
5465 * ------------------ > ---------------
5466 * cp->cache_buftotal kmem_frag_denom
5467 */
5468 return ((nfree * kmem_frag_denom) >
5469 (cp->cache_buftotal * kmem_frag_numer));
5470 }
5471
5472 static boolean_t
5531 if (kmd->kmd_consolidate > 0) {
5532 kmd->kmd_consolidate--;
5533 mutex_exit(&cp->cache_lock);
5534 kmem_cache_reap(cp);
5535 return;
5536 }
5537
5538 if (kmem_cache_is_fragmented(cp, &reap)) {
5539 size_t slabs_found;
5540
5541 /*
5542 * Consolidate reclaimable slabs from the end of the partial
5543 * slab list (scan at most kmem_reclaim_scan_range slabs to find
5544 * reclaimable slabs). Keep track of how many candidate slabs we
5545 * looked for and how many we actually found so we can adjust
5546 * the definition of a candidate slab if we're having trouble
5547 * finding them.
5548 *
5549 * kmem_move_buffers() drops and reacquires cache_lock.
5550 */
5551 KMEM_STAT_ADD(kmem_move_stats.kms_scans);
5552 kmd->kmd_scans++;
5553 slabs_found = kmem_move_buffers(cp, kmem_reclaim_scan_range,
5554 kmem_reclaim_max_slabs, 0);
5555 if (slabs_found >= 0) {
5556 kmd->kmd_slabs_sought += kmem_reclaim_max_slabs;
5557 kmd->kmd_slabs_found += slabs_found;
5558 }
5559
5560 if (++kmd->kmd_tries >= kmem_reclaim_scan_range) {
5561 kmd->kmd_tries = 0;
5562
5563 /*
5564 * If we had difficulty finding candidate slabs in
5565 * previous scans, adjust the threshold so that
5566 * candidates are easier to find.
5567 */
5568 if (kmd->kmd_slabs_found == kmd->kmd_slabs_sought) {
5569 kmem_adjust_reclaim_threshold(kmd, -1);
5570 } else if ((kmd->kmd_slabs_found * 2) <
5571 kmd->kmd_slabs_sought) {
5572 kmem_adjust_reclaim_threshold(kmd, 1);
5573 }
5574 kmd->kmd_slabs_sought = 0;
5575 kmd->kmd_slabs_found = 0;
5576 }
5577 } else {
5578 kmem_reset_reclaim_threshold(cp->cache_defrag);
5579 #ifdef DEBUG
5580 if (!avl_is_empty(&cp->cache_partial_slabs)) {
5581 /*
5582 * In a debug kernel we want the consolidator to
5583 * run occasionally even when there is plenty of
5584 * memory.
5585 */
5586 uint16_t debug_rand;
5587
5588 (void) random_get_bytes((uint8_t *)&debug_rand, 2);
5589 if (!kmem_move_noreap &&
5590 ((debug_rand % kmem_mtb_reap) == 0)) {
5591 mutex_exit(&cp->cache_lock);
5592 KMEM_STAT_ADD(kmem_move_stats.kms_debug_reaps);
5593 kmem_cache_reap(cp);
5594 return;
5595 } else if ((debug_rand % kmem_mtb_move) == 0) {
5596 KMEM_STAT_ADD(kmem_move_stats.kms_scans);
5597 KMEM_STAT_ADD(kmem_move_stats.kms_debug_scans);
5598 kmd->kmd_scans++;
5599 (void) kmem_move_buffers(cp,
5600 kmem_reclaim_scan_range, 1, KMM_DEBUG);
5601 }
5602 }
5603 #endif /* DEBUG */
5604 }
5605
5606 mutex_exit(&cp->cache_lock);
5607
5608 if (reap) {
5609 KMEM_STAT_ADD(kmem_move_stats.kms_scan_depot_ws_reaps);
5610 kmem_depot_ws_reap(cp);
5611 }
5612 }
|