1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2017, Joyent, Inc.
24 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
25 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
26 * Copyright 2018, Joyent, Inc.
27 * Copyright 2020 Oxide Computer Company
28 */
29
30 /*
31 * Kernel memory allocator, as described in the following two papers and a
32 * statement about the consolidator:
33 *
34 * Jeff Bonwick,
35 * The Slab Allocator: An Object-Caching Kernel Memory Allocator.
36 * Proceedings of the Summer 1994 Usenix Conference.
37 * Available as /shared/sac/PSARC/1994/028/materials/kmem.pdf.
38 *
39 * Jeff Bonwick and Jonathan Adams,
40 * Magazines and vmem: Extending the Slab Allocator to Many CPUs and
41 * Arbitrary Resources.
42 * Proceedings of the 2001 Usenix Conference.
43 * Available as /shared/sac/PSARC/2000/550/materials/vmem.pdf.
44 *
45 * kmem Slab Consolidator Big Theory Statement:
46 *
47 * 1. Motivation
48 *
49 * As stated in Bonwick94, slabs provide the following advantages over other
50 * allocation structures in terms of memory fragmentation:
51 *
52 * - Internal fragmentation (per-buffer wasted space) is minimal.
53 * - Severe external fragmentation (unused buffers on the free list) is
54 * unlikely.
55 *
56 * Segregating objects by size eliminates one source of external fragmentation,
57 * and according to Bonwick:
58 *
59 * The other reason that slabs reduce external fragmentation is that all
60 * objects in a slab are of the same type, so they have the same lifetime
61 * distribution. The resulting segregation of short-lived and long-lived
62 * objects at slab granularity reduces the likelihood of an entire page being
63 * held hostage due to a single long-lived allocation [Barrett93, Hanson90].
64 *
65 * While unlikely, severe external fragmentation remains possible. Clients that
66 * allocate both short- and long-lived objects from the same cache cannot
67 * anticipate the distribution of long-lived objects within the allocator's slab
68 * implementation. Even a small percentage of long-lived objects distributed
69 * randomly across many slabs can lead to a worst case scenario where the client
70 * frees the majority of its objects and the system gets back almost none of the
71 * slabs. Despite the client doing what it reasonably can to help the system
72 * reclaim memory, the allocator cannot shake free enough slabs because of
73 * lonely allocations stubbornly hanging on. Although the allocator is in a
74 * position to diagnose the fragmentation, there is nothing that the allocator
75 * by itself can do about it. It only takes a single allocated object to prevent
76 * an entire slab from being reclaimed, and any object handed out by
77 * kmem_cache_alloc() is by definition in the client's control. Conversely,
78 * although the client is in a position to move a long-lived object, it has no
79 * way of knowing if the object is causing fragmentation, and if so, where to
80 * move it. A solution necessarily requires further cooperation between the
81 * allocator and the client.
82 *
83 * 2. Move Callback
84 *
85 * The kmem slab consolidator therefore adds a move callback to the
86 * allocator/client interface, improving worst-case external fragmentation in
87 * kmem caches that supply a function to move objects from one memory location
88 * to another. In a situation of low memory kmem attempts to consolidate all of
89 * a cache's slabs at once; otherwise it works slowly to bring external
90 * fragmentation within the 1/8 limit guaranteed for internal fragmentation,
91 * thereby helping to avoid a low memory situation in the future.
92 *
93 * The callback has the following signature:
94 *
95 * kmem_cbrc_t move(void *old, void *new, size_t size, void *user_arg)
96 *
97 * It supplies the kmem client with two addresses: the allocated object that
98 * kmem wants to move and a buffer selected by kmem for the client to use as the
99 * copy destination. The callback is kmem's way of saying "Please get off of
100 * this buffer and use this one instead." kmem knows where it wants to move the
101 * object in order to best reduce fragmentation. All the client needs to know
102 * about the second argument (void *new) is that it is an allocated, constructed
103 * object ready to take the contents of the old object. When the move function
104 * is called, the system is likely to be low on memory, and the new object
105 * spares the client from having to worry about allocating memory for the
106 * requested move. The third argument supplies the size of the object, in case a
107 * single move function handles multiple caches whose objects differ only in
108 * size (such as zio_buf_512, zio_buf_1024, etc). Finally, the same optional
109 * user argument passed to the constructor, destructor, and reclaim functions is
110 * also passed to the move callback.
111 *
112 * 2.1 Setting the Move Callback
113 *
114 * The client sets the move callback after creating the cache and before
115 * allocating from it:
116 *
117 * object_cache = kmem_cache_create(...);
118 * kmem_cache_set_move(object_cache, object_move);
119 *
120 * 2.2 Move Callback Return Values
121 *
122 * Only the client knows about its own data and when is a good time to move it.
123 * The client is cooperating with kmem to return unused memory to the system,
124 * and kmem respectfully accepts this help at the client's convenience. When
125 * asked to move an object, the client can respond with any of the following:
126 *
127 * typedef enum kmem_cbrc {
128 * KMEM_CBRC_YES,
129 * KMEM_CBRC_NO,
130 * KMEM_CBRC_LATER,
131 * KMEM_CBRC_DONT_NEED,
132 * KMEM_CBRC_DONT_KNOW
133 * } kmem_cbrc_t;
134 *
135 * The client must not explicitly kmem_cache_free() either of the objects passed
136 * to the callback, since kmem wants to free them directly to the slab layer
137 * (bypassing the per-CPU magazine layer). The response tells kmem which of the
138 * objects to free:
139 *
140 * YES: (Did it) The client moved the object, so kmem frees the old one.
141 * NO: (Never) The client refused, so kmem frees the new object (the
142 * unused copy destination). kmem also marks the slab of the old
143 * object so as not to bother the client with further callbacks for
144 * that object as long as the slab remains on the partial slab list.
145 * (The system won't be getting the slab back as long as the
146 * immovable object holds it hostage, so there's no point in moving
147 * any of its objects.)
148 * LATER: The client is using the object and cannot move it now, so kmem
149 * frees the new object (the unused copy destination). kmem still
150 * attempts to move other objects off the slab, since it expects to
151 * succeed in clearing the slab in a later callback. The client
152 * should use LATER instead of NO if the object is likely to become
153 * movable very soon.
154 * DONT_NEED: The client no longer needs the object, so kmem frees the old along
155 * with the new object (the unused copy destination). This response
156 * is the client's opportunity to be a model citizen and give back as
157 * much as it can.
158 * DONT_KNOW: The client does not know about the object because
159 * a) the client has just allocated the object and not yet put it
160 * wherever it expects to find known objects
161 * b) the client has removed the object from wherever it expects to
162 * find known objects and is about to free it, or
163 * c) the client has freed the object.
164 * In all these cases (a, b, and c) kmem frees the new object (the
165 * unused copy destination). In the first case, the object is in
166 * use and the correct action is that for LATER; in the latter two
167 * cases, we know that the object is either freed or about to be
168 * freed, in which case it is either already in a magazine or about
169 * to be in one. In these cases, we know that the object will either
170 * be reallocated and reused, or it will end up in a full magazine
171 * that will be reaped (thereby liberating the slab). Because it
172 * is prohibitively expensive to differentiate these cases, and
173 * because the defrag code is executed when we're low on memory
174 * (thereby biasing the system to reclaim full magazines) we treat
175 * all DONT_KNOW cases as LATER and rely on cache reaping to
176 * generally clean up full magazines. While we take the same action
177 * for these cases, we maintain their semantic distinction: if
178 * defragmentation is not occurring, it is useful to know if this
179 * is due to objects in use (LATER) or objects in an unknown state
180 * of transition (DONT_KNOW).
181 *
182 * 2.3 Object States
183 *
184 * Neither kmem nor the client can be assumed to know the object's whereabouts
185 * at the time of the callback. An object belonging to a kmem cache may be in
186 * any of the following states:
187 *
188 * 1. Uninitialized on the slab
189 * 2. Allocated from the slab but not constructed (still uninitialized)
190 * 3. Allocated from the slab, constructed, but not yet ready for business
191 * (not in a valid state for the move callback)
192 * 4. In use (valid and known to the client)
193 * 5. About to be freed (no longer in a valid state for the move callback)
194 * 6. Freed to a magazine (still constructed)
195 * 7. Allocated from a magazine, not yet ready for business (not in a valid
196 * state for the move callback), and about to return to state #4
197 * 8. Deconstructed on a magazine that is about to be freed
198 * 9. Freed to the slab
199 *
200 * Since the move callback may be called at any time while the object is in any
201 * of the above states (except state #1), the client needs a safe way to
202 * determine whether or not it knows about the object. Specifically, the client
203 * needs to know whether or not the object is in state #4, the only state in
204 * which a move is valid. If the object is in any other state, the client should
205 * immediately return KMEM_CBRC_DONT_KNOW, since it is unsafe to access any of
206 * the object's fields.
207 *
208 * Note that although an object may be in state #4 when kmem initiates the move
209 * request, the object may no longer be in that state by the time kmem actually
210 * calls the move function. Not only does the client free objects
211 * asynchronously, kmem itself puts move requests on a queue where thay are
212 * pending until kmem processes them from another context. Also, objects freed
213 * to a magazine appear allocated from the point of view of the slab layer, so
214 * kmem may even initiate requests for objects in a state other than state #4.
215 *
216 * 2.3.1 Magazine Layer
217 *
218 * An important insight revealed by the states listed above is that the magazine
219 * layer is populated only by kmem_cache_free(). Magazines of constructed
220 * objects are never populated directly from the slab layer (which contains raw,
221 * unconstructed objects). Whenever an allocation request cannot be satisfied
222 * from the magazine layer, the magazines are bypassed and the request is
223 * satisfied from the slab layer (creating a new slab if necessary). kmem calls
224 * the object constructor only when allocating from the slab layer, and only in
225 * response to kmem_cache_alloc() or to prepare the destination buffer passed in
226 * the move callback. kmem does not preconstruct objects in anticipation of
227 * kmem_cache_alloc().
228 *
229 * 2.3.2 Object Constructor and Destructor
230 *
231 * If the client supplies a destructor, it must be valid to call the destructor
232 * on a newly created object (immediately after the constructor).
233 *
234 * 2.4 Recognizing Known Objects
235 *
236 * There is a simple test to determine safely whether or not the client knows
237 * about a given object in the move callback. It relies on the fact that kmem
238 * guarantees that the object of the move callback has only been touched by the
239 * client itself or else by kmem. kmem does this by ensuring that none of the
240 * cache's slabs are freed to the virtual memory (VM) subsystem while a move
241 * callback is pending. When the last object on a slab is freed, if there is a
242 * pending move, kmem puts the slab on a per-cache dead list and defers freeing
243 * slabs on that list until all pending callbacks are completed. That way,
244 * clients can be certain that the object of a move callback is in one of the
245 * states listed above, making it possible to distinguish known objects (in
246 * state #4) using the two low order bits of any pointer member (with the
247 * exception of 'char *' or 'short *' which may not be 4-byte aligned on some
248 * platforms).
249 *
250 * The test works as long as the client always transitions objects from state #4
251 * (known, in use) to state #5 (about to be freed, invalid) by setting the low
252 * order bit of the client-designated pointer member. Since kmem only writes
253 * invalid memory patterns, such as 0xbaddcafe to uninitialized memory and
254 * 0xdeadbeef to freed memory, any scribbling on the object done by kmem is
255 * guaranteed to set at least one of the two low order bits. Therefore, given an
256 * object with a back pointer to a 'container_t *o_container', the client can
257 * test
258 *
259 * container_t *container = object->o_container;
260 * if ((uintptr_t)container & 0x3) {
261 * return (KMEM_CBRC_DONT_KNOW);
262 * }
263 *
264 * Typically, an object will have a pointer to some structure with a list or
265 * hash where objects from the cache are kept while in use. Assuming that the
266 * client has some way of knowing that the container structure is valid and will
267 * not go away during the move, and assuming that the structure includes a lock
268 * to protect whatever collection is used, then the client would continue as
269 * follows:
270 *
271 * // Ensure that the container structure does not go away.
272 * if (container_hold(container) == 0) {
273 * return (KMEM_CBRC_DONT_KNOW);
274 * }
275 * mutex_enter(&container->c_objects_lock);
276 * if (container != object->o_container) {
277 * mutex_exit(&container->c_objects_lock);
278 * container_rele(container);
279 * return (KMEM_CBRC_DONT_KNOW);
280 * }
281 *
282 * At this point the client knows that the object cannot be freed as long as
283 * c_objects_lock is held. Note that after acquiring the lock, the client must
284 * recheck the o_container pointer in case the object was removed just before
285 * acquiring the lock.
286 *
287 * When the client is about to free an object, it must first remove that object
288 * from the list, hash, or other structure where it is kept. At that time, to
289 * mark the object so it can be distinguished from the remaining, known objects,
290 * the client sets the designated low order bit:
291 *
292 * mutex_enter(&container->c_objects_lock);
293 * object->o_container = (void *)((uintptr_t)object->o_container | 0x1);
294 * list_remove(&container->c_objects, object);
295 * mutex_exit(&container->c_objects_lock);
296 *
297 * In the common case, the object is freed to the magazine layer, where it may
298 * be reused on a subsequent allocation without the overhead of calling the
299 * constructor. While in the magazine it appears allocated from the point of
300 * view of the slab layer, making it a candidate for the move callback. Most
301 * objects unrecognized by the client in the move callback fall into this
302 * category and are cheaply distinguished from known objects by the test
303 * described earlier. Because searching magazines is prohibitively expensive
304 * for kmem, clients that do not mark freed objects (and therefore return
305 * KMEM_CBRC_DONT_KNOW for large numbers of objects) may find defragmentation
306 * efficacy reduced.
307 *
308 * Invalidating the designated pointer member before freeing the object marks
309 * the object to be avoided in the callback, and conversely, assigning a valid
310 * value to the designated pointer member after allocating the object makes the
311 * object fair game for the callback:
312 *
313 * ... allocate object ...
314 * ... set any initial state not set by the constructor ...
315 *
316 * mutex_enter(&container->c_objects_lock);
317 * list_insert_tail(&container->c_objects, object);
318 * membar_producer();
319 * object->o_container = container;
320 * mutex_exit(&container->c_objects_lock);
321 *
322 * Note that everything else must be valid before setting o_container makes the
323 * object fair game for the move callback. The membar_producer() call ensures
324 * that all the object's state is written to memory before setting the pointer
325 * that transitions the object from state #3 or #7 (allocated, constructed, not
326 * yet in use) to state #4 (in use, valid). That's important because the move
327 * function has to check the validity of the pointer before it can safely
328 * acquire the lock protecting the collection where it expects to find known
329 * objects.
330 *
331 * This method of distinguishing known objects observes the usual symmetry:
332 * invalidating the designated pointer is the first thing the client does before
333 * freeing the object, and setting the designated pointer is the last thing the
334 * client does after allocating the object. Of course, the client is not
335 * required to use this method. Fundamentally, how the client recognizes known
336 * objects is completely up to the client, but this method is recommended as an
337 * efficient and safe way to take advantage of the guarantees made by kmem. If
338 * the entire object is arbitrary data without any markable bits from a suitable
339 * pointer member, then the client must find some other method, such as
340 * searching a hash table of known objects.
341 *
342 * 2.5 Preventing Objects From Moving
343 *
344 * Besides a way to distinguish known objects, the other thing that the client
345 * needs is a strategy to ensure that an object will not move while the client
346 * is actively using it. The details of satisfying this requirement tend to be
347 * highly cache-specific. It might seem that the same rules that let a client
348 * remove an object safely should also decide when an object can be moved
349 * safely. However, any object state that makes a removal attempt invalid is
350 * likely to be long-lasting for objects that the client does not expect to
351 * remove. kmem knows nothing about the object state and is equally likely (from
352 * the client's point of view) to request a move for any object in the cache,
353 * whether prepared for removal or not. Even a low percentage of objects stuck
354 * in place by unremovability will defeat the consolidator if the stuck objects
355 * are the same long-lived allocations likely to hold slabs hostage.
356 * Fundamentally, the consolidator is not aimed at common cases. Severe external
357 * fragmentation is a worst case scenario manifested as sparsely allocated
358 * slabs, by definition a low percentage of the cache's objects. When deciding
359 * what makes an object movable, keep in mind the goal of the consolidator: to
360 * bring worst-case external fragmentation within the limits guaranteed for
361 * internal fragmentation. Removability is a poor criterion if it is likely to
362 * exclude more than an insignificant percentage of objects for long periods of
363 * time.
364 *
365 * A tricky general solution exists, and it has the advantage of letting you
366 * move any object at almost any moment, practically eliminating the likelihood
367 * that an object can hold a slab hostage. However, if there is a cache-specific
368 * way to ensure that an object is not actively in use in the vast majority of
369 * cases, a simpler solution that leverages this cache-specific knowledge is
370 * preferred.
371 *
372 * 2.5.1 Cache-Specific Solution
373 *
374 * As an example of a cache-specific solution, the ZFS znode cache takes
375 * advantage of the fact that the vast majority of znodes are only being
376 * referenced from the DNLC. (A typical case might be a few hundred in active
377 * use and a hundred thousand in the DNLC.) In the move callback, after the ZFS
378 * client has established that it recognizes the znode and can access its fields
379 * safely (using the method described earlier), it then tests whether the znode
380 * is referenced by anything other than the DNLC. If so, it assumes that the
381 * znode may be in active use and is unsafe to move, so it drops its locks and
382 * returns KMEM_CBRC_LATER. The advantage of this strategy is that everywhere
383 * else znodes are used, no change is needed to protect against the possibility
384 * of the znode moving. The disadvantage is that it remains possible for an
385 * application to hold a znode slab hostage with an open file descriptor.
386 * However, this case ought to be rare and the consolidator has a way to deal
387 * with it: If the client responds KMEM_CBRC_LATER repeatedly for the same
388 * object, kmem eventually stops believing it and treats the slab as if the
389 * client had responded KMEM_CBRC_NO. Having marked the hostage slab, kmem can
390 * then focus on getting it off of the partial slab list by allocating rather
391 * than freeing all of its objects. (Either way of getting a slab off the
392 * free list reduces fragmentation.)
393 *
394 * 2.5.2 General Solution
395 *
396 * The general solution, on the other hand, requires an explicit hold everywhere
397 * the object is used to prevent it from moving. To keep the client locking
398 * strategy as uncomplicated as possible, kmem guarantees the simplifying
399 * assumption that move callbacks are sequential, even across multiple caches.
400 * Internally, a global queue processed by a single thread supports all caches
401 * implementing the callback function. No matter how many caches supply a move
402 * function, the consolidator never moves more than one object at a time, so the
403 * client does not have to worry about tricky lock ordering involving several
404 * related objects from different kmem caches.
405 *
406 * The general solution implements the explicit hold as a read-write lock, which
407 * allows multiple readers to access an object from the cache simultaneously
408 * while a single writer is excluded from moving it. A single rwlock for the
409 * entire cache would lock out all threads from using any of the cache's objects
410 * even though only a single object is being moved, so to reduce contention,
411 * the client can fan out the single rwlock into an array of rwlocks hashed by
412 * the object address, making it probable that moving one object will not
413 * prevent other threads from using a different object. The rwlock cannot be a
414 * member of the object itself, because the possibility of the object moving
415 * makes it unsafe to access any of the object's fields until the lock is
416 * acquired.
417 *
418 * Assuming a small, fixed number of locks, it's possible that multiple objects
419 * will hash to the same lock. A thread that needs to use multiple objects in
420 * the same function may acquire the same lock multiple times. Since rwlocks are
421 * reentrant for readers, and since there is never more than a single writer at
422 * a time (assuming that the client acquires the lock as a writer only when
423 * moving an object inside the callback), there would seem to be no problem.
424 * However, a client locking multiple objects in the same function must handle
425 * one case of potential deadlock: Assume that thread A needs to prevent both
426 * object 1 and object 2 from moving, and thread B, the callback, meanwhile
427 * tries to move object 3. It's possible, if objects 1, 2, and 3 all hash to the
428 * same lock, that thread A will acquire the lock for object 1 as a reader
429 * before thread B sets the lock's write-wanted bit, preventing thread A from
430 * reacquiring the lock for object 2 as a reader. Unable to make forward
431 * progress, thread A will never release the lock for object 1, resulting in
432 * deadlock.
433 *
434 * There are two ways of avoiding the deadlock just described. The first is to
435 * use rw_tryenter() rather than rw_enter() in the callback function when
436 * attempting to acquire the lock as a writer. If tryenter discovers that the
437 * same object (or another object hashed to the same lock) is already in use, it
438 * aborts the callback and returns KMEM_CBRC_LATER. The second way is to use
439 * rprwlock_t (declared in common/fs/zfs/sys/rprwlock.h) instead of rwlock_t,
440 * since it allows a thread to acquire the lock as a reader in spite of a
441 * waiting writer. This second approach insists on moving the object now, no
442 * matter how many readers the move function must wait for in order to do so,
443 * and could delay the completion of the callback indefinitely (blocking
444 * callbacks to other clients). In practice, a less insistent callback using
445 * rw_tryenter() returns KMEM_CBRC_LATER infrequently enough that there seems
446 * little reason to use anything else.
447 *
448 * Avoiding deadlock is not the only problem that an implementation using an
449 * explicit hold needs to solve. Locking the object in the first place (to
450 * prevent it from moving) remains a problem, since the object could move
451 * between the time you obtain a pointer to the object and the time you acquire
452 * the rwlock hashed to that pointer value. Therefore the client needs to
453 * recheck the value of the pointer after acquiring the lock, drop the lock if
454 * the value has changed, and try again. This requires a level of indirection:
455 * something that points to the object rather than the object itself, that the
456 * client can access safely while attempting to acquire the lock. (The object
457 * itself cannot be referenced safely because it can move at any time.)
458 * The following lock-acquisition function takes whatever is safe to reference
459 * (arg), follows its pointer to the object (using function f), and tries as
460 * often as necessary to acquire the hashed lock and verify that the object
461 * still has not moved:
462 *
463 * object_t *
464 * object_hold(object_f f, void *arg)
465 * {
466 * object_t *op;
467 *
468 * op = f(arg);
469 * if (op == NULL) {
470 * return (NULL);
471 * }
472 *
473 * rw_enter(OBJECT_RWLOCK(op), RW_READER);
474 * while (op != f(arg)) {
475 * rw_exit(OBJECT_RWLOCK(op));
476 * op = f(arg);
477 * if (op == NULL) {
478 * break;
479 * }
480 * rw_enter(OBJECT_RWLOCK(op), RW_READER);
481 * }
482 *
483 * return (op);
484 * }
485 *
486 * The OBJECT_RWLOCK macro hashes the object address to obtain the rwlock. The
487 * lock reacquisition loop, while necessary, almost never executes. The function
488 * pointer f (used to obtain the object pointer from arg) has the following type
489 * definition:
490 *
491 * typedef object_t *(*object_f)(void *arg);
492 *
493 * An object_f implementation is likely to be as simple as accessing a structure
494 * member:
495 *
496 * object_t *
497 * s_object(void *arg)
498 * {
499 * something_t *sp = arg;
500 * return (sp->s_object);
501 * }
502 *
503 * The flexibility of a function pointer allows the path to the object to be
504 * arbitrarily complex and also supports the notion that depending on where you
505 * are using the object, you may need to get it from someplace different.
506 *
507 * The function that releases the explicit hold is simpler because it does not
508 * have to worry about the object moving:
509 *
510 * void
511 * object_rele(object_t *op)
512 * {
513 * rw_exit(OBJECT_RWLOCK(op));
514 * }
515 *
516 * The caller is spared these details so that obtaining and releasing an
517 * explicit hold feels like a simple mutex_enter()/mutex_exit() pair. The caller
518 * of object_hold() only needs to know that the returned object pointer is valid
519 * if not NULL and that the object will not move until released.
520 *
521 * Although object_hold() prevents an object from moving, it does not prevent it
522 * from being freed. The caller must take measures before calling object_hold()
523 * (afterwards is too late) to ensure that the held object cannot be freed. The
524 * caller must do so without accessing the unsafe object reference, so any lock
525 * or reference count used to ensure the continued existence of the object must
526 * live outside the object itself.
527 *
528 * Obtaining a new object is a special case where an explicit hold is impossible
529 * for the caller. Any function that returns a newly allocated object (either as
530 * a return value, or as an in-out paramter) must return it already held; after
531 * the caller gets it is too late, since the object cannot be safely accessed
532 * without the level of indirection described earlier. The following
533 * object_alloc() example uses the same code shown earlier to transition a new
534 * object into the state of being recognized (by the client) as a known object.
535 * The function must acquire the hold (rw_enter) before that state transition
536 * makes the object movable:
537 *
538 * static object_t *
539 * object_alloc(container_t *container)
540 * {
541 * object_t *object = kmem_cache_alloc(object_cache, 0);
542 * ... set any initial state not set by the constructor ...
543 * rw_enter(OBJECT_RWLOCK(object), RW_READER);
544 * mutex_enter(&container->c_objects_lock);
545 * list_insert_tail(&container->c_objects, object);
546 * membar_producer();
547 * object->o_container = container;
548 * mutex_exit(&container->c_objects_lock);
549 * return (object);
550 * }
551 *
552 * Functions that implicitly acquire an object hold (any function that calls
553 * object_alloc() to supply an object for the caller) need to be carefully noted
554 * so that the matching object_rele() is not neglected. Otherwise, leaked holds
555 * prevent all objects hashed to the affected rwlocks from ever being moved.
556 *
557 * The pointer to a held object can be hashed to the holding rwlock even after
558 * the object has been freed. Although it is possible to release the hold
559 * after freeing the object, you may decide to release the hold implicitly in
560 * whatever function frees the object, so as to release the hold as soon as
561 * possible, and for the sake of symmetry with the function that implicitly
562 * acquires the hold when it allocates the object. Here, object_free() releases
563 * the hold acquired by object_alloc(). Its implicit object_rele() forms a
564 * matching pair with object_hold():
565 *
566 * void
567 * object_free(object_t *object)
568 * {
569 * container_t *container;
570 *
571 * ASSERT(object_held(object));
572 * container = object->o_container;
573 * mutex_enter(&container->c_objects_lock);
574 * object->o_container =
575 * (void *)((uintptr_t)object->o_container | 0x1);
576 * list_remove(&container->c_objects, object);
577 * mutex_exit(&container->c_objects_lock);
578 * object_rele(object);
579 * kmem_cache_free(object_cache, object);
580 * }
581 *
582 * Note that object_free() cannot safely accept an object pointer as an argument
583 * unless the object is already held. Any function that calls object_free()
584 * needs to be carefully noted since it similarly forms a matching pair with
585 * object_hold().
586 *
587 * To complete the picture, the following callback function implements the
588 * general solution by moving objects only if they are currently unheld:
589 *
590 * static kmem_cbrc_t
591 * object_move(void *buf, void *newbuf, size_t size, void *arg)
592 * {
593 * object_t *op = buf, *np = newbuf;
594 * container_t *container;
595 *
596 * container = op->o_container;
597 * if ((uintptr_t)container & 0x3) {
598 * return (KMEM_CBRC_DONT_KNOW);
599 * }
600 *
601 * // Ensure that the container structure does not go away.
602 * if (container_hold(container) == 0) {
603 * return (KMEM_CBRC_DONT_KNOW);
604 * }
605 *
606 * mutex_enter(&container->c_objects_lock);
607 * if (container != op->o_container) {
608 * mutex_exit(&container->c_objects_lock);
609 * container_rele(container);
610 * return (KMEM_CBRC_DONT_KNOW);
611 * }
612 *
613 * if (rw_tryenter(OBJECT_RWLOCK(op), RW_WRITER) == 0) {
614 * mutex_exit(&container->c_objects_lock);
615 * container_rele(container);
616 * return (KMEM_CBRC_LATER);
617 * }
618 *
619 * object_move_impl(op, np); // critical section
620 * rw_exit(OBJECT_RWLOCK(op));
621 *
622 * op->o_container = (void *)((uintptr_t)op->o_container | 0x1);
623 * list_link_replace(&op->o_link_node, &np->o_link_node);
624 * mutex_exit(&container->c_objects_lock);
625 * container_rele(container);
626 * return (KMEM_CBRC_YES);
627 * }
628 *
629 * Note that object_move() must invalidate the designated o_container pointer of
630 * the old object in the same way that object_free() does, since kmem will free
631 * the object in response to the KMEM_CBRC_YES return value.
632 *
633 * The lock order in object_move() differs from object_alloc(), which locks
634 * OBJECT_RWLOCK first and &container->c_objects_lock second, but as long as the
635 * callback uses rw_tryenter() (preventing the deadlock described earlier), it's
636 * not a problem. Holding the lock on the object list in the example above
637 * through the entire callback not only prevents the object from going away, it
638 * also allows you to lock the list elsewhere and know that none of its elements
639 * will move during iteration.
640 *
641 * Adding an explicit hold everywhere an object from the cache is used is tricky
642 * and involves much more change to client code than a cache-specific solution
643 * that leverages existing state to decide whether or not an object is
644 * movable. However, this approach has the advantage that no object remains
645 * immovable for any significant length of time, making it extremely unlikely
646 * that long-lived allocations can continue holding slabs hostage; and it works
647 * for any cache.
648 *
649 * 3. Consolidator Implementation
650 *
651 * Once the client supplies a move function that a) recognizes known objects and
652 * b) avoids moving objects that are actively in use, the remaining work is up
653 * to the consolidator to decide which objects to move and when to issue
654 * callbacks.
655 *
656 * The consolidator relies on the fact that a cache's slabs are ordered by
657 * usage. Each slab has a fixed number of objects. Depending on the slab's
658 * "color" (the offset of the first object from the beginning of the slab;
659 * offsets are staggered to mitigate false sharing of cache lines) it is either
660 * the maximum number of objects per slab determined at cache creation time or
661 * else the number closest to the maximum that fits within the space remaining
662 * after the initial offset. A completely allocated slab may contribute some
663 * internal fragmentation (per-slab overhead) but no external fragmentation, so
664 * it is of no interest to the consolidator. At the other extreme, slabs whose
665 * objects have all been freed to the slab are released to the virtual memory
666 * (VM) subsystem (objects freed to magazines are still allocated as far as the
667 * slab is concerned). External fragmentation exists when there are slabs
668 * somewhere between these extremes. A partial slab has at least one but not all
669 * of its objects allocated. The more partial slabs, and the fewer allocated
670 * objects on each of them, the higher the fragmentation. Hence the
671 * consolidator's overall strategy is to reduce the number of partial slabs by
672 * moving allocated objects from the least allocated slabs to the most allocated
673 * slabs.
674 *
675 * Partial slabs are kept in an AVL tree ordered by usage. Completely allocated
676 * slabs are kept separately in an unordered list. Since the majority of slabs
677 * tend to be completely allocated (a typical unfragmented cache may have
678 * thousands of complete slabs and only a single partial slab), separating
679 * complete slabs improves the efficiency of partial slab ordering, since the
680 * complete slabs do not affect the depth or balance of the AVL tree. This
681 * ordered sequence of partial slabs acts as a "free list" supplying objects for
682 * allocation requests.
683 *
684 * Objects are always allocated from the first partial slab in the free list,
685 * where the allocation is most likely to eliminate a partial slab (by
686 * completely allocating it). Conversely, when a single object from a completely
687 * allocated slab is freed to the slab, that slab is added to the front of the
688 * free list. Since most free list activity involves highly allocated slabs
689 * coming and going at the front of the list, slabs tend naturally toward the
690 * ideal order: highly allocated at the front, sparsely allocated at the back.
691 * Slabs with few allocated objects are likely to become completely free if they
692 * keep a safe distance away from the front of the free list. Slab misorders
693 * interfere with the natural tendency of slabs to become completely free or
694 * completely allocated. For example, a slab with a single allocated object
695 * needs only a single free to escape the cache; its natural desire is
696 * frustrated when it finds itself at the front of the list where a second
697 * allocation happens just before the free could have released it. Another slab
698 * with all but one object allocated might have supplied the buffer instead, so
699 * that both (as opposed to neither) of the slabs would have been taken off the
700 * free list.
701 *
702 * Although slabs tend naturally toward the ideal order, misorders allowed by a
703 * simple list implementation defeat the consolidator's strategy of merging
704 * least- and most-allocated slabs. Without an AVL tree to guarantee order, kmem
705 * needs another way to fix misorders to optimize its callback strategy. One
706 * approach is to periodically scan a limited number of slabs, advancing a
707 * marker to hold the current scan position, and to move extreme misorders to
708 * the front or back of the free list and to the front or back of the current
709 * scan range. By making consecutive scan ranges overlap by one slab, the least
710 * allocated slab in the current range can be carried along from the end of one
711 * scan to the start of the next.
712 *
713 * Maintaining partial slabs in an AVL tree relieves kmem of this additional
714 * task, however. Since most of the cache's activity is in the magazine layer,
715 * and allocations from the slab layer represent only a startup cost, the
716 * overhead of maintaining a balanced tree is not a significant concern compared
717 * to the opportunity of reducing complexity by eliminating the partial slab
718 * scanner just described. The overhead of an AVL tree is minimized by
719 * maintaining only partial slabs in the tree and keeping completely allocated
720 * slabs separately in a list. To avoid increasing the size of the slab
721 * structure the AVL linkage pointers are reused for the slab's list linkage,
722 * since the slab will always be either partial or complete, never stored both
723 * ways at the same time. To further minimize the overhead of the AVL tree the
724 * compare function that orders partial slabs by usage divides the range of
725 * allocated object counts into bins such that counts within the same bin are
726 * considered equal. Binning partial slabs makes it less likely that allocating
727 * or freeing a single object will change the slab's order, requiring a tree
728 * reinsertion (an avl_remove() followed by an avl_add(), both potentially
729 * requiring some rebalancing of the tree). Allocation counts closest to
730 * completely free and completely allocated are left unbinned (finely sorted) to
731 * better support the consolidator's strategy of merging slabs at either
732 * extreme.
733 *
734 * 3.1 Assessing Fragmentation and Selecting Candidate Slabs
735 *
736 * The consolidator piggybacks on the kmem maintenance thread and is called on
737 * the same interval as kmem_cache_update(), once per cache every fifteen
738 * seconds. kmem maintains a running count of unallocated objects in the slab
739 * layer (cache_bufslab). The consolidator checks whether that number exceeds
740 * 12.5% (1/8) of the total objects in the cache (cache_buftotal), and whether
741 * there is a significant number of slabs in the cache (arbitrarily a minimum
742 * 101 total slabs). Unused objects that have fallen out of the magazine layer's
743 * working set are included in the assessment, and magazines in the depot are
744 * reaped if those objects would lift cache_bufslab above the fragmentation
745 * threshold. Once the consolidator decides that a cache is fragmented, it looks
746 * for a candidate slab to reclaim, starting at the end of the partial slab free
747 * list and scanning backwards. At first the consolidator is choosy: only a slab
748 * with fewer than 12.5% (1/8) of its objects allocated qualifies (or else a
749 * single allocated object, regardless of percentage). If there is difficulty
750 * finding a candidate slab, kmem raises the allocation threshold incrementally,
751 * up to a maximum 87.5% (7/8), so that eventually the consolidator will reduce
752 * external fragmentation (unused objects on the free list) below 12.5% (1/8),
753 * even in the worst case of every slab in the cache being almost 7/8 allocated.
754 * The threshold can also be lowered incrementally when candidate slabs are easy
755 * to find, and the threshold is reset to the minimum 1/8 as soon as the cache
756 * is no longer fragmented.
757 *
758 * 3.2 Generating Callbacks
759 *
760 * Once an eligible slab is chosen, a callback is generated for every allocated
761 * object on the slab, in the hope that the client will move everything off the
762 * slab and make it reclaimable. Objects selected as move destinations are
763 * chosen from slabs at the front of the free list. Assuming slabs in the ideal
764 * order (most allocated at the front, least allocated at the back) and a
765 * cooperative client, the consolidator will succeed in removing slabs from both
766 * ends of the free list, completely allocating on the one hand and completely
767 * freeing on the other. Objects selected as move destinations are allocated in
768 * the kmem maintenance thread where move requests are enqueued. A separate
769 * callback thread removes pending callbacks from the queue and calls the
770 * client. The separate thread ensures that client code (the move function) does
771 * not interfere with internal kmem maintenance tasks. A map of pending
772 * callbacks keyed by object address (the object to be moved) is checked to
773 * ensure that duplicate callbacks are not generated for the same object.
774 * Allocating the move destination (the object to move to) prevents subsequent
775 * callbacks from selecting the same destination as an earlier pending callback.
776 *
777 * Move requests can also be generated by kmem_cache_reap() when the system is
778 * desperate for memory and by kmem_cache_move_notify(), called by the client to
779 * notify kmem that a move refused earlier with KMEM_CBRC_LATER is now possible.
780 * The map of pending callbacks is protected by the same lock that protects the
781 * slab layer.
782 *
783 * When the system is desperate for memory, kmem does not bother to determine
784 * whether or not the cache exceeds the fragmentation threshold, but tries to
785 * consolidate as many slabs as possible. Normally, the consolidator chews
786 * slowly, one sparsely allocated slab at a time during each maintenance
787 * interval that the cache is fragmented. When desperate, the consolidator
788 * starts at the last partial slab and enqueues callbacks for every allocated
789 * object on every partial slab, working backwards until it reaches the first
790 * partial slab. The first partial slab, meanwhile, advances in pace with the
791 * consolidator as allocations to supply move destinations for the enqueued
792 * callbacks use up the highly allocated slabs at the front of the free list.
793 * Ideally, the overgrown free list collapses like an accordion, starting at
794 * both ends and ending at the center with a single partial slab.
795 *
796 * 3.3 Client Responses
797 *
798 * When the client returns KMEM_CBRC_NO in response to the move callback, kmem
799 * marks the slab that supplied the stuck object non-reclaimable and moves it to
800 * front of the free list. The slab remains marked as long as it remains on the
801 * free list, and it appears more allocated to the partial slab compare function
802 * than any unmarked slab, no matter how many of its objects are allocated.
803 * Since even one immovable object ties up the entire slab, the goal is to
804 * completely allocate any slab that cannot be completely freed. kmem does not
805 * bother generating callbacks to move objects from a marked slab unless the
806 * system is desperate.
807 *
808 * When the client responds KMEM_CBRC_LATER, kmem increments a count for the
809 * slab. If the client responds LATER too many times, kmem disbelieves and
810 * treats the response as a NO. The count is cleared when the slab is taken off
811 * the partial slab list or when the client moves one of the slab's objects.
812 *
813 * 4. Observability
814 *
815 * A kmem cache's external fragmentation is best observed with 'mdb -k' using
816 * the ::kmem_slabs dcmd. For a complete description of the command, enter
817 * '::help kmem_slabs' at the mdb prompt.
818 */
819
820 #include <sys/kmem_impl.h>
821 #include <sys/vmem_impl.h>
822 #include <sys/param.h>
823 #include <sys/sysmacros.h>
824 #include <sys/vm.h>
825 #include <sys/proc.h>
826 #include <sys/tuneable.h>
827 #include <sys/systm.h>
828 #include <sys/cmn_err.h>
829 #include <sys/debug.h>
830 #include <sys/sdt.h>
831 #include <sys/mutex.h>
832 #include <sys/bitmap.h>
833 #include <sys/atomic.h>
834 #include <sys/kobj.h>
835 #include <sys/disp.h>
836 #include <vm/seg_kmem.h>
837 #include <sys/log.h>
838 #include <sys/callb.h>
839 #include <sys/taskq.h>
840 #include <sys/modctl.h>
841 #include <sys/reboot.h>
842 #include <sys/id32.h>
843 #include <sys/zone.h>
844 #include <sys/netstack.h>
845 #ifdef DEBUG
846 #include <sys/random.h>
847 #endif
848
849 extern void streams_msg_init(void);
850 extern int segkp_fromheap;
851 extern void segkp_cache_free(void);
852 extern int callout_init_done;
853
854 struct kmem_cache_kstat {
855 kstat_named_t kmc_buf_size;
856 kstat_named_t kmc_align;
857 kstat_named_t kmc_chunk_size;
858 kstat_named_t kmc_slab_size;
859 kstat_named_t kmc_alloc;
860 kstat_named_t kmc_alloc_fail;
861 kstat_named_t kmc_free;
862 kstat_named_t kmc_depot_alloc;
863 kstat_named_t kmc_depot_free;
864 kstat_named_t kmc_depot_contention;
865 kstat_named_t kmc_slab_alloc;
866 kstat_named_t kmc_slab_free;
867 kstat_named_t kmc_buf_constructed;
868 kstat_named_t kmc_buf_avail;
869 kstat_named_t kmc_buf_inuse;
870 kstat_named_t kmc_buf_total;
871 kstat_named_t kmc_buf_max;
872 kstat_named_t kmc_slab_create;
873 kstat_named_t kmc_slab_destroy;
874 kstat_named_t kmc_vmem_source;
875 kstat_named_t kmc_hash_size;
876 kstat_named_t kmc_hash_lookup_depth;
877 kstat_named_t kmc_hash_rescale;
878 kstat_named_t kmc_full_magazines;
879 kstat_named_t kmc_empty_magazines;
880 kstat_named_t kmc_magazine_size;
881 kstat_named_t kmc_reap; /* number of kmem_cache_reap() calls */
882 kstat_named_t kmc_defrag; /* attempts to defrag all partial slabs */
883 kstat_named_t kmc_scan; /* attempts to defrag one partial slab */
884 kstat_named_t kmc_move_callbacks; /* sum of yes, no, later, dn, dk */
885 kstat_named_t kmc_move_yes;
886 kstat_named_t kmc_move_no;
887 kstat_named_t kmc_move_later;
888 kstat_named_t kmc_move_dont_need;
889 kstat_named_t kmc_move_dont_know; /* obj unrecognized by client ... */
890 kstat_named_t kmc_move_hunt_found; /* ... but found in mag layer */
891 kstat_named_t kmc_move_slabs_freed; /* slabs freed by consolidator */
892 kstat_named_t kmc_move_reclaimable; /* buffers, if consolidator ran */
893 } kmem_cache_kstat = {
894 { "buf_size", KSTAT_DATA_UINT64 },
895 { "align", KSTAT_DATA_UINT64 },
896 { "chunk_size", KSTAT_DATA_UINT64 },
897 { "slab_size", KSTAT_DATA_UINT64 },
898 { "alloc", KSTAT_DATA_UINT64 },
899 { "alloc_fail", KSTAT_DATA_UINT64 },
900 { "free", KSTAT_DATA_UINT64 },
901 { "depot_alloc", KSTAT_DATA_UINT64 },
902 { "depot_free", KSTAT_DATA_UINT64 },
903 { "depot_contention", KSTAT_DATA_UINT64 },
904 { "slab_alloc", KSTAT_DATA_UINT64 },
905 { "slab_free", KSTAT_DATA_UINT64 },
906 { "buf_constructed", KSTAT_DATA_UINT64 },
907 { "buf_avail", KSTAT_DATA_UINT64 },
908 { "buf_inuse", KSTAT_DATA_UINT64 },
909 { "buf_total", KSTAT_DATA_UINT64 },
910 { "buf_max", KSTAT_DATA_UINT64 },
911 { "slab_create", KSTAT_DATA_UINT64 },
912 { "slab_destroy", KSTAT_DATA_UINT64 },
913 { "vmem_source", KSTAT_DATA_UINT64 },
914 { "hash_size", KSTAT_DATA_UINT64 },
915 { "hash_lookup_depth", KSTAT_DATA_UINT64 },
916 { "hash_rescale", KSTAT_DATA_UINT64 },
917 { "full_magazines", KSTAT_DATA_UINT64 },
918 { "empty_magazines", KSTAT_DATA_UINT64 },
919 { "magazine_size", KSTAT_DATA_UINT64 },
920 { "reap", KSTAT_DATA_UINT64 },
921 { "defrag", KSTAT_DATA_UINT64 },
922 { "scan", KSTAT_DATA_UINT64 },
923 { "move_callbacks", KSTAT_DATA_UINT64 },
924 { "move_yes", KSTAT_DATA_UINT64 },
925 { "move_no", KSTAT_DATA_UINT64 },
926 { "move_later", KSTAT_DATA_UINT64 },
927 { "move_dont_need", KSTAT_DATA_UINT64 },
928 { "move_dont_know", KSTAT_DATA_UINT64 },
929 { "move_hunt_found", KSTAT_DATA_UINT64 },
930 { "move_slabs_freed", KSTAT_DATA_UINT64 },
931 { "move_reclaimable", KSTAT_DATA_UINT64 },
932 };
933
934 static kmutex_t kmem_cache_kstat_lock;
935
936 /*
937 * The default set of caches to back kmem_alloc().
938 * These sizes should be reevaluated periodically.
939 *
940 * We want allocations that are multiples of the coherency granularity
941 * (64 bytes) to be satisfied from a cache which is a multiple of 64
942 * bytes, so that it will be 64-byte aligned. For all multiples of 64,
943 * the next kmem_cache_size greater than or equal to it must be a
944 * multiple of 64.
945 *
946 * We split the table into two sections: size <= 4k and size > 4k. This
947 * saves a lot of space and cache footprint in our cache tables.
948 */
949 static const int kmem_alloc_sizes[] = {
950 1 * 8,
951 2 * 8,
952 3 * 8,
953 4 * 8, 5 * 8, 6 * 8, 7 * 8,
954 4 * 16, 5 * 16, 6 * 16, 7 * 16,
955 4 * 32, 5 * 32, 6 * 32, 7 * 32,
956 4 * 64, 5 * 64, 6 * 64, 7 * 64,
957 4 * 128, 5 * 128, 6 * 128, 7 * 128,
958 P2ALIGN(8192 / 7, 64),
959 P2ALIGN(8192 / 6, 64),
960 P2ALIGN(8192 / 5, 64),
961 P2ALIGN(8192 / 4, 64),
962 P2ALIGN(8192 / 3, 64),
963 P2ALIGN(8192 / 2, 64),
964 };
965
966 static const int kmem_big_alloc_sizes[] = {
967 2 * 4096, 3 * 4096,
968 2 * 8192, 3 * 8192,
969 4 * 8192, 5 * 8192, 6 * 8192, 7 * 8192,
970 8 * 8192, 9 * 8192, 10 * 8192, 11 * 8192,
971 12 * 8192, 13 * 8192, 14 * 8192, 15 * 8192,
972 16 * 8192
973 };
974
975 #define KMEM_MAXBUF 4096
976 #define KMEM_BIG_MAXBUF_32BIT 32768
977 #define KMEM_BIG_MAXBUF 131072
978
979 #define KMEM_BIG_MULTIPLE 4096 /* big_alloc_sizes must be a multiple */
980 #define KMEM_BIG_SHIFT 12 /* lg(KMEM_BIG_MULTIPLE) */
981
982 static kmem_cache_t *kmem_alloc_table[KMEM_MAXBUF >> KMEM_ALIGN_SHIFT];
983 static kmem_cache_t *kmem_big_alloc_table[KMEM_BIG_MAXBUF >> KMEM_BIG_SHIFT];
984
985 #define KMEM_ALLOC_TABLE_MAX (KMEM_MAXBUF >> KMEM_ALIGN_SHIFT)
986 static size_t kmem_big_alloc_table_max = 0; /* # of filled elements */
987
988 static kmem_magtype_t kmem_magtype[] = {
989 { 1, 8, 3200, 65536 },
990 { 3, 16, 256, 32768 },
991 { 7, 32, 64, 16384 },
992 { 15, 64, 0, 8192 },
993 { 31, 64, 0, 4096 },
994 { 47, 64, 0, 2048 },
995 { 63, 64, 0, 1024 },
996 { 95, 64, 0, 512 },
997 { 143, 64, 0, 0 },
998 };
999
1000 static uint32_t kmem_reaping;
1001 static uint32_t kmem_reaping_idspace;
1002
1003 /*
1004 * kmem tunables
1005 */
1006 clock_t kmem_reap_interval; /* cache reaping rate [15 * HZ ticks] */
1007 int kmem_depot_contention = 3; /* max failed tryenters per real interval */
1008 pgcnt_t kmem_reapahead = 0; /* start reaping N pages before pageout */
1009 int kmem_panic = 1; /* whether to panic on error */
1010 int kmem_logging = 1; /* kmem_log_enter() override */
1011 uint32_t kmem_mtbf = 0; /* mean time between failures [default: off] */
1012 size_t kmem_transaction_log_size; /* transaction log size [2% of memory] */
1013 size_t kmem_content_log_size; /* content log size [2% of memory] */
1014 size_t kmem_failure_log_size; /* failure log [4 pages per CPU] */
1015 size_t kmem_slab_log_size; /* slab create log [4 pages per CPU] */
1016 size_t kmem_zerosized_log_size; /* zero-sized log [4 pages per CPU] */
1017 size_t kmem_content_maxsave = 256; /* KMF_CONTENTS max bytes to log */
1018 size_t kmem_lite_minsize = 0; /* minimum buffer size for KMF_LITE */
1019 size_t kmem_lite_maxalign = 1024; /* maximum buffer alignment for KMF_LITE */
1020 int kmem_lite_pcs = 4; /* number of PCs to store in KMF_LITE mode */
1021 size_t kmem_maxverify; /* maximum bytes to inspect in debug routines */
1022 size_t kmem_minfirewall; /* hardware-enforced redzone threshold */
1023
1024 #ifdef DEBUG
1025 int kmem_warn_zerosized = 1; /* whether to warn on zero-sized KM_SLEEP */
1026 #else
1027 int kmem_warn_zerosized = 0; /* whether to warn on zero-sized KM_SLEEP */
1028 #endif
1029
1030 int kmem_panic_zerosized = 0; /* whether to panic on zero-sized KM_SLEEP */
1031
1032 #ifdef _LP64
1033 size_t kmem_max_cached = KMEM_BIG_MAXBUF; /* maximum kmem_alloc cache */
1034 #else
1035 size_t kmem_max_cached = KMEM_BIG_MAXBUF_32BIT; /* maximum kmem_alloc cache */
1036 #endif
1037
1038 #ifdef DEBUG
1039 int kmem_flags = KMF_AUDIT | KMF_DEADBEEF | KMF_REDZONE | KMF_CONTENTS;
1040 #else
1041 int kmem_flags = 0;
1042 #endif
1043 int kmem_ready;
1044
1045 static kmem_cache_t *kmem_slab_cache;
1046 static kmem_cache_t *kmem_bufctl_cache;
1047 static kmem_cache_t *kmem_bufctl_audit_cache;
1048
1049 static kmutex_t kmem_cache_lock; /* inter-cache linkage only */
1050 static list_t kmem_caches;
1051
1052 static taskq_t *kmem_taskq;
1053 static kmutex_t kmem_flags_lock;
1054 static vmem_t *kmem_metadata_arena;
1055 static vmem_t *kmem_msb_arena; /* arena for metadata caches */
1056 static vmem_t *kmem_cache_arena;
1057 static vmem_t *kmem_hash_arena;
1058 static vmem_t *kmem_log_arena;
1059 static vmem_t *kmem_oversize_arena;
1060 static vmem_t *kmem_va_arena;
1061 static vmem_t *kmem_default_arena;
1062 static vmem_t *kmem_firewall_va_arena;
1063 static vmem_t *kmem_firewall_arena;
1064
1065 static int kmem_zerosized; /* # of zero-sized allocs */
1066
1067 /*
1068 * kmem slab consolidator thresholds (tunables)
1069 */
1070 size_t kmem_frag_minslabs = 101; /* minimum total slabs */
1071 size_t kmem_frag_numer = 1; /* free buffers (numerator) */
1072 size_t kmem_frag_denom = KMEM_VOID_FRACTION; /* buffers (denominator) */
1073 /*
1074 * Maximum number of slabs from which to move buffers during a single
1075 * maintenance interval while the system is not low on memory.
1076 */
1077 size_t kmem_reclaim_max_slabs = 1;
1078 /*
1079 * Number of slabs to scan backwards from the end of the partial slab list
1080 * when searching for buffers to relocate.
1081 */
1082 size_t kmem_reclaim_scan_range = 12;
1083
1084 /* consolidator knobs */
1085 boolean_t kmem_move_noreap;
1086 boolean_t kmem_move_blocked;
1087 boolean_t kmem_move_fulltilt;
1088 boolean_t kmem_move_any_partial;
1089
1090 #ifdef DEBUG
1091 /*
1092 * kmem consolidator debug tunables:
1093 * Ensure code coverage by occasionally running the consolidator even when the
1094 * caches are not fragmented (they may never be). These intervals are mean time
1095 * in cache maintenance intervals (kmem_cache_update).
1096 */
1097 uint32_t kmem_mtb_move = 60; /* defrag 1 slab (~15min) */
1098 uint32_t kmem_mtb_reap = 1800; /* defrag all slabs (~7.5hrs) */
1099 #endif /* DEBUG */
1100
1101 static kmem_cache_t *kmem_defrag_cache;
1102 static kmem_cache_t *kmem_move_cache;
1103 static taskq_t *kmem_move_taskq;
1104
1105 static void kmem_cache_scan(kmem_cache_t *);
1106 static void kmem_cache_defrag(kmem_cache_t *);
1107 static void kmem_slab_prefill(kmem_cache_t *, kmem_slab_t *);
1108
1109
1110 kmem_log_header_t *kmem_transaction_log;
1111 kmem_log_header_t *kmem_content_log;
1112 kmem_log_header_t *kmem_failure_log;
1113 kmem_log_header_t *kmem_slab_log;
1114 kmem_log_header_t *kmem_zerosized_log;
1115
1116 static int kmem_lite_count; /* # of PCs in kmem_buftag_lite_t */
1117
1118 #define KMEM_BUFTAG_LITE_ENTER(bt, count, caller) \
1119 if ((count) > 0) { \
1120 pc_t *_s = ((kmem_buftag_lite_t *)(bt))->bt_history; \
1121 pc_t *_e; \
1122 /* memmove() the old entries down one notch */ \
1123 for (_e = &_s[(count) - 1]; _e > _s; _e--) \
1124 *_e = *(_e - 1); \
1125 *_s = (uintptr_t)(caller); \
1126 }
1127
1128 #define KMERR_MODIFIED 0 /* buffer modified while on freelist */
1129 #define KMERR_REDZONE 1 /* redzone violation (write past end of buf) */
1130 #define KMERR_DUPFREE 2 /* freed a buffer twice */
1131 #define KMERR_BADADDR 3 /* freed a bad (unallocated) address */
1132 #define KMERR_BADBUFTAG 4 /* buftag corrupted */
1133 #define KMERR_BADBUFCTL 5 /* bufctl corrupted */
1134 #define KMERR_BADCACHE 6 /* freed a buffer to the wrong cache */
1135 #define KMERR_BADSIZE 7 /* alloc size != free size */
1136 #define KMERR_BADBASE 8 /* buffer base address wrong */
1137
1138 struct {
1139 hrtime_t kmp_timestamp; /* timestamp of panic */
1140 int kmp_error; /* type of kmem error */
1141 void *kmp_buffer; /* buffer that induced panic */
1142 void *kmp_realbuf; /* real start address for buffer */
1143 kmem_cache_t *kmp_cache; /* buffer's cache according to client */
1144 kmem_cache_t *kmp_realcache; /* actual cache containing buffer */
1145 kmem_slab_t *kmp_slab; /* slab accoring to kmem_findslab() */
1146 kmem_bufctl_t *kmp_bufctl; /* bufctl */
1147 } kmem_panic_info;
1148
1149
1150 static void
1151 copy_pattern(uint64_t pattern, void *buf_arg, size_t size)
1152 {
1153 uint64_t *bufend = (uint64_t *)((char *)buf_arg + size);
1154 uint64_t *buf = buf_arg;
1155
1156 while (buf < bufend)
1157 *buf++ = pattern;
1158 }
1159
1160 static void *
1161 verify_pattern(uint64_t pattern, void *buf_arg, size_t size)
1162 {
1163 uint64_t *bufend = (uint64_t *)((char *)buf_arg + size);
1164 uint64_t *buf;
1165
1166 for (buf = buf_arg; buf < bufend; buf++)
1167 if (*buf != pattern)
1168 return (buf);
1169 return (NULL);
1170 }
1171
1172 static void *
1173 verify_and_copy_pattern(uint64_t old, uint64_t new, void *buf_arg, size_t size)
1174 {
1175 uint64_t *bufend = (uint64_t *)((char *)buf_arg + size);
1176 uint64_t *buf;
1177
1178 for (buf = buf_arg; buf < bufend; buf++) {
1179 if (*buf != old) {
1180 copy_pattern(old, buf_arg,
1181 (char *)buf - (char *)buf_arg);
1182 return (buf);
1183 }
1184 *buf = new;
1185 }
1186
1187 return (NULL);
1188 }
1189
1190 static void
1191 kmem_cache_applyall(void (*func)(kmem_cache_t *), taskq_t *tq, int tqflag)
1192 {
1193 kmem_cache_t *cp;
1194
1195 mutex_enter(&kmem_cache_lock);
1196 for (cp = list_head(&kmem_caches); cp != NULL;
1197 cp = list_next(&kmem_caches, cp))
1198 if (tq != NULL)
1199 (void) taskq_dispatch(tq, (task_func_t *)func, cp,
1200 tqflag);
1201 else
1202 func(cp);
1203 mutex_exit(&kmem_cache_lock);
1204 }
1205
1206 static void
1207 kmem_cache_applyall_id(void (*func)(kmem_cache_t *), taskq_t *tq, int tqflag)
1208 {
1209 kmem_cache_t *cp;
1210
1211 mutex_enter(&kmem_cache_lock);
1212 for (cp = list_head(&kmem_caches); cp != NULL;
1213 cp = list_next(&kmem_caches, cp)) {
1214 if (!(cp->cache_cflags & KMC_IDENTIFIER))
1215 continue;
1216 if (tq != NULL)
1217 (void) taskq_dispatch(tq, (task_func_t *)func, cp,
1218 tqflag);
1219 else
1220 func(cp);
1221 }
1222 mutex_exit(&kmem_cache_lock);
1223 }
1224
1225 /*
1226 * Debugging support. Given a buffer address, find its slab.
1227 */
1228 static kmem_slab_t *
1229 kmem_findslab(kmem_cache_t *cp, void *buf)
1230 {
1231 kmem_slab_t *sp;
1232
1233 mutex_enter(&cp->cache_lock);
1234 for (sp = list_head(&cp->cache_complete_slabs); sp != NULL;
1235 sp = list_next(&cp->cache_complete_slabs, sp)) {
1236 if (KMEM_SLAB_MEMBER(sp, buf)) {
1237 mutex_exit(&cp->cache_lock);
1238 return (sp);
1239 }
1240 }
1241 for (sp = avl_first(&cp->cache_partial_slabs); sp != NULL;
1242 sp = AVL_NEXT(&cp->cache_partial_slabs, sp)) {
1243 if (KMEM_SLAB_MEMBER(sp, buf)) {
1244 mutex_exit(&cp->cache_lock);
1245 return (sp);
1246 }
1247 }
1248 mutex_exit(&cp->cache_lock);
1249
1250 return (NULL);
1251 }
1252
1253 static void
1254 kmem_error(int error, kmem_cache_t *cparg, void *bufarg)
1255 {
1256 kmem_buftag_t *btp = NULL;
1257 kmem_bufctl_t *bcp = NULL;
1258 kmem_cache_t *cp = cparg;
1259 kmem_slab_t *sp;
1260 uint64_t *off;
1261 void *buf = bufarg;
1262
1263 kmem_logging = 0; /* stop logging when a bad thing happens */
1264
1265 kmem_panic_info.kmp_timestamp = gethrtime();
1266
1267 sp = kmem_findslab(cp, buf);
1268 if (sp == NULL) {
1269 for (cp = list_tail(&kmem_caches); cp != NULL;
1270 cp = list_prev(&kmem_caches, cp)) {
1271 if ((sp = kmem_findslab(cp, buf)) != NULL)
1272 break;
1273 }
1274 }
1275
1276 if (sp == NULL) {
1277 cp = NULL;
1278 error = KMERR_BADADDR;
1279 } else {
1280 if (cp != cparg)
1281 error = KMERR_BADCACHE;
1282 else
1283 buf = (char *)bufarg - ((uintptr_t)bufarg -
1284 (uintptr_t)sp->slab_base) % cp->cache_chunksize;
1285 if (buf != bufarg)
1286 error = KMERR_BADBASE;
1287 if (cp->cache_flags & KMF_BUFTAG)
1288 btp = KMEM_BUFTAG(cp, buf);
1289 if (cp->cache_flags & KMF_HASH) {
1290 mutex_enter(&cp->cache_lock);
1291 for (bcp = *KMEM_HASH(cp, buf); bcp; bcp = bcp->bc_next)
1292 if (bcp->bc_addr == buf)
1293 break;
1294 mutex_exit(&cp->cache_lock);
1295 if (bcp == NULL && btp != NULL)
1296 bcp = btp->bt_bufctl;
1297 if (kmem_findslab(cp->cache_bufctl_cache, bcp) ==
1298 NULL || P2PHASE((uintptr_t)bcp, KMEM_ALIGN) ||
1299 bcp->bc_addr != buf) {
1300 error = KMERR_BADBUFCTL;
1301 bcp = NULL;
1302 }
1303 }
1304 }
1305
1306 kmem_panic_info.kmp_error = error;
1307 kmem_panic_info.kmp_buffer = bufarg;
1308 kmem_panic_info.kmp_realbuf = buf;
1309 kmem_panic_info.kmp_cache = cparg;
1310 kmem_panic_info.kmp_realcache = cp;
1311 kmem_panic_info.kmp_slab = sp;
1312 kmem_panic_info.kmp_bufctl = bcp;
1313
1314 printf("kernel memory allocator: ");
1315
1316 switch (error) {
1317
1318 case KMERR_MODIFIED:
1319 printf("buffer modified after being freed\n");
1320 off = verify_pattern(KMEM_FREE_PATTERN, buf, cp->cache_verify);
1321 if (off == NULL) /* shouldn't happen */
1322 off = buf;
1323 printf("modification occurred at offset 0x%lx "
1324 "(0x%llx replaced by 0x%llx)\n",
1325 (uintptr_t)off - (uintptr_t)buf,
1326 (longlong_t)KMEM_FREE_PATTERN, (longlong_t)*off);
1327 break;
1328
1329 case KMERR_REDZONE:
1330 printf("redzone violation: write past end of buffer\n");
1331 break;
1332
1333 case KMERR_BADADDR:
1334 printf("invalid free: buffer not in cache\n");
1335 break;
1336
1337 case KMERR_DUPFREE:
1338 printf("duplicate free: buffer freed twice\n");
1339 break;
1340
1341 case KMERR_BADBUFTAG:
1342 printf("boundary tag corrupted\n");
1343 printf("bcp ^ bxstat = %lx, should be %lx\n",
1344 (intptr_t)btp->bt_bufctl ^ btp->bt_bxstat,
1345 KMEM_BUFTAG_FREE);
1346 break;
1347
1348 case KMERR_BADBUFCTL:
1349 printf("bufctl corrupted\n");
1350 break;
1351
1352 case KMERR_BADCACHE:
1353 printf("buffer freed to wrong cache\n");
1354 printf("buffer was allocated from %s,\n", cp->cache_name);
1355 printf("caller attempting free to %s.\n", cparg->cache_name);
1356 break;
1357
1358 case KMERR_BADSIZE:
1359 printf("bad free: free size (%u) != alloc size (%u)\n",
1360 KMEM_SIZE_DECODE(((uint32_t *)btp)[0]),
1361 KMEM_SIZE_DECODE(((uint32_t *)btp)[1]));
1362 break;
1363
1364 case KMERR_BADBASE:
1365 printf("bad free: free address (%p) != alloc address (%p)\n",
1366 bufarg, buf);
1367 break;
1368 }
1369
1370 printf("buffer=%p bufctl=%p cache: %s\n",
1371 bufarg, (void *)bcp, cparg->cache_name);
1372
1373 if (bcp != NULL && (cp->cache_flags & KMF_AUDIT) &&
1374 error != KMERR_BADBUFCTL) {
1375 int d;
1376 timestruc_t ts;
1377 kmem_bufctl_audit_t *bcap = (kmem_bufctl_audit_t *)bcp;
1378
1379 hrt2ts(kmem_panic_info.kmp_timestamp - bcap->bc_timestamp, &ts);
1380 printf("previous transaction on buffer %p:\n", buf);
1381 printf("thread=%p time=T-%ld.%09ld slab=%p cache: %s\n",
1382 (void *)bcap->bc_thread, ts.tv_sec, ts.tv_nsec,
1383 (void *)sp, cp->cache_name);
1384 for (d = 0; d < MIN(bcap->bc_depth, KMEM_STACK_DEPTH); d++) {
1385 ulong_t off;
1386 char *sym = kobj_getsymname(bcap->bc_stack[d], &off);
1387 printf("%s+%lx\n", sym ? sym : "?", off);
1388 }
1389 }
1390 if (kmem_panic > 0)
1391 panic("kernel heap corruption detected");
1392 if (kmem_panic == 0)
1393 debug_enter(NULL);
1394 kmem_logging = 1; /* resume logging */
1395 }
1396
1397 static kmem_log_header_t *
1398 kmem_log_init(size_t logsize)
1399 {
1400 kmem_log_header_t *lhp;
1401 int nchunks = 4 * max_ncpus;
1402 size_t lhsize = (size_t)&((kmem_log_header_t *)0)->lh_cpu[max_ncpus];
1403 int i;
1404
1405 /*
1406 * Make sure that lhp->lh_cpu[] is nicely aligned
1407 * to prevent false sharing of cache lines.
1408 */
1409 lhsize = P2ROUNDUP(lhsize, KMEM_ALIGN);
1410 lhp = vmem_xalloc(kmem_log_arena, lhsize, 64, P2NPHASE(lhsize, 64), 0,
1411 NULL, NULL, VM_SLEEP);
1412 bzero(lhp, lhsize);
1413
1414 mutex_init(&lhp->lh_lock, NULL, MUTEX_DEFAULT, NULL);
1415 lhp->lh_nchunks = nchunks;
1416 lhp->lh_chunksize = P2ROUNDUP(logsize / nchunks + 1, PAGESIZE);
1417 lhp->lh_base = vmem_alloc(kmem_log_arena,
1418 lhp->lh_chunksize * nchunks, VM_SLEEP);
1419 lhp->lh_free = vmem_alloc(kmem_log_arena,
1420 nchunks * sizeof (int), VM_SLEEP);
1421 bzero(lhp->lh_base, lhp->lh_chunksize * nchunks);
1422
1423 for (i = 0; i < max_ncpus; i++) {
1424 kmem_cpu_log_header_t *clhp = &lhp->lh_cpu[i];
1425 mutex_init(&clhp->clh_lock, NULL, MUTEX_DEFAULT, NULL);
1426 clhp->clh_chunk = i;
1427 }
1428
1429 for (i = max_ncpus; i < nchunks; i++)
1430 lhp->lh_free[i] = i;
1431
1432 lhp->lh_head = max_ncpus;
1433 lhp->lh_tail = 0;
1434
1435 return (lhp);
1436 }
1437
1438 static void *
1439 kmem_log_enter(kmem_log_header_t *lhp, void *data, size_t size)
1440 {
1441 void *logspace;
1442 kmem_cpu_log_header_t *clhp;
1443
1444 if (lhp == NULL || kmem_logging == 0 || panicstr)
1445 return (NULL);
1446
1447 clhp = &lhp->lh_cpu[CPU->cpu_seqid];
1448
1449 mutex_enter(&clhp->clh_lock);
1450 clhp->clh_hits++;
1451 if (size > clhp->clh_avail) {
1452 mutex_enter(&lhp->lh_lock);
1453 lhp->lh_hits++;
1454 lhp->lh_free[lhp->lh_tail] = clhp->clh_chunk;
1455 lhp->lh_tail = (lhp->lh_tail + 1) % lhp->lh_nchunks;
1456 clhp->clh_chunk = lhp->lh_free[lhp->lh_head];
1457 lhp->lh_head = (lhp->lh_head + 1) % lhp->lh_nchunks;
1458 clhp->clh_current = lhp->lh_base +
1459 clhp->clh_chunk * lhp->lh_chunksize;
1460 clhp->clh_avail = lhp->lh_chunksize;
1461 if (size > lhp->lh_chunksize)
1462 size = lhp->lh_chunksize;
1463 mutex_exit(&lhp->lh_lock);
1464 }
1465 logspace = clhp->clh_current;
1466 clhp->clh_current += size;
1467 clhp->clh_avail -= size;
1468 bcopy(data, logspace, size);
1469 mutex_exit(&clhp->clh_lock);
1470 return (logspace);
1471 }
1472
1473 #define KMEM_AUDIT(lp, cp, bcp) \
1474 { \
1475 kmem_bufctl_audit_t *_bcp = (kmem_bufctl_audit_t *)(bcp); \
1476 _bcp->bc_timestamp = gethrtime(); \
1477 _bcp->bc_thread = curthread; \
1478 _bcp->bc_depth = getpcstack(_bcp->bc_stack, KMEM_STACK_DEPTH); \
1479 _bcp->bc_lastlog = kmem_log_enter((lp), _bcp, sizeof (*_bcp)); \
1480 }
1481
1482 static void
1483 kmem_log_event(kmem_log_header_t *lp, kmem_cache_t *cp,
1484 kmem_slab_t *sp, void *addr)
1485 {
1486 kmem_bufctl_audit_t bca;
1487
1488 bzero(&bca, sizeof (kmem_bufctl_audit_t));
1489 bca.bc_addr = addr;
1490 bca.bc_slab = sp;
1491 bca.bc_cache = cp;
1492 KMEM_AUDIT(lp, cp, &bca);
1493 }
1494
1495 /*
1496 * Create a new slab for cache cp.
1497 */
1498 static kmem_slab_t *
1499 kmem_slab_create(kmem_cache_t *cp, int kmflag)
1500 {
1501 size_t slabsize = cp->cache_slabsize;
1502 size_t chunksize = cp->cache_chunksize;
1503 int cache_flags = cp->cache_flags;
1504 size_t color, chunks;
1505 char *buf, *slab;
1506 kmem_slab_t *sp;
1507 kmem_bufctl_t *bcp;
1508 vmem_t *vmp = cp->cache_arena;
1509
1510 ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
1511
1512 color = cp->cache_color + cp->cache_align;
1513 if (color > cp->cache_maxcolor)
1514 color = cp->cache_mincolor;
1515 cp->cache_color = color;
1516
1517 slab = vmem_alloc(vmp, slabsize, kmflag & KM_VMFLAGS);
1518
1519 if (slab == NULL)
1520 goto vmem_alloc_failure;
1521
1522 ASSERT(P2PHASE((uintptr_t)slab, vmp->vm_quantum) == 0);
1523
1524 /*
1525 * Reverify what was already checked in kmem_cache_set_move(), since the
1526 * consolidator depends (for correctness) on slabs being initialized
1527 * with the 0xbaddcafe memory pattern (setting a low order bit usable by
1528 * clients to distinguish uninitialized memory from known objects).
1529 */
1530 ASSERT((cp->cache_move == NULL) || !(cp->cache_cflags & KMC_NOTOUCH));
1531 if (!(cp->cache_cflags & KMC_NOTOUCH))
1532 copy_pattern(KMEM_UNINITIALIZED_PATTERN, slab, slabsize);
1533
1534 if (cache_flags & KMF_HASH) {
1535 if ((sp = kmem_cache_alloc(kmem_slab_cache, kmflag)) == NULL)
1536 goto slab_alloc_failure;
1537 chunks = (slabsize - color) / chunksize;
1538 } else {
1539 sp = KMEM_SLAB(cp, slab);
1540 chunks = (slabsize - sizeof (kmem_slab_t) - color) / chunksize;
1541 }
1542
1543 sp->slab_cache = cp;
1544 sp->slab_head = NULL;
1545 sp->slab_refcnt = 0;
1546 sp->slab_base = buf = slab + color;
1547 sp->slab_chunks = chunks;
1548 sp->slab_stuck_offset = (uint32_t)-1;
1549 sp->slab_later_count = 0;
1550 sp->slab_flags = 0;
1551
1552 ASSERT(chunks > 0);
1553 while (chunks-- != 0) {
1554 if (cache_flags & KMF_HASH) {
1555 bcp = kmem_cache_alloc(cp->cache_bufctl_cache, kmflag);
1556 if (bcp == NULL)
1557 goto bufctl_alloc_failure;
1558 if (cache_flags & KMF_AUDIT) {
1559 kmem_bufctl_audit_t *bcap =
1560 (kmem_bufctl_audit_t *)bcp;
1561 bzero(bcap, sizeof (kmem_bufctl_audit_t));
1562 bcap->bc_cache = cp;
1563 }
1564 bcp->bc_addr = buf;
1565 bcp->bc_slab = sp;
1566 } else {
1567 bcp = KMEM_BUFCTL(cp, buf);
1568 }
1569 if (cache_flags & KMF_BUFTAG) {
1570 kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
1571 btp->bt_redzone = KMEM_REDZONE_PATTERN;
1572 btp->bt_bufctl = bcp;
1573 btp->bt_bxstat = (intptr_t)bcp ^ KMEM_BUFTAG_FREE;
1574 if (cache_flags & KMF_DEADBEEF) {
1575 copy_pattern(KMEM_FREE_PATTERN, buf,
1576 cp->cache_verify);
1577 }
1578 }
1579 bcp->bc_next = sp->slab_head;
1580 sp->slab_head = bcp;
1581 buf += chunksize;
1582 }
1583
1584 kmem_log_event(kmem_slab_log, cp, sp, slab);
1585
1586 return (sp);
1587
1588 bufctl_alloc_failure:
1589
1590 while ((bcp = sp->slab_head) != NULL) {
1591 sp->slab_head = bcp->bc_next;
1592 kmem_cache_free(cp->cache_bufctl_cache, bcp);
1593 }
1594 kmem_cache_free(kmem_slab_cache, sp);
1595
1596 slab_alloc_failure:
1597
1598 vmem_free(vmp, slab, slabsize);
1599
1600 vmem_alloc_failure:
1601
1602 kmem_log_event(kmem_failure_log, cp, NULL, NULL);
1603 atomic_inc_64(&cp->cache_alloc_fail);
1604
1605 return (NULL);
1606 }
1607
1608 /*
1609 * Destroy a slab.
1610 */
1611 static void
1612 kmem_slab_destroy(kmem_cache_t *cp, kmem_slab_t *sp)
1613 {
1614 vmem_t *vmp = cp->cache_arena;
1615 void *slab = (void *)P2ALIGN((uintptr_t)sp->slab_base, vmp->vm_quantum);
1616
1617 ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
1618 ASSERT(sp->slab_refcnt == 0);
1619
1620 if (cp->cache_flags & KMF_HASH) {
1621 kmem_bufctl_t *bcp;
1622 while ((bcp = sp->slab_head) != NULL) {
1623 sp->slab_head = bcp->bc_next;
1624 kmem_cache_free(cp->cache_bufctl_cache, bcp);
1625 }
1626 kmem_cache_free(kmem_slab_cache, sp);
1627 }
1628 vmem_free(vmp, slab, cp->cache_slabsize);
1629 }
1630
1631 static void *
1632 kmem_slab_alloc_impl(kmem_cache_t *cp, kmem_slab_t *sp, boolean_t prefill)
1633 {
1634 kmem_bufctl_t *bcp, **hash_bucket;
1635 void *buf;
1636 boolean_t new_slab = (sp->slab_refcnt == 0);
1637
1638 ASSERT(MUTEX_HELD(&cp->cache_lock));
1639 /*
1640 * kmem_slab_alloc() drops cache_lock when it creates a new slab, so we
1641 * can't ASSERT(avl_is_empty(&cp->cache_partial_slabs)) here when the
1642 * slab is newly created.
1643 */
1644 ASSERT(new_slab || (KMEM_SLAB_IS_PARTIAL(sp) &&
1645 (sp == avl_first(&cp->cache_partial_slabs))));
1646 ASSERT(sp->slab_cache == cp);
1647
1648 cp->cache_slab_alloc++;
1649 cp->cache_bufslab--;
1650 sp->slab_refcnt++;
1651
1652 bcp = sp->slab_head;
1653 sp->slab_head = bcp->bc_next;
1654
1655 if (cp->cache_flags & KMF_HASH) {
1656 /*
1657 * Add buffer to allocated-address hash table.
1658 */
1659 buf = bcp->bc_addr;
1660 hash_bucket = KMEM_HASH(cp, buf);
1661 bcp->bc_next = *hash_bucket;
1662 *hash_bucket = bcp;
1663 if ((cp->cache_flags & (KMF_AUDIT | KMF_BUFTAG)) == KMF_AUDIT) {
1664 KMEM_AUDIT(kmem_transaction_log, cp, bcp);
1665 }
1666 } else {
1667 buf = KMEM_BUF(cp, bcp);
1668 }
1669
1670 ASSERT(KMEM_SLAB_MEMBER(sp, buf));
1671
1672 if (sp->slab_head == NULL) {
1673 ASSERT(KMEM_SLAB_IS_ALL_USED(sp));
1674 if (new_slab) {
1675 ASSERT(sp->slab_chunks == 1);
1676 } else {
1677 ASSERT(sp->slab_chunks > 1); /* the slab was partial */
1678 avl_remove(&cp->cache_partial_slabs, sp);
1679 sp->slab_later_count = 0; /* clear history */
1680 sp->slab_flags &= ~KMEM_SLAB_NOMOVE;
1681 sp->slab_stuck_offset = (uint32_t)-1;
1682 }
1683 list_insert_head(&cp->cache_complete_slabs, sp);
1684 cp->cache_complete_slab_count++;
1685 return (buf);
1686 }
1687
1688 ASSERT(KMEM_SLAB_IS_PARTIAL(sp));
1689 /*
1690 * Peek to see if the magazine layer is enabled before
1691 * we prefill. We're not holding the cpu cache lock,
1692 * so the peek could be wrong, but there's no harm in it.
1693 */
1694 if (new_slab && prefill && (cp->cache_flags & KMF_PREFILL) &&
1695 (KMEM_CPU_CACHE(cp)->cc_magsize != 0)) {
1696 kmem_slab_prefill(cp, sp);
1697 return (buf);
1698 }
1699
1700 if (new_slab) {
1701 avl_add(&cp->cache_partial_slabs, sp);
1702 return (buf);
1703 }
1704
1705 /*
1706 * The slab is now more allocated than it was, so the
1707 * order remains unchanged.
1708 */
1709 ASSERT(!avl_update(&cp->cache_partial_slabs, sp));
1710 return (buf);
1711 }
1712
1713 /*
1714 * Allocate a raw (unconstructed) buffer from cp's slab layer.
1715 */
1716 static void *
1717 kmem_slab_alloc(kmem_cache_t *cp, int kmflag)
1718 {
1719 kmem_slab_t *sp;
1720 void *buf;
1721 boolean_t test_destructor;
1722
1723 mutex_enter(&cp->cache_lock);
1724 test_destructor = (cp->cache_slab_alloc == 0);
1725 sp = avl_first(&cp->cache_partial_slabs);
1726 if (sp == NULL) {
1727 ASSERT(cp->cache_bufslab == 0);
1728
1729 /*
1730 * The freelist is empty. Create a new slab.
1731 */
1732 mutex_exit(&cp->cache_lock);
1733 if ((sp = kmem_slab_create(cp, kmflag)) == NULL) {
1734 return (NULL);
1735 }
1736 mutex_enter(&cp->cache_lock);
1737 cp->cache_slab_create++;
1738 if ((cp->cache_buftotal += sp->slab_chunks) > cp->cache_bufmax)
1739 cp->cache_bufmax = cp->cache_buftotal;
1740 cp->cache_bufslab += sp->slab_chunks;
1741 }
1742
1743 buf = kmem_slab_alloc_impl(cp, sp, B_TRUE);
1744 ASSERT((cp->cache_slab_create - cp->cache_slab_destroy) ==
1745 (cp->cache_complete_slab_count +
1746 avl_numnodes(&cp->cache_partial_slabs) +
1747 (cp->cache_defrag == NULL ? 0 : cp->cache_defrag->kmd_deadcount)));
1748 mutex_exit(&cp->cache_lock);
1749
1750 if (test_destructor && cp->cache_destructor != NULL) {
1751 /*
1752 * On the first kmem_slab_alloc(), assert that it is valid to
1753 * call the destructor on a newly constructed object without any
1754 * client involvement.
1755 */
1756 if ((cp->cache_constructor == NULL) ||
1757 cp->cache_constructor(buf, cp->cache_private,
1758 kmflag) == 0) {
1759 cp->cache_destructor(buf, cp->cache_private);
1760 }
1761 copy_pattern(KMEM_UNINITIALIZED_PATTERN, buf,
1762 cp->cache_bufsize);
1763 if (cp->cache_flags & KMF_DEADBEEF) {
1764 copy_pattern(KMEM_FREE_PATTERN, buf, cp->cache_verify);
1765 }
1766 }
1767
1768 return (buf);
1769 }
1770
1771 static void kmem_slab_move_yes(kmem_cache_t *, kmem_slab_t *, void *);
1772
1773 /*
1774 * Free a raw (unconstructed) buffer to cp's slab layer.
1775 */
1776 static void
1777 kmem_slab_free(kmem_cache_t *cp, void *buf)
1778 {
1779 kmem_slab_t *sp;
1780 kmem_bufctl_t *bcp, **prev_bcpp;
1781
1782 ASSERT(buf != NULL);
1783
1784 mutex_enter(&cp->cache_lock);
1785 cp->cache_slab_free++;
1786
1787 if (cp->cache_flags & KMF_HASH) {
1788 /*
1789 * Look up buffer in allocated-address hash table.
1790 */
1791 prev_bcpp = KMEM_HASH(cp, buf);
1792 while ((bcp = *prev_bcpp) != NULL) {
1793 if (bcp->bc_addr == buf) {
1794 *prev_bcpp = bcp->bc_next;
1795 sp = bcp->bc_slab;
1796 break;
1797 }
1798 cp->cache_lookup_depth++;
1799 prev_bcpp = &bcp->bc_next;
1800 }
1801 } else {
1802 bcp = KMEM_BUFCTL(cp, buf);
1803 sp = KMEM_SLAB(cp, buf);
1804 }
1805
1806 if (bcp == NULL || sp->slab_cache != cp || !KMEM_SLAB_MEMBER(sp, buf)) {
1807 mutex_exit(&cp->cache_lock);
1808 kmem_error(KMERR_BADADDR, cp, buf);
1809 return;
1810 }
1811
1812 if (KMEM_SLAB_OFFSET(sp, buf) == sp->slab_stuck_offset) {
1813 /*
1814 * If this is the buffer that prevented the consolidator from
1815 * clearing the slab, we can reset the slab flags now that the
1816 * buffer is freed. (It makes sense to do this in
1817 * kmem_cache_free(), where the client gives up ownership of the
1818 * buffer, but on the hot path the test is too expensive.)
1819 */
1820 kmem_slab_move_yes(cp, sp, buf);
1821 }
1822
1823 if ((cp->cache_flags & (KMF_AUDIT | KMF_BUFTAG)) == KMF_AUDIT) {
1824 if (cp->cache_flags & KMF_CONTENTS)
1825 ((kmem_bufctl_audit_t *)bcp)->bc_contents =
1826 kmem_log_enter(kmem_content_log, buf,
1827 cp->cache_contents);
1828 KMEM_AUDIT(kmem_transaction_log, cp, bcp);
1829 }
1830
1831 bcp->bc_next = sp->slab_head;
1832 sp->slab_head = bcp;
1833
1834 cp->cache_bufslab++;
1835 ASSERT(sp->slab_refcnt >= 1);
1836
1837 if (--sp->slab_refcnt == 0) {
1838 /*
1839 * There are no outstanding allocations from this slab,
1840 * so we can reclaim the memory.
1841 */
1842 if (sp->slab_chunks == 1) {
1843 list_remove(&cp->cache_complete_slabs, sp);
1844 cp->cache_complete_slab_count--;
1845 } else {
1846 avl_remove(&cp->cache_partial_slabs, sp);
1847 }
1848
1849 cp->cache_buftotal -= sp->slab_chunks;
1850 cp->cache_bufslab -= sp->slab_chunks;
1851 /*
1852 * Defer releasing the slab to the virtual memory subsystem
1853 * while there is a pending move callback, since we guarantee
1854 * that buffers passed to the move callback have only been
1855 * touched by kmem or by the client itself. Since the memory
1856 * patterns baddcafe (uninitialized) and deadbeef (freed) both
1857 * set at least one of the two lowest order bits, the client can
1858 * test those bits in the move callback to determine whether or
1859 * not it knows about the buffer (assuming that the client also
1860 * sets one of those low order bits whenever it frees a buffer).
1861 */
1862 if (cp->cache_defrag == NULL ||
1863 (avl_is_empty(&cp->cache_defrag->kmd_moves_pending) &&
1864 !(sp->slab_flags & KMEM_SLAB_MOVE_PENDING))) {
1865 cp->cache_slab_destroy++;
1866 mutex_exit(&cp->cache_lock);
1867 kmem_slab_destroy(cp, sp);
1868 } else {
1869 list_t *deadlist = &cp->cache_defrag->kmd_deadlist;
1870 /*
1871 * Slabs are inserted at both ends of the deadlist to
1872 * distinguish between slabs freed while move callbacks
1873 * are pending (list head) and a slab freed while the
1874 * lock is dropped in kmem_move_buffers() (list tail) so
1875 * that in both cases slab_destroy() is called from the
1876 * right context.
1877 */
1878 if (sp->slab_flags & KMEM_SLAB_MOVE_PENDING) {
1879 list_insert_tail(deadlist, sp);
1880 } else {
1881 list_insert_head(deadlist, sp);
1882 }
1883 cp->cache_defrag->kmd_deadcount++;
1884 mutex_exit(&cp->cache_lock);
1885 }
1886 return;
1887 }
1888
1889 if (bcp->bc_next == NULL) {
1890 /* Transition the slab from completely allocated to partial. */
1891 ASSERT(sp->slab_refcnt == (sp->slab_chunks - 1));
1892 ASSERT(sp->slab_chunks > 1);
1893 list_remove(&cp->cache_complete_slabs, sp);
1894 cp->cache_complete_slab_count--;
1895 avl_add(&cp->cache_partial_slabs, sp);
1896 } else {
1897 (void) avl_update_gt(&cp->cache_partial_slabs, sp);
1898 }
1899
1900 ASSERT((cp->cache_slab_create - cp->cache_slab_destroy) ==
1901 (cp->cache_complete_slab_count +
1902 avl_numnodes(&cp->cache_partial_slabs) +
1903 (cp->cache_defrag == NULL ? 0 : cp->cache_defrag->kmd_deadcount)));
1904 mutex_exit(&cp->cache_lock);
1905 }
1906
1907 /*
1908 * Return -1 if kmem_error, 1 if constructor fails, 0 if successful.
1909 */
1910 static int
1911 kmem_cache_alloc_debug(kmem_cache_t *cp, void *buf, int kmflag, int construct,
1912 caddr_t caller)
1913 {
1914 kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
1915 kmem_bufctl_audit_t *bcp = (kmem_bufctl_audit_t *)btp->bt_bufctl;
1916 uint32_t mtbf;
1917
1918 if (btp->bt_bxstat != ((intptr_t)bcp ^ KMEM_BUFTAG_FREE)) {
1919 kmem_error(KMERR_BADBUFTAG, cp, buf);
1920 return (-1);
1921 }
1922
1923 btp->bt_bxstat = (intptr_t)bcp ^ KMEM_BUFTAG_ALLOC;
1924
1925 if ((cp->cache_flags & KMF_HASH) && bcp->bc_addr != buf) {
1926 kmem_error(KMERR_BADBUFCTL, cp, buf);
1927 return (-1);
1928 }
1929
1930 if (cp->cache_flags & KMF_DEADBEEF) {
1931 if (!construct && (cp->cache_flags & KMF_LITE)) {
1932 if (*(uint64_t *)buf != KMEM_FREE_PATTERN) {
1933 kmem_error(KMERR_MODIFIED, cp, buf);
1934 return (-1);
1935 }
1936 if (cp->cache_constructor != NULL)
1937 *(uint64_t *)buf = btp->bt_redzone;
1938 else
1939 *(uint64_t *)buf = KMEM_UNINITIALIZED_PATTERN;
1940 } else {
1941 construct = 1;
1942 if (verify_and_copy_pattern(KMEM_FREE_PATTERN,
1943 KMEM_UNINITIALIZED_PATTERN, buf,
1944 cp->cache_verify)) {
1945 kmem_error(KMERR_MODIFIED, cp, buf);
1946 return (-1);
1947 }
1948 }
1949 }
1950 btp->bt_redzone = KMEM_REDZONE_PATTERN;
1951
1952 if ((mtbf = kmem_mtbf | cp->cache_mtbf) != 0 &&
1953 gethrtime() % mtbf == 0 &&
1954 (kmflag & (KM_NOSLEEP | KM_PANIC)) == KM_NOSLEEP) {
1955 kmem_log_event(kmem_failure_log, cp, NULL, NULL);
1956 if (!construct && cp->cache_destructor != NULL)
1957 cp->cache_destructor(buf, cp->cache_private);
1958 } else {
1959 mtbf = 0;
1960 }
1961
1962 if (mtbf || (construct && cp->cache_constructor != NULL &&
1963 cp->cache_constructor(buf, cp->cache_private, kmflag) != 0)) {
1964 atomic_inc_64(&cp->cache_alloc_fail);
1965 btp->bt_bxstat = (intptr_t)bcp ^ KMEM_BUFTAG_FREE;
1966 if (cp->cache_flags & KMF_DEADBEEF)
1967 copy_pattern(KMEM_FREE_PATTERN, buf, cp->cache_verify);
1968 kmem_slab_free(cp, buf);
1969 return (1);
1970 }
1971
1972 if (cp->cache_flags & KMF_AUDIT) {
1973 KMEM_AUDIT(kmem_transaction_log, cp, bcp);
1974 }
1975
1976 if ((cp->cache_flags & KMF_LITE) &&
1977 !(cp->cache_cflags & KMC_KMEM_ALLOC)) {
1978 KMEM_BUFTAG_LITE_ENTER(btp, kmem_lite_count, caller);
1979 }
1980
1981 return (0);
1982 }
1983
1984 static int
1985 kmem_cache_free_debug(kmem_cache_t *cp, void *buf, caddr_t caller)
1986 {
1987 kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
1988 kmem_bufctl_audit_t *bcp = (kmem_bufctl_audit_t *)btp->bt_bufctl;
1989 kmem_slab_t *sp;
1990
1991 if (btp->bt_bxstat != ((intptr_t)bcp ^ KMEM_BUFTAG_ALLOC)) {
1992 if (btp->bt_bxstat == ((intptr_t)bcp ^ KMEM_BUFTAG_FREE)) {
1993 kmem_error(KMERR_DUPFREE, cp, buf);
1994 return (-1);
1995 }
1996 sp = kmem_findslab(cp, buf);
1997 if (sp == NULL || sp->slab_cache != cp)
1998 kmem_error(KMERR_BADADDR, cp, buf);
1999 else
2000 kmem_error(KMERR_REDZONE, cp, buf);
2001 return (-1);
2002 }
2003
2004 btp->bt_bxstat = (intptr_t)bcp ^ KMEM_BUFTAG_FREE;
2005
2006 if ((cp->cache_flags & KMF_HASH) && bcp->bc_addr != buf) {
2007 kmem_error(KMERR_BADBUFCTL, cp, buf);
2008 return (-1);
2009 }
2010
2011 if (btp->bt_redzone != KMEM_REDZONE_PATTERN) {
2012 kmem_error(KMERR_REDZONE, cp, buf);
2013 return (-1);
2014 }
2015
2016 if (cp->cache_flags & KMF_AUDIT) {
2017 if (cp->cache_flags & KMF_CONTENTS)
2018 bcp->bc_contents = kmem_log_enter(kmem_content_log,
2019 buf, cp->cache_contents);
2020 KMEM_AUDIT(kmem_transaction_log, cp, bcp);
2021 }
2022
2023 if ((cp->cache_flags & KMF_LITE) &&
2024 !(cp->cache_cflags & KMC_KMEM_ALLOC)) {
2025 KMEM_BUFTAG_LITE_ENTER(btp, kmem_lite_count, caller);
2026 }
2027
2028 if (cp->cache_flags & KMF_DEADBEEF) {
2029 if (cp->cache_flags & KMF_LITE)
2030 btp->bt_redzone = *(uint64_t *)buf;
2031 else if (cp->cache_destructor != NULL)
2032 cp->cache_destructor(buf, cp->cache_private);
2033
2034 copy_pattern(KMEM_FREE_PATTERN, buf, cp->cache_verify);
2035 }
2036
2037 return (0);
2038 }
2039
2040 /*
2041 * Free each object in magazine mp to cp's slab layer, and free mp itself.
2042 */
2043 static void
2044 kmem_magazine_destroy(kmem_cache_t *cp, kmem_magazine_t *mp, int nrounds)
2045 {
2046 int round;
2047
2048 ASSERT(!list_link_active(&cp->cache_link) ||
2049 taskq_member(kmem_taskq, curthread));
2050
2051 for (round = 0; round < nrounds; round++) {
2052 void *buf = mp->mag_round[round];
2053
2054 if (cp->cache_flags & KMF_DEADBEEF) {
2055 if (verify_pattern(KMEM_FREE_PATTERN, buf,
2056 cp->cache_verify) != NULL) {
2057 kmem_error(KMERR_MODIFIED, cp, buf);
2058 continue;
2059 }
2060 if ((cp->cache_flags & KMF_LITE) &&
2061 cp->cache_destructor != NULL) {
2062 kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
2063 *(uint64_t *)buf = btp->bt_redzone;
2064 cp->cache_destructor(buf, cp->cache_private);
2065 *(uint64_t *)buf = KMEM_FREE_PATTERN;
2066 }
2067 } else if (cp->cache_destructor != NULL) {
2068 cp->cache_destructor(buf, cp->cache_private);
2069 }
2070
2071 kmem_slab_free(cp, buf);
2072 }
2073 ASSERT(KMEM_MAGAZINE_VALID(cp, mp));
2074 kmem_cache_free(cp->cache_magtype->mt_cache, mp);
2075 }
2076
2077 /*
2078 * Allocate a magazine from the depot.
2079 */
2080 static kmem_magazine_t *
2081 kmem_depot_alloc(kmem_cache_t *cp, kmem_maglist_t *mlp)
2082 {
2083 kmem_magazine_t *mp;
2084
2085 /*
2086 * If we can't get the depot lock without contention,
2087 * update our contention count. We use the depot
2088 * contention rate to determine whether we need to
2089 * increase the magazine size for better scalability.
2090 */
2091 if (!mutex_tryenter(&cp->cache_depot_lock)) {
2092 mutex_enter(&cp->cache_depot_lock);
2093 cp->cache_depot_contention++;
2094 }
2095
2096 if ((mp = mlp->ml_list) != NULL) {
2097 ASSERT(KMEM_MAGAZINE_VALID(cp, mp));
2098 mlp->ml_list = mp->mag_next;
2099 if (--mlp->ml_total < mlp->ml_min)
2100 mlp->ml_min = mlp->ml_total;
2101 mlp->ml_alloc++;
2102 }
2103
2104 mutex_exit(&cp->cache_depot_lock);
2105
2106 return (mp);
2107 }
2108
2109 /*
2110 * Free a magazine to the depot.
2111 */
2112 static void
2113 kmem_depot_free(kmem_cache_t *cp, kmem_maglist_t *mlp, kmem_magazine_t *mp)
2114 {
2115 mutex_enter(&cp->cache_depot_lock);
2116 ASSERT(KMEM_MAGAZINE_VALID(cp, mp));
2117 mp->mag_next = mlp->ml_list;
2118 mlp->ml_list = mp;
2119 mlp->ml_total++;
2120 mutex_exit(&cp->cache_depot_lock);
2121 }
2122
2123 /*
2124 * Update the working set statistics for cp's depot.
2125 */
2126 static void
2127 kmem_depot_ws_update(kmem_cache_t *cp)
2128 {
2129 mutex_enter(&cp->cache_depot_lock);
2130 cp->cache_full.ml_reaplimit = cp->cache_full.ml_min;
2131 cp->cache_full.ml_min = cp->cache_full.ml_total;
2132 cp->cache_empty.ml_reaplimit = cp->cache_empty.ml_min;
2133 cp->cache_empty.ml_min = cp->cache_empty.ml_total;
2134 mutex_exit(&cp->cache_depot_lock);
2135 }
2136
2137 /*
2138 * Set the working set statistics for cp's depot to zero. (Everything is
2139 * eligible for reaping.)
2140 */
2141 static void
2142 kmem_depot_ws_zero(kmem_cache_t *cp)
2143 {
2144 mutex_enter(&cp->cache_depot_lock);
2145 cp->cache_full.ml_reaplimit = cp->cache_full.ml_total;
2146 cp->cache_full.ml_min = cp->cache_full.ml_total;
2147 cp->cache_empty.ml_reaplimit = cp->cache_empty.ml_total;
2148 cp->cache_empty.ml_min = cp->cache_empty.ml_total;
2149 mutex_exit(&cp->cache_depot_lock);
2150 }
2151
2152 /*
2153 * The number of bytes to reap before we call kpreempt(). The default (1MB)
2154 * causes us to preempt reaping up to hundreds of times per second. Using a
2155 * larger value (1GB) causes this to have virtually no effect.
2156 */
2157 size_t kmem_reap_preempt_bytes = 1024 * 1024;
2158
2159 /*
2160 * Reap all magazines that have fallen out of the depot's working set.
2161 */
2162 static void
2163 kmem_depot_ws_reap(kmem_cache_t *cp)
2164 {
2165 size_t bytes = 0;
2166 long reap;
2167 kmem_magazine_t *mp;
2168
2169 ASSERT(!list_link_active(&cp->cache_link) ||
2170 taskq_member(kmem_taskq, curthread));
2171
2172 reap = MIN(cp->cache_full.ml_reaplimit, cp->cache_full.ml_min);
2173 while (reap-- &&
2174 (mp = kmem_depot_alloc(cp, &cp->cache_full)) != NULL) {
2175 kmem_magazine_destroy(cp, mp, cp->cache_magtype->mt_magsize);
2176 bytes += cp->cache_magtype->mt_magsize * cp->cache_bufsize;
2177 if (bytes > kmem_reap_preempt_bytes) {
2178 kpreempt(KPREEMPT_SYNC);
2179 bytes = 0;
2180 }
2181 }
2182
2183 reap = MIN(cp->cache_empty.ml_reaplimit, cp->cache_empty.ml_min);
2184 while (reap-- &&
2185 (mp = kmem_depot_alloc(cp, &cp->cache_empty)) != NULL) {
2186 kmem_magazine_destroy(cp, mp, 0);
2187 bytes += cp->cache_magtype->mt_magsize * cp->cache_bufsize;
2188 if (bytes > kmem_reap_preempt_bytes) {
2189 kpreempt(KPREEMPT_SYNC);
2190 bytes = 0;
2191 }
2192 }
2193 }
2194
2195 static void
2196 kmem_cpu_reload(kmem_cpu_cache_t *ccp, kmem_magazine_t *mp, int rounds)
2197 {
2198 ASSERT((ccp->cc_loaded == NULL && ccp->cc_rounds == -1) ||
2199 (ccp->cc_loaded && ccp->cc_rounds + rounds == ccp->cc_magsize));
2200 ASSERT(ccp->cc_magsize > 0);
2201
2202 ccp->cc_ploaded = ccp->cc_loaded;
2203 ccp->cc_prounds = ccp->cc_rounds;
2204 ccp->cc_loaded = mp;
2205 ccp->cc_rounds = rounds;
2206 }
2207
2208 /*
2209 * Intercept kmem alloc/free calls during crash dump in order to avoid
2210 * changing kmem state while memory is being saved to the dump device.
2211 * Otherwise, ::kmem_verify will report "corrupt buffers". Note that
2212 * there are no locks because only one CPU calls kmem during a crash
2213 * dump. To enable this feature, first create the associated vmem
2214 * arena with VMC_DUMPSAFE.
2215 */
2216 static void *kmem_dump_start; /* start of pre-reserved heap */
2217 static void *kmem_dump_end; /* end of heap area */
2218 static void *kmem_dump_curr; /* current free heap pointer */
2219 static size_t kmem_dump_size; /* size of heap area */
2220
2221 /* append to each buf created in the pre-reserved heap */
2222 typedef struct kmem_dumpctl {
2223 void *kdc_next; /* cache dump free list linkage */
2224 } kmem_dumpctl_t;
2225
2226 #define KMEM_DUMPCTL(cp, buf) \
2227 ((kmem_dumpctl_t *)P2ROUNDUP((uintptr_t)(buf) + (cp)->cache_bufsize, \
2228 sizeof (void *)))
2229
2230 /* set non zero for full report */
2231 uint_t kmem_dump_verbose = 0;
2232
2233 /* stats for overize heap */
2234 uint_t kmem_dump_oversize_allocs = 0;
2235 uint_t kmem_dump_oversize_max = 0;
2236
2237 static void
2238 kmem_dumppr(char **pp, char *e, const char *format, ...)
2239 {
2240 char *p = *pp;
2241
2242 if (p < e) {
2243 int n;
2244 va_list ap;
2245
2246 va_start(ap, format);
2247 n = vsnprintf(p, e - p, format, ap);
2248 va_end(ap);
2249 *pp = p + n;
2250 }
2251 }
2252
2253 /*
2254 * Called when dumpadm(8) configures dump parameters.
2255 */
2256 void
2257 kmem_dump_init(size_t size)
2258 {
2259 /* Our caller ensures size is always set. */
2260 ASSERT3U(size, >, 0);
2261
2262 if (kmem_dump_start != NULL)
2263 kmem_free(kmem_dump_start, kmem_dump_size);
2264
2265 kmem_dump_start = kmem_alloc(size, KM_SLEEP);
2266 kmem_dump_size = size;
2267 kmem_dump_curr = kmem_dump_start;
2268 kmem_dump_end = (void *)((char *)kmem_dump_start + size);
2269 copy_pattern(KMEM_UNINITIALIZED_PATTERN, kmem_dump_start, size);
2270 }
2271
2272 /*
2273 * Set flag for each kmem_cache_t if is safe to use alternate dump
2274 * memory. Called just before panic crash dump starts. Set the flag
2275 * for the calling CPU.
2276 */
2277 void
2278 kmem_dump_begin(void)
2279 {
2280 kmem_cache_t *cp;
2281
2282 ASSERT(panicstr != NULL);
2283
2284 for (cp = list_head(&kmem_caches); cp != NULL;
2285 cp = list_next(&kmem_caches, cp)) {
2286 kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp);
2287
2288 if (cp->cache_arena->vm_cflags & VMC_DUMPSAFE) {
2289 cp->cache_flags |= KMF_DUMPDIVERT;
2290 ccp->cc_flags |= KMF_DUMPDIVERT;
2291 ccp->cc_dump_rounds = ccp->cc_rounds;
2292 ccp->cc_dump_prounds = ccp->cc_prounds;
2293 ccp->cc_rounds = ccp->cc_prounds = -1;
2294 } else {
2295 cp->cache_flags |= KMF_DUMPUNSAFE;
2296 ccp->cc_flags |= KMF_DUMPUNSAFE;
2297 }
2298 }
2299 }
2300
2301 /*
2302 * finished dump intercept
2303 * print any warnings on the console
2304 * return verbose information to dumpsys() in the given buffer
2305 */
2306 size_t
2307 kmem_dump_finish(char *buf, size_t size)
2308 {
2309 int percent = 0;
2310 size_t used;
2311 char *e = buf + size;
2312 char *p = buf;
2313
2314 if (kmem_dump_curr == kmem_dump_end) {
2315 cmn_err(CE_WARN, "exceeded kmem_dump space of %lu "
2316 "bytes: kmem state in dump may be inconsistent",
2317 kmem_dump_size);
2318 }
2319
2320 if (kmem_dump_verbose == 0)
2321 return (0);
2322
2323 used = (char *)kmem_dump_curr - (char *)kmem_dump_start;
2324 percent = (used * 100) / kmem_dump_size;
2325
2326 kmem_dumppr(&p, e, "%% heap used,%d\n", percent);
2327 kmem_dumppr(&p, e, "used bytes,%ld\n", used);
2328 kmem_dumppr(&p, e, "heap size,%ld\n", kmem_dump_size);
2329 kmem_dumppr(&p, e, "Oversize allocs,%d\n",
2330 kmem_dump_oversize_allocs);
2331 kmem_dumppr(&p, e, "Oversize max size,%ld\n",
2332 kmem_dump_oversize_max);
2333
2334 /* return buffer size used */
2335 if (p < e)
2336 bzero(p, e - p);
2337 return (p - buf);
2338 }
2339
2340 /*
2341 * Allocate a constructed object from alternate dump memory.
2342 */
2343 void *
2344 kmem_cache_alloc_dump(kmem_cache_t *cp, int kmflag)
2345 {
2346 void *buf;
2347 void *curr;
2348 char *bufend;
2349
2350 /* return a constructed object */
2351 if ((buf = cp->cache_dump.kd_freelist) != NULL) {
2352 cp->cache_dump.kd_freelist = KMEM_DUMPCTL(cp, buf)->kdc_next;
2353 return (buf);
2354 }
2355
2356 /* create a new constructed object */
2357 curr = kmem_dump_curr;
2358 buf = (void *)P2ROUNDUP((uintptr_t)curr, cp->cache_align);
2359 bufend = (char *)KMEM_DUMPCTL(cp, buf) + sizeof (kmem_dumpctl_t);
2360
2361 /* hat layer objects cannot cross a page boundary */
2362 if (cp->cache_align < PAGESIZE) {
2363 char *page = (char *)P2ROUNDUP((uintptr_t)buf, PAGESIZE);
2364 if (bufend > page) {
2365 bufend += page - (char *)buf;
2366 buf = (void *)page;
2367 }
2368 }
2369
2370 /* fall back to normal alloc if reserved area is used up */
2371 if (bufend > (char *)kmem_dump_end) {
2372 kmem_dump_curr = kmem_dump_end;
2373 cp->cache_dump.kd_alloc_fails++;
2374 return (NULL);
2375 }
2376
2377 /*
2378 * Must advance curr pointer before calling a constructor that
2379 * may also allocate memory.
2380 */
2381 kmem_dump_curr = bufend;
2382
2383 /* run constructor */
2384 if (cp->cache_constructor != NULL &&
2385 cp->cache_constructor(buf, cp->cache_private, kmflag)
2386 != 0) {
2387 #ifdef DEBUG
2388 printf("name='%s' cache=0x%p: kmem cache constructor failed\n",
2389 cp->cache_name, (void *)cp);
2390 #endif
2391 /* reset curr pointer iff no allocs were done */
2392 if (kmem_dump_curr == bufend)
2393 kmem_dump_curr = curr;
2394
2395 cp->cache_dump.kd_alloc_fails++;
2396 /* fall back to normal alloc if the constructor fails */
2397 return (NULL);
2398 }
2399
2400 return (buf);
2401 }
2402
2403 /*
2404 * Free a constructed object in alternate dump memory.
2405 */
2406 int
2407 kmem_cache_free_dump(kmem_cache_t *cp, void *buf)
2408 {
2409 /* save constructed buffers for next time */
2410 if ((char *)buf >= (char *)kmem_dump_start &&
2411 (char *)buf < (char *)kmem_dump_end) {
2412 KMEM_DUMPCTL(cp, buf)->kdc_next = cp->cache_dump.kd_freelist;
2413 cp->cache_dump.kd_freelist = buf;
2414 return (0);
2415 }
2416
2417 /* just drop buffers that were allocated before dump started */
2418 if (kmem_dump_curr < kmem_dump_end)
2419 return (0);
2420
2421 /* fall back to normal free if reserved area is used up */
2422 return (1);
2423 }
2424
2425 /*
2426 * Allocate a constructed object from cache cp.
2427 */
2428 void *
2429 kmem_cache_alloc(kmem_cache_t *cp, int kmflag)
2430 {
2431 kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp);
2432 kmem_magazine_t *fmp;
2433 void *buf;
2434
2435 mutex_enter(&ccp->cc_lock);
2436 for (;;) {
2437 /*
2438 * If there's an object available in the current CPU's
2439 * loaded magazine, just take it and return.
2440 */
2441 if (ccp->cc_rounds > 0) {
2442 buf = ccp->cc_loaded->mag_round[--ccp->cc_rounds];
2443 ccp->cc_alloc++;
2444 mutex_exit(&ccp->cc_lock);
2445 if (ccp->cc_flags & (KMF_BUFTAG | KMF_DUMPUNSAFE)) {
2446 if (ccp->cc_flags & KMF_DUMPUNSAFE) {
2447 ASSERT(!(ccp->cc_flags &
2448 KMF_DUMPDIVERT));
2449 cp->cache_dump.kd_unsafe++;
2450 }
2451 if ((ccp->cc_flags & KMF_BUFTAG) &&
2452 kmem_cache_alloc_debug(cp, buf, kmflag, 0,
2453 caller()) != 0) {
2454 if (kmflag & KM_NOSLEEP)
2455 return (NULL);
2456 mutex_enter(&ccp->cc_lock);
2457 continue;
2458 }
2459 }
2460 return (buf);
2461 }
2462
2463 /*
2464 * The loaded magazine is empty. If the previously loaded
2465 * magazine was full, exchange them and try again.
2466 */
2467 if (ccp->cc_prounds > 0) {
2468 kmem_cpu_reload(ccp, ccp->cc_ploaded, ccp->cc_prounds);
2469 continue;
2470 }
2471
2472 /*
2473 * Return an alternate buffer at dump time to preserve
2474 * the heap.
2475 */
2476 if (ccp->cc_flags & (KMF_DUMPDIVERT | KMF_DUMPUNSAFE)) {
2477 if (ccp->cc_flags & KMF_DUMPUNSAFE) {
2478 ASSERT(!(ccp->cc_flags & KMF_DUMPDIVERT));
2479 /* log it so that we can warn about it */
2480 cp->cache_dump.kd_unsafe++;
2481 } else {
2482 if ((buf = kmem_cache_alloc_dump(cp, kmflag)) !=
2483 NULL) {
2484 mutex_exit(&ccp->cc_lock);
2485 return (buf);
2486 }
2487 break; /* fall back to slab layer */
2488 }
2489 }
2490
2491 /*
2492 * If the magazine layer is disabled, break out now.
2493 */
2494 if (ccp->cc_magsize == 0)
2495 break;
2496
2497 /*
2498 * Try to get a full magazine from the depot.
2499 */
2500 fmp = kmem_depot_alloc(cp, &cp->cache_full);
2501 if (fmp != NULL) {
2502 if (ccp->cc_ploaded != NULL)
2503 kmem_depot_free(cp, &cp->cache_empty,
2504 ccp->cc_ploaded);
2505 kmem_cpu_reload(ccp, fmp, ccp->cc_magsize);
2506 continue;
2507 }
2508
2509 /*
2510 * There are no full magazines in the depot,
2511 * so fall through to the slab layer.
2512 */
2513 break;
2514 }
2515 mutex_exit(&ccp->cc_lock);
2516
2517 /*
2518 * We couldn't allocate a constructed object from the magazine layer,
2519 * so get a raw buffer from the slab layer and apply its constructor.
2520 */
2521 buf = kmem_slab_alloc(cp, kmflag);
2522
2523 if (buf == NULL)
2524 return (NULL);
2525
2526 if (cp->cache_flags & KMF_BUFTAG) {
2527 /*
2528 * Make kmem_cache_alloc_debug() apply the constructor for us.
2529 */
2530 int rc = kmem_cache_alloc_debug(cp, buf, kmflag, 1, caller());
2531 if (rc != 0) {
2532 if (kmflag & KM_NOSLEEP)
2533 return (NULL);
2534 /*
2535 * kmem_cache_alloc_debug() detected corruption
2536 * but didn't panic (kmem_panic <= 0). We should not be
2537 * here because the constructor failed (indicated by a
2538 * return code of 1). Try again.
2539 */
2540 ASSERT(rc == -1);
2541 return (kmem_cache_alloc(cp, kmflag));
2542 }
2543 return (buf);
2544 }
2545
2546 if (cp->cache_constructor != NULL &&
2547 cp->cache_constructor(buf, cp->cache_private, kmflag) != 0) {
2548 atomic_inc_64(&cp->cache_alloc_fail);
2549 kmem_slab_free(cp, buf);
2550 return (NULL);
2551 }
2552
2553 return (buf);
2554 }
2555
2556 /*
2557 * The freed argument tells whether or not kmem_cache_free_debug() has already
2558 * been called so that we can avoid the duplicate free error. For example, a
2559 * buffer on a magazine has already been freed by the client but is still
2560 * constructed.
2561 */
2562 static void
2563 kmem_slab_free_constructed(kmem_cache_t *cp, void *buf, boolean_t freed)
2564 {
2565 if (!freed && (cp->cache_flags & KMF_BUFTAG))
2566 if (kmem_cache_free_debug(cp, buf, caller()) == -1)
2567 return;
2568
2569 /*
2570 * Note that if KMF_DEADBEEF is in effect and KMF_LITE is not,
2571 * kmem_cache_free_debug() will have already applied the destructor.
2572 */
2573 if ((cp->cache_flags & (KMF_DEADBEEF | KMF_LITE)) != KMF_DEADBEEF &&
2574 cp->cache_destructor != NULL) {
2575 if (cp->cache_flags & KMF_DEADBEEF) { /* KMF_LITE implied */
2576 kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
2577 *(uint64_t *)buf = btp->bt_redzone;
2578 cp->cache_destructor(buf, cp->cache_private);
2579 *(uint64_t *)buf = KMEM_FREE_PATTERN;
2580 } else {
2581 cp->cache_destructor(buf, cp->cache_private);
2582 }
2583 }
2584
2585 kmem_slab_free(cp, buf);
2586 }
2587
2588 /*
2589 * Used when there's no room to free a buffer to the per-CPU cache.
2590 * Drops and re-acquires &ccp->cc_lock, and returns non-zero if the
2591 * caller should try freeing to the per-CPU cache again.
2592 * Note that we don't directly install the magazine in the cpu cache,
2593 * since its state may have changed wildly while the lock was dropped.
2594 */
2595 static int
2596 kmem_cpucache_magazine_alloc(kmem_cpu_cache_t *ccp, kmem_cache_t *cp)
2597 {
2598 kmem_magazine_t *emp;
2599 kmem_magtype_t *mtp;
2600
2601 ASSERT(MUTEX_HELD(&ccp->cc_lock));
2602 ASSERT(((uint_t)ccp->cc_rounds == ccp->cc_magsize ||
2603 ((uint_t)ccp->cc_rounds == -1)) &&
2604 ((uint_t)ccp->cc_prounds == ccp->cc_magsize ||
2605 ((uint_t)ccp->cc_prounds == -1)));
2606
2607 emp = kmem_depot_alloc(cp, &cp->cache_empty);
2608 if (emp != NULL) {
2609 if (ccp->cc_ploaded != NULL)
2610 kmem_depot_free(cp, &cp->cache_full,
2611 ccp->cc_ploaded);
2612 kmem_cpu_reload(ccp, emp, 0);
2613 return (1);
2614 }
2615 /*
2616 * There are no empty magazines in the depot,
2617 * so try to allocate a new one. We must drop all locks
2618 * across kmem_cache_alloc() because lower layers may
2619 * attempt to allocate from this cache.
2620 */
2621 mtp = cp->cache_magtype;
2622 mutex_exit(&ccp->cc_lock);
2623 emp = kmem_cache_alloc(mtp->mt_cache, KM_NOSLEEP);
2624 mutex_enter(&ccp->cc_lock);
2625
2626 if (emp != NULL) {
2627 /*
2628 * We successfully allocated an empty magazine.
2629 * However, we had to drop ccp->cc_lock to do it,
2630 * so the cache's magazine size may have changed.
2631 * If so, free the magazine and try again.
2632 */
2633 if (ccp->cc_magsize != mtp->mt_magsize) {
2634 mutex_exit(&ccp->cc_lock);
2635 kmem_cache_free(mtp->mt_cache, emp);
2636 mutex_enter(&ccp->cc_lock);
2637 return (1);
2638 }
2639
2640 /*
2641 * We got a magazine of the right size. Add it to
2642 * the depot and try the whole dance again.
2643 */
2644 kmem_depot_free(cp, &cp->cache_empty, emp);
2645 return (1);
2646 }
2647
2648 /*
2649 * We couldn't allocate an empty magazine,
2650 * so fall through to the slab layer.
2651 */
2652 return (0);
2653 }
2654
2655 /*
2656 * Free a constructed object to cache cp.
2657 */
2658 void
2659 kmem_cache_free(kmem_cache_t *cp, void *buf)
2660 {
2661 kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp);
2662
2663 /*
2664 * The client must not free either of the buffers passed to the move
2665 * callback function.
2666 */
2667 ASSERT(cp->cache_defrag == NULL ||
2668 cp->cache_defrag->kmd_thread != curthread ||
2669 (buf != cp->cache_defrag->kmd_from_buf &&
2670 buf != cp->cache_defrag->kmd_to_buf));
2671
2672 if (ccp->cc_flags & (KMF_BUFTAG | KMF_DUMPDIVERT | KMF_DUMPUNSAFE)) {
2673 if (ccp->cc_flags & KMF_DUMPUNSAFE) {
2674 ASSERT(!(ccp->cc_flags & KMF_DUMPDIVERT));
2675 /* log it so that we can warn about it */
2676 cp->cache_dump.kd_unsafe++;
2677 } else if (KMEM_DUMPCC(ccp) && !kmem_cache_free_dump(cp, buf)) {
2678 return;
2679 }
2680 if (ccp->cc_flags & KMF_BUFTAG) {
2681 if (kmem_cache_free_debug(cp, buf, caller()) == -1)
2682 return;
2683 }
2684 }
2685
2686 mutex_enter(&ccp->cc_lock);
2687 /*
2688 * Any changes to this logic should be reflected in kmem_slab_prefill()
2689 */
2690 for (;;) {
2691 /*
2692 * If there's a slot available in the current CPU's
2693 * loaded magazine, just put the object there and return.
2694 */
2695 if ((uint_t)ccp->cc_rounds < ccp->cc_magsize) {
2696 ccp->cc_loaded->mag_round[ccp->cc_rounds++] = buf;
2697 ccp->cc_free++;
2698 mutex_exit(&ccp->cc_lock);
2699 return;
2700 }
2701
2702 /*
2703 * The loaded magazine is full. If the previously loaded
2704 * magazine was empty, exchange them and try again.
2705 */
2706 if (ccp->cc_prounds == 0) {
2707 kmem_cpu_reload(ccp, ccp->cc_ploaded, ccp->cc_prounds);
2708 continue;
2709 }
2710
2711 /*
2712 * If the magazine layer is disabled, break out now.
2713 */
2714 if (ccp->cc_magsize == 0)
2715 break;
2716
2717 if (!kmem_cpucache_magazine_alloc(ccp, cp)) {
2718 /*
2719 * We couldn't free our constructed object to the
2720 * magazine layer, so apply its destructor and free it
2721 * to the slab layer.
2722 */
2723 break;
2724 }
2725 }
2726 mutex_exit(&ccp->cc_lock);
2727 kmem_slab_free_constructed(cp, buf, B_TRUE);
2728 }
2729
2730 static void
2731 kmem_slab_prefill(kmem_cache_t *cp, kmem_slab_t *sp)
2732 {
2733 kmem_cpu_cache_t *ccp = KMEM_CPU_CACHE(cp);
2734 int cache_flags = cp->cache_flags;
2735
2736 kmem_bufctl_t *next, *head;
2737 size_t nbufs;
2738
2739 /*
2740 * Completely allocate the newly created slab and put the pre-allocated
2741 * buffers in magazines. Any of the buffers that cannot be put in
2742 * magazines must be returned to the slab.
2743 */
2744 ASSERT(MUTEX_HELD(&cp->cache_lock));
2745 ASSERT((cache_flags & (KMF_PREFILL|KMF_BUFTAG)) == KMF_PREFILL);
2746 ASSERT(cp->cache_constructor == NULL);
2747 ASSERT(sp->slab_cache == cp);
2748 ASSERT(sp->slab_refcnt == 1);
2749 ASSERT(sp->slab_head != NULL && sp->slab_chunks > sp->slab_refcnt);
2750 ASSERT(avl_find(&cp->cache_partial_slabs, sp, NULL) == NULL);
2751
2752 head = sp->slab_head;
2753 nbufs = (sp->slab_chunks - sp->slab_refcnt);
2754 sp->slab_head = NULL;
2755 sp->slab_refcnt += nbufs;
2756 cp->cache_bufslab -= nbufs;
2757 cp->cache_slab_alloc += nbufs;
2758 list_insert_head(&cp->cache_complete_slabs, sp);
2759 cp->cache_complete_slab_count++;
2760 mutex_exit(&cp->cache_lock);
2761 mutex_enter(&ccp->cc_lock);
2762
2763 while (head != NULL) {
2764 void *buf = KMEM_BUF(cp, head);
2765 /*
2766 * If there's a slot available in the current CPU's
2767 * loaded magazine, just put the object there and
2768 * continue.
2769 */
2770 if ((uint_t)ccp->cc_rounds < ccp->cc_magsize) {
2771 ccp->cc_loaded->mag_round[ccp->cc_rounds++] =
2772 buf;
2773 ccp->cc_free++;
2774 nbufs--;
2775 head = head->bc_next;
2776 continue;
2777 }
2778
2779 /*
2780 * The loaded magazine is full. If the previously
2781 * loaded magazine was empty, exchange them and try
2782 * again.
2783 */
2784 if (ccp->cc_prounds == 0) {
2785 kmem_cpu_reload(ccp, ccp->cc_ploaded,
2786 ccp->cc_prounds);
2787 continue;
2788 }
2789
2790 /*
2791 * If the magazine layer is disabled, break out now.
2792 */
2793
2794 if (ccp->cc_magsize == 0) {
2795 break;
2796 }
2797
2798 if (!kmem_cpucache_magazine_alloc(ccp, cp))
2799 break;
2800 }
2801 mutex_exit(&ccp->cc_lock);
2802 if (nbufs != 0) {
2803 ASSERT(head != NULL);
2804
2805 /*
2806 * If there was a failure, return remaining objects to
2807 * the slab
2808 */
2809 while (head != NULL) {
2810 ASSERT(nbufs != 0);
2811 next = head->bc_next;
2812 head->bc_next = NULL;
2813 kmem_slab_free(cp, KMEM_BUF(cp, head));
2814 head = next;
2815 nbufs--;
2816 }
2817 }
2818 ASSERT(head == NULL);
2819 ASSERT(nbufs == 0);
2820 mutex_enter(&cp->cache_lock);
2821 }
2822
2823 void *
2824 kmem_zalloc(size_t size, int kmflag)
2825 {
2826 size_t index;
2827 void *buf;
2828
2829 if ((index = ((size - 1) >> KMEM_ALIGN_SHIFT)) < KMEM_ALLOC_TABLE_MAX) {
2830 kmem_cache_t *cp = kmem_alloc_table[index];
2831 buf = kmem_cache_alloc(cp, kmflag);
2832 if (buf != NULL) {
2833 if ((cp->cache_flags & KMF_BUFTAG) && !KMEM_DUMP(cp)) {
2834 kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
2835 ((uint8_t *)buf)[size] = KMEM_REDZONE_BYTE;
2836 ((uint32_t *)btp)[1] = KMEM_SIZE_ENCODE(size);
2837
2838 if (cp->cache_flags & KMF_LITE) {
2839 KMEM_BUFTAG_LITE_ENTER(btp,
2840 kmem_lite_count, caller());
2841 }
2842 }
2843 bzero(buf, size);
2844 }
2845 } else {
2846 buf = kmem_alloc(size, kmflag);
2847 if (buf != NULL)
2848 bzero(buf, size);
2849 }
2850 return (buf);
2851 }
2852
2853 void *
2854 kmem_alloc(size_t size, int kmflag)
2855 {
2856 size_t index;
2857 kmem_cache_t *cp;
2858 void *buf;
2859
2860 if ((index = ((size - 1) >> KMEM_ALIGN_SHIFT)) < KMEM_ALLOC_TABLE_MAX) {
2861 cp = kmem_alloc_table[index];
2862 /* fall through to kmem_cache_alloc() */
2863
2864 } else if ((index = ((size - 1) >> KMEM_BIG_SHIFT)) <
2865 kmem_big_alloc_table_max) {
2866 cp = kmem_big_alloc_table[index];
2867 /* fall through to kmem_cache_alloc() */
2868
2869 } else {
2870 if (size == 0) {
2871 if (kmflag != KM_SLEEP && !(kmflag & KM_PANIC))
2872 return (NULL);
2873
2874 /*
2875 * If this is a sleeping allocation or one that has
2876 * been specified to panic on allocation failure, we
2877 * consider it to be deprecated behavior to allocate
2878 * 0 bytes. If we have been configured to panic under
2879 * this condition, we panic; if to warn, we warn -- and
2880 * regardless, we log to the kmem_zerosized_log that
2881 * that this condition has occurred (which gives us
2882 * enough information to be able to debug it).
2883 */
2884 if (kmem_panic && kmem_panic_zerosized)
2885 panic("attempted to kmem_alloc() size of 0");
2886
2887 if (kmem_warn_zerosized) {
2888 cmn_err(CE_WARN, "kmem_alloc(): sleeping "
2889 "allocation with size of 0; "
2890 "see kmem_zerosized_log for details");
2891 }
2892
2893 kmem_log_event(kmem_zerosized_log, NULL, NULL, NULL);
2894
2895 return (NULL);
2896 }
2897
2898 buf = vmem_alloc(kmem_oversize_arena, size,
2899 kmflag & KM_VMFLAGS);
2900 if (buf == NULL)
2901 kmem_log_event(kmem_failure_log, NULL, NULL,
2902 (void *)size);
2903 else if (KMEM_DUMP(kmem_slab_cache)) {
2904 /* stats for dump intercept */
2905 kmem_dump_oversize_allocs++;
2906 if (size > kmem_dump_oversize_max)
2907 kmem_dump_oversize_max = size;
2908 }
2909 return (buf);
2910 }
2911
2912 buf = kmem_cache_alloc(cp, kmflag);
2913 if ((cp->cache_flags & KMF_BUFTAG) && !KMEM_DUMP(cp) && buf != NULL) {
2914 kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
2915 ((uint8_t *)buf)[size] = KMEM_REDZONE_BYTE;
2916 ((uint32_t *)btp)[1] = KMEM_SIZE_ENCODE(size);
2917
2918 if (cp->cache_flags & KMF_LITE) {
2919 KMEM_BUFTAG_LITE_ENTER(btp, kmem_lite_count, caller());
2920 }
2921 }
2922 return (buf);
2923 }
2924
2925 void
2926 kmem_free(void *buf, size_t size)
2927 {
2928 size_t index;
2929 kmem_cache_t *cp;
2930
2931 if ((index = (size - 1) >> KMEM_ALIGN_SHIFT) < KMEM_ALLOC_TABLE_MAX) {
2932 cp = kmem_alloc_table[index];
2933 /* fall through to kmem_cache_free() */
2934
2935 } else if ((index = ((size - 1) >> KMEM_BIG_SHIFT)) <
2936 kmem_big_alloc_table_max) {
2937 cp = kmem_big_alloc_table[index];
2938 /* fall through to kmem_cache_free() */
2939
2940 } else {
2941 EQUIV(buf == NULL, size == 0);
2942 if (buf == NULL && size == 0)
2943 return;
2944 vmem_free(kmem_oversize_arena, buf, size);
2945 return;
2946 }
2947
2948 if ((cp->cache_flags & KMF_BUFTAG) && !KMEM_DUMP(cp)) {
2949 kmem_buftag_t *btp = KMEM_BUFTAG(cp, buf);
2950 uint32_t *ip = (uint32_t *)btp;
2951 if (ip[1] != KMEM_SIZE_ENCODE(size)) {
2952 if (*(uint64_t *)buf == KMEM_FREE_PATTERN) {
2953 kmem_error(KMERR_DUPFREE, cp, buf);
2954 return;
2955 }
2956 if (KMEM_SIZE_VALID(ip[1])) {
2957 ip[0] = KMEM_SIZE_ENCODE(size);
2958 kmem_error(KMERR_BADSIZE, cp, buf);
2959 } else {
2960 kmem_error(KMERR_REDZONE, cp, buf);
2961 }
2962 return;
2963 }
2964 if (((uint8_t *)buf)[size] != KMEM_REDZONE_BYTE) {
2965 kmem_error(KMERR_REDZONE, cp, buf);
2966 return;
2967 }
2968 btp->bt_redzone = KMEM_REDZONE_PATTERN;
2969 if (cp->cache_flags & KMF_LITE) {
2970 KMEM_BUFTAG_LITE_ENTER(btp, kmem_lite_count,
2971 caller());
2972 }
2973 }
2974 kmem_cache_free(cp, buf);
2975 }
2976
2977 void *
2978 kmem_firewall_va_alloc(vmem_t *vmp, size_t size, int vmflag)
2979 {
2980 size_t realsize = size + vmp->vm_quantum;
2981 void *addr;
2982
2983 /*
2984 * Annoying edge case: if 'size' is just shy of ULONG_MAX, adding
2985 * vm_quantum will cause integer wraparound. Check for this, and
2986 * blow off the firewall page in this case. Note that such a
2987 * giant allocation (the entire kernel address space) can never
2988 * be satisfied, so it will either fail immediately (VM_NOSLEEP)
2989 * or sleep forever (VM_SLEEP). Thus, there is no need for a
2990 * corresponding check in kmem_firewall_va_free().
2991 */
2992 if (realsize < size)
2993 realsize = size;
2994
2995 /*
2996 * While boot still owns resource management, make sure that this
2997 * redzone virtual address allocation is properly accounted for in
2998 * OBPs "virtual-memory" "available" lists because we're
2999 * effectively claiming them for a red zone. If we don't do this,
3000 * the available lists become too fragmented and too large for the
3001 * current boot/kernel memory list interface.
3002 */
3003 addr = vmem_alloc(vmp, realsize, vmflag | VM_NEXTFIT);
3004
3005 if (addr != NULL && kvseg.s_base == NULL && realsize != size)
3006 (void) boot_virt_alloc((char *)addr + size, vmp->vm_quantum);
3007
3008 return (addr);
3009 }
3010
3011 void
3012 kmem_firewall_va_free(vmem_t *vmp, void *addr, size_t size)
3013 {
3014 ASSERT((kvseg.s_base == NULL ?
3015 va_to_pfn((char *)addr + size) :
3016 hat_getpfnum(kas.a_hat, (caddr_t)addr + size)) == PFN_INVALID);
3017
3018 vmem_free(vmp, addr, size + vmp->vm_quantum);
3019 }
3020
3021 /*
3022 * Try to allocate at least `size' bytes of memory without sleeping or
3023 * panicking. Return actual allocated size in `asize'. If allocation failed,
3024 * try final allocation with sleep or panic allowed.
3025 */
3026 void *
3027 kmem_alloc_tryhard(size_t size, size_t *asize, int kmflag)
3028 {
3029 void *p;
3030
3031 *asize = P2ROUNDUP(size, KMEM_ALIGN);
3032 do {
3033 p = kmem_alloc(*asize, (kmflag | KM_NOSLEEP) & ~KM_PANIC);
3034 if (p != NULL)
3035 return (p);
3036 *asize += KMEM_ALIGN;
3037 } while (*asize <= PAGESIZE);
3038
3039 *asize = P2ROUNDUP(size, KMEM_ALIGN);
3040 return (kmem_alloc(*asize, kmflag));
3041 }
3042
3043 /*
3044 * Reclaim all unused memory from a cache.
3045 */
3046 static void
3047 kmem_cache_reap(kmem_cache_t *cp)
3048 {
3049 ASSERT(taskq_member(kmem_taskq, curthread));
3050 cp->cache_reap++;
3051
3052 /*
3053 * Ask the cache's owner to free some memory if possible.
3054 * The idea is to handle things like the inode cache, which
3055 * typically sits on a bunch of memory that it doesn't truly
3056 * *need*. Reclaim policy is entirely up to the owner; this
3057 * callback is just an advisory plea for help.
3058 */
3059 if (cp->cache_reclaim != NULL) {
3060 long delta;
3061
3062 /*
3063 * Reclaimed memory should be reapable (not included in the
3064 * depot's working set).
3065 */
3066 delta = cp->cache_full.ml_total;
3067 cp->cache_reclaim(cp->cache_private);
3068 delta = cp->cache_full.ml_total - delta;
3069 if (delta > 0) {
3070 mutex_enter(&cp->cache_depot_lock);
3071 cp->cache_full.ml_reaplimit += delta;
3072 cp->cache_full.ml_min += delta;
3073 mutex_exit(&cp->cache_depot_lock);
3074 }
3075 }
3076
3077 kmem_depot_ws_reap(cp);
3078
3079 if (cp->cache_defrag != NULL && !kmem_move_noreap) {
3080 kmem_cache_defrag(cp);
3081 }
3082 }
3083
3084 static void
3085 kmem_reap_timeout(void *flag_arg)
3086 {
3087 uint32_t *flag = (uint32_t *)flag_arg;
3088
3089 ASSERT(flag == &kmem_reaping || flag == &kmem_reaping_idspace);
3090 *flag = 0;
3091 }
3092
3093 static void
3094 kmem_reap_done(void *flag)
3095 {
3096 if (!callout_init_done) {
3097 /* can't schedule a timeout at this point */
3098 kmem_reap_timeout(flag);
3099 } else {
3100 (void) timeout(kmem_reap_timeout, flag, kmem_reap_interval);
3101 }
3102 }
3103
3104 static void
3105 kmem_reap_start(void *flag)
3106 {
3107 ASSERT(flag == &kmem_reaping || flag == &kmem_reaping_idspace);
3108
3109 if (flag == &kmem_reaping) {
3110 kmem_cache_applyall(kmem_cache_reap, kmem_taskq, TQ_NOSLEEP);
3111 /*
3112 * if we have segkp under heap, reap segkp cache.
3113 */
3114 if (segkp_fromheap)
3115 segkp_cache_free();
3116 }
3117 else
3118 kmem_cache_applyall_id(kmem_cache_reap, kmem_taskq, TQ_NOSLEEP);
3119
3120 /*
3121 * We use taskq_dispatch() to schedule a timeout to clear
3122 * the flag so that kmem_reap() becomes self-throttling:
3123 * we won't reap again until the current reap completes *and*
3124 * at least kmem_reap_interval ticks have elapsed.
3125 */
3126 if (taskq_dispatch(kmem_taskq, kmem_reap_done, flag, TQ_NOSLEEP) ==
3127 TASKQID_INVALID)
3128 kmem_reap_done(flag);
3129 }
3130
3131 static void
3132 kmem_reap_common(void *flag_arg)
3133 {
3134 uint32_t *flag = (uint32_t *)flag_arg;
3135
3136 if (MUTEX_HELD(&kmem_cache_lock) || kmem_taskq == NULL ||
3137 atomic_cas_32(flag, 0, 1) != 0)
3138 return;
3139
3140 /*
3141 * It may not be kosher to do memory allocation when a reap is called
3142 * (for example, if vmem_populate() is in the call chain). So we
3143 * start the reap going with a TQ_NOALLOC dispatch. If the dispatch
3144 * fails, we reset the flag, and the next reap will try again.
3145 */
3146 if (taskq_dispatch(kmem_taskq, kmem_reap_start, flag, TQ_NOALLOC) ==
3147 TASKQID_INVALID)
3148 *flag = 0;
3149 }
3150
3151 /*
3152 * Reclaim all unused memory from all caches. Called from the VM system
3153 * when memory gets tight.
3154 */
3155 void
3156 kmem_reap(void)
3157 {
3158 kmem_reap_common(&kmem_reaping);
3159 }
3160
3161 /*
3162 * Reclaim all unused memory from identifier arenas, called when a vmem
3163 * arena not back by memory is exhausted. Since reaping memory-backed caches
3164 * cannot help with identifier exhaustion, we avoid both a large amount of
3165 * work and unwanted side-effects from reclaim callbacks.
3166 */
3167 void
3168 kmem_reap_idspace(void)
3169 {
3170 kmem_reap_common(&kmem_reaping_idspace);
3171 }
3172
3173 /*
3174 * Purge all magazines from a cache and set its magazine limit to zero.
3175 * All calls are serialized by the kmem_taskq lock, except for the final
3176 * call from kmem_cache_destroy().
3177 */
3178 static void
3179 kmem_cache_magazine_purge(kmem_cache_t *cp)
3180 {
3181 kmem_cpu_cache_t *ccp;
3182 kmem_magazine_t *mp, *pmp;
3183 int rounds, prounds, cpu_seqid;
3184
3185 ASSERT(!list_link_active(&cp->cache_link) ||
3186 taskq_member(kmem_taskq, curthread));
3187 ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
3188
3189 for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) {
3190 ccp = &cp->cache_cpu[cpu_seqid];
3191
3192 mutex_enter(&ccp->cc_lock);
3193 mp = ccp->cc_loaded;
3194 pmp = ccp->cc_ploaded;
3195 rounds = ccp->cc_rounds;
3196 prounds = ccp->cc_prounds;
3197 ccp->cc_loaded = NULL;
3198 ccp->cc_ploaded = NULL;
3199 ccp->cc_rounds = -1;
3200 ccp->cc_prounds = -1;
3201 ccp->cc_magsize = 0;
3202 mutex_exit(&ccp->cc_lock);
3203
3204 if (mp)
3205 kmem_magazine_destroy(cp, mp, rounds);
3206 if (pmp)
3207 kmem_magazine_destroy(cp, pmp, prounds);
3208 }
3209
3210 kmem_depot_ws_zero(cp);
3211 kmem_depot_ws_reap(cp);
3212 }
3213
3214 /*
3215 * Enable per-cpu magazines on a cache.
3216 */
3217 static void
3218 kmem_cache_magazine_enable(kmem_cache_t *cp)
3219 {
3220 int cpu_seqid;
3221
3222 if (cp->cache_flags & KMF_NOMAGAZINE)
3223 return;
3224
3225 for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) {
3226 kmem_cpu_cache_t *ccp = &cp->cache_cpu[cpu_seqid];
3227 mutex_enter(&ccp->cc_lock);
3228 ccp->cc_magsize = cp->cache_magtype->mt_magsize;
3229 mutex_exit(&ccp->cc_lock);
3230 }
3231
3232 }
3233
3234 /*
3235 * Allow our caller to determine if there are running reaps.
3236 *
3237 * This call is very conservative and may return B_TRUE even when
3238 * reaping activity isn't active. If it returns B_FALSE, then reaping
3239 * activity is definitely inactive.
3240 */
3241 boolean_t
3242 kmem_cache_reap_active(void)
3243 {
3244 return (!taskq_empty(kmem_taskq));
3245 }
3246
3247 /*
3248 * Reap (almost) everything soon.
3249 *
3250 * Note: this does not wait for the reap-tasks to complete. Caller
3251 * should use kmem_cache_reap_active() (above) and/or moderation to
3252 * avoid scheduling too many reap-tasks.
3253 */
3254 void
3255 kmem_cache_reap_soon(kmem_cache_t *cp)
3256 {
3257 ASSERT(list_link_active(&cp->cache_link));
3258
3259 kmem_depot_ws_zero(cp);
3260
3261 (void) taskq_dispatch(kmem_taskq,
3262 (task_func_t *)kmem_depot_ws_reap, cp, TQ_SLEEP);
3263 }
3264
3265 /*
3266 * Recompute a cache's magazine size. The trade-off is that larger magazines
3267 * provide a higher transfer rate with the depot, while smaller magazines
3268 * reduce memory consumption. Magazine resizing is an expensive operation;
3269 * it should not be done frequently.
3270 *
3271 * Changes to the magazine size are serialized by the kmem_taskq lock.
3272 *
3273 * Note: at present this only grows the magazine size. It might be useful
3274 * to allow shrinkage too.
3275 */
3276 static void
3277 kmem_cache_magazine_resize(kmem_cache_t *cp)
3278 {
3279 kmem_magtype_t *mtp = cp->cache_magtype;
3280
3281 ASSERT(taskq_member(kmem_taskq, curthread));
3282
3283 if (cp->cache_chunksize < mtp->mt_maxbuf) {
3284 kmem_cache_magazine_purge(cp);
3285 mutex_enter(&cp->cache_depot_lock);
3286 cp->cache_magtype = ++mtp;
3287 cp->cache_depot_contention_prev =
3288 cp->cache_depot_contention + INT_MAX;
3289 mutex_exit(&cp->cache_depot_lock);
3290 kmem_cache_magazine_enable(cp);
3291 }
3292 }
3293
3294 /*
3295 * Rescale a cache's hash table, so that the table size is roughly the
3296 * cache size. We want the average lookup time to be extremely small.
3297 */
3298 static void
3299 kmem_hash_rescale(kmem_cache_t *cp)
3300 {
3301 kmem_bufctl_t **old_table, **new_table, *bcp;
3302 size_t old_size, new_size, h;
3303
3304 ASSERT(taskq_member(kmem_taskq, curthread));
3305
3306 new_size = MAX(KMEM_HASH_INITIAL,
3307 1 << (highbit(3 * cp->cache_buftotal + 4) - 2));
3308 old_size = cp->cache_hash_mask + 1;
3309
3310 if ((old_size >> 1) <= new_size && new_size <= (old_size << 1))
3311 return;
3312
3313 new_table = vmem_alloc(kmem_hash_arena, new_size * sizeof (void *),
3314 VM_NOSLEEP);
3315 if (new_table == NULL)
3316 return;
3317 bzero(new_table, new_size * sizeof (void *));
3318
3319 mutex_enter(&cp->cache_lock);
3320
3321 old_size = cp->cache_hash_mask + 1;
3322 old_table = cp->cache_hash_table;
3323
3324 cp->cache_hash_mask = new_size - 1;
3325 cp->cache_hash_table = new_table;
3326 cp->cache_rescale++;
3327
3328 for (h = 0; h < old_size; h++) {
3329 bcp = old_table[h];
3330 while (bcp != NULL) {
3331 void *addr = bcp->bc_addr;
3332 kmem_bufctl_t *next_bcp = bcp->bc_next;
3333 kmem_bufctl_t **hash_bucket = KMEM_HASH(cp, addr);
3334 bcp->bc_next = *hash_bucket;
3335 *hash_bucket = bcp;
3336 bcp = next_bcp;
3337 }
3338 }
3339
3340 mutex_exit(&cp->cache_lock);
3341
3342 vmem_free(kmem_hash_arena, old_table, old_size * sizeof (void *));
3343 }
3344
3345 /*
3346 * Perform periodic maintenance on a cache: hash rescaling, depot working-set
3347 * update, magazine resizing, and slab consolidation.
3348 */
3349 static void
3350 kmem_cache_update(kmem_cache_t *cp)
3351 {
3352 int need_hash_rescale = 0;
3353 int need_magazine_resize = 0;
3354
3355 ASSERT(MUTEX_HELD(&kmem_cache_lock));
3356
3357 /*
3358 * If the cache has become much larger or smaller than its hash table,
3359 * fire off a request to rescale the hash table.
3360 */
3361 mutex_enter(&cp->cache_lock);
3362
3363 if ((cp->cache_flags & KMF_HASH) &&
3364 (cp->cache_buftotal > (cp->cache_hash_mask << 1) ||
3365 (cp->cache_buftotal < (cp->cache_hash_mask >> 1) &&
3366 cp->cache_hash_mask > KMEM_HASH_INITIAL)))
3367 need_hash_rescale = 1;
3368
3369 mutex_exit(&cp->cache_lock);
3370
3371 /*
3372 * Update the depot working set statistics.
3373 */
3374 kmem_depot_ws_update(cp);
3375
3376 /*
3377 * If there's a lot of contention in the depot,
3378 * increase the magazine size.
3379 */
3380 mutex_enter(&cp->cache_depot_lock);
3381
3382 if (cp->cache_chunksize < cp->cache_magtype->mt_maxbuf &&
3383 (int)(cp->cache_depot_contention -
3384 cp->cache_depot_contention_prev) > kmem_depot_contention)
3385 need_magazine_resize = 1;
3386
3387 cp->cache_depot_contention_prev = cp->cache_depot_contention;
3388
3389 mutex_exit(&cp->cache_depot_lock);
3390
3391 if (need_hash_rescale)
3392 (void) taskq_dispatch(kmem_taskq,
3393 (task_func_t *)kmem_hash_rescale, cp, TQ_NOSLEEP);
3394
3395 if (need_magazine_resize)
3396 (void) taskq_dispatch(kmem_taskq,
3397 (task_func_t *)kmem_cache_magazine_resize, cp, TQ_NOSLEEP);
3398
3399 if (cp->cache_defrag != NULL)
3400 (void) taskq_dispatch(kmem_taskq,
3401 (task_func_t *)kmem_cache_scan, cp, TQ_NOSLEEP);
3402 }
3403
3404 static void kmem_update(void *);
3405
3406 static void
3407 kmem_update_timeout(void *dummy)
3408 {
3409 (void) timeout(kmem_update, dummy, kmem_reap_interval);
3410 }
3411
3412 static void
3413 kmem_update(void *dummy)
3414 {
3415 kmem_cache_applyall(kmem_cache_update, NULL, TQ_NOSLEEP);
3416
3417 /*
3418 * We use taskq_dispatch() to reschedule the timeout so that
3419 * kmem_update() becomes self-throttling: it won't schedule
3420 * new tasks until all previous tasks have completed.
3421 */
3422 if (taskq_dispatch(kmem_taskq, kmem_update_timeout, dummy, TQ_NOSLEEP)
3423 == TASKQID_INVALID)
3424 kmem_update_timeout(NULL);
3425 }
3426
3427 static int
3428 kmem_cache_kstat_update(kstat_t *ksp, int rw)
3429 {
3430 struct kmem_cache_kstat *kmcp = &kmem_cache_kstat;
3431 kmem_cache_t *cp = ksp->ks_private;
3432 uint64_t cpu_buf_avail;
3433 uint64_t buf_avail = 0;
3434 int cpu_seqid;
3435 long reap;
3436
3437 ASSERT(MUTEX_HELD(&kmem_cache_kstat_lock));
3438
3439 if (rw == KSTAT_WRITE)
3440 return (EACCES);
3441
3442 mutex_enter(&cp->cache_lock);
3443
3444 kmcp->kmc_alloc_fail.value.ui64 = cp->cache_alloc_fail;
3445 kmcp->kmc_alloc.value.ui64 = cp->cache_slab_alloc;
3446 kmcp->kmc_free.value.ui64 = cp->cache_slab_free;
3447 kmcp->kmc_slab_alloc.value.ui64 = cp->cache_slab_alloc;
3448 kmcp->kmc_slab_free.value.ui64 = cp->cache_slab_free;
3449
3450 for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) {
3451 kmem_cpu_cache_t *ccp = &cp->cache_cpu[cpu_seqid];
3452
3453 mutex_enter(&ccp->cc_lock);
3454
3455 cpu_buf_avail = 0;
3456 if (ccp->cc_rounds > 0)
3457 cpu_buf_avail += ccp->cc_rounds;
3458 if (ccp->cc_prounds > 0)
3459 cpu_buf_avail += ccp->cc_prounds;
3460
3461 kmcp->kmc_alloc.value.ui64 += ccp->cc_alloc;
3462 kmcp->kmc_free.value.ui64 += ccp->cc_free;
3463 buf_avail += cpu_buf_avail;
3464
3465 mutex_exit(&ccp->cc_lock);
3466 }
3467
3468 mutex_enter(&cp->cache_depot_lock);
3469
3470 kmcp->kmc_depot_alloc.value.ui64 = cp->cache_full.ml_alloc;
3471 kmcp->kmc_depot_free.value.ui64 = cp->cache_empty.ml_alloc;
3472 kmcp->kmc_depot_contention.value.ui64 = cp->cache_depot_contention;
3473 kmcp->kmc_full_magazines.value.ui64 = cp->cache_full.ml_total;
3474 kmcp->kmc_empty_magazines.value.ui64 = cp->cache_empty.ml_total;
3475 kmcp->kmc_magazine_size.value.ui64 =
3476 (cp->cache_flags & KMF_NOMAGAZINE) ?
3477 0 : cp->cache_magtype->mt_magsize;
3478
3479 kmcp->kmc_alloc.value.ui64 += cp->cache_full.ml_alloc;
3480 kmcp->kmc_free.value.ui64 += cp->cache_empty.ml_alloc;
3481 buf_avail += cp->cache_full.ml_total * cp->cache_magtype->mt_magsize;
3482
3483 reap = MIN(cp->cache_full.ml_reaplimit, cp->cache_full.ml_min);
3484 reap = MIN(reap, cp->cache_full.ml_total);
3485
3486 mutex_exit(&cp->cache_depot_lock);
3487
3488 kmcp->kmc_buf_size.value.ui64 = cp->cache_bufsize;
3489 kmcp->kmc_align.value.ui64 = cp->cache_align;
3490 kmcp->kmc_chunk_size.value.ui64 = cp->cache_chunksize;
3491 kmcp->kmc_slab_size.value.ui64 = cp->cache_slabsize;
3492 kmcp->kmc_buf_constructed.value.ui64 = buf_avail;
3493 buf_avail += cp->cache_bufslab;
3494 kmcp->kmc_buf_avail.value.ui64 = buf_avail;
3495 kmcp->kmc_buf_inuse.value.ui64 = cp->cache_buftotal - buf_avail;
3496 kmcp->kmc_buf_total.value.ui64 = cp->cache_buftotal;
3497 kmcp->kmc_buf_max.value.ui64 = cp->cache_bufmax;
3498 kmcp->kmc_slab_create.value.ui64 = cp->cache_slab_create;
3499 kmcp->kmc_slab_destroy.value.ui64 = cp->cache_slab_destroy;
3500 kmcp->kmc_hash_size.value.ui64 = (cp->cache_flags & KMF_HASH) ?
3501 cp->cache_hash_mask + 1 : 0;
3502 kmcp->kmc_hash_lookup_depth.value.ui64 = cp->cache_lookup_depth;
3503 kmcp->kmc_hash_rescale.value.ui64 = cp->cache_rescale;
3504 kmcp->kmc_vmem_source.value.ui64 = cp->cache_arena->vm_id;
3505 kmcp->kmc_reap.value.ui64 = cp->cache_reap;
3506
3507 if (cp->cache_defrag == NULL) {
3508 kmcp->kmc_move_callbacks.value.ui64 = 0;
3509 kmcp->kmc_move_yes.value.ui64 = 0;
3510 kmcp->kmc_move_no.value.ui64 = 0;
3511 kmcp->kmc_move_later.value.ui64 = 0;
3512 kmcp->kmc_move_dont_need.value.ui64 = 0;
3513 kmcp->kmc_move_dont_know.value.ui64 = 0;
3514 kmcp->kmc_move_hunt_found.value.ui64 = 0;
3515 kmcp->kmc_move_slabs_freed.value.ui64 = 0;
3516 kmcp->kmc_defrag.value.ui64 = 0;
3517 kmcp->kmc_scan.value.ui64 = 0;
3518 kmcp->kmc_move_reclaimable.value.ui64 = 0;
3519 } else {
3520 int64_t reclaimable;
3521
3522 kmem_defrag_t *kd = cp->cache_defrag;
3523 kmcp->kmc_move_callbacks.value.ui64 = kd->kmd_callbacks;
3524 kmcp->kmc_move_yes.value.ui64 = kd->kmd_yes;
3525 kmcp->kmc_move_no.value.ui64 = kd->kmd_no;
3526 kmcp->kmc_move_later.value.ui64 = kd->kmd_later;
3527 kmcp->kmc_move_dont_need.value.ui64 = kd->kmd_dont_need;
3528 kmcp->kmc_move_dont_know.value.ui64 = kd->kmd_dont_know;
3529 kmcp->kmc_move_hunt_found.value.ui64 = 0;
3530 kmcp->kmc_move_slabs_freed.value.ui64 = kd->kmd_slabs_freed;
3531 kmcp->kmc_defrag.value.ui64 = kd->kmd_defrags;
3532 kmcp->kmc_scan.value.ui64 = kd->kmd_scans;
3533
3534 reclaimable = cp->cache_bufslab - (cp->cache_maxchunks - 1);
3535 reclaimable = MAX(reclaimable, 0);
3536 reclaimable += ((uint64_t)reap * cp->cache_magtype->mt_magsize);
3537 kmcp->kmc_move_reclaimable.value.ui64 = reclaimable;
3538 }
3539
3540 mutex_exit(&cp->cache_lock);
3541 return (0);
3542 }
3543
3544 /*
3545 * Return a named statistic about a particular cache.
3546 * This shouldn't be called very often, so it's currently designed for
3547 * simplicity (leverages existing kstat support) rather than efficiency.
3548 */
3549 uint64_t
3550 kmem_cache_stat(kmem_cache_t *cp, char *name)
3551 {
3552 int i;
3553 kstat_t *ksp = cp->cache_kstat;
3554 kstat_named_t *knp = (kstat_named_t *)&kmem_cache_kstat;
3555 uint64_t value = 0;
3556
3557 if (ksp != NULL) {
3558 mutex_enter(&kmem_cache_kstat_lock);
3559 (void) kmem_cache_kstat_update(ksp, KSTAT_READ);
3560 for (i = 0; i < ksp->ks_ndata; i++) {
3561 if (strcmp(knp[i].name, name) == 0) {
3562 value = knp[i].value.ui64;
3563 break;
3564 }
3565 }
3566 mutex_exit(&kmem_cache_kstat_lock);
3567 }
3568 return (value);
3569 }
3570
3571 /*
3572 * Return an estimate of currently available kernel heap memory.
3573 * On 32-bit systems, physical memory may exceed virtual memory,
3574 * we just truncate the result at 1GB.
3575 */
3576 size_t
3577 kmem_avail(void)
3578 {
3579 spgcnt_t rmem = availrmem - tune.t_minarmem;
3580 spgcnt_t fmem = freemem - minfree;
3581
3582 return ((size_t)ptob(MIN(MAX(MIN(rmem, fmem), 0),
3583 1 << (30 - PAGESHIFT))));
3584 }
3585
3586 /*
3587 * Return the maximum amount of memory that is (in theory) allocatable
3588 * from the heap. This may be used as an estimate only since there
3589 * is no guarentee this space will still be available when an allocation
3590 * request is made, nor that the space may be allocated in one big request
3591 * due to kernel heap fragmentation.
3592 */
3593 size_t
3594 kmem_maxavail(void)
3595 {
3596 spgcnt_t pmem = availrmem - tune.t_minarmem;
3597 spgcnt_t vmem = btop(vmem_size(heap_arena, VMEM_FREE));
3598
3599 return ((size_t)ptob(MAX(MIN(pmem, vmem), 0)));
3600 }
3601
3602 /*
3603 * Indicate whether memory-intensive kmem debugging is enabled.
3604 */
3605 int
3606 kmem_debugging(void)
3607 {
3608 return (kmem_flags & (KMF_AUDIT | KMF_REDZONE));
3609 }
3610
3611 /* binning function, sorts finely at the two extremes */
3612 #define KMEM_PARTIAL_SLAB_WEIGHT(sp, binshift) \
3613 ((((sp)->slab_refcnt <= (binshift)) || \
3614 (((sp)->slab_chunks - (sp)->slab_refcnt) <= (binshift))) \
3615 ? -(sp)->slab_refcnt \
3616 : -((binshift) + ((sp)->slab_refcnt >> (binshift))))
3617
3618 /*
3619 * Minimizing the number of partial slabs on the freelist minimizes
3620 * fragmentation (the ratio of unused buffers held by the slab layer). There are
3621 * two ways to get a slab off of the freelist: 1) free all the buffers on the
3622 * slab, and 2) allocate all the buffers on the slab. It follows that we want
3623 * the most-used slabs at the front of the list where they have the best chance
3624 * of being completely allocated, and the least-used slabs at a safe distance
3625 * from the front to improve the odds that the few remaining buffers will all be
3626 * freed before another allocation can tie up the slab. For that reason a slab
3627 * with a higher slab_refcnt sorts less than than a slab with a lower
3628 * slab_refcnt.
3629 *
3630 * However, if a slab has at least one buffer that is deemed unfreeable, we
3631 * would rather have that slab at the front of the list regardless of
3632 * slab_refcnt, since even one unfreeable buffer makes the entire slab
3633 * unfreeable. If the client returns KMEM_CBRC_NO in response to a cache_move()
3634 * callback, the slab is marked unfreeable for as long as it remains on the
3635 * freelist.
3636 */
3637 static int
3638 kmem_partial_slab_cmp(const void *p0, const void *p1)
3639 {
3640 const kmem_cache_t *cp;
3641 const kmem_slab_t *s0 = p0;
3642 const kmem_slab_t *s1 = p1;
3643 int w0, w1;
3644 size_t binshift;
3645
3646 ASSERT(KMEM_SLAB_IS_PARTIAL(s0));
3647 ASSERT(KMEM_SLAB_IS_PARTIAL(s1));
3648 ASSERT(s0->slab_cache == s1->slab_cache);
3649 cp = s1->slab_cache;
3650 ASSERT(MUTEX_HELD(&cp->cache_lock));
3651 binshift = cp->cache_partial_binshift;
3652
3653 /* weight of first slab */
3654 w0 = KMEM_PARTIAL_SLAB_WEIGHT(s0, binshift);
3655 if (s0->slab_flags & KMEM_SLAB_NOMOVE) {
3656 w0 -= cp->cache_maxchunks;
3657 }
3658
3659 /* weight of second slab */
3660 w1 = KMEM_PARTIAL_SLAB_WEIGHT(s1, binshift);
3661 if (s1->slab_flags & KMEM_SLAB_NOMOVE) {
3662 w1 -= cp->cache_maxchunks;
3663 }
3664
3665 if (w0 < w1)
3666 return (-1);
3667 if (w0 > w1)
3668 return (1);
3669
3670 /* compare pointer values */
3671 if ((uintptr_t)s0 < (uintptr_t)s1)
3672 return (-1);
3673 if ((uintptr_t)s0 > (uintptr_t)s1)
3674 return (1);
3675
3676 return (0);
3677 }
3678
3679 /*
3680 * It must be valid to call the destructor (if any) on a newly created object.
3681 * That is, the constructor (if any) must leave the object in a valid state for
3682 * the destructor.
3683 */
3684 kmem_cache_t *
3685 kmem_cache_create(
3686 char *name, /* descriptive name for this cache */
3687 size_t bufsize, /* size of the objects it manages */
3688 size_t align, /* required object alignment */
3689 int (*constructor)(void *, void *, int), /* object constructor */
3690 void (*destructor)(void *, void *), /* object destructor */
3691 void (*reclaim)(void *), /* memory reclaim callback */
3692 void *private, /* pass-thru arg for constr/destr/reclaim */
3693 vmem_t *vmp, /* vmem source for slab allocation */
3694 int cflags) /* cache creation flags */
3695 {
3696 int cpu_seqid;
3697 size_t chunksize;
3698 kmem_cache_t *cp;
3699 kmem_magtype_t *mtp;
3700 size_t csize = KMEM_CACHE_SIZE(max_ncpus);
3701
3702 #ifdef DEBUG
3703 /*
3704 * Cache names should conform to the rules for valid C identifiers
3705 */
3706 if (!strident_valid(name)) {
3707 cmn_err(CE_CONT,
3708 "kmem_cache_create: '%s' is an invalid cache name\n"
3709 "cache names must conform to the rules for "
3710 "C identifiers\n", name);
3711 }
3712 #endif /* DEBUG */
3713
3714 if (vmp == NULL)
3715 vmp = kmem_default_arena;
3716
3717 /*
3718 * If this kmem cache has an identifier vmem arena as its source, mark
3719 * it such to allow kmem_reap_idspace().
3720 */
3721 ASSERT(!(cflags & KMC_IDENTIFIER)); /* consumer should not set this */
3722 if (vmp->vm_cflags & VMC_IDENTIFIER)
3723 cflags |= KMC_IDENTIFIER;
3724
3725 /*
3726 * Get a kmem_cache structure. We arrange that cp->cache_cpu[]
3727 * is aligned on a KMEM_CPU_CACHE_SIZE boundary to prevent
3728 * false sharing of per-CPU data.
3729 */
3730 cp = vmem_xalloc(kmem_cache_arena, csize, KMEM_CPU_CACHE_SIZE,
3731 P2NPHASE(csize, KMEM_CPU_CACHE_SIZE), 0, NULL, NULL, VM_SLEEP);
3732 bzero(cp, csize);
3733 list_link_init(&cp->cache_link);
3734
3735 if (align == 0)
3736 align = KMEM_ALIGN;
3737
3738 /*
3739 * If we're not at least KMEM_ALIGN aligned, we can't use free
3740 * memory to hold bufctl information (because we can't safely
3741 * perform word loads and stores on it).
3742 */
3743 if (align < KMEM_ALIGN)
3744 cflags |= KMC_NOTOUCH;
3745
3746 if (!ISP2(align) || align > vmp->vm_quantum)
3747 panic("kmem_cache_create: bad alignment %lu", align);
3748
3749 mutex_enter(&kmem_flags_lock);
3750 if (kmem_flags & KMF_RANDOMIZE)
3751 kmem_flags = (((kmem_flags | ~KMF_RANDOM) + 1) & KMF_RANDOM) |
3752 KMF_RANDOMIZE;
3753 cp->cache_flags = (kmem_flags | cflags) & KMF_DEBUG;
3754 mutex_exit(&kmem_flags_lock);
3755
3756 /*
3757 * Make sure all the various flags are reasonable.
3758 */
3759 ASSERT(!(cflags & KMC_NOHASH) || !(cflags & KMC_NOTOUCH));
3760
3761 if (cp->cache_flags & KMF_LITE) {
3762 if (bufsize >= kmem_lite_minsize &&
3763 align <= kmem_lite_maxalign &&
3764 P2PHASE(bufsize, kmem_lite_maxalign) != 0) {
3765 cp->cache_flags |= KMF_BUFTAG;
3766 cp->cache_flags &= ~(KMF_AUDIT | KMF_FIREWALL);
3767 } else {
3768 cp->cache_flags &= ~KMF_DEBUG;
3769 }
3770 }
3771
3772 if (cp->cache_flags & KMF_DEADBEEF)
3773 cp->cache_flags |= KMF_REDZONE;
3774
3775 if ((cflags & KMC_QCACHE) && (cp->cache_flags & KMF_AUDIT))
3776 cp->cache_flags |= KMF_NOMAGAZINE;
3777
3778 if (cflags & KMC_NODEBUG)
3779 cp->cache_flags &= ~KMF_DEBUG;
3780
3781 if (cflags & KMC_NOTOUCH)
3782 cp->cache_flags &= ~KMF_TOUCH;
3783
3784 if (cflags & KMC_PREFILL)
3785 cp->cache_flags |= KMF_PREFILL;
3786
3787 if (cflags & KMC_NOHASH)
3788 cp->cache_flags &= ~(KMF_AUDIT | KMF_FIREWALL);
3789
3790 if (cflags & KMC_NOMAGAZINE)
3791 cp->cache_flags |= KMF_NOMAGAZINE;
3792
3793 if ((cp->cache_flags & KMF_AUDIT) && !(cflags & KMC_NOTOUCH))
3794 cp->cache_flags |= KMF_REDZONE;
3795
3796 if (!(cp->cache_flags & KMF_AUDIT))
3797 cp->cache_flags &= ~KMF_CONTENTS;
3798
3799 if ((cp->cache_flags & KMF_BUFTAG) && bufsize >= kmem_minfirewall &&
3800 !(cp->cache_flags & KMF_LITE) && !(cflags & KMC_NOHASH))
3801 cp->cache_flags |= KMF_FIREWALL;
3802
3803 if (vmp != kmem_default_arena || kmem_firewall_arena == NULL)
3804 cp->cache_flags &= ~KMF_FIREWALL;
3805
3806 if (cp->cache_flags & KMF_FIREWALL) {
3807 cp->cache_flags &= ~KMF_BUFTAG;
3808 cp->cache_flags |= KMF_NOMAGAZINE;
3809 ASSERT(vmp == kmem_default_arena);
3810 vmp = kmem_firewall_arena;
3811 }
3812
3813 /*
3814 * Set cache properties.
3815 */
3816 (void) strncpy(cp->cache_name, name, KMEM_CACHE_NAMELEN);
3817 strident_canon(cp->cache_name, KMEM_CACHE_NAMELEN + 1);
3818 cp->cache_bufsize = bufsize;
3819 cp->cache_align = align;
3820 cp->cache_constructor = constructor;
3821 cp->cache_destructor = destructor;
3822 cp->cache_reclaim = reclaim;
3823 cp->cache_private = private;
3824 cp->cache_arena = vmp;
3825 cp->cache_cflags = cflags;
3826
3827 /*
3828 * Determine the chunk size.
3829 */
3830 chunksize = bufsize;
3831
3832 if (align >= KMEM_ALIGN) {
3833 chunksize = P2ROUNDUP(chunksize, KMEM_ALIGN);
3834 cp->cache_bufctl = chunksize - KMEM_ALIGN;
3835 }
3836
3837 if (cp->cache_flags & KMF_BUFTAG) {
3838 cp->cache_bufctl = chunksize;
3839 cp->cache_buftag = chunksize;
3840 if (cp->cache_flags & KMF_LITE)
3841 chunksize += KMEM_BUFTAG_LITE_SIZE(kmem_lite_count);
3842 else
3843 chunksize += sizeof (kmem_buftag_t);
3844 }
3845
3846 if (cp->cache_flags & KMF_DEADBEEF) {
3847 cp->cache_verify = MIN(cp->cache_buftag, kmem_maxverify);
3848 if (cp->cache_flags & KMF_LITE)
3849 cp->cache_verify = sizeof (uint64_t);
3850 }
3851
3852 cp->cache_contents = MIN(cp->cache_bufctl, kmem_content_maxsave);
3853
3854 cp->cache_chunksize = chunksize = P2ROUNDUP(chunksize, align);
3855
3856 /*
3857 * Now that we know the chunk size, determine the optimal slab size.
3858 */
3859 if (vmp == kmem_firewall_arena) {
3860 cp->cache_slabsize = P2ROUNDUP(chunksize, vmp->vm_quantum);
3861 cp->cache_mincolor = cp->cache_slabsize - chunksize;
3862 cp->cache_maxcolor = cp->cache_mincolor;
3863 cp->cache_flags |= KMF_HASH;
3864 ASSERT(!(cp->cache_flags & KMF_BUFTAG));
3865 } else if ((cflags & KMC_NOHASH) || (!(cflags & KMC_NOTOUCH) &&
3866 !(cp->cache_flags & KMF_AUDIT) &&
3867 chunksize < vmp->vm_quantum / KMEM_VOID_FRACTION)) {
3868 cp->cache_slabsize = vmp->vm_quantum;
3869 cp->cache_mincolor = 0;
3870 cp->cache_maxcolor =
3871 (cp->cache_slabsize - sizeof (kmem_slab_t)) % chunksize;
3872 ASSERT(chunksize + sizeof (kmem_slab_t) <= cp->cache_slabsize);
3873 ASSERT(!(cp->cache_flags & KMF_AUDIT));
3874 } else {
3875 size_t chunks, bestfit, waste, slabsize;
3876 size_t minwaste = LONG_MAX;
3877
3878 bestfit = 0;
3879 for (chunks = 1; chunks <= KMEM_VOID_FRACTION; chunks++) {
3880 slabsize = P2ROUNDUP(chunksize * chunks,
3881 vmp->vm_quantum);
3882 chunks = slabsize / chunksize;
3883 waste = (slabsize % chunksize) / chunks;
3884 if (waste < minwaste) {
3885 minwaste = waste;
3886 bestfit = slabsize;
3887 }
3888 }
3889 if (cflags & KMC_QCACHE)
3890 bestfit = VMEM_QCACHE_SLABSIZE(vmp->vm_qcache_max);
3891 cp->cache_slabsize = bestfit;
3892 cp->cache_mincolor = 0;
3893 cp->cache_maxcolor = bestfit % chunksize;
3894 cp->cache_flags |= KMF_HASH;
3895 }
3896
3897 cp->cache_maxchunks = (cp->cache_slabsize / cp->cache_chunksize);
3898 cp->cache_partial_binshift = highbit(cp->cache_maxchunks / 16) + 1;
3899
3900 /*
3901 * Disallowing prefill when either the DEBUG or HASH flag is set or when
3902 * there is a constructor avoids some tricky issues with debug setup
3903 * that may be revisited later. We cannot allow prefill in a
3904 * metadata cache because of potential recursion.
3905 */
3906 if (vmp == kmem_msb_arena ||
3907 cp->cache_flags & (KMF_HASH | KMF_BUFTAG) ||
3908 cp->cache_constructor != NULL)
3909 cp->cache_flags &= ~KMF_PREFILL;
3910
3911 if (cp->cache_flags & KMF_HASH) {
3912 ASSERT(!(cflags & KMC_NOHASH));
3913 cp->cache_bufctl_cache = (cp->cache_flags & KMF_AUDIT) ?
3914 kmem_bufctl_audit_cache : kmem_bufctl_cache;
3915 }
3916
3917 if (cp->cache_maxcolor >= vmp->vm_quantum)
3918 cp->cache_maxcolor = vmp->vm_quantum - 1;
3919
3920 cp->cache_color = cp->cache_mincolor;
3921
3922 /*
3923 * Initialize the rest of the slab layer.
3924 */
3925 mutex_init(&cp->cache_lock, NULL, MUTEX_DEFAULT, NULL);
3926
3927 avl_create(&cp->cache_partial_slabs, kmem_partial_slab_cmp,
3928 sizeof (kmem_slab_t), offsetof(kmem_slab_t, slab_link));
3929 /* LINTED: E_TRUE_LOGICAL_EXPR */
3930 ASSERT(sizeof (list_node_t) <= sizeof (avl_node_t));
3931 /* reuse partial slab AVL linkage for complete slab list linkage */
3932 list_create(&cp->cache_complete_slabs,
3933 sizeof (kmem_slab_t), offsetof(kmem_slab_t, slab_link));
3934
3935 if (cp->cache_flags & KMF_HASH) {
3936 cp->cache_hash_table = vmem_alloc(kmem_hash_arena,
3937 KMEM_HASH_INITIAL * sizeof (void *), VM_SLEEP);
3938 bzero(cp->cache_hash_table,
3939 KMEM_HASH_INITIAL * sizeof (void *));
3940 cp->cache_hash_mask = KMEM_HASH_INITIAL - 1;
3941 cp->cache_hash_shift = highbit((ulong_t)chunksize) - 1;
3942 }
3943
3944 /*
3945 * Initialize the depot.
3946 */
3947 mutex_init(&cp->cache_depot_lock, NULL, MUTEX_DEFAULT, NULL);
3948
3949 for (mtp = kmem_magtype; chunksize <= mtp->mt_minbuf; mtp++)
3950 continue;
3951
3952 cp->cache_magtype = mtp;
3953
3954 /*
3955 * Initialize the CPU layer.
3956 */
3957 for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++) {
3958 kmem_cpu_cache_t *ccp = &cp->cache_cpu[cpu_seqid];
3959 mutex_init(&ccp->cc_lock, NULL, MUTEX_DEFAULT, NULL);
3960 ccp->cc_flags = cp->cache_flags;
3961 ccp->cc_rounds = -1;
3962 ccp->cc_prounds = -1;
3963 }
3964
3965 /*
3966 * Create the cache's kstats.
3967 */
3968 if ((cp->cache_kstat = kstat_create("unix", 0, cp->cache_name,
3969 "kmem_cache", KSTAT_TYPE_NAMED,
3970 sizeof (kmem_cache_kstat) / sizeof (kstat_named_t),
3971 KSTAT_FLAG_VIRTUAL)) != NULL) {
3972 cp->cache_kstat->ks_data = &kmem_cache_kstat;
3973 cp->cache_kstat->ks_update = kmem_cache_kstat_update;
3974 cp->cache_kstat->ks_private = cp;
3975 cp->cache_kstat->ks_lock = &kmem_cache_kstat_lock;
3976 kstat_install(cp->cache_kstat);
3977 }
3978
3979 /*
3980 * Add the cache to the global list. This makes it visible
3981 * to kmem_update(), so the cache must be ready for business.
3982 */
3983 mutex_enter(&kmem_cache_lock);
3984 list_insert_tail(&kmem_caches, cp);
3985 mutex_exit(&kmem_cache_lock);
3986
3987 if (kmem_ready)
3988 kmem_cache_magazine_enable(cp);
3989
3990 return (cp);
3991 }
3992
3993 static int
3994 kmem_move_cmp(const void *buf, const void *p)
3995 {
3996 const kmem_move_t *kmm = p;
3997 uintptr_t v1 = (uintptr_t)buf;
3998 uintptr_t v2 = (uintptr_t)kmm->kmm_from_buf;
3999 return (v1 < v2 ? -1 : (v1 > v2 ? 1 : 0));
4000 }
4001
4002 static void
4003 kmem_reset_reclaim_threshold(kmem_defrag_t *kmd)
4004 {
4005 kmd->kmd_reclaim_numer = 1;
4006 }
4007
4008 /*
4009 * Initially, when choosing candidate slabs for buffers to move, we want to be
4010 * very selective and take only slabs that are less than
4011 * (1 / KMEM_VOID_FRACTION) allocated. If we have difficulty finding candidate
4012 * slabs, then we raise the allocation ceiling incrementally. The reclaim
4013 * threshold is reset to (1 / KMEM_VOID_FRACTION) as soon as the cache is no
4014 * longer fragmented.
4015 */
4016 static void
4017 kmem_adjust_reclaim_threshold(kmem_defrag_t *kmd, int direction)
4018 {
4019 if (direction > 0) {
4020 /* make it easier to find a candidate slab */
4021 if (kmd->kmd_reclaim_numer < (KMEM_VOID_FRACTION - 1)) {
4022 kmd->kmd_reclaim_numer++;
4023 }
4024 } else {
4025 /* be more selective */
4026 if (kmd->kmd_reclaim_numer > 1) {
4027 kmd->kmd_reclaim_numer--;
4028 }
4029 }
4030 }
4031
4032 void
4033 kmem_cache_set_move(kmem_cache_t *cp,
4034 kmem_cbrc_t (*move)(void *, void *, size_t, void *))
4035 {
4036 kmem_defrag_t *defrag;
4037
4038 ASSERT(move != NULL);
4039 /*
4040 * The consolidator does not support NOTOUCH caches because kmem cannot
4041 * initialize their slabs with the 0xbaddcafe memory pattern, which sets
4042 * a low order bit usable by clients to distinguish uninitialized memory
4043 * from known objects (see kmem_slab_create).
4044 */
4045 ASSERT(!(cp->cache_cflags & KMC_NOTOUCH));
4046 ASSERT(!(cp->cache_cflags & KMC_IDENTIFIER));
4047
4048 /*
4049 * We should not be holding anyone's cache lock when calling
4050 * kmem_cache_alloc(), so allocate in all cases before acquiring the
4051 * lock.
4052 */
4053 defrag = kmem_cache_alloc(kmem_defrag_cache, KM_SLEEP);
4054
4055 mutex_enter(&cp->cache_lock);
4056
4057 if (KMEM_IS_MOVABLE(cp)) {
4058 if (cp->cache_move == NULL) {
4059 ASSERT(cp->cache_slab_alloc == 0);
4060
4061 cp->cache_defrag = defrag;
4062 defrag = NULL; /* nothing to free */
4063 bzero(cp->cache_defrag, sizeof (kmem_defrag_t));
4064 avl_create(&cp->cache_defrag->kmd_moves_pending,
4065 kmem_move_cmp, sizeof (kmem_move_t),
4066 offsetof(kmem_move_t, kmm_entry));
4067 /* LINTED: E_TRUE_LOGICAL_EXPR */
4068 ASSERT(sizeof (list_node_t) <= sizeof (avl_node_t));
4069 /* reuse the slab's AVL linkage for deadlist linkage */
4070 list_create(&cp->cache_defrag->kmd_deadlist,
4071 sizeof (kmem_slab_t),
4072 offsetof(kmem_slab_t, slab_link));
4073 kmem_reset_reclaim_threshold(cp->cache_defrag);
4074 }
4075 cp->cache_move = move;
4076 }
4077
4078 mutex_exit(&cp->cache_lock);
4079
4080 if (defrag != NULL) {
4081 kmem_cache_free(kmem_defrag_cache, defrag); /* unused */
4082 }
4083 }
4084
4085 void
4086 kmem_cache_destroy(kmem_cache_t *cp)
4087 {
4088 int cpu_seqid;
4089
4090 /*
4091 * Remove the cache from the global cache list so that no one else
4092 * can schedule tasks on its behalf, wait for any pending tasks to
4093 * complete, purge the cache, and then destroy it.
4094 */
4095 mutex_enter(&kmem_cache_lock);
4096 list_remove(&kmem_caches, cp);
4097 mutex_exit(&kmem_cache_lock);
4098
4099 if (kmem_taskq != NULL)
4100 taskq_wait(kmem_taskq);
4101
4102 if (kmem_move_taskq != NULL && cp->cache_defrag != NULL)
4103 taskq_wait(kmem_move_taskq);
4104
4105 kmem_cache_magazine_purge(cp);
4106
4107 mutex_enter(&cp->cache_lock);
4108 if (cp->cache_buftotal != 0)
4109 cmn_err(CE_WARN, "kmem_cache_destroy: '%s' (%p) not empty",
4110 cp->cache_name, (void *)cp);
4111 if (cp->cache_defrag != NULL) {
4112 avl_destroy(&cp->cache_defrag->kmd_moves_pending);
4113 list_destroy(&cp->cache_defrag->kmd_deadlist);
4114 kmem_cache_free(kmem_defrag_cache, cp->cache_defrag);
4115 cp->cache_defrag = NULL;
4116 }
4117 /*
4118 * The cache is now dead. There should be no further activity. We
4119 * enforce this by setting land mines in the constructor, destructor,
4120 * reclaim, and move routines that induce a kernel text fault if
4121 * invoked.
4122 */
4123 cp->cache_constructor = (int (*)(void *, void *, int))1;
4124 cp->cache_destructor = (void (*)(void *, void *))2;
4125 cp->cache_reclaim = (void (*)(void *))3;
4126 cp->cache_move = (kmem_cbrc_t (*)(void *, void *, size_t, void *))4;
4127 mutex_exit(&cp->cache_lock);
4128
4129 kstat_delete(cp->cache_kstat);
4130
4131 if (cp->cache_hash_table != NULL)
4132 vmem_free(kmem_hash_arena, cp->cache_hash_table,
4133 (cp->cache_hash_mask + 1) * sizeof (void *));
4134
4135 for (cpu_seqid = 0; cpu_seqid < max_ncpus; cpu_seqid++)
4136 mutex_destroy(&cp->cache_cpu[cpu_seqid].cc_lock);
4137
4138 mutex_destroy(&cp->cache_depot_lock);
4139 mutex_destroy(&cp->cache_lock);
4140
4141 vmem_free(kmem_cache_arena, cp, KMEM_CACHE_SIZE(max_ncpus));
4142 }
4143
4144 /*ARGSUSED*/
4145 static int
4146 kmem_cpu_setup(cpu_setup_t what, int id, void *arg)
4147 {
4148 ASSERT(MUTEX_HELD(&cpu_lock));
4149 if (what == CPU_UNCONFIG) {
4150 kmem_cache_applyall(kmem_cache_magazine_purge,
4151 kmem_taskq, TQ_SLEEP);
4152 kmem_cache_applyall(kmem_cache_magazine_enable,
4153 kmem_taskq, TQ_SLEEP);
4154 }
4155 return (0);
4156 }
4157
4158 static void
4159 kmem_alloc_caches_create(const int *array, size_t count,
4160 kmem_cache_t **alloc_table, size_t maxbuf, uint_t shift)
4161 {
4162 char name[KMEM_CACHE_NAMELEN + 1];
4163 size_t table_unit = (1 << shift); /* range of one alloc_table entry */
4164 size_t size = table_unit;
4165 int i;
4166
4167 for (i = 0; i < count; i++) {
4168 size_t cache_size = array[i];
4169 size_t align = KMEM_ALIGN;
4170 kmem_cache_t *cp;
4171
4172 /* if the table has an entry for maxbuf, we're done */
4173 if (size > maxbuf)
4174 break;
4175
4176 /* cache size must be a multiple of the table unit */
4177 ASSERT(P2PHASE(cache_size, table_unit) == 0);
4178
4179 /*
4180 * If they allocate a multiple of the coherency granularity,
4181 * they get a coherency-granularity-aligned address.
4182 */
4183 if (IS_P2ALIGNED(cache_size, 64))
4184 align = 64;
4185 if (IS_P2ALIGNED(cache_size, PAGESIZE))
4186 align = PAGESIZE;
4187 (void) snprintf(name, sizeof (name),
4188 "kmem_alloc_%lu", cache_size);
4189 cp = kmem_cache_create(name, cache_size, align,
4190 NULL, NULL, NULL, NULL, NULL, KMC_KMEM_ALLOC);
4191
4192 while (size <= cache_size) {
4193 alloc_table[(size - 1) >> shift] = cp;
4194 size += table_unit;
4195 }
4196 }
4197
4198 ASSERT(size > maxbuf); /* i.e. maxbuf <= max(cache_size) */
4199 }
4200
4201 static void
4202 kmem_cache_init(int pass, int use_large_pages)
4203 {
4204 int i;
4205 size_t maxbuf;
4206 kmem_magtype_t *mtp;
4207
4208 for (i = 0; i < sizeof (kmem_magtype) / sizeof (*mtp); i++) {
4209 char name[KMEM_CACHE_NAMELEN + 1];
4210
4211 mtp = &kmem_magtype[i];
4212 (void) sprintf(name, "kmem_magazine_%d", mtp->mt_magsize);
4213 mtp->mt_cache = kmem_cache_create(name,
4214 (mtp->mt_magsize + 1) * sizeof (void *),
4215 mtp->mt_align, NULL, NULL, NULL, NULL,
4216 kmem_msb_arena, KMC_NOHASH);
4217 }
4218
4219 kmem_slab_cache = kmem_cache_create("kmem_slab_cache",
4220 sizeof (kmem_slab_t), 0, NULL, NULL, NULL, NULL,
4221 kmem_msb_arena, KMC_NOHASH);
4222
4223 kmem_bufctl_cache = kmem_cache_create("kmem_bufctl_cache",
4224 sizeof (kmem_bufctl_t), 0, NULL, NULL, NULL, NULL,
4225 kmem_msb_arena, KMC_NOHASH);
4226
4227 kmem_bufctl_audit_cache = kmem_cache_create("kmem_bufctl_audit_cache",
4228 sizeof (kmem_bufctl_audit_t), 0, NULL, NULL, NULL, NULL,
4229 kmem_msb_arena, KMC_NOHASH);
4230
4231 if (pass == 2) {
4232 kmem_va_arena = vmem_create("kmem_va",
4233 NULL, 0, PAGESIZE,
4234 vmem_alloc, vmem_free, heap_arena,
4235 8 * PAGESIZE, VM_SLEEP);
4236
4237 if (use_large_pages) {
4238 kmem_default_arena = vmem_xcreate("kmem_default",
4239 NULL, 0, PAGESIZE,
4240 segkmem_alloc_lp, segkmem_free_lp, kmem_va_arena,
4241 0, VMC_DUMPSAFE | VM_SLEEP);
4242 } else {
4243 kmem_default_arena = vmem_create("kmem_default",
4244 NULL, 0, PAGESIZE,
4245 segkmem_alloc, segkmem_free, kmem_va_arena,
4246 0, VMC_DUMPSAFE | VM_SLEEP);
4247 }
4248
4249 /* Figure out what our maximum cache size is */
4250 maxbuf = kmem_max_cached;
4251 if (maxbuf <= KMEM_MAXBUF) {
4252 maxbuf = 0;
4253 kmem_max_cached = KMEM_MAXBUF;
4254 } else {
4255 size_t size = 0;
4256 size_t max =
4257 sizeof (kmem_big_alloc_sizes) / sizeof (int);
4258 /*
4259 * Round maxbuf up to an existing cache size. If maxbuf
4260 * is larger than the largest cache, we truncate it to
4261 * the largest cache's size.
4262 */
4263 for (i = 0; i < max; i++) {
4264 size = kmem_big_alloc_sizes[i];
4265 if (maxbuf <= size)
4266 break;
4267 }
4268 kmem_max_cached = maxbuf = size;
4269 }
4270
4271 /*
4272 * The big alloc table may not be completely overwritten, so
4273 * we clear out any stale cache pointers from the first pass.
4274 */
4275 bzero(kmem_big_alloc_table, sizeof (kmem_big_alloc_table));
4276 } else {
4277 /*
4278 * During the first pass, the kmem_alloc_* caches
4279 * are treated as metadata.
4280 */
4281 kmem_default_arena = kmem_msb_arena;
4282 maxbuf = KMEM_BIG_MAXBUF_32BIT;
4283 }
4284
4285 /*
4286 * Set up the default caches to back kmem_alloc()
4287 */
4288 kmem_alloc_caches_create(
4289 kmem_alloc_sizes, sizeof (kmem_alloc_sizes) / sizeof (int),
4290 kmem_alloc_table, KMEM_MAXBUF, KMEM_ALIGN_SHIFT);
4291
4292 kmem_alloc_caches_create(
4293 kmem_big_alloc_sizes, sizeof (kmem_big_alloc_sizes) / sizeof (int),
4294 kmem_big_alloc_table, maxbuf, KMEM_BIG_SHIFT);
4295
4296 kmem_big_alloc_table_max = maxbuf >> KMEM_BIG_SHIFT;
4297 }
4298
4299 void
4300 kmem_init(void)
4301 {
4302 kmem_cache_t *cp;
4303 int old_kmem_flags = kmem_flags;
4304 int use_large_pages = 0;
4305 size_t maxverify, minfirewall;
4306
4307 kstat_init();
4308
4309 /*
4310 * Don't do firewalled allocations if the heap is less than 1TB
4311 * (i.e. on a 32-bit kernel)
4312 * The resulting VM_NEXTFIT allocations would create too much
4313 * fragmentation in a small heap.
4314 */
4315 #if defined(_LP64)
4316 maxverify = minfirewall = PAGESIZE / 2;
4317 #else
4318 maxverify = minfirewall = ULONG_MAX;
4319 #endif
4320
4321 /* LINTED */
4322 ASSERT(sizeof (kmem_cpu_cache_t) == KMEM_CPU_CACHE_SIZE);
4323
4324 list_create(&kmem_caches, sizeof (kmem_cache_t),
4325 offsetof(kmem_cache_t, cache_link));
4326
4327 kmem_metadata_arena = vmem_create("kmem_metadata", NULL, 0, PAGESIZE,
4328 vmem_alloc, vmem_free, heap_arena, 8 * PAGESIZE,
4329 VM_SLEEP | VMC_NO_QCACHE);
4330
4331 kmem_msb_arena = vmem_create("kmem_msb", NULL, 0,
4332 PAGESIZE, segkmem_alloc, segkmem_free, kmem_metadata_arena, 0,
4333 VMC_DUMPSAFE | VM_SLEEP);
4334
4335 kmem_cache_arena = vmem_create("kmem_cache", NULL, 0, KMEM_ALIGN,
4336 segkmem_alloc, segkmem_free, kmem_metadata_arena, 0, VM_SLEEP);
4337
4338 kmem_hash_arena = vmem_create("kmem_hash", NULL, 0, KMEM_ALIGN,
4339 segkmem_alloc, segkmem_free, kmem_metadata_arena, 0, VM_SLEEP);
4340
4341 kmem_log_arena = vmem_create("kmem_log", NULL, 0, KMEM_ALIGN,
4342 segkmem_alloc, segkmem_free, heap_arena, 0, VM_SLEEP);
4343
4344 kmem_firewall_va_arena = vmem_create("kmem_firewall_va",
4345 NULL, 0, PAGESIZE,
4346 kmem_firewall_va_alloc, kmem_firewall_va_free, heap_arena,
4347 0, VM_SLEEP);
4348
4349 kmem_firewall_arena = vmem_create("kmem_firewall", NULL, 0, PAGESIZE,
4350 segkmem_alloc, segkmem_free, kmem_firewall_va_arena, 0,
4351 VMC_DUMPSAFE | VM_SLEEP);
4352
4353 /* temporary oversize arena for mod_read_system_file */
4354 kmem_oversize_arena = vmem_create("kmem_oversize", NULL, 0, PAGESIZE,
4355 segkmem_alloc, segkmem_free, heap_arena, 0, VM_SLEEP);
4356
4357 kmem_reap_interval = 15 * hz;
4358
4359 /*
4360 * Read /etc/system. This is a chicken-and-egg problem because
4361 * kmem_flags may be set in /etc/system, but mod_read_system_file()
4362 * needs to use the allocator. The simplest solution is to create
4363 * all the standard kmem caches, read /etc/system, destroy all the
4364 * caches we just created, and then create them all again in light
4365 * of the (possibly) new kmem_flags and other kmem tunables.
4366 */
4367 kmem_cache_init(1, 0);
4368
4369 mod_read_system_file(boothowto & RB_ASKNAME);
4370
4371 while ((cp = list_tail(&kmem_caches)) != NULL)
4372 kmem_cache_destroy(cp);
4373
4374 vmem_destroy(kmem_oversize_arena);
4375
4376 if (old_kmem_flags & KMF_STICKY)
4377 kmem_flags = old_kmem_flags;
4378
4379 if (!(kmem_flags & KMF_AUDIT))
4380 vmem_seg_size = offsetof(vmem_seg_t, vs_thread);
4381
4382 if (kmem_maxverify == 0)
4383 kmem_maxverify = maxverify;
4384
4385 if (kmem_minfirewall == 0)
4386 kmem_minfirewall = minfirewall;
4387
4388 /*
4389 * give segkmem a chance to figure out if we are using large pages
4390 * for the kernel heap
4391 */
4392 use_large_pages = segkmem_lpsetup();
4393
4394 /*
4395 * To protect against corruption, we keep the actual number of callers
4396 * KMF_LITE records seperate from the tunable. We arbitrarily clamp
4397 * to 16, since the overhead for small buffers quickly gets out of
4398 * hand.
4399 *
4400 * The real limit would depend on the needs of the largest KMC_NOHASH
4401 * cache.
4402 */
4403 kmem_lite_count = MIN(MAX(0, kmem_lite_pcs), 16);
4404 kmem_lite_pcs = kmem_lite_count;
4405
4406 /*
4407 * Normally, we firewall oversized allocations when possible, but
4408 * if we are using large pages for kernel memory, and we don't have
4409 * any non-LITE debugging flags set, we want to allocate oversized
4410 * buffers from large pages, and so skip the firewalling.
4411 */
4412 if (use_large_pages &&
4413 ((kmem_flags & KMF_LITE) || !(kmem_flags & KMF_DEBUG))) {
4414 kmem_oversize_arena = vmem_xcreate("kmem_oversize", NULL, 0,
4415 PAGESIZE, segkmem_alloc_lp, segkmem_free_lp, heap_arena,
4416 0, VMC_DUMPSAFE | VM_SLEEP);
4417 } else {
4418 kmem_oversize_arena = vmem_create("kmem_oversize",
4419 NULL, 0, PAGESIZE,
4420 segkmem_alloc, segkmem_free, kmem_minfirewall < ULONG_MAX?
4421 kmem_firewall_va_arena : heap_arena, 0, VMC_DUMPSAFE |
4422 VM_SLEEP);
4423 }
4424
4425 kmem_cache_init(2, use_large_pages);
4426
4427 if (kmem_flags & (KMF_AUDIT | KMF_RANDOMIZE)) {
4428 if (kmem_transaction_log_size == 0)
4429 kmem_transaction_log_size = kmem_maxavail() / 50;
4430 kmem_transaction_log = kmem_log_init(kmem_transaction_log_size);
4431 }
4432
4433 if (kmem_flags & (KMF_CONTENTS | KMF_RANDOMIZE)) {
4434 if (kmem_content_log_size == 0)
4435 kmem_content_log_size = kmem_maxavail() / 50;
4436 kmem_content_log = kmem_log_init(kmem_content_log_size);
4437 }
4438
4439 kmem_failure_log = kmem_log_init(kmem_failure_log_size);
4440 kmem_slab_log = kmem_log_init(kmem_slab_log_size);
4441 kmem_zerosized_log = kmem_log_init(kmem_zerosized_log_size);
4442
4443 /*
4444 * Initialize STREAMS message caches so allocb() is available.
4445 * This allows us to initialize the logging framework (cmn_err(9F),
4446 * strlog(9F), etc) so we can start recording messages.
4447 */
4448 streams_msg_init();
4449
4450 /*
4451 * Initialize the ZSD framework in Zones so modules loaded henceforth
4452 * can register their callbacks.
4453 */
4454 zone_zsd_init();
4455
4456 log_init();
4457 taskq_init();
4458
4459 /*
4460 * Warn about invalid or dangerous values of kmem_flags.
4461 * Always warn about unsupported values.
4462 */
4463 if (((kmem_flags & ~(KMF_AUDIT | KMF_DEADBEEF | KMF_REDZONE |
4464 KMF_CONTENTS | KMF_LITE)) != 0) ||
4465 ((kmem_flags & KMF_LITE) && kmem_flags != KMF_LITE))
4466 cmn_err(CE_WARN, "kmem_flags set to unsupported value 0x%x.",
4467 kmem_flags);
4468
4469 #ifdef DEBUG
4470 if ((kmem_flags & KMF_DEBUG) == 0)
4471 cmn_err(CE_NOTE, "kmem debugging disabled.");
4472 #else
4473 /*
4474 * For non-debug kernels, the only "normal" flags are 0, KMF_LITE,
4475 * KMF_REDZONE, and KMF_CONTENTS (the last because it is only enabled
4476 * if KMF_AUDIT is set). We should warn the user about the performance
4477 * penalty of KMF_AUDIT or KMF_DEADBEEF if they are set and KMF_LITE
4478 * isn't set (since that disables AUDIT).
4479 */
4480 if (!(kmem_flags & KMF_LITE) &&
4481 (kmem_flags & (KMF_AUDIT | KMF_DEADBEEF)) != 0)
4482 cmn_err(CE_WARN, "High-overhead kmem debugging features "
4483 "enabled (kmem_flags = 0x%x). Performance degradation "
4484 "and large memory overhead possible.", kmem_flags);
4485 #endif /* not DEBUG */
4486
4487 kmem_cache_applyall(kmem_cache_magazine_enable, NULL, TQ_SLEEP);
4488
4489 kmem_ready = 1;
4490
4491 /*
4492 * Initialize the platform-specific aligned/DMA memory allocator.
4493 */
4494 ka_init();
4495
4496 /*
4497 * Initialize 32-bit ID cache.
4498 */
4499 id32_init();
4500
4501 /*
4502 * Initialize the networking stack so modules loaded can
4503 * register their callbacks.
4504 */
4505 netstack_init();
4506 }
4507
4508 static void
4509 kmem_move_init(void)
4510 {
4511 kmem_defrag_cache = kmem_cache_create("kmem_defrag_cache",
4512 sizeof (kmem_defrag_t), 0, NULL, NULL, NULL, NULL,
4513 kmem_msb_arena, KMC_NOHASH);
4514 kmem_move_cache = kmem_cache_create("kmem_move_cache",
4515 sizeof (kmem_move_t), 0, NULL, NULL, NULL, NULL,
4516 kmem_msb_arena, KMC_NOHASH);
4517
4518 /*
4519 * kmem guarantees that move callbacks are sequential and that even
4520 * across multiple caches no two moves ever execute simultaneously.
4521 * Move callbacks are processed on a separate taskq so that client code
4522 * does not interfere with internal maintenance tasks.
4523 */
4524 kmem_move_taskq = taskq_create_instance("kmem_move_taskq", 0, 1,
4525 minclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE);
4526 }
4527
4528 void
4529 kmem_thread_init(void)
4530 {
4531 kmem_move_init();
4532
4533 /*
4534 * This taskq is used for various kmem maintenance functions, including
4535 * kmem_reap(). When maintenance is required on every cache,
4536 * kmem_cache_applyall() dispatches one task per cache onto this queue.
4537 *
4538 * In the case of kmem_reap(), the system may be under increasingly
4539 * dire memory pressure and may not be able to allocate a new task
4540 * entry. The count of entries to prepopulate (below) should cover at
4541 * least as many caches as we generally expect to exist on the system
4542 * so that they may all be scheduled for reaping under those
4543 * conditions.
4544 */
4545 kmem_taskq = taskq_create_instance("kmem_taskq", 0, 1, minclsyspri,
4546 600, INT_MAX, TASKQ_PREPOPULATE);
4547 }
4548
4549 void
4550 kmem_mp_init(void)
4551 {
4552 mutex_enter(&cpu_lock);
4553 register_cpu_setup_func(kmem_cpu_setup, NULL);
4554 mutex_exit(&cpu_lock);
4555
4556 kmem_update_timeout(NULL);
4557
4558 taskq_mp_init();
4559 }
4560
4561 /*
4562 * Return the slab of the allocated buffer, or NULL if the buffer is not
4563 * allocated. This function may be called with a known slab address to determine
4564 * whether or not the buffer is allocated, or with a NULL slab address to obtain
4565 * an allocated buffer's slab.
4566 */
4567 static kmem_slab_t *
4568 kmem_slab_allocated(kmem_cache_t *cp, kmem_slab_t *sp, void *buf)
4569 {
4570 kmem_bufctl_t *bcp, *bufbcp;
4571
4572 ASSERT(MUTEX_HELD(&cp->cache_lock));
4573 ASSERT(sp == NULL || KMEM_SLAB_MEMBER(sp, buf));
4574
4575 if (cp->cache_flags & KMF_HASH) {
4576 for (bcp = *KMEM_HASH(cp, buf);
4577 (bcp != NULL) && (bcp->bc_addr != buf);
4578 bcp = bcp->bc_next) {
4579 continue;
4580 }
4581 ASSERT(sp != NULL && bcp != NULL ? sp == bcp->bc_slab : 1);
4582 return (bcp == NULL ? NULL : bcp->bc_slab);
4583 }
4584
4585 if (sp == NULL) {
4586 sp = KMEM_SLAB(cp, buf);
4587 }
4588 bufbcp = KMEM_BUFCTL(cp, buf);
4589 for (bcp = sp->slab_head;
4590 (bcp != NULL) && (bcp != bufbcp);
4591 bcp = bcp->bc_next) {
4592 continue;
4593 }
4594 return (bcp == NULL ? sp : NULL);
4595 }
4596
4597 static boolean_t
4598 kmem_slab_is_reclaimable(kmem_cache_t *cp, kmem_slab_t *sp, int flags)
4599 {
4600 long refcnt = sp->slab_refcnt;
4601
4602 ASSERT(cp->cache_defrag != NULL);
4603
4604 /*
4605 * For code coverage we want to be able to move an object within the
4606 * same slab (the only partial slab) even if allocating the destination
4607 * buffer resulted in a completely allocated slab.
4608 */
4609 if (flags & KMM_DEBUG) {
4610 return ((flags & KMM_DESPERATE) ||
4611 ((sp->slab_flags & KMEM_SLAB_NOMOVE) == 0));
4612 }
4613
4614 /* If we're desperate, we don't care if the client said NO. */
4615 if (flags & KMM_DESPERATE) {
4616 return (refcnt < sp->slab_chunks); /* any partial */
4617 }
4618
4619 if (sp->slab_flags & KMEM_SLAB_NOMOVE) {
4620 return (B_FALSE);
4621 }
4622
4623 if ((refcnt == 1) || kmem_move_any_partial) {
4624 return (refcnt < sp->slab_chunks);
4625 }
4626
4627 /*
4628 * The reclaim threshold is adjusted at each kmem_cache_scan() so that
4629 * slabs with a progressively higher percentage of used buffers can be
4630 * reclaimed until the cache as a whole is no longer fragmented.
4631 *
4632 * sp->slab_refcnt kmd_reclaim_numer
4633 * --------------- < ------------------
4634 * sp->slab_chunks KMEM_VOID_FRACTION
4635 */
4636 return ((refcnt * KMEM_VOID_FRACTION) <
4637 (sp->slab_chunks * cp->cache_defrag->kmd_reclaim_numer));
4638 }
4639
4640 /*
4641 * May be called from the kmem_move_taskq, from kmem_cache_move_notify_task(),
4642 * or when the buffer is freed.
4643 */
4644 static void
4645 kmem_slab_move_yes(kmem_cache_t *cp, kmem_slab_t *sp, void *from_buf)
4646 {
4647 ASSERT(MUTEX_HELD(&cp->cache_lock));
4648 ASSERT(KMEM_SLAB_MEMBER(sp, from_buf));
4649
4650 if (!KMEM_SLAB_IS_PARTIAL(sp)) {
4651 return;
4652 }
4653
4654 if (sp->slab_flags & KMEM_SLAB_NOMOVE) {
4655 if (KMEM_SLAB_OFFSET(sp, from_buf) == sp->slab_stuck_offset) {
4656 avl_remove(&cp->cache_partial_slabs, sp);
4657 sp->slab_flags &= ~KMEM_SLAB_NOMOVE;
4658 sp->slab_stuck_offset = (uint32_t)-1;
4659 avl_add(&cp->cache_partial_slabs, sp);
4660 }
4661 } else {
4662 sp->slab_later_count = 0;
4663 sp->slab_stuck_offset = (uint32_t)-1;
4664 }
4665 }
4666
4667 static void
4668 kmem_slab_move_no(kmem_cache_t *cp, kmem_slab_t *sp, void *from_buf)
4669 {
4670 ASSERT(taskq_member(kmem_move_taskq, curthread));
4671 ASSERT(MUTEX_HELD(&cp->cache_lock));
4672 ASSERT(KMEM_SLAB_MEMBER(sp, from_buf));
4673
4674 if (!KMEM_SLAB_IS_PARTIAL(sp)) {
4675 return;
4676 }
4677
4678 avl_remove(&cp->cache_partial_slabs, sp);
4679 sp->slab_later_count = 0;
4680 sp->slab_flags |= KMEM_SLAB_NOMOVE;
4681 sp->slab_stuck_offset = KMEM_SLAB_OFFSET(sp, from_buf);
4682 avl_add(&cp->cache_partial_slabs, sp);
4683 }
4684
4685 static void kmem_move_end(kmem_cache_t *, kmem_move_t *);
4686
4687 /*
4688 * The move callback takes two buffer addresses, the buffer to be moved, and a
4689 * newly allocated and constructed buffer selected by kmem as the destination.
4690 * It also takes the size of the buffer and an optional user argument specified
4691 * at cache creation time. kmem guarantees that the buffer to be moved has not
4692 * been unmapped by the virtual memory subsystem. Beyond that, it cannot
4693 * guarantee the present whereabouts of the buffer to be moved, so it is up to
4694 * the client to safely determine whether or not it is still using the buffer.
4695 * The client must not free either of the buffers passed to the move callback,
4696 * since kmem wants to free them directly to the slab layer. The client response
4697 * tells kmem which of the two buffers to free:
4698 *
4699 * YES kmem frees the old buffer (the move was successful)
4700 * NO kmem frees the new buffer, marks the slab of the old buffer
4701 * non-reclaimable to avoid bothering the client again
4702 * LATER kmem frees the new buffer, increments slab_later_count
4703 * DONT_KNOW kmem frees the new buffer
4704 * DONT_NEED kmem frees both the old buffer and the new buffer
4705 *
4706 * The pending callback argument now being processed contains both of the
4707 * buffers (old and new) passed to the move callback function, the slab of the
4708 * old buffer, and flags related to the move request, such as whether or not the
4709 * system was desperate for memory.
4710 *
4711 * Slabs are not freed while there is a pending callback, but instead are kept
4712 * on a deadlist, which is drained after the last callback completes. This means
4713 * that slabs are safe to access until kmem_move_end(), no matter how many of
4714 * their buffers have been freed. Once slab_refcnt reaches zero, it stays at
4715 * zero for as long as the slab remains on the deadlist and until the slab is
4716 * freed.
4717 */
4718 static void
4719 kmem_move_buffer(kmem_move_t *callback)
4720 {
4721 kmem_cbrc_t response;
4722 kmem_slab_t *sp = callback->kmm_from_slab;
4723 kmem_cache_t *cp = sp->slab_cache;
4724 boolean_t free_on_slab;
4725
4726 ASSERT(taskq_member(kmem_move_taskq, curthread));
4727 ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
4728 ASSERT(KMEM_SLAB_MEMBER(sp, callback->kmm_from_buf));
4729
4730 /*
4731 * The number of allocated buffers on the slab may have changed since we
4732 * last checked the slab's reclaimability (when the pending move was
4733 * enqueued), or the client may have responded NO when asked to move
4734 * another buffer on the same slab.
4735 */
4736 if (!kmem_slab_is_reclaimable(cp, sp, callback->kmm_flags)) {
4737 kmem_slab_free(cp, callback->kmm_to_buf);
4738 kmem_move_end(cp, callback);
4739 return;
4740 }
4741
4742 /*
4743 * Checking the slab layer is easy, so we might as well do that here
4744 * in case we can avoid bothering the client.
4745 */
4746 mutex_enter(&cp->cache_lock);
4747 free_on_slab = (kmem_slab_allocated(cp, sp,
4748 callback->kmm_from_buf) == NULL);
4749 mutex_exit(&cp->cache_lock);
4750
4751 if (free_on_slab) {
4752 kmem_slab_free(cp, callback->kmm_to_buf);
4753 kmem_move_end(cp, callback);
4754 return;
4755 }
4756
4757 if (cp->cache_flags & KMF_BUFTAG) {
4758 /*
4759 * Make kmem_cache_alloc_debug() apply the constructor for us.
4760 */
4761 if (kmem_cache_alloc_debug(cp, callback->kmm_to_buf,
4762 KM_NOSLEEP, 1, caller()) != 0) {
4763 kmem_move_end(cp, callback);
4764 return;
4765 }
4766 } else if (cp->cache_constructor != NULL &&
4767 cp->cache_constructor(callback->kmm_to_buf, cp->cache_private,
4768 KM_NOSLEEP) != 0) {
4769 atomic_inc_64(&cp->cache_alloc_fail);
4770 kmem_slab_free(cp, callback->kmm_to_buf);
4771 kmem_move_end(cp, callback);
4772 return;
4773 }
4774
4775 cp->cache_defrag->kmd_callbacks++;
4776 cp->cache_defrag->kmd_thread = curthread;
4777 cp->cache_defrag->kmd_from_buf = callback->kmm_from_buf;
4778 cp->cache_defrag->kmd_to_buf = callback->kmm_to_buf;
4779 DTRACE_PROBE2(kmem__move__start, kmem_cache_t *, cp, kmem_move_t *,
4780 callback);
4781
4782 response = cp->cache_move(callback->kmm_from_buf,
4783 callback->kmm_to_buf, cp->cache_bufsize, cp->cache_private);
4784
4785 DTRACE_PROBE3(kmem__move__end, kmem_cache_t *, cp, kmem_move_t *,
4786 callback, kmem_cbrc_t, response);
4787 cp->cache_defrag->kmd_thread = NULL;
4788 cp->cache_defrag->kmd_from_buf = NULL;
4789 cp->cache_defrag->kmd_to_buf = NULL;
4790
4791 if (response == KMEM_CBRC_YES) {
4792 cp->cache_defrag->kmd_yes++;
4793 kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE);
4794 /* slab safe to access until kmem_move_end() */
4795 if (sp->slab_refcnt == 0)
4796 cp->cache_defrag->kmd_slabs_freed++;
4797 mutex_enter(&cp->cache_lock);
4798 kmem_slab_move_yes(cp, sp, callback->kmm_from_buf);
4799 mutex_exit(&cp->cache_lock);
4800 kmem_move_end(cp, callback);
4801 return;
4802 }
4803
4804 switch (response) {
4805 case KMEM_CBRC_NO:
4806 cp->cache_defrag->kmd_no++;
4807 mutex_enter(&cp->cache_lock);
4808 kmem_slab_move_no(cp, sp, callback->kmm_from_buf);
4809 mutex_exit(&cp->cache_lock);
4810 break;
4811 case KMEM_CBRC_LATER:
4812 cp->cache_defrag->kmd_later++;
4813 mutex_enter(&cp->cache_lock);
4814 if (!KMEM_SLAB_IS_PARTIAL(sp)) {
4815 mutex_exit(&cp->cache_lock);
4816 break;
4817 }
4818
4819 if (++sp->slab_later_count >= KMEM_DISBELIEF) {
4820 kmem_slab_move_no(cp, sp, callback->kmm_from_buf);
4821 } else if (!(sp->slab_flags & KMEM_SLAB_NOMOVE)) {
4822 sp->slab_stuck_offset = KMEM_SLAB_OFFSET(sp,
4823 callback->kmm_from_buf);
4824 }
4825 mutex_exit(&cp->cache_lock);
4826 break;
4827 case KMEM_CBRC_DONT_NEED:
4828 cp->cache_defrag->kmd_dont_need++;
4829 kmem_slab_free_constructed(cp, callback->kmm_from_buf, B_FALSE);
4830 if (sp->slab_refcnt == 0)
4831 cp->cache_defrag->kmd_slabs_freed++;
4832 mutex_enter(&cp->cache_lock);
4833 kmem_slab_move_yes(cp, sp, callback->kmm_from_buf);
4834 mutex_exit(&cp->cache_lock);
4835 break;
4836 case KMEM_CBRC_DONT_KNOW:
4837 /*
4838 * If we don't know if we can move this buffer or not, we'll
4839 * just assume that we can't: if the buffer is in fact free,
4840 * then it is sitting in one of the per-CPU magazines or in
4841 * a full magazine in the depot layer. Either way, because
4842 * defrag is induced in the same logic that reaps a cache,
4843 * it's likely that full magazines will be returned to the
4844 * system soon (thereby accomplishing what we're trying to
4845 * accomplish here: return those magazines to their slabs).
4846 * Given this, any work that we might do now to locate a buffer
4847 * in a magazine is wasted (and expensive!) work; we bump
4848 * a counter in this case and otherwise assume that we can't
4849 * move it.
4850 */
4851 cp->cache_defrag->kmd_dont_know++;
4852 break;
4853 default:
4854 panic("'%s' (%p) unexpected move callback response %d\n",
4855 cp->cache_name, (void *)cp, response);
4856 }
4857
4858 kmem_slab_free_constructed(cp, callback->kmm_to_buf, B_FALSE);
4859 kmem_move_end(cp, callback);
4860 }
4861
4862 /* Return B_FALSE if there is insufficient memory for the move request. */
4863 static boolean_t
4864 kmem_move_begin(kmem_cache_t *cp, kmem_slab_t *sp, void *buf, int flags)
4865 {
4866 void *to_buf;
4867 avl_index_t index;
4868 kmem_move_t *callback, *pending;
4869 ulong_t n;
4870
4871 ASSERT(taskq_member(kmem_taskq, curthread));
4872 ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
4873 ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING);
4874
4875 callback = kmem_cache_alloc(kmem_move_cache, KM_NOSLEEP);
4876
4877 if (callback == NULL)
4878 return (B_FALSE);
4879
4880 callback->kmm_from_slab = sp;
4881 callback->kmm_from_buf = buf;
4882 callback->kmm_flags = flags;
4883
4884 mutex_enter(&cp->cache_lock);
4885
4886 n = avl_numnodes(&cp->cache_partial_slabs);
4887 if ((n == 0) || ((n == 1) && !(flags & KMM_DEBUG))) {
4888 mutex_exit(&cp->cache_lock);
4889 kmem_cache_free(kmem_move_cache, callback);
4890 return (B_TRUE); /* there is no need for the move request */
4891 }
4892
4893 pending = avl_find(&cp->cache_defrag->kmd_moves_pending, buf, &index);
4894 if (pending != NULL) {
4895 /*
4896 * If the move is already pending and we're desperate now,
4897 * update the move flags.
4898 */
4899 if (flags & KMM_DESPERATE) {
4900 pending->kmm_flags |= KMM_DESPERATE;
4901 }
4902 mutex_exit(&cp->cache_lock);
4903 kmem_cache_free(kmem_move_cache, callback);
4904 return (B_TRUE);
4905 }
4906
4907 to_buf = kmem_slab_alloc_impl(cp, avl_first(&cp->cache_partial_slabs),
4908 B_FALSE);
4909 callback->kmm_to_buf = to_buf;
4910 avl_insert(&cp->cache_defrag->kmd_moves_pending, callback, index);
4911
4912 mutex_exit(&cp->cache_lock);
4913
4914 if (taskq_dispatch(kmem_move_taskq, (task_func_t *)kmem_move_buffer,
4915 callback, TQ_NOSLEEP) == TASKQID_INVALID) {
4916 mutex_enter(&cp->cache_lock);
4917 avl_remove(&cp->cache_defrag->kmd_moves_pending, callback);
4918 mutex_exit(&cp->cache_lock);
4919 kmem_slab_free(cp, to_buf);
4920 kmem_cache_free(kmem_move_cache, callback);
4921 return (B_FALSE);
4922 }
4923
4924 return (B_TRUE);
4925 }
4926
4927 static void
4928 kmem_move_end(kmem_cache_t *cp, kmem_move_t *callback)
4929 {
4930 avl_index_t index;
4931
4932 ASSERT(cp->cache_defrag != NULL);
4933 ASSERT(taskq_member(kmem_move_taskq, curthread));
4934 ASSERT(MUTEX_NOT_HELD(&cp->cache_lock));
4935
4936 mutex_enter(&cp->cache_lock);
4937 VERIFY(avl_find(&cp->cache_defrag->kmd_moves_pending,
4938 callback->kmm_from_buf, &index) != NULL);
4939 avl_remove(&cp->cache_defrag->kmd_moves_pending, callback);
4940 if (avl_is_empty(&cp->cache_defrag->kmd_moves_pending)) {
4941 list_t *deadlist = &cp->cache_defrag->kmd_deadlist;
4942 kmem_slab_t *sp;
4943
4944 /*
4945 * The last pending move completed. Release all slabs from the
4946 * front of the dead list except for any slab at the tail that
4947 * needs to be released from the context of kmem_move_buffers().
4948 * kmem deferred unmapping the buffers on these slabs in order
4949 * to guarantee that buffers passed to the move callback have
4950 * been touched only by kmem or by the client itself.
4951 */
4952 while ((sp = list_remove_head(deadlist)) != NULL) {
4953 if (sp->slab_flags & KMEM_SLAB_MOVE_PENDING) {
4954 list_insert_tail(deadlist, sp);
4955 break;
4956 }
4957 cp->cache_defrag->kmd_deadcount--;
4958 cp->cache_slab_destroy++;
4959 mutex_exit(&cp->cache_lock);
4960 kmem_slab_destroy(cp, sp);
4961 mutex_enter(&cp->cache_lock);
4962 }
4963 }
4964 mutex_exit(&cp->cache_lock);
4965 kmem_cache_free(kmem_move_cache, callback);
4966 }
4967
4968 /*
4969 * Move buffers from least used slabs first by scanning backwards from the end
4970 * of the partial slab list. Scan at most max_scan candidate slabs and move
4971 * buffers from at most max_slabs slabs (0 for all partial slabs in both cases).
4972 * If desperate to reclaim memory, move buffers from any partial slab, otherwise
4973 * skip slabs with a ratio of allocated buffers at or above the current
4974 * threshold. Return the number of unskipped slabs (at most max_slabs, -1 if the
4975 * scan is aborted) so that the caller can adjust the reclaimability threshold
4976 * depending on how many reclaimable slabs it finds.
4977 *
4978 * kmem_move_buffers() drops and reacquires cache_lock every time it issues a
4979 * move request, since it is not valid for kmem_move_begin() to call
4980 * kmem_cache_alloc() or taskq_dispatch() with cache_lock held.
4981 */
4982 static int
4983 kmem_move_buffers(kmem_cache_t *cp, size_t max_scan, size_t max_slabs,
4984 int flags)
4985 {
4986 kmem_slab_t *sp;
4987 void *buf;
4988 int i, j; /* slab index, buffer index */
4989 int s; /* reclaimable slabs */
4990 int b; /* allocated (movable) buffers on reclaimable slab */
4991 boolean_t success;
4992 int refcnt;
4993 int nomove;
4994
4995 ASSERT(taskq_member(kmem_taskq, curthread));
4996 ASSERT(MUTEX_HELD(&cp->cache_lock));
4997 ASSERT(kmem_move_cache != NULL);
4998 ASSERT(cp->cache_move != NULL && cp->cache_defrag != NULL);
4999 ASSERT((flags & KMM_DEBUG) ? !avl_is_empty(&cp->cache_partial_slabs) :
5000 avl_numnodes(&cp->cache_partial_slabs) > 1);
5001
5002 if (kmem_move_blocked) {
5003 return (0);
5004 }
5005
5006 if (kmem_move_fulltilt) {
5007 flags |= KMM_DESPERATE;
5008 }
5009
5010 if (max_scan == 0 || (flags & KMM_DESPERATE)) {
5011 /*
5012 * Scan as many slabs as needed to find the desired number of
5013 * candidate slabs.
5014 */
5015 max_scan = (size_t)-1;
5016 }
5017
5018 if (max_slabs == 0 || (flags & KMM_DESPERATE)) {
5019 /* Find as many candidate slabs as possible. */
5020 max_slabs = (size_t)-1;
5021 }
5022
5023 sp = avl_last(&cp->cache_partial_slabs);
5024 ASSERT(KMEM_SLAB_IS_PARTIAL(sp));
5025 for (i = 0, s = 0; (i < max_scan) && (s < max_slabs) && (sp != NULL) &&
5026 ((sp != avl_first(&cp->cache_partial_slabs)) ||
5027 (flags & KMM_DEBUG));
5028 sp = AVL_PREV(&cp->cache_partial_slabs, sp), i++) {
5029
5030 if (!kmem_slab_is_reclaimable(cp, sp, flags)) {
5031 continue;
5032 }
5033 s++;
5034
5035 /* Look for allocated buffers to move. */
5036 for (j = 0, b = 0, buf = sp->slab_base;
5037 (j < sp->slab_chunks) && (b < sp->slab_refcnt);
5038 buf = (((char *)buf) + cp->cache_chunksize), j++) {
5039
5040 if (kmem_slab_allocated(cp, sp, buf) == NULL) {
5041 continue;
5042 }
5043
5044 b++;
5045
5046 /*
5047 * Prevent the slab from being destroyed while we drop
5048 * cache_lock and while the pending move is not yet
5049 * registered. Flag the pending move while
5050 * kmd_moves_pending may still be empty, since we can't
5051 * yet rely on a non-zero pending move count to prevent
5052 * the slab from being destroyed.
5053 */
5054 ASSERT(!(sp->slab_flags & KMEM_SLAB_MOVE_PENDING));
5055 sp->slab_flags |= KMEM_SLAB_MOVE_PENDING;
5056 /*
5057 * Recheck refcnt and nomove after reacquiring the lock,
5058 * since these control the order of partial slabs, and
5059 * we want to know if we can pick up the scan where we
5060 * left off.
5061 */
5062 refcnt = sp->slab_refcnt;
5063 nomove = (sp->slab_flags & KMEM_SLAB_NOMOVE);
5064 mutex_exit(&cp->cache_lock);
5065
5066 success = kmem_move_begin(cp, sp, buf, flags);
5067
5068 /*
5069 * Now, before the lock is reacquired, kmem could
5070 * process all pending move requests and purge the
5071 * deadlist, so that upon reacquiring the lock, sp has
5072 * been remapped. Or, the client may free all the
5073 * objects on the slab while the pending moves are still
5074 * on the taskq. Therefore, the KMEM_SLAB_MOVE_PENDING
5075 * flag causes the slab to be put at the end of the
5076 * deadlist and prevents it from being destroyed, since
5077 * we plan to destroy it here after reacquiring the
5078 * lock.
5079 */
5080 mutex_enter(&cp->cache_lock);
5081 ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING);
5082 sp->slab_flags &= ~KMEM_SLAB_MOVE_PENDING;
5083
5084 if (sp->slab_refcnt == 0) {
5085 list_t *deadlist =
5086 &cp->cache_defrag->kmd_deadlist;
5087 list_remove(deadlist, sp);
5088
5089 if (!avl_is_empty(
5090 &cp->cache_defrag->kmd_moves_pending)) {
5091 /*
5092 * A pending move makes it unsafe to
5093 * destroy the slab, because even though
5094 * the move is no longer needed, the
5095 * context where that is determined
5096 * requires the slab to exist.
5097 * Fortunately, a pending move also
5098 * means we don't need to destroy the
5099 * slab here, since it will get
5100 * destroyed along with any other slabs
5101 * on the deadlist after the last
5102 * pending move completes.
5103 */
5104 list_insert_head(deadlist, sp);
5105 return (-1);
5106 }
5107
5108 /*
5109 * Destroy the slab now if it was completely
5110 * freed while we dropped cache_lock and there
5111 * are no pending moves. Since slab_refcnt
5112 * cannot change once it reaches zero, no new
5113 * pending moves from that slab are possible.
5114 */
5115 cp->cache_defrag->kmd_deadcount--;
5116 cp->cache_slab_destroy++;
5117 mutex_exit(&cp->cache_lock);
5118 kmem_slab_destroy(cp, sp);
5119 mutex_enter(&cp->cache_lock);
5120 /*
5121 * Since we can't pick up the scan where we left
5122 * off, abort the scan and say nothing about the
5123 * number of reclaimable slabs.
5124 */
5125 return (-1);
5126 }
5127
5128 if (!success) {
5129 /*
5130 * Abort the scan if there is not enough memory
5131 * for the request and say nothing about the
5132 * number of reclaimable slabs.
5133 */
5134 return (-1);
5135 }
5136
5137 /*
5138 * The slab's position changed while the lock was
5139 * dropped, so we don't know where we are in the
5140 * sequence any more.
5141 */
5142 if (sp->slab_refcnt != refcnt) {
5143 /*
5144 * If this is a KMM_DEBUG move, the slab_refcnt
5145 * may have changed because we allocated a
5146 * destination buffer on the same slab. In that
5147 * case, we're not interested in counting it.
5148 */
5149 return (-1);
5150 }
5151 if ((sp->slab_flags & KMEM_SLAB_NOMOVE) != nomove)
5152 return (-1);
5153
5154 /*
5155 * Generating a move request allocates a destination
5156 * buffer from the slab layer, bumping the first partial
5157 * slab if it is completely allocated. If the current
5158 * slab becomes the first partial slab as a result, we
5159 * can't continue to scan backwards.
5160 *
5161 * If this is a KMM_DEBUG move and we allocated the
5162 * destination buffer from the last partial slab, then
5163 * the buffer we're moving is on the same slab and our
5164 * slab_refcnt has changed, causing us to return before
5165 * reaching here if there are no partial slabs left.
5166 */
5167 ASSERT(!avl_is_empty(&cp->cache_partial_slabs));
5168 if (sp == avl_first(&cp->cache_partial_slabs)) {
5169 /*
5170 * We're not interested in a second KMM_DEBUG
5171 * move.
5172 */
5173 goto end_scan;
5174 }
5175 }
5176 }
5177 end_scan:
5178
5179 return (s);
5180 }
5181
5182 typedef struct kmem_move_notify_args {
5183 kmem_cache_t *kmna_cache;
5184 void *kmna_buf;
5185 } kmem_move_notify_args_t;
5186
5187 static void
5188 kmem_cache_move_notify_task(void *arg)
5189 {
5190 kmem_move_notify_args_t *args = arg;
5191 kmem_cache_t *cp = args->kmna_cache;
5192 void *buf = args->kmna_buf;
5193 kmem_slab_t *sp;
5194
5195 ASSERT(taskq_member(kmem_taskq, curthread));
5196 ASSERT(list_link_active(&cp->cache_link));
5197
5198 kmem_free(args, sizeof (kmem_move_notify_args_t));
5199 mutex_enter(&cp->cache_lock);
5200 sp = kmem_slab_allocated(cp, NULL, buf);
5201
5202 /* Ignore the notification if the buffer is no longer allocated. */
5203 if (sp == NULL) {
5204 mutex_exit(&cp->cache_lock);
5205 return;
5206 }
5207
5208 /* Ignore the notification if there's no reason to move the buffer. */
5209 if (avl_numnodes(&cp->cache_partial_slabs) > 1) {
5210 /*
5211 * So far the notification is not ignored. Ignore the
5212 * notification if the slab is not marked by an earlier refusal
5213 * to move a buffer.
5214 */
5215 if (!(sp->slab_flags & KMEM_SLAB_NOMOVE) &&
5216 (sp->slab_later_count == 0)) {
5217 mutex_exit(&cp->cache_lock);
5218 return;
5219 }
5220
5221 kmem_slab_move_yes(cp, sp, buf);
5222 ASSERT(!(sp->slab_flags & KMEM_SLAB_MOVE_PENDING));
5223 sp->slab_flags |= KMEM_SLAB_MOVE_PENDING;
5224 mutex_exit(&cp->cache_lock);
5225 /* see kmem_move_buffers() about dropping the lock */
5226 (void) kmem_move_begin(cp, sp, buf, KMM_NOTIFY);
5227 mutex_enter(&cp->cache_lock);
5228 ASSERT(sp->slab_flags & KMEM_SLAB_MOVE_PENDING);
5229 sp->slab_flags &= ~KMEM_SLAB_MOVE_PENDING;
5230 if (sp->slab_refcnt == 0) {
5231 list_t *deadlist = &cp->cache_defrag->kmd_deadlist;
5232 list_remove(deadlist, sp);
5233
5234 if (!avl_is_empty(
5235 &cp->cache_defrag->kmd_moves_pending)) {
5236 list_insert_head(deadlist, sp);
5237 mutex_exit(&cp->cache_lock);
5238 return;
5239 }
5240
5241 cp->cache_defrag->kmd_deadcount--;
5242 cp->cache_slab_destroy++;
5243 mutex_exit(&cp->cache_lock);
5244 kmem_slab_destroy(cp, sp);
5245 return;
5246 }
5247 } else {
5248 kmem_slab_move_yes(cp, sp, buf);
5249 }
5250 mutex_exit(&cp->cache_lock);
5251 }
5252
5253 void
5254 kmem_cache_move_notify(kmem_cache_t *cp, void *buf)
5255 {
5256 kmem_move_notify_args_t *args;
5257
5258 args = kmem_alloc(sizeof (kmem_move_notify_args_t), KM_NOSLEEP);
5259 if (args != NULL) {
5260 args->kmna_cache = cp;
5261 args->kmna_buf = buf;
5262 if (taskq_dispatch(kmem_taskq,
5263 (task_func_t *)kmem_cache_move_notify_task, args,
5264 TQ_NOSLEEP) == TASKQID_INVALID)
5265 kmem_free(args, sizeof (kmem_move_notify_args_t));
5266 }
5267 }
5268
5269 static void
5270 kmem_cache_defrag(kmem_cache_t *cp)
5271 {
5272 size_t n;
5273
5274 ASSERT(cp->cache_defrag != NULL);
5275
5276 mutex_enter(&cp->cache_lock);
5277 n = avl_numnodes(&cp->cache_partial_slabs);
5278 if (n > 1) {
5279 /* kmem_move_buffers() drops and reacquires cache_lock */
5280 cp->cache_defrag->kmd_defrags++;
5281 (void) kmem_move_buffers(cp, n, 0, KMM_DESPERATE);
5282 }
5283 mutex_exit(&cp->cache_lock);
5284 }
5285
5286 /* Is this cache above the fragmentation threshold? */
5287 static boolean_t
5288 kmem_cache_frag_threshold(kmem_cache_t *cp, uint64_t nfree)
5289 {
5290 /*
5291 * nfree kmem_frag_numer
5292 * ------------------ > ---------------
5293 * cp->cache_buftotal kmem_frag_denom
5294 */
5295 return ((nfree * kmem_frag_denom) >
5296 (cp->cache_buftotal * kmem_frag_numer));
5297 }
5298
5299 static boolean_t
5300 kmem_cache_is_fragmented(kmem_cache_t *cp, boolean_t *doreap)
5301 {
5302 boolean_t fragmented;
5303 uint64_t nfree;
5304
5305 ASSERT(MUTEX_HELD(&cp->cache_lock));
5306 *doreap = B_FALSE;
5307
5308 if (kmem_move_fulltilt) {
5309 if (avl_numnodes(&cp->cache_partial_slabs) > 1) {
5310 return (B_TRUE);
5311 }
5312 } else {
5313 if ((cp->cache_complete_slab_count + avl_numnodes(
5314 &cp->cache_partial_slabs)) < kmem_frag_minslabs) {
5315 return (B_FALSE);
5316 }
5317 }
5318
5319 nfree = cp->cache_bufslab;
5320 fragmented = ((avl_numnodes(&cp->cache_partial_slabs) > 1) &&
5321 kmem_cache_frag_threshold(cp, nfree));
5322
5323 /*
5324 * Free buffers in the magazine layer appear allocated from the point of
5325 * view of the slab layer. We want to know if the slab layer would
5326 * appear fragmented if we included free buffers from magazines that
5327 * have fallen out of the working set.
5328 */
5329 if (!fragmented) {
5330 long reap;
5331
5332 mutex_enter(&cp->cache_depot_lock);
5333 reap = MIN(cp->cache_full.ml_reaplimit, cp->cache_full.ml_min);
5334 reap = MIN(reap, cp->cache_full.ml_total);
5335 mutex_exit(&cp->cache_depot_lock);
5336
5337 nfree += ((uint64_t)reap * cp->cache_magtype->mt_magsize);
5338 if (kmem_cache_frag_threshold(cp, nfree)) {
5339 *doreap = B_TRUE;
5340 }
5341 }
5342
5343 return (fragmented);
5344 }
5345
5346 /* Called periodically from kmem_taskq */
5347 static void
5348 kmem_cache_scan(kmem_cache_t *cp)
5349 {
5350 boolean_t reap = B_FALSE;
5351 kmem_defrag_t *kmd;
5352
5353 ASSERT(taskq_member(kmem_taskq, curthread));
5354
5355 mutex_enter(&cp->cache_lock);
5356
5357 kmd = cp->cache_defrag;
5358 if (kmd->kmd_consolidate > 0) {
5359 kmd->kmd_consolidate--;
5360 mutex_exit(&cp->cache_lock);
5361 kmem_cache_reap(cp);
5362 return;
5363 }
5364
5365 if (kmem_cache_is_fragmented(cp, &reap)) {
5366 int slabs_found;
5367
5368 /*
5369 * Consolidate reclaimable slabs from the end of the partial
5370 * slab list (scan at most kmem_reclaim_scan_range slabs to find
5371 * reclaimable slabs). Keep track of how many candidate slabs we
5372 * looked for and how many we actually found so we can adjust
5373 * the definition of a candidate slab if we're having trouble
5374 * finding them.
5375 *
5376 * kmem_move_buffers() drops and reacquires cache_lock.
5377 */
5378 kmd->kmd_scans++;
5379 slabs_found = kmem_move_buffers(cp, kmem_reclaim_scan_range,
5380 kmem_reclaim_max_slabs, 0);
5381 if (slabs_found >= 0) {
5382 kmd->kmd_slabs_sought += kmem_reclaim_max_slabs;
5383 kmd->kmd_slabs_found += slabs_found;
5384 }
5385
5386 if (++kmd->kmd_tries >= kmem_reclaim_scan_range) {
5387 kmd->kmd_tries = 0;
5388
5389 /*
5390 * If we had difficulty finding candidate slabs in
5391 * previous scans, adjust the threshold so that
5392 * candidates are easier to find.
5393 */
5394 if (kmd->kmd_slabs_found == kmd->kmd_slabs_sought) {
5395 kmem_adjust_reclaim_threshold(kmd, -1);
5396 } else if ((kmd->kmd_slabs_found * 2) <
5397 kmd->kmd_slabs_sought) {
5398 kmem_adjust_reclaim_threshold(kmd, 1);
5399 }
5400 kmd->kmd_slabs_sought = 0;
5401 kmd->kmd_slabs_found = 0;
5402 }
5403 } else {
5404 kmem_reset_reclaim_threshold(cp->cache_defrag);
5405 #ifdef DEBUG
5406 if (!avl_is_empty(&cp->cache_partial_slabs)) {
5407 /*
5408 * In a debug kernel we want the consolidator to
5409 * run occasionally even when there is plenty of
5410 * memory.
5411 */
5412 uint16_t debug_rand;
5413
5414 (void) random_get_bytes((uint8_t *)&debug_rand, 2);
5415 if (!kmem_move_noreap &&
5416 ((debug_rand % kmem_mtb_reap) == 0)) {
5417 mutex_exit(&cp->cache_lock);
5418 kmem_cache_reap(cp);
5419 return;
5420 } else if ((debug_rand % kmem_mtb_move) == 0) {
5421 kmd->kmd_scans++;
5422 (void) kmem_move_buffers(cp,
5423 kmem_reclaim_scan_range, 1, KMM_DEBUG);
5424 }
5425 }
5426 #endif /* DEBUG */
5427 }
5428
5429 mutex_exit(&cp->cache_lock);
5430
5431 if (reap)
5432 kmem_depot_ws_reap(cp);
5433 }