1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
22 /* All Rights Reserved */
23
24 /*
25 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
26 * Use is subject to license terms.
27 *
28 * Copyright 2017 Nexenta Systems, Inc. All rights reserved.
29 */
30
31 #include <sys/types.h>
32 #include <sys/param.h>
33 #include <sys/thread.h>
34 #include <sys/sysmacros.h>
35 #include <sys/stropts.h>
36 #include <sys/stream.h>
37 #include <sys/strsubr.h>
38 #include <sys/strsun.h>
39 #include <sys/conf.h>
40 #include <sys/debug.h>
41 #include <sys/cmn_err.h>
42 #include <sys/kmem.h>
43 #include <sys/atomic.h>
44 #include <sys/errno.h>
45 #include <sys/vtrace.h>
46 #include <sys/ftrace.h>
47 #include <sys/ontrap.h>
48 #include <sys/multidata.h>
49 #include <sys/multidata_impl.h>
50 #include <sys/sdt.h>
51 #include <sys/strft.h>
52
53 #if defined(_KERNEL) && defined(DEBUG)
54 #include <sys/kmem_impl.h>
55 #endif
56
57 /*
58 * This file contains those functions from io/stream.c
59 * needed by this library, mostly unmodified.
60 */
61
62 /*
63 * STREAMS message allocator: principles of operation
64 *
65 * The streams message allocator consists of all the routines that
66 * allocate, dup and free streams messages: allocb(), [d]esballoc[a],
67 * dupb(), freeb() and freemsg(). What follows is a high-level view
68 * of how the allocator works.
69 *
70 * Every streams message consists of one or more mblks, a dblk, and data.
71 * All mblks for all types of messages come from a common mblk_cache.
72 * The dblk and data come in several flavors, depending on how the
73 * message is allocated:
74 *
75 * (1) mblks up to DBLK_MAX_CACHE size are allocated from a collection of
76 * fixed-size dblk/data caches. For message sizes that are multiples of
77 * PAGESIZE, dblks are allocated separately from the buffer.
78 * The associated buffer is allocated by the constructor using kmem_alloc().
79 * For all other message sizes, dblk and its associated data is allocated
80 * as a single contiguous chunk of memory.
81 * Objects in these caches consist of a dblk plus its associated data.
82 * allocb() determines the nearest-size cache by table lookup:
83 * the dblk_cache[] array provides the mapping from size to dblk cache.
84 *
85 * (2) Large messages (size > DBLK_MAX_CACHE) are constructed by
86 * kmem_alloc()'ing a buffer for the data and supplying that
87 * buffer to gesballoc(), described below.
88 *
89 * (3) The four flavors of [d]esballoc[a] are all implemented by a
90 * common routine, gesballoc() ("generic esballoc"). gesballoc()
91 * allocates a dblk from the global dblk_esb_cache and sets db_base,
92 * db_lim and db_frtnp to describe the caller-supplied buffer.
93 *
94 * While there are several routines to allocate messages, there is only
95 * one routine to free messages: freeb(). freeb() simply invokes the
96 * dblk's free method, dbp->db_free(), which is set at allocation time.
97 *
98 * dupb() creates a new reference to a message by allocating a new mblk,
99 * incrementing the dblk reference count and setting the dblk's free
100 * method to dblk_decref(). The dblk's original free method is retained
101 * in db_lastfree. dblk_decref() decrements the reference count on each
102 * freeb(). If this is not the last reference it just frees the mblk;
103 * if this *is* the last reference, it restores db_free to db_lastfree,
104 * sets db_mblk to the current mblk (see below), and invokes db_lastfree.
105 *
106 * The implementation makes aggressive use of kmem object caching for
107 * maximum performance. This makes the code simple and compact, but
108 * also a bit abstruse in some places. The invariants that constitute a
109 * message's constructed state, described below, are more subtle than usual.
110 *
111 * Every dblk has an "attached mblk" as part of its constructed state.
112 * The mblk is allocated by the dblk's constructor and remains attached
113 * until the message is either dup'ed or pulled up. In the dupb() case
114 * the mblk association doesn't matter until the last free, at which time
115 * dblk_decref() attaches the last mblk to the dblk. pullupmsg() affects
116 * the mblk association because it swaps the leading mblks of two messages,
117 * so it is responsible for swapping their db_mblk pointers accordingly.
118 * From a constructed-state viewpoint it doesn't matter that a dblk's
119 * attached mblk can change while the message is allocated; all that
120 * matters is that the dblk has *some* attached mblk when it's freed.
121 *
122 * The sizes of the allocb() small-message caches are not magical.
123 * They represent a good trade-off between internal and external
124 * fragmentation for current workloads. They should be reevaluated
125 * periodically, especially if allocations larger than DBLK_MAX_CACHE
126 * become common. We use 64-byte alignment so that dblks don't
127 * straddle cache lines unnecessarily.
128 */
129 #define DBLK_MAX_CACHE 73728
130 #define DBLK_CACHE_ALIGN 64
131 #define DBLK_MIN_SIZE 8
132 #define DBLK_SIZE_SHIFT 3
133
134 #ifdef _BIG_ENDIAN
135 #define DBLK_RTFU_SHIFT(field) \
136 (8 * (&((dblk_t *)0)->db_struioflag - &((dblk_t *)0)->field))
137 #else
138 #define DBLK_RTFU_SHIFT(field) \
139 (8 * (&((dblk_t *)0)->field - &((dblk_t *)0)->db_ref))
140 #endif
141
142 #define DBLK_RTFU(ref, type, flags, uioflag) \
143 (((ref) << DBLK_RTFU_SHIFT(db_ref)) | \
144 ((type) << DBLK_RTFU_SHIFT(db_type)) | \
145 (((flags) | (ref - 1)) << DBLK_RTFU_SHIFT(db_flags)) | \
146 ((uioflag) << DBLK_RTFU_SHIFT(db_struioflag)))
147 #define DBLK_RTFU_REF_MASK (DBLK_REFMAX << DBLK_RTFU_SHIFT(db_ref))
148 #define DBLK_RTFU_WORD(dbp) (*((uint32_t *)&(dbp)->db_ref))
149 #define MBLK_BAND_FLAG_WORD(mp) (*((uint32_t *)&(mp)->b_band))
150
151 static size_t dblk_sizes[] = {
152 #ifdef _LP64
153 16, 80, 144, 208, 272, 336, 528, 1040, 1488, 1936, 2576, 3856,
154 8192, 12048, 16384, 20240, 24576, 28432, 32768, 36624,
155 40960, 44816, 49152, 53008, 57344, 61200, 65536, 69392,
156 #else
157 64, 128, 320, 576, 1088, 1536, 1984, 2624, 3904,
158 8192, 12096, 16384, 20288, 24576, 28480, 32768, 36672,
159 40960, 44864, 49152, 53056, 57344, 61248, 65536, 69440,
160 #endif
161 DBLK_MAX_CACHE, 0
162 };
163
164 static struct kmem_cache *dblk_cache[DBLK_MAX_CACHE / DBLK_MIN_SIZE];
165 static struct kmem_cache *mblk_cache;
166 static struct kmem_cache *dblk_esb_cache;
167 #ifdef _KERNEL
168 static struct kmem_cache *fthdr_cache;
169 static struct kmem_cache *ftblk_cache;
170 #endif /* _KERNEL */
171
172 static void dblk_lastfree(mblk_t *mp, dblk_t *dbp);
173 static mblk_t *allocb_oversize(size_t size, int flags);
174 static int allocb_tryhard_fails;
175 static void frnop_func(void *arg);
176 frtn_t frnop = { frnop_func };
177 static void bcache_dblk_lastfree(mblk_t *mp, dblk_t *dbp);
178
179 #ifdef _KERNEL
180 static boolean_t rwnext_enter(queue_t *qp);
181 static void rwnext_exit(queue_t *qp);
182 #endif /* _KERNEL */
183
184 /*
185 * Patchable mblk/dblk kmem_cache flags.
186 */
187 int dblk_kmem_flags = 0;
188 int mblk_kmem_flags = 0;
189
190 static int
191 dblk_constructor(void *buf, void *cdrarg, int kmflags)
192 {
193 dblk_t *dbp = buf;
194 ssize_t msg_size = (ssize_t)cdrarg;
195 size_t index;
196
197 ASSERT(msg_size != 0);
198
199 index = (msg_size - 1) >> DBLK_SIZE_SHIFT;
200
201 ASSERT(index < (DBLK_MAX_CACHE >> DBLK_SIZE_SHIFT));
202
203 if ((dbp->db_mblk = kmem_cache_alloc(mblk_cache, kmflags)) == NULL)
204 return (-1);
205 if ((msg_size & PAGEOFFSET) == 0) {
206 dbp->db_base = kmem_alloc(msg_size, kmflags);
207 if (dbp->db_base == NULL) {
208 kmem_cache_free(mblk_cache, dbp->db_mblk);
209 return (-1);
210 }
211 } else {
212 dbp->db_base = (unsigned char *)&dbp[1];
213 }
214
215 dbp->db_mblk->b_datap = dbp;
216 dbp->db_cache = dblk_cache[index];
217 dbp->db_lim = dbp->db_base + msg_size;
218 dbp->db_free = dbp->db_lastfree = dblk_lastfree;
219 dbp->db_frtnp = NULL;
220 dbp->db_fthdr = NULL;
221 dbp->db_credp = NULL;
222 dbp->db_cpid = -1;
223 dbp->db_struioflag = 0;
224 dbp->db_struioun.cksum.flags = 0;
225 return (0);
226 }
227
228 /*ARGSUSED*/
229 static int
230 dblk_esb_constructor(void *buf, void *cdrarg, int kmflags)
231 {
232 dblk_t *dbp = buf;
233
234 if ((dbp->db_mblk = kmem_cache_alloc(mblk_cache, kmflags)) == NULL)
235 return (-1);
236 dbp->db_mblk->b_datap = dbp;
237 dbp->db_cache = dblk_esb_cache;
238 dbp->db_fthdr = NULL;
239 dbp->db_credp = NULL;
240 dbp->db_cpid = -1;
241 dbp->db_struioflag = 0;
242 dbp->db_struioun.cksum.flags = 0;
243 return (0);
244 }
245
246 static int
247 bcache_dblk_constructor(void *buf, void *cdrarg, int kmflags)
248 {
249 dblk_t *dbp = buf;
250 bcache_t *bcp = cdrarg;
251
252 if ((dbp->db_mblk = kmem_cache_alloc(mblk_cache, kmflags)) == NULL)
253 return (-1);
254
255 dbp->db_base = kmem_cache_alloc(bcp->buffer_cache, kmflags);
256 if (dbp->db_base == NULL) {
257 kmem_cache_free(mblk_cache, dbp->db_mblk);
258 return (-1);
259 }
260
261 dbp->db_mblk->b_datap = dbp;
262 dbp->db_cache = (void *)bcp;
263 dbp->db_lim = dbp->db_base + bcp->size;
264 dbp->db_free = dbp->db_lastfree = bcache_dblk_lastfree;
265 dbp->db_frtnp = NULL;
266 dbp->db_fthdr = NULL;
267 dbp->db_credp = NULL;
268 dbp->db_cpid = -1;
269 dbp->db_struioflag = 0;
270 dbp->db_struioun.cksum.flags = 0;
271 return (0);
272 }
273
274 /*ARGSUSED*/
275 static void
276 dblk_destructor(void *buf, void *cdrarg)
277 {
278 dblk_t *dbp = buf;
279 ssize_t msg_size = (ssize_t)cdrarg;
280
281 ASSERT(dbp->db_mblk->b_datap == dbp);
282 ASSERT(msg_size != 0);
283 ASSERT(dbp->db_struioflag == 0);
284 ASSERT(dbp->db_struioun.cksum.flags == 0);
285
286 if ((msg_size & PAGEOFFSET) == 0) {
287 kmem_free(dbp->db_base, msg_size);
288 }
289
290 kmem_cache_free(mblk_cache, dbp->db_mblk);
291 }
292
293 static void
294 bcache_dblk_destructor(void *buf, void *cdrarg)
295 {
296 dblk_t *dbp = buf;
297 bcache_t *bcp = cdrarg;
298
299 kmem_cache_free(bcp->buffer_cache, dbp->db_base);
300
301 ASSERT(dbp->db_mblk->b_datap == dbp);
302 ASSERT(dbp->db_struioflag == 0);
303 ASSERT(dbp->db_struioun.cksum.flags == 0);
304
305 kmem_cache_free(mblk_cache, dbp->db_mblk);
306 }
307
308 #ifdef _KERNEL
309
310 /* ARGSUSED */
311 static int
312 ftblk_constructor(void *buf, void *cdrarg, int kmflags)
313 {
314 ftblk_t *fbp = buf;
315 int i;
316
317 bzero(fbp, sizeof (ftblk_t));
318 if (str_ftstack != 0) {
319 for (i = 0; i < FTBLK_EVNTS; i++)
320 fbp->ev[i].stk = kmem_alloc(sizeof (ftstk_t), kmflags);
321 }
322
323 return (0);
324 }
325
326 /* ARGSUSED */
327 static void
328 ftblk_destructor(void *buf, void *cdrarg)
329 {
330 ftblk_t *fbp = buf;
331 int i;
332
333 if (str_ftstack != 0) {
334 for (i = 0; i < FTBLK_EVNTS; i++) {
335 if (fbp->ev[i].stk != NULL) {
336 kmem_free(fbp->ev[i].stk, sizeof (ftstk_t));
337 fbp->ev[i].stk = NULL;
338 }
339 }
340 }
341 }
342
343 static int
344 fthdr_constructor(void *buf, void *cdrarg, int kmflags)
345 {
346 fthdr_t *fhp = buf;
347
348 return (ftblk_constructor(&fhp->first, cdrarg, kmflags));
349 }
350
351 static void
352 fthdr_destructor(void *buf, void *cdrarg)
353 {
354 fthdr_t *fhp = buf;
355
356 ftblk_destructor(&fhp->first, cdrarg);
357 }
358
359 #endif /* _KERNEL */
360
361 /* Needed in the ASSERT below */
362 #ifdef DEBUG
363 #ifdef _KERNEL
364 #define KMEM_SLAB_T_SZ sizeof (kmem_slab_t)
365 #else /* _KERNEL */
366 #define KMEM_SLAB_T_SZ 64 /* fakekernel */
367 #endif /* _KERNEL */
368 #endif /* DEBUG */
369
370 void
371 streams_msg_init(void)
372 {
373 char name[40];
374 size_t size;
375 size_t lastsize = DBLK_MIN_SIZE;
376 size_t *sizep;
377 struct kmem_cache *cp;
378 size_t tot_size;
379 int offset;
380
381 mblk_cache = kmem_cache_create("streams_mblk", sizeof (mblk_t), 32,
382 NULL, NULL, NULL, NULL, NULL, mblk_kmem_flags);
383
384 for (sizep = dblk_sizes; (size = *sizep) != 0; sizep++) {
385
386 if ((offset = (size & PAGEOFFSET)) != 0) {
387 /*
388 * We are in the middle of a page, dblk should
389 * be allocated on the same page
390 */
391 tot_size = size + sizeof (dblk_t);
392 ASSERT((offset + sizeof (dblk_t) + KMEM_SLAB_T_SZ)
393 < PAGESIZE);
394 ASSERT((tot_size & (DBLK_CACHE_ALIGN - 1)) == 0);
395
396 } else {
397
398 /*
399 * buf size is multiple of page size, dblk and
400 * buffer are allocated separately.
401 */
402
403 ASSERT((size & (DBLK_CACHE_ALIGN - 1)) == 0);
404 tot_size = sizeof (dblk_t);
405 }
406
407 (void) sprintf(name, "streams_dblk_%ld", (long)size);
408 cp = kmem_cache_create(name, tot_size, DBLK_CACHE_ALIGN,
409 dblk_constructor, dblk_destructor, NULL, (void *)(size),
410 NULL, dblk_kmem_flags);
411
412 while (lastsize <= size) {
413 dblk_cache[(lastsize - 1) >> DBLK_SIZE_SHIFT] = cp;
414 lastsize += DBLK_MIN_SIZE;
415 }
416 }
417
418 dblk_esb_cache = kmem_cache_create("streams_dblk_esb", sizeof (dblk_t),
419 DBLK_CACHE_ALIGN, dblk_esb_constructor, dblk_destructor, NULL,
420 (void *)sizeof (dblk_t), NULL, dblk_kmem_flags);
421 #ifdef _KERNEL
422 fthdr_cache = kmem_cache_create("streams_fthdr", sizeof (fthdr_t), 32,
423 fthdr_constructor, fthdr_destructor, NULL, NULL, NULL, 0);
424 ftblk_cache = kmem_cache_create("streams_ftblk", sizeof (ftblk_t), 32,
425 ftblk_constructor, ftblk_destructor, NULL, NULL, NULL, 0);
426
427 /* Initialize Multidata caches */
428 mmd_init();
429
430 /* initialize throttling queue for esballoc */
431 esballoc_queue_init();
432 #endif /* _KERNEL */
433 }
434
435 /*ARGSUSED*/
436 mblk_t *
437 allocb(size_t size, uint_t pri)
438 {
439 dblk_t *dbp;
440 mblk_t *mp;
441 size_t index;
442
443 index = (size - 1) >> DBLK_SIZE_SHIFT;
444
445 if (index >= (DBLK_MAX_CACHE >> DBLK_SIZE_SHIFT)) {
446 if (size != 0) {
447 mp = allocb_oversize(size, KM_NOSLEEP);
448 goto out;
449 }
450 index = 0;
451 }
452
453 if ((dbp = kmem_cache_alloc(dblk_cache[index], KM_NOSLEEP)) == NULL) {
454 mp = NULL;
455 goto out;
456 }
457
458 mp = dbp->db_mblk;
459 DBLK_RTFU_WORD(dbp) = DBLK_RTFU(1, M_DATA, 0, 0);
460 mp->b_next = mp->b_prev = mp->b_cont = NULL;
461 mp->b_rptr = mp->b_wptr = dbp->db_base;
462 mp->b_queue = NULL;
463 MBLK_BAND_FLAG_WORD(mp) = 0;
464 STR_FTALLOC(&dbp->db_fthdr, FTEV_ALLOCB, size);
465 out:
466 FTRACE_1("allocb(): mp=0x%p", (uintptr_t)mp);
467
468 return (mp);
469 }
470
471 /*
472 * Allocate an mblk taking db_credp and db_cpid from the template.
473 * Allow the cred to be NULL.
474 */
475 mblk_t *
476 allocb_tmpl(size_t size, const mblk_t *tmpl)
477 {
478 mblk_t *mp = allocb(size, 0);
479
480 if (mp != NULL) {
481 dblk_t *src = tmpl->b_datap;
482 dblk_t *dst = mp->b_datap;
483 cred_t *cr;
484 pid_t cpid;
485
486 cr = msg_getcred(tmpl, &cpid);
487 if (cr != NULL)
488 crhold(dst->db_credp = cr);
489 dst->db_cpid = cpid;
490 dst->db_type = src->db_type;
491 }
492 return (mp);
493 }
494
495 mblk_t *
496 allocb_cred(size_t size, cred_t *cr, pid_t cpid)
497 {
498 mblk_t *mp = allocb(size, 0);
499
500 ASSERT(cr != NULL);
501 if (mp != NULL) {
502 dblk_t *dbp = mp->b_datap;
503
504 crhold(dbp->db_credp = cr);
505 dbp->db_cpid = cpid;
506 }
507 return (mp);
508 }
509
510 mblk_t *
511 allocb_cred_wait(size_t size, uint_t flags, int *error, cred_t *cr, pid_t cpid)
512 {
513 mblk_t *mp = allocb_wait(size, 0, flags, error);
514
515 ASSERT(cr != NULL);
516 if (mp != NULL) {
517 dblk_t *dbp = mp->b_datap;
518
519 crhold(dbp->db_credp = cr);
520 dbp->db_cpid = cpid;
521 }
522
523 return (mp);
524 }
525
526 /*
527 * Extract the db_cred (and optionally db_cpid) from a message.
528 * We find the first mblk which has a non-NULL db_cred and use that.
529 * If none found we return NULL.
530 * Does NOT get a hold on the cred.
531 */
532 cred_t *
533 msg_getcred(const mblk_t *mp, pid_t *cpidp)
534 {
535 cred_t *cr = NULL;
536 cred_t *cr2;
537 mblk_t *mp2;
538
539 while (mp != NULL) {
540 dblk_t *dbp = mp->b_datap;
541
542 cr = dbp->db_credp;
543 if (cr == NULL) {
544 mp = mp->b_cont;
545 continue;
546 }
547 if (cpidp != NULL)
548 *cpidp = dbp->db_cpid;
549
550 #ifdef DEBUG
551 /*
552 * Normally there should at most one db_credp in a message.
553 * But if there are multiple (as in the case of some M_IOC*
554 * and some internal messages in TCP/IP bind logic) then
555 * they must be identical in the normal case.
556 * However, a socket can be shared between different uids
557 * in which case data queued in TCP would be from different
558 * creds. Thus we can only assert for the zoneid being the
559 * same. Due to Multi-level Level Ports for TX, some
560 * cred_t can have a NULL cr_zone, and we skip the comparison
561 * in that case.
562 */
563 mp2 = mp->b_cont;
564 while (mp2 != NULL) {
565 cr2 = DB_CRED(mp2);
566 if (cr2 != NULL) {
567 DTRACE_PROBE2(msg__getcred,
568 cred_t *, cr, cred_t *, cr2);
569 ASSERT(crgetzoneid(cr) == crgetzoneid(cr2) ||
570 crgetzone(cr) == NULL ||
571 crgetzone(cr2) == NULL);
572 }
573 mp2 = mp2->b_cont;
574 }
575 #endif
576 return (cr);
577 }
578 if (cpidp != NULL)
579 *cpidp = NOPID;
580 return (NULL);
581 }
582
583 /*
584 * Variant of msg_getcred which, when a cred is found
585 * 1. Returns with a hold on the cred
586 * 2. Clears the first cred in the mblk.
587 * This is more efficient to use than a msg_getcred() + crhold() when
588 * the message is freed after the cred has been extracted.
589 *
590 * The caller is responsible for ensuring that there is no other reference
591 * on the message since db_credp can not be cleared when there are other
592 * references.
593 */
594 cred_t *
595 msg_extractcred(mblk_t *mp, pid_t *cpidp)
596 {
597 cred_t *cr = NULL;
598 cred_t *cr2;
599 mblk_t *mp2;
600
601 while (mp != NULL) {
602 dblk_t *dbp = mp->b_datap;
603
604 cr = dbp->db_credp;
605 if (cr == NULL) {
606 mp = mp->b_cont;
607 continue;
608 }
609 ASSERT(dbp->db_ref == 1);
610 dbp->db_credp = NULL;
611 if (cpidp != NULL)
612 *cpidp = dbp->db_cpid;
613 #ifdef DEBUG
614 /*
615 * Normally there should at most one db_credp in a message.
616 * But if there are multiple (as in the case of some M_IOC*
617 * and some internal messages in TCP/IP bind logic) then
618 * they must be identical in the normal case.
619 * However, a socket can be shared between different uids
620 * in which case data queued in TCP would be from different
621 * creds. Thus we can only assert for the zoneid being the
622 * same. Due to Multi-level Level Ports for TX, some
623 * cred_t can have a NULL cr_zone, and we skip the comparison
624 * in that case.
625 */
626 mp2 = mp->b_cont;
627 while (mp2 != NULL) {
628 cr2 = DB_CRED(mp2);
629 if (cr2 != NULL) {
630 DTRACE_PROBE2(msg__extractcred,
631 cred_t *, cr, cred_t *, cr2);
632 ASSERT(crgetzoneid(cr) == crgetzoneid(cr2) ||
633 crgetzone(cr) == NULL ||
634 crgetzone(cr2) == NULL);
635 }
636 mp2 = mp2->b_cont;
637 }
638 #endif
639 return (cr);
640 }
641 return (NULL);
642 }
643
644 #ifdef _KERNEL
645 /*
646 * Get the label for a message. Uses the first mblk in the message
647 * which has a non-NULL db_credp.
648 * Returns NULL if there is no credp.
649 */
650 extern struct ts_label_s *
651 msg_getlabel(const mblk_t *mp)
652 {
653 cred_t *cr = msg_getcred(mp, NULL);
654
655 if (cr == NULL)
656 return (NULL);
657
658 return (crgetlabel(cr));
659 }
660 #endif /* _KERNEL */
661
662 void
663 freeb(mblk_t *mp)
664 {
665 dblk_t *dbp = mp->b_datap;
666
667 ASSERT(dbp->db_ref > 0);
668 ASSERT(mp->b_next == NULL && mp->b_prev == NULL);
669 FTRACE_1("freeb(): mp=0x%lx", (uintptr_t)mp);
670
671 STR_FTEVENT_MBLK(mp, caller(), FTEV_FREEB, dbp->db_ref);
672
673 dbp->db_free(mp, dbp);
674 }
675
676 void
677 freemsg(mblk_t *mp)
678 {
679 FTRACE_1("freemsg(): mp=0x%lx", (uintptr_t)mp);
680 while (mp) {
681 dblk_t *dbp = mp->b_datap;
682 mblk_t *mp_cont = mp->b_cont;
683
684 ASSERT(dbp->db_ref > 0);
685 ASSERT(mp->b_next == NULL && mp->b_prev == NULL);
686
687 STR_FTEVENT_MBLK(mp, caller(), FTEV_FREEB, dbp->db_ref);
688
689 dbp->db_free(mp, dbp);
690 mp = mp_cont;
691 }
692 }
693
694 /*
695 * Reallocate a block for another use. Try hard to use the old block.
696 * If the old data is wanted (copy), leave b_wptr at the end of the data,
697 * otherwise return b_wptr = b_rptr.
698 *
699 * This routine is private and unstable.
700 */
701 mblk_t *
702 reallocb(mblk_t *mp, size_t size, uint_t copy)
703 {
704 mblk_t *mp1;
705 unsigned char *old_rptr;
706 ptrdiff_t cur_size;
707
708 if (mp == NULL)
709 return (allocb(size, BPRI_HI));
710
711 cur_size = mp->b_wptr - mp->b_rptr;
712 old_rptr = mp->b_rptr;
713
714 ASSERT(mp->b_datap->db_ref != 0);
715
716 if (mp->b_datap->db_ref == 1 && MBLKSIZE(mp) >= size) {
717 /*
718 * If the data is wanted and it will fit where it is, no
719 * work is required.
720 */
721 if (copy && mp->b_datap->db_lim - mp->b_rptr >= size)
722 return (mp);
723
724 mp->b_wptr = mp->b_rptr = mp->b_datap->db_base;
725 mp1 = mp;
726 } else if ((mp1 = allocb_tmpl(size, mp)) != NULL) {
727 /* XXX other mp state could be copied too, db_flags ... ? */
728 mp1->b_cont = mp->b_cont;
729 } else {
730 return (NULL);
731 }
732
733 if (copy) {
734 bcopy(old_rptr, mp1->b_rptr, cur_size);
735 mp1->b_wptr = mp1->b_rptr + cur_size;
736 }
737
738 if (mp != mp1)
739 freeb(mp);
740
741 return (mp1);
742 }
743
744 static void
745 dblk_lastfree(mblk_t *mp, dblk_t *dbp)
746 {
747 ASSERT(dbp->db_mblk == mp);
748 if (dbp->db_fthdr != NULL)
749 str_ftfree(dbp);
750
751 /* set credp and projid to be 'unspecified' before returning to cache */
752 if (dbp->db_credp != NULL) {
753 crfree(dbp->db_credp);
754 dbp->db_credp = NULL;
755 }
756 dbp->db_cpid = -1;
757
758 /* Reset the struioflag and the checksum flag fields */
759 dbp->db_struioflag = 0;
760 dbp->db_struioun.cksum.flags = 0;
761
762 /* and the COOKED and/or UIOA flag(s) */
763 dbp->db_flags &= ~(DBLK_COOKED | DBLK_UIOA);
764
765 kmem_cache_free(dbp->db_cache, dbp);
766 }
767
768 static void
769 dblk_decref(mblk_t *mp, dblk_t *dbp)
770 {
771 if (dbp->db_ref != 1) {
772 uint32_t rtfu = atomic_add_32_nv(&DBLK_RTFU_WORD(dbp),
773 -(1 << DBLK_RTFU_SHIFT(db_ref)));
774 /*
775 * atomic_add_32_nv() just decremented db_ref, so we no longer
776 * have a reference to the dblk, which means another thread
777 * could free it. Therefore we cannot examine the dblk to
778 * determine whether ours was the last reference. Instead,
779 * we extract the new and minimum reference counts from rtfu.
780 * Note that all we're really saying is "if (ref != refmin)".
781 */
782 if (((rtfu >> DBLK_RTFU_SHIFT(db_ref)) & DBLK_REFMAX) !=
783 ((rtfu >> DBLK_RTFU_SHIFT(db_flags)) & DBLK_REFMIN)) {
784 kmem_cache_free(mblk_cache, mp);
785 return;
786 }
787 }
788 dbp->db_mblk = mp;
789 dbp->db_free = dbp->db_lastfree;
790 dbp->db_lastfree(mp, dbp);
791 }
792
793 mblk_t *
794 dupb(mblk_t *mp)
795 {
796 dblk_t *dbp = mp->b_datap;
797 mblk_t *new_mp;
798 uint32_t oldrtfu, newrtfu;
799
800 if ((new_mp = kmem_cache_alloc(mblk_cache, KM_NOSLEEP)) == NULL)
801 goto out;
802
803 new_mp->b_next = new_mp->b_prev = new_mp->b_cont = NULL;
804 new_mp->b_rptr = mp->b_rptr;
805 new_mp->b_wptr = mp->b_wptr;
806 new_mp->b_datap = dbp;
807 new_mp->b_queue = NULL;
808 MBLK_BAND_FLAG_WORD(new_mp) = MBLK_BAND_FLAG_WORD(mp);
809
810 STR_FTEVENT_MBLK(mp, caller(), FTEV_DUPB, dbp->db_ref);
811
812 dbp->db_free = dblk_decref;
813 do {
814 ASSERT(dbp->db_ref > 0);
815 oldrtfu = DBLK_RTFU_WORD(dbp);
816 newrtfu = oldrtfu + (1 << DBLK_RTFU_SHIFT(db_ref));
817 /*
818 * If db_ref is maxed out we can't dup this message anymore.
819 */
820 if ((oldrtfu & DBLK_RTFU_REF_MASK) == DBLK_RTFU_REF_MASK) {
821 kmem_cache_free(mblk_cache, new_mp);
822 new_mp = NULL;
823 goto out;
824 }
825 } while (atomic_cas_32(&DBLK_RTFU_WORD(dbp), oldrtfu, newrtfu) !=
826 oldrtfu);
827
828 out:
829 FTRACE_1("dupb(): new_mp=0x%lx", (uintptr_t)new_mp);
830 return (new_mp);
831 }
832
833 static void
834 dblk_lastfree_desb(mblk_t *mp, dblk_t *dbp)
835 {
836 frtn_t *frp = dbp->db_frtnp;
837
838 ASSERT(dbp->db_mblk == mp);
839 frp->free_func(frp->free_arg);
840 if (dbp->db_fthdr != NULL)
841 str_ftfree(dbp);
842
843 /* set credp and projid to be 'unspecified' before returning to cache */
844 if (dbp->db_credp != NULL) {
845 crfree(dbp->db_credp);
846 dbp->db_credp = NULL;
847 }
848 dbp->db_cpid = -1;
849 dbp->db_struioflag = 0;
850 dbp->db_struioun.cksum.flags = 0;
851
852 kmem_cache_free(dbp->db_cache, dbp);
853 }
854
855 /*ARGSUSED*/
856 static void
857 frnop_func(void *arg)
858 {
859 }
860
861 /*
862 * Generic esballoc used to implement the four flavors: [d]esballoc[a].
863 */
864 static mblk_t *
865 gesballoc(unsigned char *base, size_t size, uint32_t db_rtfu, frtn_t *frp,
866 void (*lastfree)(mblk_t *, dblk_t *), int kmflags)
867 {
868 dblk_t *dbp;
869 mblk_t *mp;
870
871 ASSERT(base != NULL && frp != NULL);
872
873 if ((dbp = kmem_cache_alloc(dblk_esb_cache, kmflags)) == NULL) {
874 mp = NULL;
875 goto out;
876 }
877
878 mp = dbp->db_mblk;
879 dbp->db_base = base;
880 dbp->db_lim = base + size;
881 dbp->db_free = dbp->db_lastfree = lastfree;
882 dbp->db_frtnp = frp;
883 DBLK_RTFU_WORD(dbp) = db_rtfu;
884 mp->b_next = mp->b_prev = mp->b_cont = NULL;
885 mp->b_rptr = mp->b_wptr = base;
886 mp->b_queue = NULL;
887 MBLK_BAND_FLAG_WORD(mp) = 0;
888
889 out:
890 FTRACE_1("gesballoc(): mp=0x%lx", (uintptr_t)mp);
891 return (mp);
892 }
893
894 /*ARGSUSED*/
895 mblk_t *
896 esballoc(unsigned char *base, size_t size, uint_t pri, frtn_t *frp)
897 {
898 mblk_t *mp;
899
900 /*
901 * Note that this is structured to allow the common case (i.e.
902 * STREAMS flowtracing disabled) to call gesballoc() with tail
903 * call optimization.
904 */
905 if (!str_ftnever) {
906 mp = gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0),
907 frp, freebs_enqueue, KM_NOSLEEP);
908
909 if (mp != NULL)
910 STR_FTALLOC(&DB_FTHDR(mp), FTEV_ESBALLOC, size);
911 return (mp);
912 }
913
914 return (gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0),
915 frp, freebs_enqueue, KM_NOSLEEP));
916 }
917
918 /*
919 * Same as esballoc() but sleeps waiting for memory.
920 */
921 /*ARGSUSED*/
922 mblk_t *
923 esballoc_wait(unsigned char *base, size_t size, uint_t pri, frtn_t *frp)
924 {
925 mblk_t *mp;
926
927 /*
928 * Note that this is structured to allow the common case (i.e.
929 * STREAMS flowtracing disabled) to call gesballoc() with tail
930 * call optimization.
931 */
932 if (!str_ftnever) {
933 mp = gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0),
934 frp, freebs_enqueue, KM_SLEEP);
935
936 STR_FTALLOC(&DB_FTHDR(mp), FTEV_ESBALLOC, size);
937 return (mp);
938 }
939
940 return (gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0),
941 frp, freebs_enqueue, KM_SLEEP));
942 }
943
944 /*ARGSUSED*/
945 mblk_t *
946 desballoc(unsigned char *base, size_t size, uint_t pri, frtn_t *frp)
947 {
948 mblk_t *mp;
949
950 /*
951 * Note that this is structured to allow the common case (i.e.
952 * STREAMS flowtracing disabled) to call gesballoc() with tail
953 * call optimization.
954 */
955 if (!str_ftnever) {
956 mp = gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0),
957 frp, dblk_lastfree_desb, KM_NOSLEEP);
958
959 if (mp != NULL)
960 STR_FTALLOC(&DB_FTHDR(mp), FTEV_DESBALLOC, size);
961 return (mp);
962 }
963
964 return (gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0),
965 frp, dblk_lastfree_desb, KM_NOSLEEP));
966 }
967
968 /*ARGSUSED*/
969 mblk_t *
970 esballoca(unsigned char *base, size_t size, uint_t pri, frtn_t *frp)
971 {
972 mblk_t *mp;
973
974 /*
975 * Note that this is structured to allow the common case (i.e.
976 * STREAMS flowtracing disabled) to call gesballoc() with tail
977 * call optimization.
978 */
979 if (!str_ftnever) {
980 mp = gesballoc(base, size, DBLK_RTFU(2, M_DATA, 0, 0),
981 frp, freebs_enqueue, KM_NOSLEEP);
982
983 if (mp != NULL)
984 STR_FTALLOC(&DB_FTHDR(mp), FTEV_ESBALLOCA, size);
985 return (mp);
986 }
987
988 return (gesballoc(base, size, DBLK_RTFU(2, M_DATA, 0, 0),
989 frp, freebs_enqueue, KM_NOSLEEP));
990 }
991
992 /*ARGSUSED*/
993 mblk_t *
994 desballoca(unsigned char *base, size_t size, uint_t pri, frtn_t *frp)
995 {
996 mblk_t *mp;
997
998 /*
999 * Note that this is structured to allow the common case (i.e.
1000 * STREAMS flowtracing disabled) to call gesballoc() with tail
1001 * call optimization.
1002 */
1003 if (!str_ftnever) {
1004 mp = gesballoc(base, size, DBLK_RTFU(2, M_DATA, 0, 0),
1005 frp, dblk_lastfree_desb, KM_NOSLEEP);
1006
1007 if (mp != NULL)
1008 STR_FTALLOC(&DB_FTHDR(mp), FTEV_DESBALLOCA, size);
1009 return (mp);
1010 }
1011
1012 return (gesballoc(base, size, DBLK_RTFU(2, M_DATA, 0, 0),
1013 frp, dblk_lastfree_desb, KM_NOSLEEP));
1014 }
1015
1016 static void
1017 bcache_dblk_lastfree(mblk_t *mp, dblk_t *dbp)
1018 {
1019 bcache_t *bcp = dbp->db_cache;
1020
1021 ASSERT(dbp->db_mblk == mp);
1022 if (dbp->db_fthdr != NULL)
1023 str_ftfree(dbp);
1024
1025 /* set credp and projid to be 'unspecified' before returning to cache */
1026 if (dbp->db_credp != NULL) {
1027 crfree(dbp->db_credp);
1028 dbp->db_credp = NULL;
1029 }
1030 dbp->db_cpid = -1;
1031 dbp->db_struioflag = 0;
1032 dbp->db_struioun.cksum.flags = 0;
1033
1034 mutex_enter(&bcp->mutex);
1035 kmem_cache_free(bcp->dblk_cache, dbp);
1036 bcp->alloc--;
1037
1038 if (bcp->alloc == 0 && bcp->destroy != 0) {
1039 kmem_cache_destroy(bcp->dblk_cache);
1040 kmem_cache_destroy(bcp->buffer_cache);
1041 mutex_exit(&bcp->mutex);
1042 mutex_destroy(&bcp->mutex);
1043 kmem_free(bcp, sizeof (bcache_t));
1044 } else {
1045 mutex_exit(&bcp->mutex);
1046 }
1047 }
1048
1049 bcache_t *
1050 bcache_create(char *name, size_t size, uint_t align)
1051 {
1052 bcache_t *bcp;
1053 char buffer[255];
1054
1055 ASSERT((align & (align - 1)) == 0);
1056
1057 if ((bcp = kmem_alloc(sizeof (bcache_t), KM_NOSLEEP)) == NULL)
1058 return (NULL);
1059
1060 bcp->size = size;
1061 bcp->align = align;
1062 bcp->alloc = 0;
1063 bcp->destroy = 0;
1064
1065 mutex_init(&bcp->mutex, NULL, MUTEX_DRIVER, NULL);
1066
1067 (void) sprintf(buffer, "%s_buffer_cache", name);
1068 bcp->buffer_cache = kmem_cache_create(buffer, size, align, NULL, NULL,
1069 NULL, NULL, NULL, 0);
1070 (void) sprintf(buffer, "%s_dblk_cache", name);
1071 bcp->dblk_cache = kmem_cache_create(buffer, sizeof (dblk_t),
1072 DBLK_CACHE_ALIGN, bcache_dblk_constructor, bcache_dblk_destructor,
1073 NULL, (void *)bcp, NULL, 0);
1074
1075 return (bcp);
1076 }
1077
1078 void
1079 bcache_destroy(bcache_t *bcp)
1080 {
1081 ASSERT(bcp != NULL);
1082
1083 mutex_enter(&bcp->mutex);
1084 if (bcp->alloc == 0) {
1085 kmem_cache_destroy(bcp->dblk_cache);
1086 kmem_cache_destroy(bcp->buffer_cache);
1087 mutex_exit(&bcp->mutex);
1088 mutex_destroy(&bcp->mutex);
1089 kmem_free(bcp, sizeof (bcache_t));
1090 } else {
1091 bcp->destroy++;
1092 mutex_exit(&bcp->mutex);
1093 }
1094 }
1095
1096 /*ARGSUSED*/
1097 mblk_t *
1098 bcache_allocb(bcache_t *bcp, uint_t pri)
1099 {
1100 dblk_t *dbp;
1101 mblk_t *mp = NULL;
1102
1103 ASSERT(bcp != NULL);
1104
1105 mutex_enter(&bcp->mutex);
1106 if (bcp->destroy != 0) {
1107 mutex_exit(&bcp->mutex);
1108 goto out;
1109 }
1110
1111 if ((dbp = kmem_cache_alloc(bcp->dblk_cache, KM_NOSLEEP)) == NULL) {
1112 mutex_exit(&bcp->mutex);
1113 goto out;
1114 }
1115 bcp->alloc++;
1116 mutex_exit(&bcp->mutex);
1117
1118 ASSERT(((uintptr_t)(dbp->db_base) & (bcp->align - 1)) == 0);
1119
1120 mp = dbp->db_mblk;
1121 DBLK_RTFU_WORD(dbp) = DBLK_RTFU(1, M_DATA, 0, 0);
1122 mp->b_next = mp->b_prev = mp->b_cont = NULL;
1123 mp->b_rptr = mp->b_wptr = dbp->db_base;
1124 mp->b_queue = NULL;
1125 MBLK_BAND_FLAG_WORD(mp) = 0;
1126 STR_FTALLOC(&dbp->db_fthdr, FTEV_BCALLOCB, bcp->size);
1127 out:
1128 FTRACE_1("bcache_allocb(): mp=0x%p", (uintptr_t)mp);
1129
1130 return (mp);
1131 }
1132
1133 static void
1134 dblk_lastfree_oversize(mblk_t *mp, dblk_t *dbp)
1135 {
1136 ASSERT(dbp->db_mblk == mp);
1137 if (dbp->db_fthdr != NULL)
1138 str_ftfree(dbp);
1139
1140 /* set credp and projid to be 'unspecified' before returning to cache */
1141 if (dbp->db_credp != NULL) {
1142 crfree(dbp->db_credp);
1143 dbp->db_credp = NULL;
1144 }
1145 dbp->db_cpid = -1;
1146 dbp->db_struioflag = 0;
1147 dbp->db_struioun.cksum.flags = 0;
1148
1149 kmem_free(dbp->db_base, dbp->db_lim - dbp->db_base);
1150 kmem_cache_free(dbp->db_cache, dbp);
1151 }
1152
1153 static mblk_t *
1154 allocb_oversize(size_t size, int kmflags)
1155 {
1156 mblk_t *mp;
1157 void *buf;
1158
1159 size = P2ROUNDUP(size, DBLK_CACHE_ALIGN);
1160 if ((buf = kmem_alloc(size, kmflags)) == NULL)
1161 return (NULL);
1162 if ((mp = gesballoc(buf, size, DBLK_RTFU(1, M_DATA, 0, 0),
1163 &frnop, dblk_lastfree_oversize, kmflags)) == NULL)
1164 kmem_free(buf, size);
1165
1166 if (mp != NULL)
1167 STR_FTALLOC(&DB_FTHDR(mp), FTEV_ALLOCBIG, size);
1168
1169 return (mp);
1170 }
1171
1172 mblk_t *
1173 allocb_tryhard(size_t target_size)
1174 {
1175 size_t size;
1176 mblk_t *bp;
1177
1178 for (size = target_size; size < target_size + 512;
1179 size += DBLK_CACHE_ALIGN)
1180 if ((bp = allocb(size, BPRI_HI)) != NULL)
1181 return (bp);
1182 allocb_tryhard_fails++;
1183 return (NULL);
1184 }
1185
1186 /*
1187 * This routine is consolidation private for STREAMS internal use
1188 * This routine may only be called from sync routines (i.e., not
1189 * from put or service procedures). It is located here (rather
1190 * than strsubr.c) so that we don't have to expose all of the
1191 * allocb() implementation details in header files.
1192 */
1193 mblk_t *
1194 allocb_wait(size_t size, uint_t pri, uint_t flags, int *error)
1195 {
1196 dblk_t *dbp;
1197 mblk_t *mp;
1198 size_t index;
1199
1200 index = (size -1) >> DBLK_SIZE_SHIFT;
1201
1202 if (flags & STR_NOSIG) {
1203 if (index >= (DBLK_MAX_CACHE >> DBLK_SIZE_SHIFT)) {
1204 if (size != 0) {
1205 mp = allocb_oversize(size, KM_SLEEP);
1206 FTRACE_1("allocb_wait (NOSIG): mp=0x%lx",
1207 (uintptr_t)mp);
1208 return (mp);
1209 }
1210 index = 0;
1211 }
1212
1213 dbp = kmem_cache_alloc(dblk_cache[index], KM_SLEEP);
1214 mp = dbp->db_mblk;
1215 DBLK_RTFU_WORD(dbp) = DBLK_RTFU(1, M_DATA, 0, 0);
1216 mp->b_next = mp->b_prev = mp->b_cont = NULL;
1217 mp->b_rptr = mp->b_wptr = dbp->db_base;
1218 mp->b_queue = NULL;
1219 MBLK_BAND_FLAG_WORD(mp) = 0;
1220 STR_FTALLOC(&DB_FTHDR(mp), FTEV_ALLOCBW, size);
1221
1222 FTRACE_1("allocb_wait (NOSIG): mp=0x%lx", (uintptr_t)mp);
1223
1224 } else {
1225 while ((mp = allocb(size, pri)) == NULL) {
1226 if ((*error = strwaitbuf(size, BPRI_HI)) != 0)
1227 return (NULL);
1228 }
1229 }
1230
1231 return (mp);
1232 }
1233
1234 /*
1235 * Call function 'func' with 'arg' when a class zero block can
1236 * be allocated with priority 'pri'.
1237 */
1238 bufcall_id_t
1239 esbbcall(uint_t pri, void (*func)(void *), void *arg)
1240 {
1241 return (bufcall(1, pri, func, arg));
1242 }
1243
1244 /*
1245 * Allocates an iocblk (M_IOCTL) block. Properly sets the credentials
1246 * ioc_id, rval and error of the struct ioctl to set up an ioctl call.
1247 * This provides consistency for all internal allocators of ioctl.
1248 */
1249 mblk_t *
1250 mkiocb(uint_t cmd)
1251 {
1252 struct iocblk *ioc;
1253 mblk_t *mp;
1254
1255 /*
1256 * Allocate enough space for any of the ioctl related messages.
1257 */
1258 if ((mp = allocb(sizeof (union ioctypes), BPRI_MED)) == NULL)
1259 return (NULL);
1260
1261 bzero(mp->b_rptr, sizeof (union ioctypes));
1262
1263 /*
1264 * Set the mblk_t information and ptrs correctly.
1265 */
1266 mp->b_wptr += sizeof (struct iocblk);
1267 mp->b_datap->db_type = M_IOCTL;
1268
1269 /*
1270 * Fill in the fields.
1271 */
1272 ioc = (struct iocblk *)mp->b_rptr;
1273 ioc->ioc_cmd = cmd;
1274 ioc->ioc_cr = kcred;
1275 ioc->ioc_id = getiocseqno();
1276 ioc->ioc_flag = IOC_NATIVE;
1277 return (mp);
1278 }
1279
1280 /*
1281 * test if block of given size can be allocated with a request of
1282 * the given priority.
1283 * 'pri' is no longer used, but is retained for compatibility.
1284 */
1285 /* ARGSUSED */
1286 int
1287 testb(size_t size, uint_t pri)
1288 {
1289 return ((size + sizeof (dblk_t)) <= kmem_avail());
1290 }
1291
1292 #ifdef _KERNEL
1293
1294 /*
1295 * Call function 'func' with argument 'arg' when there is a reasonably
1296 * good chance that a block of size 'size' can be allocated.
1297 * 'pri' is no longer used, but is retained for compatibility.
1298 */
1299 /* ARGSUSED */
1300 bufcall_id_t
1301 bufcall(size_t size, uint_t pri, void (*func)(void *), void *arg)
1302 {
1303 static long bid = 1; /* always odd to save checking for zero */
1304 bufcall_id_t bc_id;
1305 struct strbufcall *bcp;
1306
1307 if ((bcp = kmem_alloc(sizeof (strbufcall_t), KM_NOSLEEP)) == NULL)
1308 return (0);
1309
1310 bcp->bc_func = func;
1311 bcp->bc_arg = arg;
1312 bcp->bc_size = size;
1313 bcp->bc_next = NULL;
1314 bcp->bc_executor = NULL;
1315
1316 mutex_enter(&strbcall_lock);
1317 /*
1318 * After bcp is linked into strbcalls and strbcall_lock is dropped there
1319 * should be no references to bcp since it may be freed by
1320 * runbufcalls(). Since bcp_id field is returned, we save its value in
1321 * the local var.
1322 */
1323 bc_id = bcp->bc_id = (bufcall_id_t)(bid += 2); /* keep it odd */
1324
1325 /*
1326 * add newly allocated stream event to existing
1327 * linked list of events.
1328 */
1329 if (strbcalls.bc_head == NULL) {
1330 strbcalls.bc_head = strbcalls.bc_tail = bcp;
1331 } else {
1332 strbcalls.bc_tail->bc_next = bcp;
1333 strbcalls.bc_tail = bcp;
1334 }
1335
1336 cv_signal(&strbcall_cv);
1337 mutex_exit(&strbcall_lock);
1338 return (bc_id);
1339 }
1340
1341 /*
1342 * Cancel a bufcall request.
1343 */
1344 void
1345 unbufcall(bufcall_id_t id)
1346 {
1347 strbufcall_t *bcp, *pbcp;
1348
1349 mutex_enter(&strbcall_lock);
1350 again:
1351 pbcp = NULL;
1352 for (bcp = strbcalls.bc_head; bcp; bcp = bcp->bc_next) {
1353 if (id == bcp->bc_id)
1354 break;
1355 pbcp = bcp;
1356 }
1357 if (bcp) {
1358 if (bcp->bc_executor != NULL) {
1359 if (bcp->bc_executor != curthread) {
1360 cv_wait(&bcall_cv, &strbcall_lock);
1361 goto again;
1362 }
1363 } else {
1364 if (pbcp)
1365 pbcp->bc_next = bcp->bc_next;
1366 else
1367 strbcalls.bc_head = bcp->bc_next;
1368 if (bcp == strbcalls.bc_tail)
1369 strbcalls.bc_tail = pbcp;
1370 kmem_free(bcp, sizeof (strbufcall_t));
1371 }
1372 }
1373 mutex_exit(&strbcall_lock);
1374 }
1375
1376 #endif /* _KERNEL */
1377
1378 /*
1379 * Duplicate a message block by block (uses dupb), returning
1380 * a pointer to the duplicate message.
1381 * Returns a non-NULL value only if the entire message
1382 * was dup'd.
1383 */
1384 mblk_t *
1385 dupmsg(mblk_t *bp)
1386 {
1387 mblk_t *head, *nbp;
1388
1389 if (!bp || !(nbp = head = dupb(bp)))
1390 return (NULL);
1391
1392 while (bp->b_cont) {
1393 if (!(nbp->b_cont = dupb(bp->b_cont))) {
1394 freemsg(head);
1395 return (NULL);
1396 }
1397 nbp = nbp->b_cont;
1398 bp = bp->b_cont;
1399 }
1400 return (head);
1401 }
1402
1403 #define DUPB_NOLOAN(bp) \
1404 ((((bp)->b_datap->db_struioflag & STRUIO_ZC) != 0) ? \
1405 copyb((bp)) : dupb((bp)))
1406
1407 mblk_t *
1408 dupmsg_noloan(mblk_t *bp)
1409 {
1410 mblk_t *head, *nbp;
1411
1412 if (bp == NULL || DB_TYPE(bp) != M_DATA ||
1413 ((nbp = head = DUPB_NOLOAN(bp)) == NULL))
1414 return (NULL);
1415
1416 while (bp->b_cont) {
1417 if ((nbp->b_cont = DUPB_NOLOAN(bp->b_cont)) == NULL) {
1418 freemsg(head);
1419 return (NULL);
1420 }
1421 nbp = nbp->b_cont;
1422 bp = bp->b_cont;
1423 }
1424 return (head);
1425 }
1426
1427 /*
1428 * Copy data from message and data block to newly allocated message and
1429 * data block. Returns new message block pointer, or NULL if error.
1430 * The alignment of rptr (w.r.t. word alignment) will be the same in the copy
1431 * as in the original even when db_base is not word aligned. (bug 1052877)
1432 */
1433 mblk_t *
1434 copyb(mblk_t *bp)
1435 {
1436 mblk_t *nbp;
1437 dblk_t *dp, *ndp;
1438 uchar_t *base;
1439 size_t size;
1440 size_t unaligned;
1441
1442 ASSERT(bp->b_wptr >= bp->b_rptr);
1443
1444 dp = bp->b_datap;
1445 if (dp->db_fthdr != NULL)
1446 STR_FTEVENT_MBLK(bp, caller(), FTEV_COPYB, 0);
1447
1448 /*
1449 * Special handling for Multidata message; this should be
1450 * removed once a copy-callback routine is made available.
1451 */
1452 if (dp->db_type == M_MULTIDATA) {
1453 #ifdef _KERNEL
1454 cred_t *cr;
1455
1456 if ((nbp = mmd_copy(bp, KM_NOSLEEP)) == NULL)
1457 return (NULL);
1458
1459 nbp->b_flag = bp->b_flag;
1460 nbp->b_band = bp->b_band;
1461 ndp = nbp->b_datap;
1462
1463 /* See comments below on potential issues. */
1464 STR_FTEVENT_MBLK(nbp, caller(), FTEV_COPYB, 1);
1465
1466 ASSERT(ndp->db_type == dp->db_type);
1467 cr = dp->db_credp;
1468 if (cr != NULL)
1469 crhold(ndp->db_credp = cr);
1470 ndp->db_cpid = dp->db_cpid;
1471 return (nbp);
1472 #else /* _KERNEL */
1473 return (NULL);
1474 #endif /* _KERNEL */
1475 }
1476
1477 size = dp->db_lim - dp->db_base;
1478 unaligned = P2PHASE((uintptr_t)dp->db_base, sizeof (uint_t));
1479 if ((nbp = allocb_tmpl(size + unaligned, bp)) == NULL)
1480 return (NULL);
1481 nbp->b_flag = bp->b_flag;
1482 nbp->b_band = bp->b_band;
1483 ndp = nbp->b_datap;
1484
1485 /*
1486 * Well, here is a potential issue. If we are trying to
1487 * trace a flow, and we copy the message, we might lose
1488 * information about where this message might have been.
1489 * So we should inherit the FT data. On the other hand,
1490 * a user might be interested only in alloc to free data.
1491 * So I guess the real answer is to provide a tunable.
1492 */
1493 STR_FTEVENT_MBLK(nbp, caller(), FTEV_COPYB, 1);
1494
1495 base = ndp->db_base + unaligned;
1496 bcopy(dp->db_base, ndp->db_base + unaligned, size);
1497
1498 nbp->b_rptr = base + (bp->b_rptr - dp->db_base);
1499 nbp->b_wptr = nbp->b_rptr + MBLKL(bp);
1500
1501 return (nbp);
1502 }
1503
1504 /*
1505 * Copy data from message to newly allocated message using new
1506 * data blocks. Returns a pointer to the new message, or NULL if error.
1507 */
1508 mblk_t *
1509 copymsg(mblk_t *bp)
1510 {
1511 mblk_t *head, *nbp;
1512
1513 if (!bp || !(nbp = head = copyb(bp)))
1514 return (NULL);
1515
1516 while (bp->b_cont) {
1517 if (!(nbp->b_cont = copyb(bp->b_cont))) {
1518 freemsg(head);
1519 return (NULL);
1520 }
1521 nbp = nbp->b_cont;
1522 bp = bp->b_cont;
1523 }
1524 return (head);
1525 }
1526
1527 /*
1528 * link a message block to tail of message
1529 */
1530 void
1531 linkb(mblk_t *mp, mblk_t *bp)
1532 {
1533 ASSERT(mp && bp);
1534
1535 for (; mp->b_cont; mp = mp->b_cont)
1536 ;
1537 mp->b_cont = bp;
1538 }
1539
1540 /*
1541 * unlink a message block from head of message
1542 * return pointer to new message.
1543 * NULL if message becomes empty.
1544 */
1545 mblk_t *
1546 unlinkb(mblk_t *bp)
1547 {
1548 mblk_t *bp1;
1549
1550 bp1 = bp->b_cont;
1551 bp->b_cont = NULL;
1552 return (bp1);
1553 }
1554
1555 /*
1556 * remove a message block "bp" from message "mp"
1557 *
1558 * Return pointer to new message or NULL if no message remains.
1559 * Return -1 if bp is not found in message.
1560 */
1561 mblk_t *
1562 rmvb(mblk_t *mp, mblk_t *bp)
1563 {
1564 mblk_t *tmp;
1565 mblk_t *lastp = NULL;
1566
1567 ASSERT(mp && bp);
1568 for (tmp = mp; tmp; tmp = tmp->b_cont) {
1569 if (tmp == bp) {
1570 if (lastp)
1571 lastp->b_cont = tmp->b_cont;
1572 else
1573 mp = tmp->b_cont;
1574 tmp->b_cont = NULL;
1575 return (mp);
1576 }
1577 lastp = tmp;
1578 }
1579 return ((mblk_t *)-1);
1580 }
1581
1582 /*
1583 * Concatenate and align first len bytes of common
1584 * message type. Len == -1, means concat everything.
1585 * Returns 1 on success, 0 on failure
1586 * After the pullup, mp points to the pulled up data.
1587 */
1588 int
1589 pullupmsg(mblk_t *mp, ssize_t len)
1590 {
1591 mblk_t *bp, *b_cont;
1592 dblk_t *dbp;
1593 ssize_t n;
1594
1595 ASSERT(mp->b_datap->db_ref > 0);
1596 ASSERT(mp->b_next == NULL && mp->b_prev == NULL);
1597
1598 /*
1599 * We won't handle Multidata message, since it contains
1600 * metadata which this function has no knowledge of; we
1601 * assert on DEBUG, and return failure otherwise.
1602 */
1603 ASSERT(mp->b_datap->db_type != M_MULTIDATA);
1604 if (mp->b_datap->db_type == M_MULTIDATA)
1605 return (0);
1606
1607 if (len == -1) {
1608 if (mp->b_cont == NULL && str_aligned(mp->b_rptr))
1609 return (1);
1610 len = xmsgsize(mp);
1611 } else {
1612 ssize_t first_mblk_len = mp->b_wptr - mp->b_rptr;
1613 ASSERT(first_mblk_len >= 0);
1614 /*
1615 * If the length is less than that of the first mblk,
1616 * we want to pull up the message into an aligned mblk.
1617 * Though not part of the spec, some callers assume it.
1618 */
1619 if (len <= first_mblk_len) {
1620 if (str_aligned(mp->b_rptr))
1621 return (1);
1622 len = first_mblk_len;
1623 } else if (xmsgsize(mp) < len)
1624 return (0);
1625 }
1626
1627 if ((bp = allocb_tmpl(len, mp)) == NULL)
1628 return (0);
1629
1630 dbp = bp->b_datap;
1631 *bp = *mp; /* swap mblks so bp heads the old msg... */
1632 mp->b_datap = dbp; /* ... and mp heads the new message */
1633 mp->b_datap->db_mblk = mp;
1634 bp->b_datap->db_mblk = bp;
1635 mp->b_rptr = mp->b_wptr = dbp->db_base;
1636
1637 do {
1638 ASSERT(bp->b_datap->db_ref > 0);
1639 ASSERT(bp->b_wptr >= bp->b_rptr);
1640 n = MIN(bp->b_wptr - bp->b_rptr, len);
1641 ASSERT(n >= 0); /* allow zero-length mblk_t's */
1642 if (n > 0)
1643 bcopy(bp->b_rptr, mp->b_wptr, (size_t)n);
1644 mp->b_wptr += n;
1645 bp->b_rptr += n;
1646 len -= n;
1647 if (bp->b_rptr != bp->b_wptr)
1648 break;
1649 b_cont = bp->b_cont;
1650 freeb(bp);
1651 bp = b_cont;
1652 } while (len && bp);
1653
1654 mp->b_cont = bp; /* tack on whatever wasn't pulled up */
1655
1656 return (1);
1657 }
1658
1659 /*
1660 * Concatenate and align at least the first len bytes of common message
1661 * type. Len == -1 means concatenate everything. The original message is
1662 * unaltered. Returns a pointer to a new message on success, otherwise
1663 * returns NULL.
1664 */
1665 mblk_t *
1666 msgpullup(mblk_t *mp, ssize_t len)
1667 {
1668 mblk_t *newmp;
1669 ssize_t totlen;
1670 ssize_t n;
1671
1672 /*
1673 * We won't handle Multidata message, since it contains
1674 * metadata which this function has no knowledge of; we
1675 * assert on DEBUG, and return failure otherwise.
1676 */
1677 ASSERT(mp->b_datap->db_type != M_MULTIDATA);
1678 if (mp->b_datap->db_type == M_MULTIDATA)
1679 return (NULL);
1680
1681 totlen = xmsgsize(mp);
1682
1683 if ((len > 0) && (len > totlen))
1684 return (NULL);
1685
1686 /*
1687 * Copy all of the first msg type into one new mblk, then dupmsg
1688 * and link the rest onto this.
1689 */
1690
1691 len = totlen;
1692
1693 if ((newmp = allocb_tmpl(len, mp)) == NULL)
1694 return (NULL);
1695
1696 newmp->b_flag = mp->b_flag;
1697 newmp->b_band = mp->b_band;
1698
1699 while (len > 0) {
1700 n = mp->b_wptr - mp->b_rptr;
1701 ASSERT(n >= 0); /* allow zero-length mblk_t's */
1702 if (n > 0)
1703 bcopy(mp->b_rptr, newmp->b_wptr, n);
1704 newmp->b_wptr += n;
1705 len -= n;
1706 mp = mp->b_cont;
1707 }
1708
1709 if (mp != NULL) {
1710 newmp->b_cont = dupmsg(mp);
1711 if (newmp->b_cont == NULL) {
1712 freemsg(newmp);
1713 return (NULL);
1714 }
1715 }
1716
1717 return (newmp);
1718 }
1719
1720 /*
1721 * Trim bytes from message
1722 * len > 0, trim from head
1723 * len < 0, trim from tail
1724 * Returns 1 on success, 0 on failure.
1725 */
1726 int
1727 adjmsg(mblk_t *mp, ssize_t len)
1728 {
1729 mblk_t *bp;
1730 mblk_t *save_bp = NULL;
1731 mblk_t *prev_bp;
1732 mblk_t *bcont;
1733 unsigned char type;
1734 ssize_t n;
1735 int fromhead;
1736 int first;
1737
1738 ASSERT(mp != NULL);
1739 /*
1740 * We won't handle Multidata message, since it contains
1741 * metadata which this function has no knowledge of; we
1742 * assert on DEBUG, and return failure otherwise.
1743 */
1744 ASSERT(mp->b_datap->db_type != M_MULTIDATA);
1745 if (mp->b_datap->db_type == M_MULTIDATA)
1746 return (0);
1747
1748 if (len < 0) {
1749 fromhead = 0;
1750 len = -len;
1751 } else {
1752 fromhead = 1;
1753 }
1754
1755 if (xmsgsize(mp) < len)
1756 return (0);
1757
1758 if (fromhead) {
1759 first = 1;
1760 while (len) {
1761 ASSERT(mp->b_wptr >= mp->b_rptr);
1762 n = MIN(mp->b_wptr - mp->b_rptr, len);
1763 mp->b_rptr += n;
1764 len -= n;
1765
1766 /*
1767 * If this is not the first zero length
1768 * message remove it
1769 */
1770 if (!first && (mp->b_wptr == mp->b_rptr)) {
1771 bcont = mp->b_cont;
1772 freeb(mp);
1773 mp = save_bp->b_cont = bcont;
1774 } else {
1775 save_bp = mp;
1776 mp = mp->b_cont;
1777 }
1778 first = 0;
1779 }
1780 } else {
1781 type = mp->b_datap->db_type;
1782 while (len) {
1783 bp = mp;
1784 save_bp = NULL;
1785
1786 /*
1787 * Find the last message of same type
1788 */
1789 while (bp && bp->b_datap->db_type == type) {
1790 ASSERT(bp->b_wptr >= bp->b_rptr);
1791 prev_bp = save_bp;
1792 save_bp = bp;
1793 bp = bp->b_cont;
1794 }
1795 if (save_bp == NULL)
1796 break;
1797 n = MIN(save_bp->b_wptr - save_bp->b_rptr, len);
1798 save_bp->b_wptr -= n;
1799 len -= n;
1800
1801 /*
1802 * If this is not the first message
1803 * and we have taken away everything
1804 * from this message, remove it
1805 */
1806
1807 if ((save_bp != mp) &&
1808 (save_bp->b_wptr == save_bp->b_rptr)) {
1809 bcont = save_bp->b_cont;
1810 freeb(save_bp);
1811 prev_bp->b_cont = bcont;
1812 }
1813 }
1814 }
1815 return (1);
1816 }
1817
1818 /*
1819 * get number of data bytes in message
1820 */
1821 size_t
1822 msgdsize(mblk_t *bp)
1823 {
1824 size_t count = 0;
1825
1826 for (; bp; bp = bp->b_cont)
1827 if (bp->b_datap->db_type == M_DATA) {
1828 ASSERT(bp->b_wptr >= bp->b_rptr);
1829 count += bp->b_wptr - bp->b_rptr;
1830 }
1831 return (count);
1832 }
1833
1834 /* getq() etc to EOF removed */