1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  22 /*        All Rights Reserved   */
  23 
  24 /*
  25  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  26  * Use is subject to license terms.
  27  *
  28  * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
  29  */
  30 
  31 #include <sys/types.h>
  32 #include <sys/param.h>
  33 #include <sys/thread.h>
  34 #include <sys/sysmacros.h>
  35 #include <sys/stropts.h>
  36 #include <sys/stream.h>
  37 #include <sys/strsubr.h>
  38 #include <sys/strsun.h>
  39 #include <sys/conf.h>
  40 #include <sys/debug.h>
  41 #include <sys/cmn_err.h>
  42 #include <sys/kmem.h>
  43 #include <sys/atomic.h>
  44 #include <sys/errno.h>
  45 #include <sys/vtrace.h>
  46 #include <sys/ftrace.h>
  47 #include <sys/ontrap.h>
  48 #include <sys/multidata.h>
  49 #include <sys/multidata_impl.h>
  50 #include <sys/sdt.h>
  51 #include <sys/strft.h>
  52 
  53 #if defined(_KERNEL) && defined(DEBUG)
  54 #include <sys/kmem_impl.h>
  55 #endif
  56 
  57 /*
  58  * This file contains those functions from io/stream.c
  59  * needed by this library, mostly unmodified.
  60  */
  61 
  62 /*
  63  * STREAMS message allocator: principles of operation
  64  *
  65  * The streams message allocator consists of all the routines that
  66  * allocate, dup and free streams messages: allocb(), [d]esballoc[a],
  67  * dupb(), freeb() and freemsg().  What follows is a high-level view
  68  * of how the allocator works.
  69  *
  70  * Every streams message consists of one or more mblks, a dblk, and data.
  71  * All mblks for all types of messages come from a common mblk_cache.
  72  * The dblk and data come in several flavors, depending on how the
  73  * message is allocated:
  74  *
  75  * (1) mblks up to DBLK_MAX_CACHE size are allocated from a collection of
  76  *     fixed-size dblk/data caches. For message sizes that are multiples of
  77  *     PAGESIZE, dblks are allocated separately from the buffer.
  78  *     The associated buffer is allocated by the constructor using kmem_alloc().
  79  *     For all other message sizes, dblk and its associated data is allocated
  80  *     as a single contiguous chunk of memory.
  81  *     Objects in these caches consist of a dblk plus its associated data.
  82  *     allocb() determines the nearest-size cache by table lookup:
  83  *     the dblk_cache[] array provides the mapping from size to dblk cache.
  84  *
  85  * (2) Large messages (size > DBLK_MAX_CACHE) are constructed by
  86  *     kmem_alloc()'ing a buffer for the data and supplying that
  87  *     buffer to gesballoc(), described below.
  88  *
  89  * (3) The four flavors of [d]esballoc[a] are all implemented by a
  90  *     common routine, gesballoc() ("generic esballoc").  gesballoc()
  91  *     allocates a dblk from the global dblk_esb_cache and sets db_base,
  92  *     db_lim and db_frtnp to describe the caller-supplied buffer.
  93  *
  94  * While there are several routines to allocate messages, there is only
  95  * one routine to free messages: freeb().  freeb() simply invokes the
  96  * dblk's free method, dbp->db_free(), which is set at allocation time.
  97  *
  98  * dupb() creates a new reference to a message by allocating a new mblk,
  99  * incrementing the dblk reference count and setting the dblk's free
 100  * method to dblk_decref().  The dblk's original free method is retained
 101  * in db_lastfree.  dblk_decref() decrements the reference count on each
 102  * freeb().  If this is not the last reference it just frees the mblk;
 103  * if this *is* the last reference, it restores db_free to db_lastfree,
 104  * sets db_mblk to the current mblk (see below), and invokes db_lastfree.
 105  *
 106  * The implementation makes aggressive use of kmem object caching for
 107  * maximum performance.  This makes the code simple and compact, but
 108  * also a bit abstruse in some places.  The invariants that constitute a
 109  * message's constructed state, described below, are more subtle than usual.
 110  *
 111  * Every dblk has an "attached mblk" as part of its constructed state.
 112  * The mblk is allocated by the dblk's constructor and remains attached
 113  * until the message is either dup'ed or pulled up.  In the dupb() case
 114  * the mblk association doesn't matter until the last free, at which time
 115  * dblk_decref() attaches the last mblk to the dblk.  pullupmsg() affects
 116  * the mblk association because it swaps the leading mblks of two messages,
 117  * so it is responsible for swapping their db_mblk pointers accordingly.
 118  * From a constructed-state viewpoint it doesn't matter that a dblk's
 119  * attached mblk can change while the message is allocated; all that
 120  * matters is that the dblk has *some* attached mblk when it's freed.
 121  *
 122  * The sizes of the allocb() small-message caches are not magical.
 123  * They represent a good trade-off between internal and external
 124  * fragmentation for current workloads.  They should be reevaluated
 125  * periodically, especially if allocations larger than DBLK_MAX_CACHE
 126  * become common.  We use 64-byte alignment so that dblks don't
 127  * straddle cache lines unnecessarily.
 128  */
 129 #define DBLK_MAX_CACHE          73728
 130 #define DBLK_CACHE_ALIGN        64
 131 #define DBLK_MIN_SIZE           8
 132 #define DBLK_SIZE_SHIFT         3
 133 
 134 #ifdef _BIG_ENDIAN
 135 #define DBLK_RTFU_SHIFT(field)  \
 136         (8 * (&((dblk_t *)0)->db_struioflag - &((dblk_t *)0)->field))
 137 #else
 138 #define DBLK_RTFU_SHIFT(field)  \
 139         (8 * (&((dblk_t *)0)->field - &((dblk_t *)0)->db_ref))
 140 #endif
 141 
 142 #define DBLK_RTFU(ref, type, flags, uioflag)    \
 143         (((ref) << DBLK_RTFU_SHIFT(db_ref)) | \
 144         ((type) << DBLK_RTFU_SHIFT(db_type)) | \
 145         (((flags) | (ref - 1)) << DBLK_RTFU_SHIFT(db_flags)) | \
 146         ((uioflag) << DBLK_RTFU_SHIFT(db_struioflag)))
 147 #define DBLK_RTFU_REF_MASK      (DBLK_REFMAX << DBLK_RTFU_SHIFT(db_ref))
 148 #define DBLK_RTFU_WORD(dbp)     (*((uint32_t *)&(dbp)->db_ref))
 149 #define MBLK_BAND_FLAG_WORD(mp) (*((uint32_t *)&(mp)->b_band))
 150 
 151 static size_t dblk_sizes[] = {
 152 #ifdef _LP64
 153         16, 80, 144, 208, 272, 336, 528, 1040, 1488, 1936, 2576, 3856,
 154         8192, 12048, 16384, 20240, 24576, 28432, 32768, 36624,
 155         40960, 44816, 49152, 53008, 57344, 61200, 65536, 69392,
 156 #else
 157         64, 128, 320, 576, 1088, 1536, 1984, 2624, 3904,
 158         8192, 12096, 16384, 20288, 24576, 28480, 32768, 36672,
 159         40960, 44864, 49152, 53056, 57344, 61248, 65536, 69440,
 160 #endif
 161         DBLK_MAX_CACHE, 0
 162 };
 163 
 164 static struct kmem_cache *dblk_cache[DBLK_MAX_CACHE / DBLK_MIN_SIZE];
 165 static struct kmem_cache *mblk_cache;
 166 static struct kmem_cache *dblk_esb_cache;
 167 #ifdef  _KERNEL
 168 static struct kmem_cache *fthdr_cache;
 169 static struct kmem_cache *ftblk_cache;
 170 #endif  /* _KERNEL */
 171 
 172 static void dblk_lastfree(mblk_t *mp, dblk_t *dbp);
 173 static mblk_t *allocb_oversize(size_t size, int flags);
 174 static int allocb_tryhard_fails;
 175 static void frnop_func(void *arg);
 176 frtn_t frnop = { frnop_func };
 177 static void bcache_dblk_lastfree(mblk_t *mp, dblk_t *dbp);
 178 
 179 #ifdef  _KERNEL
 180 static boolean_t rwnext_enter(queue_t *qp);
 181 static void rwnext_exit(queue_t *qp);
 182 #endif  /* _KERNEL */
 183 
 184 /*
 185  * Patchable mblk/dblk kmem_cache flags.
 186  */
 187 int dblk_kmem_flags = 0;
 188 int mblk_kmem_flags = 0;
 189 
 190 static int
 191 dblk_constructor(void *buf, void *cdrarg, int kmflags)
 192 {
 193         dblk_t *dbp = buf;
 194         ssize_t msg_size = (ssize_t)cdrarg;
 195         size_t index;
 196 
 197         ASSERT(msg_size != 0);
 198 
 199         index = (msg_size - 1) >> DBLK_SIZE_SHIFT;
 200 
 201         ASSERT(index < (DBLK_MAX_CACHE >> DBLK_SIZE_SHIFT));
 202 
 203         if ((dbp->db_mblk = kmem_cache_alloc(mblk_cache, kmflags)) == NULL)
 204                 return (-1);
 205         if ((msg_size & PAGEOFFSET) == 0) {
 206                 dbp->db_base = kmem_alloc(msg_size, kmflags);
 207                 if (dbp->db_base == NULL) {
 208                         kmem_cache_free(mblk_cache, dbp->db_mblk);
 209                         return (-1);
 210                 }
 211         } else {
 212                 dbp->db_base = (unsigned char *)&dbp[1];
 213         }
 214 
 215         dbp->db_mblk->b_datap = dbp;
 216         dbp->db_cache = dblk_cache[index];
 217         dbp->db_lim = dbp->db_base + msg_size;
 218         dbp->db_free = dbp->db_lastfree = dblk_lastfree;
 219         dbp->db_frtnp = NULL;
 220         dbp->db_fthdr = NULL;
 221         dbp->db_credp = NULL;
 222         dbp->db_cpid = -1;
 223         dbp->db_struioflag = 0;
 224         dbp->db_struioun.cksum.flags = 0;
 225         return (0);
 226 }
 227 
 228 /*ARGSUSED*/
 229 static int
 230 dblk_esb_constructor(void *buf, void *cdrarg, int kmflags)
 231 {
 232         dblk_t *dbp = buf;
 233 
 234         if ((dbp->db_mblk = kmem_cache_alloc(mblk_cache, kmflags)) == NULL)
 235                 return (-1);
 236         dbp->db_mblk->b_datap = dbp;
 237         dbp->db_cache = dblk_esb_cache;
 238         dbp->db_fthdr = NULL;
 239         dbp->db_credp = NULL;
 240         dbp->db_cpid = -1;
 241         dbp->db_struioflag = 0;
 242         dbp->db_struioun.cksum.flags = 0;
 243         return (0);
 244 }
 245 
 246 static int
 247 bcache_dblk_constructor(void *buf, void *cdrarg, int kmflags)
 248 {
 249         dblk_t *dbp = buf;
 250         bcache_t *bcp = cdrarg;
 251 
 252         if ((dbp->db_mblk = kmem_cache_alloc(mblk_cache, kmflags)) == NULL)
 253                 return (-1);
 254 
 255         dbp->db_base = kmem_cache_alloc(bcp->buffer_cache, kmflags);
 256         if (dbp->db_base == NULL) {
 257                 kmem_cache_free(mblk_cache, dbp->db_mblk);
 258                 return (-1);
 259         }
 260 
 261         dbp->db_mblk->b_datap = dbp;
 262         dbp->db_cache = (void *)bcp;
 263         dbp->db_lim = dbp->db_base + bcp->size;
 264         dbp->db_free = dbp->db_lastfree = bcache_dblk_lastfree;
 265         dbp->db_frtnp = NULL;
 266         dbp->db_fthdr = NULL;
 267         dbp->db_credp = NULL;
 268         dbp->db_cpid = -1;
 269         dbp->db_struioflag = 0;
 270         dbp->db_struioun.cksum.flags = 0;
 271         return (0);
 272 }
 273 
 274 /*ARGSUSED*/
 275 static void
 276 dblk_destructor(void *buf, void *cdrarg)
 277 {
 278         dblk_t *dbp = buf;
 279         ssize_t msg_size = (ssize_t)cdrarg;
 280 
 281         ASSERT(dbp->db_mblk->b_datap == dbp);
 282         ASSERT(msg_size != 0);
 283         ASSERT(dbp->db_struioflag == 0);
 284         ASSERT(dbp->db_struioun.cksum.flags == 0);
 285 
 286         if ((msg_size & PAGEOFFSET) == 0) {
 287                 kmem_free(dbp->db_base, msg_size);
 288         }
 289 
 290         kmem_cache_free(mblk_cache, dbp->db_mblk);
 291 }
 292 
 293 static void
 294 bcache_dblk_destructor(void *buf, void *cdrarg)
 295 {
 296         dblk_t *dbp = buf;
 297         bcache_t *bcp = cdrarg;
 298 
 299         kmem_cache_free(bcp->buffer_cache, dbp->db_base);
 300 
 301         ASSERT(dbp->db_mblk->b_datap == dbp);
 302         ASSERT(dbp->db_struioflag == 0);
 303         ASSERT(dbp->db_struioun.cksum.flags == 0);
 304 
 305         kmem_cache_free(mblk_cache, dbp->db_mblk);
 306 }
 307 
 308 #ifdef  _KERNEL
 309 
 310 /* ARGSUSED */
 311 static int
 312 ftblk_constructor(void *buf, void *cdrarg, int kmflags)
 313 {
 314         ftblk_t *fbp = buf;
 315         int i;
 316 
 317         bzero(fbp, sizeof (ftblk_t));
 318         if (str_ftstack != 0) {
 319                 for (i = 0; i < FTBLK_EVNTS; i++)
 320                         fbp->ev[i].stk = kmem_alloc(sizeof (ftstk_t), kmflags);
 321         }
 322 
 323         return (0);
 324 }
 325 
 326 /* ARGSUSED */
 327 static void
 328 ftblk_destructor(void *buf, void *cdrarg)
 329 {
 330         ftblk_t *fbp = buf;
 331         int i;
 332 
 333         if (str_ftstack != 0) {
 334                 for (i = 0; i < FTBLK_EVNTS; i++) {
 335                         if (fbp->ev[i].stk != NULL) {
 336                                 kmem_free(fbp->ev[i].stk, sizeof (ftstk_t));
 337                                 fbp->ev[i].stk = NULL;
 338                         }
 339                 }
 340         }
 341 }
 342 
 343 static int
 344 fthdr_constructor(void *buf, void *cdrarg, int kmflags)
 345 {
 346         fthdr_t *fhp = buf;
 347 
 348         return (ftblk_constructor(&fhp->first, cdrarg, kmflags));
 349 }
 350 
 351 static void
 352 fthdr_destructor(void *buf, void *cdrarg)
 353 {
 354         fthdr_t *fhp = buf;
 355 
 356         ftblk_destructor(&fhp->first, cdrarg);
 357 }
 358 
 359 #endif  /* _KERNEL */
 360 
 361 /* Needed in the ASSERT below */
 362 #ifdef  DEBUG
 363 #ifdef  _KERNEL
 364 #define KMEM_SLAB_T_SZ  sizeof (kmem_slab_t)
 365 #else   /* _KERNEL */
 366 #define KMEM_SLAB_T_SZ  64      /* fakekernel */
 367 #endif  /* _KERNEL */
 368 #endif  /* DEBUG */
 369 
 370 void
 371 streams_msg_init(void)
 372 {
 373         char name[40];
 374         size_t size;
 375         size_t lastsize = DBLK_MIN_SIZE;
 376         size_t *sizep;
 377         struct kmem_cache *cp;
 378         size_t tot_size;
 379         int offset;
 380 
 381         mblk_cache = kmem_cache_create("streams_mblk", sizeof (mblk_t), 32,
 382             NULL, NULL, NULL, NULL, NULL, mblk_kmem_flags);
 383 
 384         for (sizep = dblk_sizes; (size = *sizep) != 0; sizep++) {
 385 
 386                 if ((offset = (size & PAGEOFFSET)) != 0) {
 387                         /*
 388                          * We are in the middle of a page, dblk should
 389                          * be allocated on the same page
 390                          */
 391                         tot_size = size + sizeof (dblk_t);
 392                         ASSERT((offset + sizeof (dblk_t) + KMEM_SLAB_T_SZ)
 393                             < PAGESIZE);
 394                         ASSERT((tot_size & (DBLK_CACHE_ALIGN - 1)) == 0);
 395 
 396                 } else {
 397 
 398                         /*
 399                          * buf size is multiple of page size, dblk and
 400                          * buffer are allocated separately.
 401                          */
 402 
 403                         ASSERT((size & (DBLK_CACHE_ALIGN - 1)) == 0);
 404                         tot_size = sizeof (dblk_t);
 405                 }
 406 
 407                 (void) sprintf(name, "streams_dblk_%ld", (long)size);
 408                 cp = kmem_cache_create(name, tot_size, DBLK_CACHE_ALIGN,
 409                     dblk_constructor, dblk_destructor, NULL, (void *)(size),
 410                     NULL, dblk_kmem_flags);
 411 
 412                 while (lastsize <= size) {
 413                         dblk_cache[(lastsize - 1) >> DBLK_SIZE_SHIFT] = cp;
 414                         lastsize += DBLK_MIN_SIZE;
 415                 }
 416         }
 417 
 418         dblk_esb_cache = kmem_cache_create("streams_dblk_esb", sizeof (dblk_t),
 419             DBLK_CACHE_ALIGN, dblk_esb_constructor, dblk_destructor, NULL,
 420             (void *)sizeof (dblk_t), NULL, dblk_kmem_flags);
 421 #ifdef  _KERNEL
 422         fthdr_cache = kmem_cache_create("streams_fthdr", sizeof (fthdr_t), 32,
 423             fthdr_constructor, fthdr_destructor, NULL, NULL, NULL, 0);
 424         ftblk_cache = kmem_cache_create("streams_ftblk", sizeof (ftblk_t), 32,
 425             ftblk_constructor, ftblk_destructor, NULL, NULL, NULL, 0);
 426 
 427         /* Initialize Multidata caches */
 428         mmd_init();
 429 
 430         /* initialize throttling queue for esballoc */
 431         esballoc_queue_init();
 432 #endif  /* _KERNEL */
 433 }
 434 
 435 /*ARGSUSED*/
 436 mblk_t *
 437 allocb(size_t size, uint_t pri)
 438 {
 439         dblk_t *dbp;
 440         mblk_t *mp;
 441         size_t index;
 442 
 443         index =  (size - 1)  >> DBLK_SIZE_SHIFT;
 444 
 445         if (index >= (DBLK_MAX_CACHE >> DBLK_SIZE_SHIFT)) {
 446                 if (size != 0) {
 447                         mp = allocb_oversize(size, KM_NOSLEEP);
 448                         goto out;
 449                 }
 450                 index = 0;
 451         }
 452 
 453         if ((dbp = kmem_cache_alloc(dblk_cache[index], KM_NOSLEEP)) == NULL) {
 454                 mp = NULL;
 455                 goto out;
 456         }
 457 
 458         mp = dbp->db_mblk;
 459         DBLK_RTFU_WORD(dbp) = DBLK_RTFU(1, M_DATA, 0, 0);
 460         mp->b_next = mp->b_prev = mp->b_cont = NULL;
 461         mp->b_rptr = mp->b_wptr = dbp->db_base;
 462         mp->b_queue = NULL;
 463         MBLK_BAND_FLAG_WORD(mp) = 0;
 464         STR_FTALLOC(&dbp->db_fthdr, FTEV_ALLOCB, size);
 465 out:
 466         FTRACE_1("allocb(): mp=0x%p", (uintptr_t)mp);
 467 
 468         return (mp);
 469 }
 470 
 471 /*
 472  * Allocate an mblk taking db_credp and db_cpid from the template.
 473  * Allow the cred to be NULL.
 474  */
 475 mblk_t *
 476 allocb_tmpl(size_t size, const mblk_t *tmpl)
 477 {
 478         mblk_t *mp = allocb(size, 0);
 479 
 480         if (mp != NULL) {
 481                 dblk_t *src = tmpl->b_datap;
 482                 dblk_t *dst = mp->b_datap;
 483                 cred_t *cr;
 484                 pid_t cpid;
 485 
 486                 cr = msg_getcred(tmpl, &cpid);
 487                 if (cr != NULL)
 488                         crhold(dst->db_credp = cr);
 489                 dst->db_cpid = cpid;
 490                 dst->db_type = src->db_type;
 491         }
 492         return (mp);
 493 }
 494 
 495 mblk_t *
 496 allocb_cred(size_t size, cred_t *cr, pid_t cpid)
 497 {
 498         mblk_t *mp = allocb(size, 0);
 499 
 500         ASSERT(cr != NULL);
 501         if (mp != NULL) {
 502                 dblk_t *dbp = mp->b_datap;
 503 
 504                 crhold(dbp->db_credp = cr);
 505                 dbp->db_cpid = cpid;
 506         }
 507         return (mp);
 508 }
 509 
 510 mblk_t *
 511 allocb_cred_wait(size_t size, uint_t flags, int *error, cred_t *cr, pid_t cpid)
 512 {
 513         mblk_t *mp = allocb_wait(size, 0, flags, error);
 514 
 515         ASSERT(cr != NULL);
 516         if (mp != NULL) {
 517                 dblk_t *dbp = mp->b_datap;
 518 
 519                 crhold(dbp->db_credp = cr);
 520                 dbp->db_cpid = cpid;
 521         }
 522 
 523         return (mp);
 524 }
 525 
 526 /*
 527  * Extract the db_cred (and optionally db_cpid) from a message.
 528  * We find the first mblk which has a non-NULL db_cred and use that.
 529  * If none found we return NULL.
 530  * Does NOT get a hold on the cred.
 531  */
 532 cred_t *
 533 msg_getcred(const mblk_t *mp, pid_t *cpidp)
 534 {
 535         cred_t *cr = NULL;
 536         cred_t *cr2;
 537         mblk_t *mp2;
 538 
 539         while (mp != NULL) {
 540                 dblk_t *dbp = mp->b_datap;
 541 
 542                 cr = dbp->db_credp;
 543                 if (cr == NULL) {
 544                         mp = mp->b_cont;
 545                         continue;
 546                 }
 547                 if (cpidp != NULL)
 548                         *cpidp = dbp->db_cpid;
 549 
 550 #ifdef DEBUG
 551                 /*
 552                  * Normally there should at most one db_credp in a message.
 553                  * But if there are multiple (as in the case of some M_IOC*
 554                  * and some internal messages in TCP/IP bind logic) then
 555                  * they must be identical in the normal case.
 556                  * However, a socket can be shared between different uids
 557                  * in which case data queued in TCP would be from different
 558                  * creds. Thus we can only assert for the zoneid being the
 559                  * same. Due to Multi-level Level Ports for TX, some
 560                  * cred_t can have a NULL cr_zone, and we skip the comparison
 561                  * in that case.
 562                  */
 563                 mp2 = mp->b_cont;
 564                 while (mp2 != NULL) {
 565                         cr2 = DB_CRED(mp2);
 566                         if (cr2 != NULL) {
 567                                 DTRACE_PROBE2(msg__getcred,
 568                                     cred_t *, cr, cred_t *, cr2);
 569                                 ASSERT(crgetzoneid(cr) == crgetzoneid(cr2) ||
 570                                     crgetzone(cr) == NULL ||
 571                                     crgetzone(cr2) == NULL);
 572                         }
 573                         mp2 = mp2->b_cont;
 574                 }
 575 #endif
 576                 return (cr);
 577         }
 578         if (cpidp != NULL)
 579                 *cpidp = NOPID;
 580         return (NULL);
 581 }
 582 
 583 /*
 584  * Variant of msg_getcred which, when a cred is found
 585  * 1. Returns with a hold on the cred
 586  * 2. Clears the first cred in the mblk.
 587  * This is more efficient to use than a msg_getcred() + crhold() when
 588  * the message is freed after the cred has been extracted.
 589  *
 590  * The caller is responsible for ensuring that there is no other reference
 591  * on the message since db_credp can not be cleared when there are other
 592  * references.
 593  */
 594 cred_t *
 595 msg_extractcred(mblk_t *mp, pid_t *cpidp)
 596 {
 597         cred_t *cr = NULL;
 598         cred_t *cr2;
 599         mblk_t *mp2;
 600 
 601         while (mp != NULL) {
 602                 dblk_t *dbp = mp->b_datap;
 603 
 604                 cr = dbp->db_credp;
 605                 if (cr == NULL) {
 606                         mp = mp->b_cont;
 607                         continue;
 608                 }
 609                 ASSERT(dbp->db_ref == 1);
 610                 dbp->db_credp = NULL;
 611                 if (cpidp != NULL)
 612                         *cpidp = dbp->db_cpid;
 613 #ifdef DEBUG
 614                 /*
 615                  * Normally there should at most one db_credp in a message.
 616                  * But if there are multiple (as in the case of some M_IOC*
 617                  * and some internal messages in TCP/IP bind logic) then
 618                  * they must be identical in the normal case.
 619                  * However, a socket can be shared between different uids
 620                  * in which case data queued in TCP would be from different
 621                  * creds. Thus we can only assert for the zoneid being the
 622                  * same. Due to Multi-level Level Ports for TX, some
 623                  * cred_t can have a NULL cr_zone, and we skip the comparison
 624                  * in that case.
 625                  */
 626                 mp2 = mp->b_cont;
 627                 while (mp2 != NULL) {
 628                         cr2 = DB_CRED(mp2);
 629                         if (cr2 != NULL) {
 630                                 DTRACE_PROBE2(msg__extractcred,
 631                                     cred_t *, cr, cred_t *, cr2);
 632                                 ASSERT(crgetzoneid(cr) == crgetzoneid(cr2) ||
 633                                     crgetzone(cr) == NULL ||
 634                                     crgetzone(cr2) == NULL);
 635                         }
 636                         mp2 = mp2->b_cont;
 637                 }
 638 #endif
 639                 return (cr);
 640         }
 641         return (NULL);
 642 }
 643 
 644 #ifdef  _KERNEL
 645 /*
 646  * Get the label for a message. Uses the first mblk in the message
 647  * which has a non-NULL db_credp.
 648  * Returns NULL if there is no credp.
 649  */
 650 extern struct ts_label_s *
 651 msg_getlabel(const mblk_t *mp)
 652 {
 653         cred_t *cr = msg_getcred(mp, NULL);
 654 
 655         if (cr == NULL)
 656                 return (NULL);
 657 
 658         return (crgetlabel(cr));
 659 }
 660 #endif  /* _KERNEL */
 661 
 662 void
 663 freeb(mblk_t *mp)
 664 {
 665         dblk_t *dbp = mp->b_datap;
 666 
 667         ASSERT(dbp->db_ref > 0);
 668         ASSERT(mp->b_next == NULL && mp->b_prev == NULL);
 669         FTRACE_1("freeb(): mp=0x%lx", (uintptr_t)mp);
 670 
 671         STR_FTEVENT_MBLK(mp, caller(), FTEV_FREEB, dbp->db_ref);
 672 
 673         dbp->db_free(mp, dbp);
 674 }
 675 
 676 void
 677 freemsg(mblk_t *mp)
 678 {
 679         FTRACE_1("freemsg(): mp=0x%lx", (uintptr_t)mp);
 680         while (mp) {
 681                 dblk_t *dbp = mp->b_datap;
 682                 mblk_t *mp_cont = mp->b_cont;
 683 
 684                 ASSERT(dbp->db_ref > 0);
 685                 ASSERT(mp->b_next == NULL && mp->b_prev == NULL);
 686 
 687                 STR_FTEVENT_MBLK(mp, caller(), FTEV_FREEB, dbp->db_ref);
 688 
 689                 dbp->db_free(mp, dbp);
 690                 mp = mp_cont;
 691         }
 692 }
 693 
 694 /*
 695  * Reallocate a block for another use.  Try hard to use the old block.
 696  * If the old data is wanted (copy), leave b_wptr at the end of the data,
 697  * otherwise return b_wptr = b_rptr.
 698  *
 699  * This routine is private and unstable.
 700  */
 701 mblk_t  *
 702 reallocb(mblk_t *mp, size_t size, uint_t copy)
 703 {
 704         mblk_t          *mp1;
 705         unsigned char   *old_rptr;
 706         ptrdiff_t       cur_size;
 707 
 708         if (mp == NULL)
 709                 return (allocb(size, BPRI_HI));
 710 
 711         cur_size = mp->b_wptr - mp->b_rptr;
 712         old_rptr = mp->b_rptr;
 713 
 714         ASSERT(mp->b_datap->db_ref != 0);
 715 
 716         if (mp->b_datap->db_ref == 1 && MBLKSIZE(mp) >= size) {
 717                 /*
 718                  * If the data is wanted and it will fit where it is, no
 719                  * work is required.
 720                  */
 721                 if (copy && mp->b_datap->db_lim - mp->b_rptr >= size)
 722                         return (mp);
 723 
 724                 mp->b_wptr = mp->b_rptr = mp->b_datap->db_base;
 725                 mp1 = mp;
 726         } else if ((mp1 = allocb_tmpl(size, mp)) != NULL) {
 727                 /* XXX other mp state could be copied too, db_flags ... ? */
 728                 mp1->b_cont = mp->b_cont;
 729         } else {
 730                 return (NULL);
 731         }
 732 
 733         if (copy) {
 734                 bcopy(old_rptr, mp1->b_rptr, cur_size);
 735                 mp1->b_wptr = mp1->b_rptr + cur_size;
 736         }
 737 
 738         if (mp != mp1)
 739                 freeb(mp);
 740 
 741         return (mp1);
 742 }
 743 
 744 static void
 745 dblk_lastfree(mblk_t *mp, dblk_t *dbp)
 746 {
 747         ASSERT(dbp->db_mblk == mp);
 748         if (dbp->db_fthdr != NULL)
 749                 str_ftfree(dbp);
 750 
 751         /* set credp and projid to be 'unspecified' before returning to cache */
 752         if (dbp->db_credp != NULL) {
 753                 crfree(dbp->db_credp);
 754                 dbp->db_credp = NULL;
 755         }
 756         dbp->db_cpid = -1;
 757 
 758         /* Reset the struioflag and the checksum flag fields */
 759         dbp->db_struioflag = 0;
 760         dbp->db_struioun.cksum.flags = 0;
 761 
 762         /* and the COOKED and/or UIOA flag(s) */
 763         dbp->db_flags &= ~(DBLK_COOKED | DBLK_UIOA);
 764 
 765         kmem_cache_free(dbp->db_cache, dbp);
 766 }
 767 
 768 static void
 769 dblk_decref(mblk_t *mp, dblk_t *dbp)
 770 {
 771         if (dbp->db_ref != 1) {
 772                 uint32_t rtfu = atomic_add_32_nv(&DBLK_RTFU_WORD(dbp),
 773                     -(1 << DBLK_RTFU_SHIFT(db_ref)));
 774                 /*
 775                  * atomic_add_32_nv() just decremented db_ref, so we no longer
 776                  * have a reference to the dblk, which means another thread
 777                  * could free it.  Therefore we cannot examine the dblk to
 778                  * determine whether ours was the last reference.  Instead,
 779                  * we extract the new and minimum reference counts from rtfu.
 780                  * Note that all we're really saying is "if (ref != refmin)".
 781                  */
 782                 if (((rtfu >> DBLK_RTFU_SHIFT(db_ref)) & DBLK_REFMAX) !=
 783                     ((rtfu >> DBLK_RTFU_SHIFT(db_flags)) & DBLK_REFMIN)) {
 784                         kmem_cache_free(mblk_cache, mp);
 785                         return;
 786                 }
 787         }
 788         dbp->db_mblk = mp;
 789         dbp->db_free = dbp->db_lastfree;
 790         dbp->db_lastfree(mp, dbp);
 791 }
 792 
 793 mblk_t *
 794 dupb(mblk_t *mp)
 795 {
 796         dblk_t *dbp = mp->b_datap;
 797         mblk_t *new_mp;
 798         uint32_t oldrtfu, newrtfu;
 799 
 800         if ((new_mp = kmem_cache_alloc(mblk_cache, KM_NOSLEEP)) == NULL)
 801                 goto out;
 802 
 803         new_mp->b_next = new_mp->b_prev = new_mp->b_cont = NULL;
 804         new_mp->b_rptr = mp->b_rptr;
 805         new_mp->b_wptr = mp->b_wptr;
 806         new_mp->b_datap = dbp;
 807         new_mp->b_queue = NULL;
 808         MBLK_BAND_FLAG_WORD(new_mp) = MBLK_BAND_FLAG_WORD(mp);
 809 
 810         STR_FTEVENT_MBLK(mp, caller(), FTEV_DUPB, dbp->db_ref);
 811 
 812         dbp->db_free = dblk_decref;
 813         do {
 814                 ASSERT(dbp->db_ref > 0);
 815                 oldrtfu = DBLK_RTFU_WORD(dbp);
 816                 newrtfu = oldrtfu + (1 << DBLK_RTFU_SHIFT(db_ref));
 817                 /*
 818                  * If db_ref is maxed out we can't dup this message anymore.
 819                  */
 820                 if ((oldrtfu & DBLK_RTFU_REF_MASK) == DBLK_RTFU_REF_MASK) {
 821                         kmem_cache_free(mblk_cache, new_mp);
 822                         new_mp = NULL;
 823                         goto out;
 824                 }
 825         } while (atomic_cas_32(&DBLK_RTFU_WORD(dbp), oldrtfu, newrtfu) !=
 826             oldrtfu);
 827 
 828 out:
 829         FTRACE_1("dupb(): new_mp=0x%lx", (uintptr_t)new_mp);
 830         return (new_mp);
 831 }
 832 
 833 static void
 834 dblk_lastfree_desb(mblk_t *mp, dblk_t *dbp)
 835 {
 836         frtn_t *frp = dbp->db_frtnp;
 837 
 838         ASSERT(dbp->db_mblk == mp);
 839         frp->free_func(frp->free_arg);
 840         if (dbp->db_fthdr != NULL)
 841                 str_ftfree(dbp);
 842 
 843         /* set credp and projid to be 'unspecified' before returning to cache */
 844         if (dbp->db_credp != NULL) {
 845                 crfree(dbp->db_credp);
 846                 dbp->db_credp = NULL;
 847         }
 848         dbp->db_cpid = -1;
 849         dbp->db_struioflag = 0;
 850         dbp->db_struioun.cksum.flags = 0;
 851 
 852         kmem_cache_free(dbp->db_cache, dbp);
 853 }
 854 
 855 /*ARGSUSED*/
 856 static void
 857 frnop_func(void *arg)
 858 {
 859 }
 860 
 861 /*
 862  * Generic esballoc used to implement the four flavors: [d]esballoc[a].
 863  */
 864 static mblk_t *
 865 gesballoc(unsigned char *base, size_t size, uint32_t db_rtfu, frtn_t *frp,
 866         void (*lastfree)(mblk_t *, dblk_t *), int kmflags)
 867 {
 868         dblk_t *dbp;
 869         mblk_t *mp;
 870 
 871         ASSERT(base != NULL && frp != NULL);
 872 
 873         if ((dbp = kmem_cache_alloc(dblk_esb_cache, kmflags)) == NULL) {
 874                 mp = NULL;
 875                 goto out;
 876         }
 877 
 878         mp = dbp->db_mblk;
 879         dbp->db_base = base;
 880         dbp->db_lim = base + size;
 881         dbp->db_free = dbp->db_lastfree = lastfree;
 882         dbp->db_frtnp = frp;
 883         DBLK_RTFU_WORD(dbp) = db_rtfu;
 884         mp->b_next = mp->b_prev = mp->b_cont = NULL;
 885         mp->b_rptr = mp->b_wptr = base;
 886         mp->b_queue = NULL;
 887         MBLK_BAND_FLAG_WORD(mp) = 0;
 888 
 889 out:
 890         FTRACE_1("gesballoc(): mp=0x%lx", (uintptr_t)mp);
 891         return (mp);
 892 }
 893 
 894 /*ARGSUSED*/
 895 mblk_t *
 896 esballoc(unsigned char *base, size_t size, uint_t pri, frtn_t *frp)
 897 {
 898         mblk_t *mp;
 899 
 900         /*
 901          * Note that this is structured to allow the common case (i.e.
 902          * STREAMS flowtracing disabled) to call gesballoc() with tail
 903          * call optimization.
 904          */
 905         if (!str_ftnever) {
 906                 mp = gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0),
 907                     frp, freebs_enqueue, KM_NOSLEEP);
 908 
 909                 if (mp != NULL)
 910                         STR_FTALLOC(&DB_FTHDR(mp), FTEV_ESBALLOC, size);
 911                 return (mp);
 912         }
 913 
 914         return (gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0),
 915             frp, freebs_enqueue, KM_NOSLEEP));
 916 }
 917 
 918 /*
 919  * Same as esballoc() but sleeps waiting for memory.
 920  */
 921 /*ARGSUSED*/
 922 mblk_t *
 923 esballoc_wait(unsigned char *base, size_t size, uint_t pri, frtn_t *frp)
 924 {
 925         mblk_t *mp;
 926 
 927         /*
 928          * Note that this is structured to allow the common case (i.e.
 929          * STREAMS flowtracing disabled) to call gesballoc() with tail
 930          * call optimization.
 931          */
 932         if (!str_ftnever) {
 933                 mp = gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0),
 934                     frp, freebs_enqueue, KM_SLEEP);
 935 
 936                 STR_FTALLOC(&DB_FTHDR(mp), FTEV_ESBALLOC, size);
 937                 return (mp);
 938         }
 939 
 940         return (gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0),
 941             frp, freebs_enqueue, KM_SLEEP));
 942 }
 943 
 944 /*ARGSUSED*/
 945 mblk_t *
 946 desballoc(unsigned char *base, size_t size, uint_t pri, frtn_t *frp)
 947 {
 948         mblk_t *mp;
 949 
 950         /*
 951          * Note that this is structured to allow the common case (i.e.
 952          * STREAMS flowtracing disabled) to call gesballoc() with tail
 953          * call optimization.
 954          */
 955         if (!str_ftnever) {
 956                 mp = gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0),
 957                     frp, dblk_lastfree_desb, KM_NOSLEEP);
 958 
 959                 if (mp != NULL)
 960                         STR_FTALLOC(&DB_FTHDR(mp), FTEV_DESBALLOC, size);
 961                 return (mp);
 962         }
 963 
 964         return (gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0),
 965             frp, dblk_lastfree_desb, KM_NOSLEEP));
 966 }
 967 
 968 /*ARGSUSED*/
 969 mblk_t *
 970 esballoca(unsigned char *base, size_t size, uint_t pri, frtn_t *frp)
 971 {
 972         mblk_t *mp;
 973 
 974         /*
 975          * Note that this is structured to allow the common case (i.e.
 976          * STREAMS flowtracing disabled) to call gesballoc() with tail
 977          * call optimization.
 978          */
 979         if (!str_ftnever) {
 980                 mp = gesballoc(base, size, DBLK_RTFU(2, M_DATA, 0, 0),
 981                     frp, freebs_enqueue, KM_NOSLEEP);
 982 
 983                 if (mp != NULL)
 984                         STR_FTALLOC(&DB_FTHDR(mp), FTEV_ESBALLOCA, size);
 985                 return (mp);
 986         }
 987 
 988         return (gesballoc(base, size, DBLK_RTFU(2, M_DATA, 0, 0),
 989             frp, freebs_enqueue, KM_NOSLEEP));
 990 }
 991 
 992 /*ARGSUSED*/
 993 mblk_t *
 994 desballoca(unsigned char *base, size_t size, uint_t pri, frtn_t *frp)
 995 {
 996         mblk_t *mp;
 997 
 998         /*
 999          * Note that this is structured to allow the common case (i.e.
1000          * STREAMS flowtracing disabled) to call gesballoc() with tail
1001          * call optimization.
1002          */
1003         if (!str_ftnever) {
1004                 mp = gesballoc(base, size, DBLK_RTFU(2, M_DATA, 0, 0),
1005                     frp, dblk_lastfree_desb, KM_NOSLEEP);
1006 
1007                 if (mp != NULL)
1008                         STR_FTALLOC(&DB_FTHDR(mp), FTEV_DESBALLOCA, size);
1009                 return (mp);
1010         }
1011 
1012         return (gesballoc(base, size, DBLK_RTFU(2, M_DATA, 0, 0),
1013             frp, dblk_lastfree_desb, KM_NOSLEEP));
1014 }
1015 
1016 static void
1017 bcache_dblk_lastfree(mblk_t *mp, dblk_t *dbp)
1018 {
1019         bcache_t *bcp = dbp->db_cache;
1020 
1021         ASSERT(dbp->db_mblk == mp);
1022         if (dbp->db_fthdr != NULL)
1023                 str_ftfree(dbp);
1024 
1025         /* set credp and projid to be 'unspecified' before returning to cache */
1026         if (dbp->db_credp != NULL) {
1027                 crfree(dbp->db_credp);
1028                 dbp->db_credp = NULL;
1029         }
1030         dbp->db_cpid = -1;
1031         dbp->db_struioflag = 0;
1032         dbp->db_struioun.cksum.flags = 0;
1033 
1034         mutex_enter(&bcp->mutex);
1035         kmem_cache_free(bcp->dblk_cache, dbp);
1036         bcp->alloc--;
1037 
1038         if (bcp->alloc == 0 && bcp->destroy != 0) {
1039                 kmem_cache_destroy(bcp->dblk_cache);
1040                 kmem_cache_destroy(bcp->buffer_cache);
1041                 mutex_exit(&bcp->mutex);
1042                 mutex_destroy(&bcp->mutex);
1043                 kmem_free(bcp, sizeof (bcache_t));
1044         } else {
1045                 mutex_exit(&bcp->mutex);
1046         }
1047 }
1048 
1049 bcache_t *
1050 bcache_create(char *name, size_t size, uint_t align)
1051 {
1052         bcache_t *bcp;
1053         char buffer[255];
1054 
1055         ASSERT((align & (align - 1)) == 0);
1056 
1057         if ((bcp = kmem_alloc(sizeof (bcache_t), KM_NOSLEEP)) == NULL)
1058                 return (NULL);
1059 
1060         bcp->size = size;
1061         bcp->align = align;
1062         bcp->alloc = 0;
1063         bcp->destroy = 0;
1064 
1065         mutex_init(&bcp->mutex, NULL, MUTEX_DRIVER, NULL);
1066 
1067         (void) sprintf(buffer, "%s_buffer_cache", name);
1068         bcp->buffer_cache = kmem_cache_create(buffer, size, align, NULL, NULL,
1069             NULL, NULL, NULL, 0);
1070         (void) sprintf(buffer, "%s_dblk_cache", name);
1071         bcp->dblk_cache = kmem_cache_create(buffer, sizeof (dblk_t),
1072             DBLK_CACHE_ALIGN, bcache_dblk_constructor, bcache_dblk_destructor,
1073             NULL, (void *)bcp, NULL, 0);
1074 
1075         return (bcp);
1076 }
1077 
1078 void
1079 bcache_destroy(bcache_t *bcp)
1080 {
1081         ASSERT(bcp != NULL);
1082 
1083         mutex_enter(&bcp->mutex);
1084         if (bcp->alloc == 0) {
1085                 kmem_cache_destroy(bcp->dblk_cache);
1086                 kmem_cache_destroy(bcp->buffer_cache);
1087                 mutex_exit(&bcp->mutex);
1088                 mutex_destroy(&bcp->mutex);
1089                 kmem_free(bcp, sizeof (bcache_t));
1090         } else {
1091                 bcp->destroy++;
1092                 mutex_exit(&bcp->mutex);
1093         }
1094 }
1095 
1096 /*ARGSUSED*/
1097 mblk_t *
1098 bcache_allocb(bcache_t *bcp, uint_t pri)
1099 {
1100         dblk_t *dbp;
1101         mblk_t *mp = NULL;
1102 
1103         ASSERT(bcp != NULL);
1104 
1105         mutex_enter(&bcp->mutex);
1106         if (bcp->destroy != 0) {
1107                 mutex_exit(&bcp->mutex);
1108                 goto out;
1109         }
1110 
1111         if ((dbp = kmem_cache_alloc(bcp->dblk_cache, KM_NOSLEEP)) == NULL) {
1112                 mutex_exit(&bcp->mutex);
1113                 goto out;
1114         }
1115         bcp->alloc++;
1116         mutex_exit(&bcp->mutex);
1117 
1118         ASSERT(((uintptr_t)(dbp->db_base) & (bcp->align - 1)) == 0);
1119 
1120         mp = dbp->db_mblk;
1121         DBLK_RTFU_WORD(dbp) = DBLK_RTFU(1, M_DATA, 0, 0);
1122         mp->b_next = mp->b_prev = mp->b_cont = NULL;
1123         mp->b_rptr = mp->b_wptr = dbp->db_base;
1124         mp->b_queue = NULL;
1125         MBLK_BAND_FLAG_WORD(mp) = 0;
1126         STR_FTALLOC(&dbp->db_fthdr, FTEV_BCALLOCB, bcp->size);
1127 out:
1128         FTRACE_1("bcache_allocb(): mp=0x%p", (uintptr_t)mp);
1129 
1130         return (mp);
1131 }
1132 
1133 static void
1134 dblk_lastfree_oversize(mblk_t *mp, dblk_t *dbp)
1135 {
1136         ASSERT(dbp->db_mblk == mp);
1137         if (dbp->db_fthdr != NULL)
1138                 str_ftfree(dbp);
1139 
1140         /* set credp and projid to be 'unspecified' before returning to cache */
1141         if (dbp->db_credp != NULL) {
1142                 crfree(dbp->db_credp);
1143                 dbp->db_credp = NULL;
1144         }
1145         dbp->db_cpid = -1;
1146         dbp->db_struioflag = 0;
1147         dbp->db_struioun.cksum.flags = 0;
1148 
1149         kmem_free(dbp->db_base, dbp->db_lim - dbp->db_base);
1150         kmem_cache_free(dbp->db_cache, dbp);
1151 }
1152 
1153 static mblk_t *
1154 allocb_oversize(size_t size, int kmflags)
1155 {
1156         mblk_t *mp;
1157         void *buf;
1158 
1159         size = P2ROUNDUP(size, DBLK_CACHE_ALIGN);
1160         if ((buf = kmem_alloc(size, kmflags)) == NULL)
1161                 return (NULL);
1162         if ((mp = gesballoc(buf, size, DBLK_RTFU(1, M_DATA, 0, 0),
1163             &frnop, dblk_lastfree_oversize, kmflags)) == NULL)
1164                 kmem_free(buf, size);
1165 
1166         if (mp != NULL)
1167                 STR_FTALLOC(&DB_FTHDR(mp), FTEV_ALLOCBIG, size);
1168 
1169         return (mp);
1170 }
1171 
1172 mblk_t *
1173 allocb_tryhard(size_t target_size)
1174 {
1175         size_t size;
1176         mblk_t *bp;
1177 
1178         for (size = target_size; size < target_size + 512;
1179             size += DBLK_CACHE_ALIGN)
1180                 if ((bp = allocb(size, BPRI_HI)) != NULL)
1181                         return (bp);
1182         allocb_tryhard_fails++;
1183         return (NULL);
1184 }
1185 
1186 /*
1187  * This routine is consolidation private for STREAMS internal use
1188  * This routine may only be called from sync routines (i.e., not
1189  * from put or service procedures).  It is located here (rather
1190  * than strsubr.c) so that we don't have to expose all of the
1191  * allocb() implementation details in header files.
1192  */
1193 mblk_t *
1194 allocb_wait(size_t size, uint_t pri, uint_t flags, int *error)
1195 {
1196         dblk_t *dbp;
1197         mblk_t *mp;
1198         size_t index;
1199 
1200         index = (size -1) >> DBLK_SIZE_SHIFT;
1201 
1202         if (flags & STR_NOSIG) {
1203                 if (index >= (DBLK_MAX_CACHE >> DBLK_SIZE_SHIFT)) {
1204                         if (size != 0) {
1205                                 mp = allocb_oversize(size, KM_SLEEP);
1206                                 FTRACE_1("allocb_wait (NOSIG): mp=0x%lx",
1207                                     (uintptr_t)mp);
1208                                 return (mp);
1209                         }
1210                         index = 0;
1211                 }
1212 
1213                 dbp = kmem_cache_alloc(dblk_cache[index], KM_SLEEP);
1214                 mp = dbp->db_mblk;
1215                 DBLK_RTFU_WORD(dbp) = DBLK_RTFU(1, M_DATA, 0, 0);
1216                 mp->b_next = mp->b_prev = mp->b_cont = NULL;
1217                 mp->b_rptr = mp->b_wptr = dbp->db_base;
1218                 mp->b_queue = NULL;
1219                 MBLK_BAND_FLAG_WORD(mp) = 0;
1220                 STR_FTALLOC(&DB_FTHDR(mp), FTEV_ALLOCBW, size);
1221 
1222                 FTRACE_1("allocb_wait (NOSIG): mp=0x%lx", (uintptr_t)mp);
1223 
1224         } else {
1225                 while ((mp = allocb(size, pri)) == NULL) {
1226                         if ((*error = strwaitbuf(size, BPRI_HI)) != 0)
1227                                 return (NULL);
1228                 }
1229         }
1230 
1231         return (mp);
1232 }
1233 
1234 /*
1235  * Call function 'func' with 'arg' when a class zero block can
1236  * be allocated with priority 'pri'.
1237  */
1238 bufcall_id_t
1239 esbbcall(uint_t pri, void (*func)(void *), void *arg)
1240 {
1241         return (bufcall(1, pri, func, arg));
1242 }
1243 
1244 /*
1245  * Allocates an iocblk (M_IOCTL) block. Properly sets the credentials
1246  * ioc_id, rval and error of the struct ioctl to set up an ioctl call.
1247  * This provides consistency for all internal allocators of ioctl.
1248  */
1249 mblk_t *
1250 mkiocb(uint_t cmd)
1251 {
1252         struct iocblk   *ioc;
1253         mblk_t          *mp;
1254 
1255         /*
1256          * Allocate enough space for any of the ioctl related messages.
1257          */
1258         if ((mp = allocb(sizeof (union ioctypes), BPRI_MED)) == NULL)
1259                 return (NULL);
1260 
1261         bzero(mp->b_rptr, sizeof (union ioctypes));
1262 
1263         /*
1264          * Set the mblk_t information and ptrs correctly.
1265          */
1266         mp->b_wptr += sizeof (struct iocblk);
1267         mp->b_datap->db_type = M_IOCTL;
1268 
1269         /*
1270          * Fill in the fields.
1271          */
1272         ioc             = (struct iocblk *)mp->b_rptr;
1273         ioc->ioc_cmd = cmd;
1274         ioc->ioc_cr  = kcred;
1275         ioc->ioc_id  = getiocseqno();
1276         ioc->ioc_flag        = IOC_NATIVE;
1277         return (mp);
1278 }
1279 
1280 /*
1281  * test if block of given size can be allocated with a request of
1282  * the given priority.
1283  * 'pri' is no longer used, but is retained for compatibility.
1284  */
1285 /* ARGSUSED */
1286 int
1287 testb(size_t size, uint_t pri)
1288 {
1289         return ((size + sizeof (dblk_t)) <= kmem_avail());
1290 }
1291 
1292 #ifdef  _KERNEL
1293 
1294 /*
1295  * Call function 'func' with argument 'arg' when there is a reasonably
1296  * good chance that a block of size 'size' can be allocated.
1297  * 'pri' is no longer used, but is retained for compatibility.
1298  */
1299 /* ARGSUSED */
1300 bufcall_id_t
1301 bufcall(size_t size, uint_t pri, void (*func)(void *), void *arg)
1302 {
1303         static long bid = 1;    /* always odd to save checking for zero */
1304         bufcall_id_t bc_id;
1305         struct strbufcall *bcp;
1306 
1307         if ((bcp = kmem_alloc(sizeof (strbufcall_t), KM_NOSLEEP)) == NULL)
1308                 return (0);
1309 
1310         bcp->bc_func = func;
1311         bcp->bc_arg = arg;
1312         bcp->bc_size = size;
1313         bcp->bc_next = NULL;
1314         bcp->bc_executor = NULL;
1315 
1316         mutex_enter(&strbcall_lock);
1317         /*
1318          * After bcp is linked into strbcalls and strbcall_lock is dropped there
1319          * should be no references to bcp since it may be freed by
1320          * runbufcalls(). Since bcp_id field is returned, we save its value in
1321          * the local var.
1322          */
1323         bc_id = bcp->bc_id = (bufcall_id_t)(bid += 2);       /* keep it odd */
1324 
1325         /*
1326          * add newly allocated stream event to existing
1327          * linked list of events.
1328          */
1329         if (strbcalls.bc_head == NULL) {
1330                 strbcalls.bc_head = strbcalls.bc_tail = bcp;
1331         } else {
1332                 strbcalls.bc_tail->bc_next = bcp;
1333                 strbcalls.bc_tail = bcp;
1334         }
1335 
1336         cv_signal(&strbcall_cv);
1337         mutex_exit(&strbcall_lock);
1338         return (bc_id);
1339 }
1340 
1341 /*
1342  * Cancel a bufcall request.
1343  */
1344 void
1345 unbufcall(bufcall_id_t id)
1346 {
1347         strbufcall_t *bcp, *pbcp;
1348 
1349         mutex_enter(&strbcall_lock);
1350 again:
1351         pbcp = NULL;
1352         for (bcp = strbcalls.bc_head; bcp; bcp = bcp->bc_next) {
1353                 if (id == bcp->bc_id)
1354                         break;
1355                 pbcp = bcp;
1356         }
1357         if (bcp) {
1358                 if (bcp->bc_executor != NULL) {
1359                         if (bcp->bc_executor != curthread) {
1360                                 cv_wait(&bcall_cv, &strbcall_lock);
1361                                 goto again;
1362                         }
1363                 } else {
1364                         if (pbcp)
1365                                 pbcp->bc_next = bcp->bc_next;
1366                         else
1367                                 strbcalls.bc_head = bcp->bc_next;
1368                         if (bcp == strbcalls.bc_tail)
1369                                 strbcalls.bc_tail = pbcp;
1370                         kmem_free(bcp, sizeof (strbufcall_t));
1371                 }
1372         }
1373         mutex_exit(&strbcall_lock);
1374 }
1375 
1376 #endif  /* _KERNEL */
1377 
1378 /*
1379  * Duplicate a message block by block (uses dupb), returning
1380  * a pointer to the duplicate message.
1381  * Returns a non-NULL value only if the entire message
1382  * was dup'd.
1383  */
1384 mblk_t *
1385 dupmsg(mblk_t *bp)
1386 {
1387         mblk_t *head, *nbp;
1388 
1389         if (!bp || !(nbp = head = dupb(bp)))
1390                 return (NULL);
1391 
1392         while (bp->b_cont) {
1393                 if (!(nbp->b_cont = dupb(bp->b_cont))) {
1394                         freemsg(head);
1395                         return (NULL);
1396                 }
1397                 nbp = nbp->b_cont;
1398                 bp = bp->b_cont;
1399         }
1400         return (head);
1401 }
1402 
1403 #define DUPB_NOLOAN(bp) \
1404         ((((bp)->b_datap->db_struioflag & STRUIO_ZC) != 0) ? \
1405         copyb((bp)) : dupb((bp)))
1406 
1407 mblk_t *
1408 dupmsg_noloan(mblk_t *bp)
1409 {
1410         mblk_t *head, *nbp;
1411 
1412         if (bp == NULL || DB_TYPE(bp) != M_DATA ||
1413             ((nbp = head = DUPB_NOLOAN(bp)) == NULL))
1414                 return (NULL);
1415 
1416         while (bp->b_cont) {
1417                 if ((nbp->b_cont = DUPB_NOLOAN(bp->b_cont)) == NULL) {
1418                         freemsg(head);
1419                         return (NULL);
1420                 }
1421                 nbp = nbp->b_cont;
1422                 bp = bp->b_cont;
1423         }
1424         return (head);
1425 }
1426 
1427 /*
1428  * Copy data from message and data block to newly allocated message and
1429  * data block. Returns new message block pointer, or NULL if error.
1430  * The alignment of rptr (w.r.t. word alignment) will be the same in the copy
1431  * as in the original even when db_base is not word aligned. (bug 1052877)
1432  */
1433 mblk_t *
1434 copyb(mblk_t *bp)
1435 {
1436         mblk_t  *nbp;
1437         dblk_t  *dp, *ndp;
1438         uchar_t *base;
1439         size_t  size;
1440         size_t  unaligned;
1441 
1442         ASSERT(bp->b_wptr >= bp->b_rptr);
1443 
1444         dp = bp->b_datap;
1445         if (dp->db_fthdr != NULL)
1446                 STR_FTEVENT_MBLK(bp, caller(), FTEV_COPYB, 0);
1447 
1448         /*
1449          * Special handling for Multidata message; this should be
1450          * removed once a copy-callback routine is made available.
1451          */
1452         if (dp->db_type == M_MULTIDATA) {
1453 #ifdef  _KERNEL
1454                 cred_t *cr;
1455 
1456                 if ((nbp = mmd_copy(bp, KM_NOSLEEP)) == NULL)
1457                         return (NULL);
1458 
1459                 nbp->b_flag = bp->b_flag;
1460                 nbp->b_band = bp->b_band;
1461                 ndp = nbp->b_datap;
1462 
1463                 /* See comments below on potential issues. */
1464                 STR_FTEVENT_MBLK(nbp, caller(), FTEV_COPYB, 1);
1465 
1466                 ASSERT(ndp->db_type == dp->db_type);
1467                 cr = dp->db_credp;
1468                 if (cr != NULL)
1469                         crhold(ndp->db_credp = cr);
1470                 ndp->db_cpid = dp->db_cpid;
1471                 return (nbp);
1472 #else   /* _KERNEL */
1473                 return (NULL);
1474 #endif  /* _KERNEL */
1475         }
1476 
1477         size = dp->db_lim - dp->db_base;
1478         unaligned = P2PHASE((uintptr_t)dp->db_base, sizeof (uint_t));
1479         if ((nbp = allocb_tmpl(size + unaligned, bp)) == NULL)
1480                 return (NULL);
1481         nbp->b_flag = bp->b_flag;
1482         nbp->b_band = bp->b_band;
1483         ndp = nbp->b_datap;
1484 
1485         /*
1486          * Well, here is a potential issue.  If we are trying to
1487          * trace a flow, and we copy the message, we might lose
1488          * information about where this message might have been.
1489          * So we should inherit the FT data.  On the other hand,
1490          * a user might be interested only in alloc to free data.
1491          * So I guess the real answer is to provide a tunable.
1492          */
1493         STR_FTEVENT_MBLK(nbp, caller(), FTEV_COPYB, 1);
1494 
1495         base = ndp->db_base + unaligned;
1496         bcopy(dp->db_base, ndp->db_base + unaligned, size);
1497 
1498         nbp->b_rptr = base + (bp->b_rptr - dp->db_base);
1499         nbp->b_wptr = nbp->b_rptr + MBLKL(bp);
1500 
1501         return (nbp);
1502 }
1503 
1504 /*
1505  * Copy data from message to newly allocated message using new
1506  * data blocks.  Returns a pointer to the new message, or NULL if error.
1507  */
1508 mblk_t *
1509 copymsg(mblk_t *bp)
1510 {
1511         mblk_t *head, *nbp;
1512 
1513         if (!bp || !(nbp = head = copyb(bp)))
1514                 return (NULL);
1515 
1516         while (bp->b_cont) {
1517                 if (!(nbp->b_cont = copyb(bp->b_cont))) {
1518                         freemsg(head);
1519                         return (NULL);
1520                 }
1521                 nbp = nbp->b_cont;
1522                 bp = bp->b_cont;
1523         }
1524         return (head);
1525 }
1526 
1527 /*
1528  * link a message block to tail of message
1529  */
1530 void
1531 linkb(mblk_t *mp, mblk_t *bp)
1532 {
1533         ASSERT(mp && bp);
1534 
1535         for (; mp->b_cont; mp = mp->b_cont)
1536                 ;
1537         mp->b_cont = bp;
1538 }
1539 
1540 /*
1541  * unlink a message block from head of message
1542  * return pointer to new message.
1543  * NULL if message becomes empty.
1544  */
1545 mblk_t *
1546 unlinkb(mblk_t *bp)
1547 {
1548         mblk_t *bp1;
1549 
1550         bp1 = bp->b_cont;
1551         bp->b_cont = NULL;
1552         return (bp1);
1553 }
1554 
1555 /*
1556  * remove a message block "bp" from message "mp"
1557  *
1558  * Return pointer to new message or NULL if no message remains.
1559  * Return -1 if bp is not found in message.
1560  */
1561 mblk_t *
1562 rmvb(mblk_t *mp, mblk_t *bp)
1563 {
1564         mblk_t *tmp;
1565         mblk_t *lastp = NULL;
1566 
1567         ASSERT(mp && bp);
1568         for (tmp = mp; tmp; tmp = tmp->b_cont) {
1569                 if (tmp == bp) {
1570                         if (lastp)
1571                                 lastp->b_cont = tmp->b_cont;
1572                         else
1573                                 mp = tmp->b_cont;
1574                         tmp->b_cont = NULL;
1575                         return (mp);
1576                 }
1577                 lastp = tmp;
1578         }
1579         return ((mblk_t *)-1);
1580 }
1581 
1582 /*
1583  * Concatenate and align first len bytes of common
1584  * message type.  Len == -1, means concat everything.
1585  * Returns 1 on success, 0 on failure
1586  * After the pullup, mp points to the pulled up data.
1587  */
1588 int
1589 pullupmsg(mblk_t *mp, ssize_t len)
1590 {
1591         mblk_t *bp, *b_cont;
1592         dblk_t *dbp;
1593         ssize_t n;
1594 
1595         ASSERT(mp->b_datap->db_ref > 0);
1596         ASSERT(mp->b_next == NULL && mp->b_prev == NULL);
1597 
1598         /*
1599          * We won't handle Multidata message, since it contains
1600          * metadata which this function has no knowledge of; we
1601          * assert on DEBUG, and return failure otherwise.
1602          */
1603         ASSERT(mp->b_datap->db_type != M_MULTIDATA);
1604         if (mp->b_datap->db_type == M_MULTIDATA)
1605                 return (0);
1606 
1607         if (len == -1) {
1608                 if (mp->b_cont == NULL && str_aligned(mp->b_rptr))
1609                         return (1);
1610                 len = xmsgsize(mp);
1611         } else {
1612                 ssize_t first_mblk_len = mp->b_wptr - mp->b_rptr;
1613                 ASSERT(first_mblk_len >= 0);
1614                 /*
1615                  * If the length is less than that of the first mblk,
1616                  * we want to pull up the message into an aligned mblk.
1617                  * Though not part of the spec, some callers assume it.
1618                  */
1619                 if (len <= first_mblk_len) {
1620                         if (str_aligned(mp->b_rptr))
1621                                 return (1);
1622                         len = first_mblk_len;
1623                 } else if (xmsgsize(mp) < len)
1624                         return (0);
1625         }
1626 
1627         if ((bp = allocb_tmpl(len, mp)) == NULL)
1628                 return (0);
1629 
1630         dbp = bp->b_datap;
1631         *bp = *mp;              /* swap mblks so bp heads the old msg... */
1632         mp->b_datap = dbp;   /* ... and mp heads the new message */
1633         mp->b_datap->db_mblk = mp;
1634         bp->b_datap->db_mblk = bp;
1635         mp->b_rptr = mp->b_wptr = dbp->db_base;
1636 
1637         do {
1638                 ASSERT(bp->b_datap->db_ref > 0);
1639                 ASSERT(bp->b_wptr >= bp->b_rptr);
1640                 n = MIN(bp->b_wptr - bp->b_rptr, len);
1641                 ASSERT(n >= 0);              /* allow zero-length mblk_t's */
1642                 if (n > 0)
1643                         bcopy(bp->b_rptr, mp->b_wptr, (size_t)n);
1644                 mp->b_wptr += n;
1645                 bp->b_rptr += n;
1646                 len -= n;
1647                 if (bp->b_rptr != bp->b_wptr)
1648                         break;
1649                 b_cont = bp->b_cont;
1650                 freeb(bp);
1651                 bp = b_cont;
1652         } while (len && bp);
1653 
1654         mp->b_cont = bp;     /* tack on whatever wasn't pulled up */
1655 
1656         return (1);
1657 }
1658 
1659 /*
1660  * Concatenate and align at least the first len bytes of common message
1661  * type.  Len == -1 means concatenate everything.  The original message is
1662  * unaltered.  Returns a pointer to a new message on success, otherwise
1663  * returns NULL.
1664  */
1665 mblk_t *
1666 msgpullup(mblk_t *mp, ssize_t len)
1667 {
1668         mblk_t  *newmp;
1669         ssize_t totlen;
1670         ssize_t n;
1671 
1672         /*
1673          * We won't handle Multidata message, since it contains
1674          * metadata which this function has no knowledge of; we
1675          * assert on DEBUG, and return failure otherwise.
1676          */
1677         ASSERT(mp->b_datap->db_type != M_MULTIDATA);
1678         if (mp->b_datap->db_type == M_MULTIDATA)
1679                 return (NULL);
1680 
1681         totlen = xmsgsize(mp);
1682 
1683         if ((len > 0) && (len > totlen))
1684                 return (NULL);
1685 
1686         /*
1687          * Copy all of the first msg type into one new mblk, then dupmsg
1688          * and link the rest onto this.
1689          */
1690 
1691         len = totlen;
1692 
1693         if ((newmp = allocb_tmpl(len, mp)) == NULL)
1694                 return (NULL);
1695 
1696         newmp->b_flag = mp->b_flag;
1697         newmp->b_band = mp->b_band;
1698 
1699         while (len > 0) {
1700                 n = mp->b_wptr - mp->b_rptr;
1701                 ASSERT(n >= 0);              /* allow zero-length mblk_t's */
1702                 if (n > 0)
1703                         bcopy(mp->b_rptr, newmp->b_wptr, n);
1704                 newmp->b_wptr += n;
1705                 len -= n;
1706                 mp = mp->b_cont;
1707         }
1708 
1709         if (mp != NULL) {
1710                 newmp->b_cont = dupmsg(mp);
1711                 if (newmp->b_cont == NULL) {
1712                         freemsg(newmp);
1713                         return (NULL);
1714                 }
1715         }
1716 
1717         return (newmp);
1718 }
1719 
1720 /*
1721  * Trim bytes from message
1722  *  len > 0, trim from head
1723  *  len < 0, trim from tail
1724  * Returns 1 on success, 0 on failure.
1725  */
1726 int
1727 adjmsg(mblk_t *mp, ssize_t len)
1728 {
1729         mblk_t *bp;
1730         mblk_t *save_bp = NULL;
1731         mblk_t *prev_bp;
1732         mblk_t *bcont;
1733         unsigned char type;
1734         ssize_t n;
1735         int fromhead;
1736         int first;
1737 
1738         ASSERT(mp != NULL);
1739         /*
1740          * We won't handle Multidata message, since it contains
1741          * metadata which this function has no knowledge of; we
1742          * assert on DEBUG, and return failure otherwise.
1743          */
1744         ASSERT(mp->b_datap->db_type != M_MULTIDATA);
1745         if (mp->b_datap->db_type == M_MULTIDATA)
1746                 return (0);
1747 
1748         if (len < 0) {
1749                 fromhead = 0;
1750                 len = -len;
1751         } else {
1752                 fromhead = 1;
1753         }
1754 
1755         if (xmsgsize(mp) < len)
1756                 return (0);
1757 
1758         if (fromhead) {
1759                 first = 1;
1760                 while (len) {
1761                         ASSERT(mp->b_wptr >= mp->b_rptr);
1762                         n = MIN(mp->b_wptr - mp->b_rptr, len);
1763                         mp->b_rptr += n;
1764                         len -= n;
1765 
1766                         /*
1767                          * If this is not the first zero length
1768                          * message remove it
1769                          */
1770                         if (!first && (mp->b_wptr == mp->b_rptr)) {
1771                                 bcont = mp->b_cont;
1772                                 freeb(mp);
1773                                 mp = save_bp->b_cont = bcont;
1774                         } else {
1775                                 save_bp = mp;
1776                                 mp = mp->b_cont;
1777                         }
1778                         first = 0;
1779                 }
1780         } else {
1781                 type = mp->b_datap->db_type;
1782                 while (len) {
1783                         bp = mp;
1784                         save_bp = NULL;
1785 
1786                         /*
1787                          * Find the last message of same type
1788                          */
1789                         while (bp && bp->b_datap->db_type == type) {
1790                                 ASSERT(bp->b_wptr >= bp->b_rptr);
1791                                 prev_bp = save_bp;
1792                                 save_bp = bp;
1793                                 bp = bp->b_cont;
1794                         }
1795                         if (save_bp == NULL)
1796                                 break;
1797                         n = MIN(save_bp->b_wptr - save_bp->b_rptr, len);
1798                         save_bp->b_wptr -= n;
1799                         len -= n;
1800 
1801                         /*
1802                          * If this is not the first message
1803                          * and we have taken away everything
1804                          * from this message, remove it
1805                          */
1806 
1807                         if ((save_bp != mp) &&
1808                             (save_bp->b_wptr == save_bp->b_rptr)) {
1809                                 bcont = save_bp->b_cont;
1810                                 freeb(save_bp);
1811                                 prev_bp->b_cont = bcont;
1812                         }
1813                 }
1814         }
1815         return (1);
1816 }
1817 
1818 /*
1819  * get number of data bytes in message
1820  */
1821 size_t
1822 msgdsize(mblk_t *bp)
1823 {
1824         size_t count = 0;
1825 
1826         for (; bp; bp = bp->b_cont)
1827                 if (bp->b_datap->db_type == M_DATA) {
1828                         ASSERT(bp->b_wptr >= bp->b_rptr);
1829                         count += bp->b_wptr - bp->b_rptr;
1830                 }
1831         return (count);
1832 }
1833 
1834 /* getq() etc to EOF removed */