1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright 2011 Joyent, Inc.  All rights reserved.
  25  */
  26 /*
  27  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  28  */
  29 
  30 /*
  31  * Copyright (c) 2016 by Delphix. All rights reserved.
  32  */
  33 
  34 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  35 /*        All Rights Reserved   */
  36 
  37 /*
  38  * University Copyright- Copyright (c) 1982, 1986, 1988
  39  * The Regents of the University of California
  40  * All Rights Reserved
  41  *
  42  * University Acknowledgment- Portions of this document are derived from
  43  * software developed by the University of California, Berkeley, and its
  44  * contributors.
  45  */
  46 
  47 #include <sys/types.h>
  48 #include <sys/t_lock.h>
  49 #include <sys/sysmacros.h>
  50 #include <sys/conf.h>
  51 #include <sys/cpuvar.h>
  52 #include <sys/errno.h>
  53 #include <sys/debug.h>
  54 #include <sys/buf.h>
  55 #include <sys/var.h>
  56 #include <sys/vnode.h>
  57 #include <sys/bitmap.h>
  58 #include <sys/cmn_err.h>
  59 #include <sys/kmem.h>
  60 #include <sys/vmem.h>
  61 #include <sys/atomic.h>
  62 #include <vm/seg_kmem.h>
  63 #include <vm/page.h>
  64 #include <vm/pvn.h>
  65 #include <sys/vtrace.h>
  66 #include <sys/tnf_probe.h>
  67 #include <sys/fs/ufs_inode.h>
  68 #include <sys/fs/ufs_bio.h>
  69 #include <sys/fs/ufs_log.h>
  70 #include <sys/systm.h>
  71 #include <sys/vfs.h>
  72 #include <sys/sdt.h>
  73 
  74 /* Locks */
  75 static  kmutex_t        blist_lock;     /* protects b_list */
  76 static  kmutex_t        bhdr_lock;      /* protects the bhdrlist */
  77 static  kmutex_t        bfree_lock;     /* protects the bfreelist structure */
  78 
  79 struct hbuf     *hbuf;                  /* Hash buckets */
  80 struct dwbuf    *dwbuf;                 /* Delayed write buckets */
  81 static struct buf *bhdrlist;            /* buf header free list */
  82 static int      nbuf;                   /* number of buffer headers allocated */
  83 
  84 static int      lastindex;              /* Reference point on where to start */
  85                                         /* when looking for free buffers */
  86 
  87 #define bio_bhash(dev, bn)      (hash2ints((dev), (int)(bn)) & v.v_hmask)
  88 #define EMPTY_LIST      ((struct buf *)-1)
  89 
  90 static kcondvar_t       bio_mem_cv;     /* Condition variables */
  91 static kcondvar_t       bio_flushinval_cv;
  92 static int      bio_doingflush;         /* flush in progress */
  93 static int      bio_doinginval;         /* inval in progress */
  94 static int      bio_flinv_cv_wanted;    /* someone waiting for cv */
  95 
  96 /*
  97  * Statistics on the buffer cache
  98  */
  99 struct biostats biostats = {
 100         { "buffer_cache_lookups",               KSTAT_DATA_UINT32 },
 101         { "buffer_cache_hits",                  KSTAT_DATA_UINT32 },
 102         { "new_buffer_requests",                KSTAT_DATA_UINT32 },
 103         { "waits_for_buffer_allocs",            KSTAT_DATA_UINT32 },
 104         { "buffers_locked_by_someone",          KSTAT_DATA_UINT32 },
 105         { "duplicate_buffers_found",            KSTAT_DATA_UINT32 }
 106 };
 107 
 108 /*
 109  * kstat data
 110  */
 111 kstat_named_t   *biostats_ptr = (kstat_named_t *)&biostats;
 112 uint_t          biostats_ndata = (uint_t)(sizeof (biostats) /
 113                                         sizeof (kstat_named_t));
 114 
 115 /*
 116  * Statistics on ufs buffer cache
 117  * Not protected by locks
 118  */
 119 struct ufsbiostats ub = {
 120         { "breads",                     KSTAT_DATA_UINT32 },
 121         { "bwrites",                    KSTAT_DATA_UINT32 },
 122         { "fbiwrites",                  KSTAT_DATA_UINT32 },
 123         { "getpages",                   KSTAT_DATA_UINT32 },
 124         { "getras",                     KSTAT_DATA_UINT32 },
 125         { "putsyncs",                   KSTAT_DATA_UINT32 },
 126         { "putasyncs",                  KSTAT_DATA_UINT32 },
 127         { "putpageios",                 KSTAT_DATA_UINT32 },
 128 };
 129 
 130 /*
 131  * more UFS Logging eccentricities...
 132  *
 133  * required since "#pragma weak ..." doesn't work in reverse order.
 134  * i.e.:  genunix (bio.c) is loaded before the ufs modules and pointers
 135  *        to ufs routines don't get plugged into bio.c calls so
 136  *        we initialize it when setting up the "lufsops" table
 137  *        in "lufs.c:_init()"
 138  */
 139 void (*bio_lufs_strategy)(void *, buf_t *);
 140 void (*bio_snapshot_strategy)(void *, buf_t *);
 141 
 142 
 143 /* Private routines */
 144 static struct buf       *bio_getfreeblk(long);
 145 static void             bio_mem_get(long);
 146 static void             bio_bhdr_free(struct buf *);
 147 static struct buf       *bio_bhdr_alloc(void);
 148 static void             bio_recycle(int, long);
 149 static void             bio_pageio_done(struct buf *);
 150 static int              bio_incore(dev_t, daddr_t);
 151 
 152 /*
 153  * Buffer cache constants
 154  */
 155 #define BIO_BUF_PERCENT (100/2)         /* default: 2% of memory */
 156 #define BIO_MAX_PERCENT (100/20)        /* max is 20% of real memory */
 157 #define BIO_BHDR_POOL   100             /* Default bhdr pool size */
 158 #define BIO_MIN_HDR     10              /* Minimum number of buffer headers */
 159 #define BIO_MIN_HWM     (BIO_MIN_HDR * MAXBSIZE / 1024)
 160 #define BIO_HASHLEN     4               /* Target length of hash chains */
 161 
 162 
 163 /* Flags for bio_recycle() */
 164 #define BIO_HEADER      0x01
 165 #define BIO_MEM         0x02
 166 
 167 extern volatile int bufhwm;     /* User tunable - high water mark for mem  */
 168 extern volatile int bufhwm_pct; /* ditto - given in % of physmem  */
 169 
 170 /*
 171  * The following routines allocate and free
 172  * buffers with various side effects.  In general the
 173  * arguments to an allocate routine are a device and
 174  * a block number, and the value is a pointer to
 175  * to the buffer header; the buffer returned is locked with a
 176  * binary semaphore so that no one else can touch it. If the block was
 177  * already in core, no I/O need be done; if it is
 178  * already locked, the process waits until it becomes free.
 179  * The following routines allocate a buffer:
 180  *      getblk
 181  *      bread/BREAD
 182  *      breada
 183  * Eventually the buffer must be released, possibly with the
 184  * side effect of writing it out, by using one of
 185  *      bwrite/BWRITE/brwrite
 186  *      bdwrite/bdrwrite
 187  *      bawrite
 188  *      brelse
 189  *
 190  * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization.
 191  * Instead, a binary semaphore, b_sem is used to gain exclusive access to
 192  * a buffer and a binary semaphore, b_io is used for I/O synchronization.
 193  * B_DONE is still used to denote a buffer with I/O complete on it.
 194  *
 195  * The bfreelist.b_bcount field is computed everytime fsflush runs. It is
 196  * should not be used where a very accurate count of the free buffers is
 197  * needed.
 198  */
 199 
 200 /*
 201  * Read in (if necessary) the block and return a buffer pointer.
 202  *
 203  * This interface is provided for binary compatibility.  Using
 204  * BREAD() directly avoids the extra function call overhead invoked
 205  * by calling this routine.
 206  */
 207 struct buf *
 208 bread(dev_t dev, daddr_t blkno, long bsize)
 209 {
 210         return (BREAD(dev, blkno, bsize));
 211 }
 212 
 213 /*
 214  * Common code for reading a buffer with various options
 215  *
 216  * Read in (if necessary) the block and return a buffer pointer.
 217  */
 218 struct buf *
 219 bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize)
 220 {
 221         struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
 222         struct buf *bp;
 223         klwp_t *lwp = ttolwp(curthread);
 224 
 225         CPU_STATS_ADD_K(sys, lread, 1);
 226         bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1);
 227         if (bp->b_flags & B_DONE)
 228                 return (bp);
 229         bp->b_flags |= B_READ;
 230         ASSERT(bp->b_bcount == bsize);
 231         if (ufsvfsp == NULL) {                                  /* !ufs */
 232                 (void) bdev_strategy(bp);
 233         } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
 234                                                         /* ufs && logging */
 235                 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
 236         } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
 237                                                         /* ufs && snapshots */
 238                 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
 239         } else {
 240                 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
 241                 ub.ub_breads.value.ul++;                /* ufs && !logging */
 242                 (void) bdev_strategy(bp);
 243         }
 244         if (lwp != NULL)
 245                 lwp->lwp_ru.inblock++;
 246         CPU_STATS_ADD_K(sys, bread, 1);
 247         (void) biowait(bp);
 248         return (bp);
 249 }
 250 
 251 /*
 252  * Read in the block, like bread, but also start I/O on the
 253  * read-ahead block (which is not allocated to the caller).
 254  */
 255 struct buf *
 256 breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize)
 257 {
 258         struct buf *bp, *rabp;
 259         klwp_t *lwp = ttolwp(curthread);
 260 
 261         bp = NULL;
 262         if (!bio_incore(dev, blkno)) {
 263                 CPU_STATS_ADD_K(sys, lread, 1);
 264                 bp = GETBLK(dev, blkno, bsize);
 265                 if ((bp->b_flags & B_DONE) == 0) {
 266                         bp->b_flags |= B_READ;
 267                         bp->b_bcount = bsize;
 268                         (void) bdev_strategy(bp);
 269                         if (lwp != NULL)
 270                                 lwp->lwp_ru.inblock++;
 271                         CPU_STATS_ADD_K(sys, bread, 1);
 272                 }
 273         }
 274         if (rablkno && bfreelist.b_bcount > 1 &&
 275             !bio_incore(dev, rablkno)) {
 276                 rabp = GETBLK(dev, rablkno, bsize);
 277                 if (rabp->b_flags & B_DONE)
 278                         brelse(rabp);
 279                 else {
 280                         rabp->b_flags |= B_READ|B_ASYNC;
 281                         rabp->b_bcount = bsize;
 282                         (void) bdev_strategy(rabp);
 283                         if (lwp != NULL)
 284                                 lwp->lwp_ru.inblock++;
 285                         CPU_STATS_ADD_K(sys, bread, 1);
 286                 }
 287         }
 288         if (bp == NULL)
 289                 return (BREAD(dev, blkno, bsize));
 290         (void) biowait(bp);
 291         return (bp);
 292 }
 293 
 294 /*
 295  * Common code for writing a buffer with various options.
 296  *
 297  * force_wait  - wait for write completion regardless of B_ASYNC flag
 298  * do_relse    - release the buffer when we are done
 299  * clear_flags - flags to clear from the buffer
 300  */
 301 void
 302 bwrite_common(void *arg, struct buf *bp, int force_wait,
 303     int do_relse, int clear_flags)
 304 {
 305         register int do_wait;
 306         struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
 307         int flag;
 308         klwp_t *lwp = ttolwp(curthread);
 309         struct cpu *cpup;
 310 
 311         ASSERT(SEMA_HELD(&bp->b_sem));
 312         flag = bp->b_flags;
 313         bp->b_flags &= ~clear_flags;
 314         if (lwp != NULL)
 315                 lwp->lwp_ru.oublock++;
 316         CPU_STATS_ENTER_K();
 317         cpup = CPU;             /* get pointer AFTER preemption is disabled */
 318         CPU_STATS_ADDQ(cpup, sys, lwrite, 1);
 319         CPU_STATS_ADDQ(cpup, sys, bwrite, 1);
 320         do_wait = ((flag & B_ASYNC) == 0 || force_wait);
 321         if (do_wait == 0)
 322                 CPU_STATS_ADDQ(cpup, sys, bawrite, 1);
 323         CPU_STATS_EXIT_K();
 324         if (ufsvfsp == NULL) {
 325                 (void) bdev_strategy(bp);
 326         } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
 327                                                         /* ufs && logging */
 328                 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
 329         } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
 330                                                         /* ufs && snapshots */
 331                 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
 332         } else {
 333                 ub.ub_bwrites.value.ul++;               /* ufs && !logging */
 334                 (void) bdev_strategy(bp);
 335         }
 336         if (do_wait) {
 337                 (void) biowait(bp);
 338                 if (do_relse) {
 339                         brelse(bp);
 340                 }
 341         }
 342 }
 343 
 344 /*
 345  * Write the buffer, waiting for completion (unless B_ASYNC is set).
 346  * Then release the buffer.
 347  * This interface is provided for binary compatibility.  Using
 348  * BWRITE() directly avoids the extra function call overhead invoked
 349  * by calling this routine.
 350  */
 351 void
 352 bwrite(struct buf *bp)
 353 {
 354         BWRITE(bp);
 355 }
 356 
 357 /*
 358  * Write the buffer, waiting for completion.
 359  * But don't release the buffer afterwards.
 360  * This interface is provided for binary compatibility.  Using
 361  * BWRITE2() directly avoids the extra function call overhead.
 362  */
 363 void
 364 bwrite2(struct buf *bp)
 365 {
 366         BWRITE2(bp);
 367 }
 368 
 369 /*
 370  * Release the buffer, marking it so that if it is grabbed
 371  * for another purpose it will be written out before being
 372  * given up (e.g. when writing a partial block where it is
 373  * assumed that another write for the same block will soon follow).
 374  * Also save the time that the block is first marked as delayed
 375  * so that it will be written in a reasonable time.
 376  */
 377 void
 378 bdwrite(struct buf *bp)
 379 {
 380         ASSERT(SEMA_HELD(&bp->b_sem));
 381         CPU_STATS_ADD_K(sys, lwrite, 1);
 382         if ((bp->b_flags & B_DELWRI) == 0)
 383                 bp->b_start = ddi_get_lbolt();
 384         /*
 385          * B_DONE allows others to use the buffer, B_DELWRI causes the
 386          * buffer to be written before being reused, and setting b_resid
 387          * to zero says the buffer is complete.
 388          */
 389         bp->b_flags |= B_DELWRI | B_DONE;
 390         bp->b_resid = 0;
 391         brelse(bp);
 392 }
 393 
 394 /*
 395  * Release the buffer, start I/O on it, but don't wait for completion.
 396  */
 397 void
 398 bawrite(struct buf *bp)
 399 {
 400         ASSERT(SEMA_HELD(&bp->b_sem));
 401 
 402         /* Use bfreelist.b_bcount as a weird-ass heuristic */
 403         if (bfreelist.b_bcount > 4)
 404                 bp->b_flags |= B_ASYNC;
 405         BWRITE(bp);
 406 }
 407 
 408 /*
 409  * Release the buffer, with no I/O implied.
 410  */
 411 void
 412 brelse(struct buf *bp)
 413 {
 414         struct buf      **backp;
 415         uint_t          index;
 416         kmutex_t        *hmp;
 417         struct  buf     *dp;
 418         struct  hbuf    *hp;
 419 
 420 
 421         ASSERT(SEMA_HELD(&bp->b_sem));
 422 
 423         /*
 424          * Clear the retry write flag if the buffer was written without
 425          * error.  The presence of B_DELWRI means the buffer has not yet
 426          * been written and the presence of B_ERROR means that an error
 427          * is still occurring.
 428          */
 429         if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) {
 430                 bp->b_flags &= ~B_RETRYWRI;
 431         }
 432 
 433         /* Check for anomalous conditions */
 434         if (bp->b_flags & (B_ERROR|B_NOCACHE)) {
 435                 if (bp->b_flags & B_NOCACHE) {
 436                         /* Don't add to the freelist. Destroy it now */
 437                         kmem_free(bp->b_un.b_addr, bp->b_bufsize);
 438                         sema_destroy(&bp->b_sem);
 439                         sema_destroy(&bp->b_io);
 440                         kmem_free(bp, sizeof (struct buf));
 441                         return;
 442                 }
 443                 /*
 444                  * If a write failed and we are supposed to retry write,
 445                  * don't toss the buffer.  Keep it around and mark it
 446                  * delayed write in the hopes that it will eventually
 447                  * get flushed (and still keep the system running.)
 448                  */
 449                 if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) {
 450                         bp->b_flags |= B_DELWRI;
 451                         /* keep fsflush from trying continuously to flush */
 452                         bp->b_start = ddi_get_lbolt();
 453                 } else
 454                         bp->b_flags |= B_AGE|B_STALE;
 455                 bp->b_flags &= ~B_ERROR;
 456                 bp->b_error = 0;
 457         }
 458 
 459         /*
 460          * If delayed write is set then put in on the delayed
 461          * write list instead of the free buffer list.
 462          */
 463         index = bio_bhash(bp->b_edev, bp->b_blkno);
 464         hmp   = &hbuf[index].b_lock;
 465 
 466         mutex_enter(hmp);
 467         hp = &hbuf[index];
 468         dp = (struct buf *)hp;
 469 
 470         /*
 471          * Make sure that the number of entries on this list are
 472          * Zero <= count <= total # buffers
 473          */
 474         ASSERT(hp->b_length >= 0);
 475         ASSERT(hp->b_length < nbuf);
 476 
 477         hp->b_length++;              /* We are adding this buffer */
 478 
 479         if (bp->b_flags & B_DELWRI) {
 480                 /*
 481                  * This buffer goes on the delayed write buffer list
 482                  */
 483                 dp = (struct buf *)&dwbuf[index];
 484         }
 485         ASSERT(bp->b_bufsize > 0);
 486         ASSERT(bp->b_bcount > 0);
 487         ASSERT(bp->b_un.b_addr != NULL);
 488 
 489         if (bp->b_flags & B_AGE) {
 490                 backp = &dp->av_forw;
 491                 (*backp)->av_back = bp;
 492                 bp->av_forw = *backp;
 493                 *backp = bp;
 494                 bp->av_back = dp;
 495         } else {
 496                 backp = &dp->av_back;
 497                 (*backp)->av_forw = bp;
 498                 bp->av_back = *backp;
 499                 *backp = bp;
 500                 bp->av_forw = dp;
 501         }
 502         mutex_exit(hmp);
 503 
 504         if (bfreelist.b_flags & B_WANTED) {
 505                 /*
 506                  * Should come here very very rarely.
 507                  */
 508                 mutex_enter(&bfree_lock);
 509                 if (bfreelist.b_flags & B_WANTED) {
 510                         bfreelist.b_flags &= ~B_WANTED;
 511                         cv_broadcast(&bio_mem_cv);
 512                 }
 513                 mutex_exit(&bfree_lock);
 514         }
 515 
 516         bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC);
 517         /*
 518          * Don't let anyone get the buffer off the freelist before we
 519          * release our hold on it.
 520          */
 521         sema_v(&bp->b_sem);
 522 }
 523 
 524 /*
 525  * Return a count of the number of B_BUSY buffers in the system
 526  * Can only be used as a good estimate.  If 'cleanit' is set,
 527  * try to flush all bufs.
 528  */
 529 int
 530 bio_busy(int cleanit)
 531 {
 532         struct buf *bp, *dp;
 533         int busy = 0;
 534         int i;
 535         kmutex_t *hmp;
 536 
 537         for (i = 0; i < v.v_hbuf; i++) {
 538                 dp = (struct buf *)&hbuf[i];
 539                 hmp = &hbuf[i].b_lock;
 540 
 541                 mutex_enter(hmp);
 542                 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
 543                         if (bp->b_flags & B_BUSY)
 544                                 busy++;
 545                 }
 546                 mutex_exit(hmp);
 547         }
 548 
 549         if (cleanit && busy != 0) {
 550                 bflush(NODEV);
 551         }
 552 
 553         return (busy);
 554 }
 555 
 556 /*
 557  * this interface is provided for binary compatibility.
 558  *
 559  * Assign a buffer for the given block.  If the appropriate
 560  * block is already associated, return it; otherwise search
 561  * for the oldest non-busy buffer and reassign it.
 562  */
 563 struct buf *
 564 getblk(dev_t dev, daddr_t blkno, long bsize)
 565 {
 566         return (getblk_common(/* ufsvfsp */ NULL, dev,
 567             blkno, bsize, /* errflg */ 0));
 568 }
 569 
 570 /*
 571  * Assign a buffer for the given block.  If the appropriate
 572  * block is already associated, return it; otherwise search
 573  * for the oldest non-busy buffer and reassign it.
 574  */
 575 struct buf *
 576 getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg)
 577 {
 578         ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg;
 579         struct buf *bp;
 580         struct buf *dp;
 581         struct buf *nbp = NULL;
 582         struct buf *errbp;
 583         uint_t          index;
 584         kmutex_t        *hmp;
 585         struct  hbuf    *hp;
 586 
 587         if (getmajor(dev) >= devcnt)
 588                 cmn_err(CE_PANIC, "blkdev");
 589 
 590         biostats.bio_lookup.value.ui32++;
 591 
 592         index = bio_bhash(dev, blkno);
 593         hp    = &hbuf[index];
 594         dp    = (struct buf *)hp;
 595         hmp   = &hp->b_lock;
 596 
 597         mutex_enter(hmp);
 598 loop:
 599         for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
 600                 if (bp->b_blkno != blkno || bp->b_edev != dev ||
 601                     (bp->b_flags & B_STALE))
 602                         continue;
 603                 /*
 604                  * Avoid holding the hash lock in the event that
 605                  * the buffer is locked by someone. Since the hash chain
 606                  * may change when we drop the hash lock
 607                  * we have to start at the beginning of the chain if the
 608                  * buffer identity/contents aren't valid.
 609                  */
 610                 if (!sema_tryp(&bp->b_sem)) {
 611                         biostats.bio_bufbusy.value.ui32++;
 612                         mutex_exit(hmp);
 613                         /*
 614                          * OK, we are dealing with a busy buffer.
 615                          * In the case that we are panicking and we
 616                          * got called from bread(), we have some chance
 617                          * for error recovery. So better bail out from
 618                          * here since sema_p() won't block. If we got
 619                          * called directly from ufs routines, there is
 620                          * no way to report an error yet.
 621                          */
 622                         if (panicstr && errflg)
 623                                 goto errout;
 624                         /*
 625                          * For the following line of code to work
 626                          * correctly never kmem_free the buffer "header".
 627                          */
 628                         sema_p(&bp->b_sem);
 629                         if (bp->b_blkno != blkno || bp->b_edev != dev ||
 630                             (bp->b_flags & B_STALE)) {
 631                                 sema_v(&bp->b_sem);
 632                                 mutex_enter(hmp);
 633                                 goto loop;      /* start over */
 634                         }
 635                         mutex_enter(hmp);
 636                 }
 637                 /* Found */
 638                 biostats.bio_hit.value.ui32++;
 639                 bp->b_flags &= ~B_AGE;
 640 
 641                 /*
 642                  * Yank it off the free/delayed write lists
 643                  */
 644                 hp->b_length--;
 645                 notavail(bp);
 646                 mutex_exit(hmp);
 647 
 648                 ASSERT((bp->b_flags & B_NOCACHE) == NULL);
 649 
 650                 if (nbp == NULL) {
 651                         /*
 652                          * Make the common path short.
 653                          */
 654                         ASSERT(SEMA_HELD(&bp->b_sem));
 655                         return (bp);
 656                 }
 657 
 658                 biostats.bio_bufdup.value.ui32++;
 659 
 660                 /*
 661                  * The buffer must have entered during the lock upgrade
 662                  * so free the new buffer we allocated and return the
 663                  * found buffer.
 664                  */
 665                 kmem_free(nbp->b_un.b_addr, nbp->b_bufsize);
 666                 nbp->b_un.b_addr = NULL;
 667 
 668                 /*
 669                  * Account for the memory
 670                  */
 671                 mutex_enter(&bfree_lock);
 672                 bfreelist.b_bufsize += nbp->b_bufsize;
 673                 mutex_exit(&bfree_lock);
 674 
 675                 /*
 676                  * Destroy buf identity, and place on avail list
 677                  */
 678                 nbp->b_dev = (o_dev_t)NODEV;
 679                 nbp->b_edev = NODEV;
 680                 nbp->b_flags = 0;
 681                 nbp->b_file = NULL;
 682                 nbp->b_offset = -1;
 683 
 684                 sema_v(&nbp->b_sem);
 685                 bio_bhdr_free(nbp);
 686 
 687                 ASSERT(SEMA_HELD(&bp->b_sem));
 688                 return (bp);
 689         }
 690 
 691         /*
 692          * bio_getfreeblk may block so check the hash chain again.
 693          */
 694         if (nbp == NULL) {
 695                 mutex_exit(hmp);
 696                 nbp = bio_getfreeblk(bsize);
 697                 mutex_enter(hmp);
 698                 goto loop;
 699         }
 700 
 701         /*
 702          * New buffer. Assign nbp and stick it on the hash.
 703          */
 704         nbp->b_flags = B_BUSY;
 705         nbp->b_edev = dev;
 706         nbp->b_dev = (o_dev_t)cmpdev(dev);
 707         nbp->b_blkno = blkno;
 708         nbp->b_iodone = NULL;
 709         nbp->b_bcount = bsize;
 710         /*
 711          * If we are given a ufsvfsp and the vfs_root field is NULL
 712          * then this must be I/O for a superblock.  A superblock's
 713          * buffer is set up in mountfs() and there is no root vnode
 714          * at that point.
 715          */
 716         if (ufsvfsp && ufsvfsp->vfs_root) {
 717                 nbp->b_vp = ufsvfsp->vfs_root;
 718         } else {
 719                 nbp->b_vp = NULL;
 720         }
 721 
 722         ASSERT((nbp->b_flags & B_NOCACHE) == NULL);
 723 
 724         binshash(nbp, dp);
 725         mutex_exit(hmp);
 726 
 727         ASSERT(SEMA_HELD(&nbp->b_sem));
 728 
 729         return (nbp);
 730 
 731 
 732         /*
 733          * Come here in case of an internal error. At this point we couldn't
 734          * get a buffer, but we have to return one. Hence we allocate some
 735          * kind of error reply buffer on the fly. This buffer is marked as
 736          * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
 737          *      - B_ERROR will indicate error to the caller.
 738          *      - B_DONE will prevent us from reading the buffer from
 739          *        the device.
 740          *      - B_NOCACHE will cause that this buffer gets free'd in
 741          *        brelse().
 742          */
 743 
 744 errout:
 745         errbp = geteblk();
 746         sema_p(&errbp->b_sem);
 747         errbp->b_flags &= ~B_BUSY;
 748         errbp->b_flags |= (B_ERROR | B_DONE);
 749         return (errbp);
 750 }
 751 
 752 /*
 753  * Get an empty block, not assigned to any particular device.
 754  * Returns a locked buffer that is not on any hash or free list.
 755  */
 756 struct buf *
 757 ngeteblk(long bsize)
 758 {
 759         struct buf *bp;
 760 
 761         bp = kmem_alloc(sizeof (struct buf), KM_SLEEP);
 762         bioinit(bp);
 763         bp->av_forw = bp->av_back = NULL;
 764         bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
 765         bp->b_bufsize = bsize;
 766         bp->b_flags = B_BUSY | B_NOCACHE | B_AGE;
 767         bp->b_dev = (o_dev_t)NODEV;
 768         bp->b_edev = NODEV;
 769         bp->b_lblkno = 0;
 770         bp->b_bcount = bsize;
 771         bp->b_iodone = NULL;
 772         return (bp);
 773 }
 774 
 775 /*
 776  * Interface of geteblk() is kept intact to maintain driver compatibility.
 777  * Use ngeteblk() to allocate block size other than 1 KB.
 778  */
 779 struct buf *
 780 geteblk(void)
 781 {
 782         return (ngeteblk((long)1024));
 783 }
 784 
 785 /*
 786  * Return a buffer w/o sleeping
 787  */
 788 struct buf *
 789 trygetblk(dev_t dev, daddr_t blkno)
 790 {
 791         struct buf      *bp;
 792         struct buf      *dp;
 793         struct hbuf     *hp;
 794         kmutex_t        *hmp;
 795         uint_t          index;
 796 
 797         index = bio_bhash(dev, blkno);
 798         hp = &hbuf[index];
 799         hmp = &hp->b_lock;
 800 
 801         if (!mutex_tryenter(hmp))
 802                 return (NULL);
 803 
 804         dp = (struct buf *)hp;
 805         for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
 806                 if (bp->b_blkno != blkno || bp->b_edev != dev ||
 807                     (bp->b_flags & B_STALE))
 808                         continue;
 809                 /*
 810                  * Get access to a valid buffer without sleeping
 811                  */
 812                 if (sema_tryp(&bp->b_sem)) {
 813                         if (bp->b_flags & B_DONE) {
 814                                 hp->b_length--;
 815                                 notavail(bp);
 816                                 mutex_exit(hmp);
 817                                 return (bp);
 818                         } else {
 819                                 sema_v(&bp->b_sem);
 820                                 break;
 821                         }
 822                 }
 823                 break;
 824         }
 825         mutex_exit(hmp);
 826         return (NULL);
 827 }
 828 
 829 /*
 830  * Wait for I/O completion on the buffer; return errors
 831  * to the user.
 832  */
 833 int
 834 iowait(struct buf *bp)
 835 {
 836         ASSERT(SEMA_HELD(&bp->b_sem));
 837         return (biowait(bp));
 838 }
 839 
 840 /*
 841  * Mark I/O complete on a buffer, release it if I/O is asynchronous,
 842  * and wake up anyone waiting for it.
 843  */
 844 void
 845 iodone(struct buf *bp)
 846 {
 847         ASSERT(SEMA_HELD(&bp->b_sem));
 848         (void) biodone(bp);
 849 }
 850 
 851 /*
 852  * Zero the core associated with a buffer.
 853  */
 854 void
 855 clrbuf(struct buf *bp)
 856 {
 857         ASSERT(SEMA_HELD(&bp->b_sem));
 858         bzero(bp->b_un.b_addr, bp->b_bcount);
 859         bp->b_resid = 0;
 860 }
 861 
 862 
 863 /*
 864  * Make sure all write-behind blocks on dev (or NODEV for all)
 865  * are flushed out.
 866  */
 867 void
 868 bflush(dev_t dev)
 869 {
 870         struct buf *bp, *dp;
 871         struct hbuf *hp;
 872         struct buf *delwri_list = EMPTY_LIST;
 873         int i, index;
 874         kmutex_t *hmp;
 875 
 876         mutex_enter(&blist_lock);
 877         /*
 878          * Wait for any invalidates or flushes ahead of us to finish.
 879          * We really could split blist_lock up per device for better
 880          * parallelism here.
 881          */
 882         while (bio_doinginval || bio_doingflush) {
 883                 bio_flinv_cv_wanted = 1;
 884                 cv_wait(&bio_flushinval_cv, &blist_lock);
 885         }
 886         bio_doingflush++;
 887         /*
 888          * Gather all B_DELWRI buffer for device.
 889          * Lock ordering is b_sem > hash lock (brelse).
 890          * Since we are finding the buffer via the delayed write list,
 891          * it may be busy and we would block trying to get the
 892          * b_sem lock while holding hash lock. So transfer all the
 893          * candidates on the delwri_list and then drop the hash locks.
 894          */
 895         for (i = 0; i < v.v_hbuf; i++) {
 896                 hmp = &hbuf[i].b_lock;
 897                 dp = (struct buf *)&dwbuf[i];
 898                 mutex_enter(hmp);
 899                 for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) {
 900                         if (dev == NODEV || bp->b_edev == dev) {
 901                                 if (bp->b_list == NULL) {
 902                                         bp->b_list = delwri_list;
 903                                         delwri_list = bp;
 904                                 }
 905                         }
 906                 }
 907                 mutex_exit(hmp);
 908         }
 909         mutex_exit(&blist_lock);
 910 
 911         /*
 912          * Now that the hash locks have been dropped grab the semaphores
 913          * and write back all the buffers that have B_DELWRI set.
 914          */
 915         while (delwri_list != EMPTY_LIST) {
 916                 bp = delwri_list;
 917 
 918                 sema_p(&bp->b_sem);      /* may block */
 919                 if ((dev != bp->b_edev && dev != NODEV) ||
 920                     (panicstr && bp->b_flags & B_BUSY)) {
 921                         sema_v(&bp->b_sem);
 922                         delwri_list = bp->b_list;
 923                         bp->b_list = NULL;
 924                         continue;       /* No longer a candidate */
 925                 }
 926                 if (bp->b_flags & B_DELWRI) {
 927                         index = bio_bhash(bp->b_edev, bp->b_blkno);
 928                         hp = &hbuf[index];
 929                         hmp = &hp->b_lock;
 930                         dp = (struct buf *)hp;
 931 
 932                         bp->b_flags |= B_ASYNC;
 933                         mutex_enter(hmp);
 934                         hp->b_length--;
 935                         notavail(bp);
 936                         mutex_exit(hmp);
 937                         if (bp->b_vp == NULL) {              /* !ufs */
 938                                 BWRITE(bp);
 939                         } else {                        /* ufs */
 940                                 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
 941                         }
 942                 } else {
 943                         sema_v(&bp->b_sem);
 944                 }
 945                 delwri_list = bp->b_list;
 946                 bp->b_list = NULL;
 947         }
 948         mutex_enter(&blist_lock);
 949         bio_doingflush--;
 950         if (bio_flinv_cv_wanted) {
 951                 bio_flinv_cv_wanted = 0;
 952                 cv_broadcast(&bio_flushinval_cv);
 953         }
 954         mutex_exit(&blist_lock);
 955 }
 956 
 957 /*
 958  * Ensure that a specified block is up-to-date on disk.
 959  */
 960 void
 961 blkflush(dev_t dev, daddr_t blkno)
 962 {
 963         struct buf *bp, *dp;
 964         struct hbuf *hp;
 965         struct buf *sbp = NULL;
 966         uint_t index;
 967         kmutex_t *hmp;
 968 
 969         index = bio_bhash(dev, blkno);
 970         hp    = &hbuf[index];
 971         dp    = (struct buf *)hp;
 972         hmp   = &hp->b_lock;
 973 
 974         /*
 975          * Identify the buffer in the cache belonging to
 976          * this device and blkno (if any).
 977          */
 978         mutex_enter(hmp);
 979         for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
 980                 if (bp->b_blkno != blkno || bp->b_edev != dev ||
 981                     (bp->b_flags & B_STALE))
 982                         continue;
 983                 sbp = bp;
 984                 break;
 985         }
 986         mutex_exit(hmp);
 987         if (sbp == NULL)
 988                 return;
 989         /*
 990          * Now check the buffer we have identified and
 991          * make sure it still belongs to the device and is B_DELWRI
 992          */
 993         sema_p(&sbp->b_sem);
 994         if (sbp->b_blkno == blkno && sbp->b_edev == dev &&
 995             (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) {
 996                 mutex_enter(hmp);
 997                 hp->b_length--;
 998                 notavail(sbp);
 999                 mutex_exit(hmp);
1000                 /*
1001                  * XXX - There is nothing to guarantee a synchronous
1002                  * write here if the B_ASYNC flag is set.  This needs
1003                  * some investigation.
1004                  */
1005                 if (sbp->b_vp == NULL) {             /* !ufs */
1006                         BWRITE(sbp);    /* synchronous write */
1007                 } else {                                /* ufs */
1008                         UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp);
1009                 }
1010         } else {
1011                 sema_v(&sbp->b_sem);
1012         }
1013 }
1014 
1015 /*
1016  * Same as binval, except can force-invalidate delayed-write buffers
1017  * (which are not be already flushed because of device errors).  Also
1018  * makes sure that the retry write flag is cleared.
1019  */
1020 int
1021 bfinval(dev_t dev, int force)
1022 {
1023         struct buf *dp;
1024         struct buf *bp;
1025         struct buf *binval_list = EMPTY_LIST;
1026         int i, error = 0;
1027         kmutex_t *hmp;
1028         uint_t index;
1029         struct buf **backp;
1030 
1031         mutex_enter(&blist_lock);
1032         /*
1033          * Wait for any flushes ahead of us to finish, it's ok to
1034          * do invalidates in parallel.
1035          */
1036         while (bio_doingflush) {
1037                 bio_flinv_cv_wanted = 1;
1038                 cv_wait(&bio_flushinval_cv, &blist_lock);
1039         }
1040         bio_doinginval++;
1041 
1042         /* Gather bp's */
1043         for (i = 0; i < v.v_hbuf; i++) {
1044                 dp = (struct buf *)&hbuf[i];
1045                 hmp = &hbuf[i].b_lock;
1046 
1047                 mutex_enter(hmp);
1048                 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1049                         if (bp->b_edev == dev) {
1050                                 if (bp->b_list == NULL) {
1051                                         bp->b_list = binval_list;
1052                                         binval_list = bp;
1053                                 }
1054                         }
1055                 }
1056                 mutex_exit(hmp);
1057         }
1058         mutex_exit(&blist_lock);
1059 
1060         /* Invalidate all bp's found */
1061         while (binval_list != EMPTY_LIST) {
1062                 bp = binval_list;
1063 
1064                 sema_p(&bp->b_sem);
1065                 if (bp->b_edev == dev) {
1066                         if (force && (bp->b_flags & B_DELWRI)) {
1067                                 /* clear B_DELWRI, move to non-dw freelist */
1068                                 index = bio_bhash(bp->b_edev, bp->b_blkno);
1069                                 hmp = &hbuf[index].b_lock;
1070                                 dp = (struct buf *)&hbuf[index];
1071                                 mutex_enter(hmp);
1072 
1073                                 /* remove from delayed write freelist */
1074                                 notavail(bp);
1075 
1076                                 /* add to B_AGE side of non-dw freelist */
1077                                 backp = &dp->av_forw;
1078                                 (*backp)->av_back = bp;
1079                                 bp->av_forw = *backp;
1080                                 *backp = bp;
1081                                 bp->av_back = dp;
1082 
1083                                 /*
1084                                  * make sure write retries and busy are cleared
1085                                  */
1086                                 bp->b_flags &=
1087                                     ~(B_BUSY | B_DELWRI | B_RETRYWRI);
1088                                 mutex_exit(hmp);
1089                         }
1090                         if ((bp->b_flags & B_DELWRI) == 0)
1091                                 bp->b_flags |= B_STALE|B_AGE;
1092                         else
1093                                 error = EIO;
1094                 }
1095                 sema_v(&bp->b_sem);
1096                 binval_list = bp->b_list;
1097                 bp->b_list = NULL;
1098         }
1099         mutex_enter(&blist_lock);
1100         bio_doinginval--;
1101         if (bio_flinv_cv_wanted) {
1102                 cv_broadcast(&bio_flushinval_cv);
1103                 bio_flinv_cv_wanted = 0;
1104         }
1105         mutex_exit(&blist_lock);
1106         return (error);
1107 }
1108 
1109 /*
1110  * If possible, invalidate blocks for a dev on demand
1111  */
1112 void
1113 binval(dev_t dev)
1114 {
1115         (void) bfinval(dev, 0);
1116 }
1117 
1118 /*
1119  * Initialize the buffer I/O system by freeing
1120  * all buffers and setting all device hash buffer lists to empty.
1121  */
1122 void
1123 binit(void)
1124 {
1125         struct buf *bp;
1126         unsigned int i, pct;
1127         ulong_t bio_max_hwm, bio_default_hwm;
1128 
1129         /*
1130          * Maximum/Default values for bufhwm are set to the smallest of:
1131          *      - BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
1132          *      - 1/4 of kernel virtual memory
1133          *      - INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
1134          * Additionally, in order to allow simple tuning by percentage of
1135          * physical memory, bufhwm_pct is used to calculate the default if
1136          * the value of this tunable is between 0 and BIO_MAX_PERCENT.
1137          *
1138          * Since the unit for v.v_bufhwm is kilobytes, this allows for
1139          * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
1140          */
1141         bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT,
1142             btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1143         bio_max_hwm = MIN(INT32_MAX, bio_max_hwm);
1144 
1145         pct = BIO_BUF_PERCENT;
1146         if (bufhwm_pct != 0 &&
1147             ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) {
1148                 pct = BIO_BUF_PERCENT;
1149                 /*
1150                  * Invalid user specified value, emit a warning.
1151                  */
1152                 cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \
1153                     range(1..%d). Using %d as default.",
1154                     bufhwm_pct,
1155                     100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT);
1156         }
1157 
1158         bio_default_hwm = MIN(physmem / pct,
1159             btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1160         bio_default_hwm = MIN(INT32_MAX, bio_default_hwm);
1161 
1162         if ((v.v_bufhwm = bufhwm) == 0)
1163                 v.v_bufhwm = bio_default_hwm;
1164 
1165         if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) {
1166                 v.v_bufhwm = (int)bio_max_hwm;
1167                 /*
1168                  * Invalid user specified value, emit a warning.
1169                  */
1170                 cmn_err(CE_WARN,
1171                     "binit: bufhwm(%d) out \
1172                     of range(%d..%lu). Using %lu as default",
1173                     bufhwm,
1174                     BIO_MIN_HWM, bio_max_hwm, bio_max_hwm);
1175         }
1176 
1177         /*
1178          * Determine the number of hash buckets. Default is to
1179          * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
1180          * Round up number to the next power of 2.
1181          */
1182         v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) /
1183             BIO_HASHLEN);
1184         v.v_hmask = v.v_hbuf - 1;
1185         v.v_buf = BIO_BHDR_POOL;
1186 
1187         hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP);
1188 
1189         dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP);
1190 
1191         bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024;
1192         bp = &bfreelist;
1193         bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp;
1194 
1195         for (i = 0; i < v.v_hbuf; i++) {
1196                 hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i];
1197                 hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i];
1198 
1199                 /*
1200                  * Initialize the delayed write buffer list.
1201                  */
1202                 dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i];
1203                 dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i];
1204         }
1205 }
1206 
1207 /*
1208  * Wait for I/O completion on the buffer; return error code.
1209  * If bp was for synchronous I/O, bp is invalid and associated
1210  * resources are freed on return.
1211  */
1212 int
1213 biowait(struct buf *bp)
1214 {
1215         int error = 0;
1216         struct cpu *cpup;
1217 
1218         ASSERT(SEMA_HELD(&bp->b_sem));
1219 
1220         cpup = CPU;
1221         atomic_inc_64(&cpup->cpu_stats.sys.iowait);
1222         DTRACE_IO1(wait__start, struct buf *, bp);
1223 
1224         /*
1225          * In case of panic, busy wait for completion
1226          */
1227         if (panicstr) {
1228                 while ((bp->b_flags & B_DONE) == 0)
1229                         drv_usecwait(10);
1230         } else
1231                 sema_p(&bp->b_io);
1232 
1233         DTRACE_IO1(wait__done, struct buf *, bp);
1234         atomic_dec_64(&cpup->cpu_stats.sys.iowait);
1235 
1236         error = geterror(bp);
1237         if ((bp->b_flags & B_ASYNC) == 0) {
1238                 if (bp->b_flags & B_REMAPPED)
1239                         bp_mapout(bp);
1240         }
1241         return (error);
1242 }
1243 
1244 static void
1245 biodone_tnf_probe(struct buf *bp)
1246 {
1247         /* Kernel probe */
1248         TNF_PROBE_3(biodone, "io blockio", /* CSTYLED */,
1249             tnf_device,         device,         bp->b_edev,
1250             tnf_diskaddr,       block,          bp->b_lblkno,
1251             tnf_opaque,         buf,            bp);
1252 }
1253 
1254 /*
1255  * Mark I/O complete on a buffer, release it if I/O is asynchronous,
1256  * and wake up anyone waiting for it.
1257  */
1258 void
1259 biodone(struct buf *bp)
1260 {
1261         if (bp->b_flags & B_STARTED) {
1262                 DTRACE_IO1(done, struct buf *, bp);
1263                 bp->b_flags &= ~B_STARTED;
1264         }
1265 
1266         /*
1267          * Call the TNF probe here instead of the inline code
1268          * to force our compiler to use the tail call optimization.
1269          */
1270         biodone_tnf_probe(bp);
1271 
1272         if (bp->b_iodone != NULL) {
1273                 (*(bp->b_iodone))(bp);
1274                 return;
1275         }
1276         ASSERT((bp->b_flags & B_DONE) == 0);
1277         ASSERT(SEMA_HELD(&bp->b_sem));
1278         bp->b_flags |= B_DONE;
1279         if (bp->b_flags & B_ASYNC) {
1280                 if (bp->b_flags & (B_PAGEIO|B_REMAPPED))
1281                         bio_pageio_done(bp);
1282                 else
1283                         brelse(bp);     /* release bp to freelist */
1284         } else {
1285                 sema_v(&bp->b_io);
1286         }
1287 }
1288 
1289 /*
1290  * Pick up the device's error number and pass it to the user;
1291  * if there is an error but the number is 0 set a generalized code.
1292  */
1293 int
1294 geterror(struct buf *bp)
1295 {
1296         int error = 0;
1297 
1298         ASSERT(SEMA_HELD(&bp->b_sem));
1299         if (bp->b_flags & B_ERROR) {
1300                 error = bp->b_error;
1301                 if (!error)
1302                         error = EIO;
1303         }
1304         return (error);
1305 }
1306 
1307 /*
1308  * Support for pageio buffers.
1309  *
1310  * This stuff should be generalized to provide a generalized bp
1311  * header facility that can be used for things other than pageio.
1312  */
1313 
1314 /*
1315  * Allocate and initialize a buf struct for use with pageio.
1316  */
1317 struct buf *
1318 pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
1319 {
1320         struct buf *bp;
1321         struct cpu *cpup;
1322 
1323         if (flags & B_READ) {
1324                 CPU_STATS_ENTER_K();
1325                 cpup = CPU;     /* get pointer AFTER preemption is disabled */
1326                 CPU_STATS_ADDQ(cpup, vm, pgin, 1);
1327                 CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len));
1328 
1329                 atomic_add_64(&curzone->zone_pgpgin, btopr(len));
1330 
1331                 if ((flags & B_ASYNC) == 0) {
1332                         klwp_t *lwp = ttolwp(curthread);
1333                         if (lwp != NULL)
1334                                 lwp->lwp_ru.majflt++;
1335                         CPU_STATS_ADDQ(cpup, vm, maj_fault, 1);
1336                         /* Kernel probe */
1337                         TNF_PROBE_2(major_fault, "vm pagefault", /* CSTYLED */,
1338                             tnf_opaque,         vnode,          pp->p_vnode,
1339                             tnf_offset,         offset,         pp->p_offset);
1340                 }
1341                 /*
1342                  * Update statistics for pages being paged in
1343                  */
1344                 if (pp != NULL && pp->p_vnode != NULL) {
1345                         if (IS_SWAPFSVP(pp->p_vnode)) {
1346                                 CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len));
1347                                 atomic_add_64(&curzone->zone_anonpgin,
1348                                     btopr(len));
1349                         } else {
1350                                 if (pp->p_vnode->v_flag & VVMEXEC) {
1351                                         CPU_STATS_ADDQ(cpup, vm, execpgin,
1352                                             btopr(len));
1353                                         atomic_add_64(&curzone->zone_execpgin,
1354                                             btopr(len));
1355                                 } else {
1356                                         CPU_STATS_ADDQ(cpup, vm, fspgin,
1357                                             btopr(len));
1358                                         atomic_add_64(&curzone->zone_fspgin,
1359                                             btopr(len));
1360                                 }
1361                         }
1362                 }
1363                 CPU_STATS_EXIT_K();
1364                 TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN,
1365                     "page_ws_in:pp %p", pp);
1366                 /* Kernel probe */
1367                 TNF_PROBE_3(pagein, "vm pageio io", /* CSTYLED */,
1368                     tnf_opaque, vnode,  pp->p_vnode,
1369                     tnf_offset, offset, pp->p_offset,
1370                     tnf_size,   size,   len);
1371         }
1372 
1373         bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
1374         bp->b_bcount = len;
1375         bp->b_bufsize = len;
1376         bp->b_pages = pp;
1377         bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags;
1378         bp->b_offset = -1;
1379         sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
1380 
1381         /* Initialize bp->b_sem in "locked" state */
1382         sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
1383 
1384         VN_HOLD(vp);
1385         bp->b_vp = vp;
1386         THREAD_KPRI_RELEASE_N(btopr(len)); /* release kpri from page_locks */
1387 
1388         /*
1389          * Caller sets dev & blkno and can adjust
1390          * b_addr for page offset and can use bp_mapin
1391          * to make pages kernel addressable.
1392          */
1393         return (bp);
1394 }
1395 
1396 void
1397 pageio_done(struct buf *bp)
1398 {
1399         ASSERT(SEMA_HELD(&bp->b_sem));
1400         if (bp->b_flags & B_REMAPPED)
1401                 bp_mapout(bp);
1402         VN_RELE(bp->b_vp);
1403         bp->b_vp = NULL;
1404         ASSERT((bp->b_flags & B_NOCACHE) != 0);
1405 
1406         /* A sema_v(bp->b_sem) is implied if we are destroying it */
1407         sema_destroy(&bp->b_sem);
1408         sema_destroy(&bp->b_io);
1409         kmem_free(bp, sizeof (struct buf));
1410 }
1411 
1412 /*
1413  * Check to see whether the buffers, except the one pointed by sbp,
1414  * associated with the device are busy.
1415  * NOTE: This expensive operation shall be improved together with ufs_icheck().
1416  */
1417 int
1418 bcheck(dev_t dev, struct buf *sbp)
1419 {
1420         struct buf      *bp;
1421         struct buf      *dp;
1422         int i;
1423         kmutex_t *hmp;
1424 
1425         /*
1426          * check for busy bufs for this filesystem
1427          */
1428         for (i = 0; i < v.v_hbuf; i++) {
1429                 dp = (struct buf *)&hbuf[i];
1430                 hmp = &hbuf[i].b_lock;
1431 
1432                 mutex_enter(hmp);
1433                 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1434                         /*
1435                          * if buf is busy or dirty, then filesystem is busy
1436                          */
1437                         if ((bp->b_edev == dev) &&
1438                             ((bp->b_flags & B_STALE) == 0) &&
1439                             (bp->b_flags & (B_DELWRI|B_BUSY)) &&
1440                             (bp != sbp)) {
1441                                 mutex_exit(hmp);
1442                                 return (1);
1443                         }
1444                 }
1445                 mutex_exit(hmp);
1446         }
1447         return (0);
1448 }
1449 
1450 /*
1451  * Hash two 32 bit entities.
1452  */
1453 int
1454 hash2ints(int x, int y)
1455 {
1456         int hash = 0;
1457 
1458         hash = x - 1;
1459         hash = ((hash * 7) + (x >> 8)) - 1;
1460         hash = ((hash * 7) + (x >> 16)) - 1;
1461         hash = ((hash * 7) + (x >> 24)) - 1;
1462         hash = ((hash * 7) + y) - 1;
1463         hash = ((hash * 7) + (y >> 8)) - 1;
1464         hash = ((hash * 7) + (y >> 16)) - 1;
1465         hash = ((hash * 7) + (y >> 24)) - 1;
1466 
1467         return (hash);
1468 }
1469 
1470 
1471 /*
1472  * Return a new buffer struct.
1473  *      Create a new buffer if we haven't gone over our high water
1474  *      mark for memory, otherwise try to get one off the freelist.
1475  *
1476  * Returns a locked buf that has no id and is not on any hash or free
1477  * list.
1478  */
1479 static struct buf *
1480 bio_getfreeblk(long bsize)
1481 {
1482         struct buf *bp, *dp;
1483         struct hbuf *hp;
1484         kmutex_t        *hmp;
1485         uint_t          start, end;
1486 
1487         /*
1488          * mutex_enter(&bfree_lock);
1489          * bfreelist.b_bufsize represents the amount of memory
1490          * mutex_exit(&bfree_lock); protect ref to bfreelist
1491          * we are allowed to allocate in the cache before we hit our hwm.
1492          */
1493         bio_mem_get(bsize);     /* Account for our memory request */
1494 
1495 again:
1496         bp = bio_bhdr_alloc();  /* Get a buf hdr */
1497         sema_p(&bp->b_sem);      /* Should never fail */
1498 
1499         ASSERT(bp->b_un.b_addr == NULL);
1500         bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP);
1501         if (bp->b_un.b_addr != NULL) {
1502                 /*
1503                  * Make the common path short
1504                  */
1505                 bp->b_bufsize = bsize;
1506                 ASSERT(SEMA_HELD(&bp->b_sem));
1507                 return (bp);
1508         } else {
1509                 struct buf *save;
1510 
1511                 save = bp;      /* Save bp we allocated */
1512                 start = end = lastindex;
1513 
1514                 biostats.bio_bufwant.value.ui32++;
1515 
1516                 /*
1517                  * Memory isn't available from the system now. Scan
1518                  * the hash buckets till enough space is found.
1519                  */
1520                 do {
1521                         hp = &hbuf[start];
1522                         hmp = &hp->b_lock;
1523                         dp = (struct buf *)hp;
1524 
1525                         mutex_enter(hmp);
1526                         bp = dp->av_forw;
1527 
1528                         while (bp != dp) {
1529 
1530                                 ASSERT(bp != NULL);
1531 
1532                                 if (!sema_tryp(&bp->b_sem)) {
1533                                         bp = bp->av_forw;
1534                                         continue;
1535                                 }
1536 
1537                                 /*
1538                                  * Since we are going down the freelist
1539                                  * associated with this hash bucket the
1540                                  * B_DELWRI flag should not be set.
1541                                  */
1542                                 ASSERT(!(bp->b_flags & B_DELWRI));
1543 
1544                                 if (bp->b_bufsize == bsize) {
1545                                         hp->b_length--;
1546                                         notavail(bp);
1547                                         bremhash(bp);
1548                                         mutex_exit(hmp);
1549 
1550                                         /*
1551                                          * Didn't kmem_alloc any more, so don't
1552                                          * count it twice.
1553                                          */
1554                                         mutex_enter(&bfree_lock);
1555                                         bfreelist.b_bufsize += bsize;
1556                                         mutex_exit(&bfree_lock);
1557 
1558                                         /*
1559                                          * Update the lastindex value.
1560                                          */
1561                                         lastindex = start;
1562 
1563                                         /*
1564                                          * Put our saved bp back on the list
1565                                          */
1566                                         sema_v(&save->b_sem);
1567                                         bio_bhdr_free(save);
1568                                         ASSERT(SEMA_HELD(&bp->b_sem));
1569                                         return (bp);
1570                                 }
1571                                 sema_v(&bp->b_sem);
1572                                 bp = bp->av_forw;
1573                         }
1574                         mutex_exit(hmp);
1575                         start = ((start + 1) % v.v_hbuf);
1576                 } while (start != end);
1577 
1578                 biostats.bio_bufwait.value.ui32++;
1579                 bp = save;              /* Use original bp */
1580                 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
1581         }
1582 
1583         bp->b_bufsize = bsize;
1584         ASSERT(SEMA_HELD(&bp->b_sem));
1585         return (bp);
1586 }
1587 
1588 /*
1589  * Allocate a buffer header. If none currently available, allocate
1590  * a new pool.
1591  */
1592 static struct buf *
1593 bio_bhdr_alloc(void)
1594 {
1595         struct buf *dp, *sdp;
1596         struct buf *bp;
1597         int i;
1598 
1599         for (;;) {
1600                 mutex_enter(&bhdr_lock);
1601                 if (bhdrlist != NULL) {
1602                         bp = bhdrlist;
1603                         bhdrlist = bp->av_forw;
1604                         mutex_exit(&bhdr_lock);
1605                         bp->av_forw = NULL;
1606                         return (bp);
1607                 }
1608                 mutex_exit(&bhdr_lock);
1609 
1610                 /*
1611                  * Need to allocate a new pool. If the system is currently
1612                  * out of memory, then try freeing things on the freelist.
1613                  */
1614                 dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP);
1615                 if (dp == NULL) {
1616                         /*
1617                          * System can't give us a pool of headers, try
1618                          * recycling from the free lists.
1619                          */
1620                         bio_recycle(BIO_HEADER, 0);
1621                 } else {
1622                         sdp = dp;
1623                         for (i = 0; i < v.v_buf; i++, dp++) {
1624                                 /*
1625                                  * The next two lines are needed since NODEV
1626                                  * is -1 and not NULL
1627                                  */
1628                                 dp->b_dev = (o_dev_t)NODEV;
1629                                 dp->b_edev = NODEV;
1630                                 dp->av_forw = dp + 1;
1631                                 sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT,
1632                                     NULL);
1633                                 sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT,
1634                                     NULL);
1635                                 dp->b_offset = -1;
1636                         }
1637                         mutex_enter(&bhdr_lock);
1638                         (--dp)->av_forw = bhdrlist;  /* Fix last pointer */
1639                         bhdrlist = sdp;
1640                         nbuf += v.v_buf;
1641                         bp = bhdrlist;
1642                         bhdrlist = bp->av_forw;
1643                         mutex_exit(&bhdr_lock);
1644 
1645                         bp->av_forw = NULL;
1646                         return (bp);
1647                 }
1648         }
1649 }
1650 
1651 static  void
1652 bio_bhdr_free(struct buf *bp)
1653 {
1654         ASSERT(bp->b_back == NULL);
1655         ASSERT(bp->b_forw == NULL);
1656         ASSERT(bp->av_back == NULL);
1657         ASSERT(bp->av_forw == NULL);
1658         ASSERT(bp->b_un.b_addr == NULL);
1659         ASSERT(bp->b_dev == (o_dev_t)NODEV);
1660         ASSERT(bp->b_edev == NODEV);
1661         ASSERT(bp->b_flags == 0);
1662 
1663         mutex_enter(&bhdr_lock);
1664         bp->av_forw = bhdrlist;
1665         bhdrlist = bp;
1666         mutex_exit(&bhdr_lock);
1667 }
1668 
1669 /*
1670  * If we haven't gone over the high water mark, it's o.k. to
1671  * allocate more buffer space, otherwise recycle buffers
1672  * from the freelist until enough memory is free for a bsize request.
1673  *
1674  * We account for this memory, even though
1675  * we don't allocate it here.
1676  */
1677 static void
1678 bio_mem_get(long bsize)
1679 {
1680         mutex_enter(&bfree_lock);
1681         if (bfreelist.b_bufsize > bsize) {
1682                 bfreelist.b_bufsize -= bsize;
1683                 mutex_exit(&bfree_lock);
1684                 return;
1685         }
1686         mutex_exit(&bfree_lock);
1687         bio_recycle(BIO_MEM, bsize);
1688 }
1689 
1690 /*
1691  * flush a list of delayed write buffers.
1692  * (currently used only by bio_recycle below.)
1693  */
1694 static void
1695 bio_flushlist(struct buf *delwri_list)
1696 {
1697         struct buf *bp;
1698 
1699         while (delwri_list != EMPTY_LIST) {
1700                 bp = delwri_list;
1701                 bp->b_flags |= B_AGE | B_ASYNC;
1702                 if (bp->b_vp == NULL) {              /* !ufs */
1703                         BWRITE(bp);
1704                 } else {                        /* ufs */
1705                         UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
1706                 }
1707                 delwri_list = bp->b_list;
1708                 bp->b_list = NULL;
1709         }
1710 }
1711 
1712 /*
1713  * Start recycling buffers on the freelist for one of 2 reasons:
1714  *      - we need a buffer header
1715  *      - we need to free up memory
1716  * Once started we continue to recycle buffers until the B_AGE
1717  * buffers are gone.
1718  */
1719 static void
1720 bio_recycle(int want, long bsize)
1721 {
1722         struct buf *bp, *dp, *dwp, *nbp;
1723         struct hbuf *hp;
1724         int     found = 0;
1725         kmutex_t        *hmp;
1726         int             start, end;
1727         struct buf *delwri_list = EMPTY_LIST;
1728 
1729         /*
1730          * Recycle buffers.
1731          */
1732 top:
1733         start = end = lastindex;
1734         do {
1735                 hp = &hbuf[start];
1736                 hmp = &hp->b_lock;
1737                 dp = (struct buf *)hp;
1738 
1739                 mutex_enter(hmp);
1740                 bp = dp->av_forw;
1741 
1742                 while (bp != dp) {
1743 
1744                         ASSERT(bp != NULL);
1745 
1746                         if (!sema_tryp(&bp->b_sem)) {
1747                                 bp = bp->av_forw;
1748                                 continue;
1749                         }
1750                         /*
1751                          * Do we really want to nuke all of the B_AGE stuff??
1752                          */
1753                         if ((bp->b_flags & B_AGE) == 0 && found) {
1754                                 sema_v(&bp->b_sem);
1755                                 mutex_exit(hmp);
1756                                 lastindex = start;
1757                                 return; /* All done */
1758                         }
1759 
1760                         ASSERT(MUTEX_HELD(&hp->b_lock));
1761                         ASSERT(!(bp->b_flags & B_DELWRI));
1762                         hp->b_length--;
1763                         notavail(bp);
1764 
1765                         /*
1766                          * Remove bhdr from cache, free up memory,
1767                          * and add the hdr to the freelist.
1768                          */
1769                         bremhash(bp);
1770                         mutex_exit(hmp);
1771 
1772                         if (bp->b_bufsize) {
1773                                 kmem_free(bp->b_un.b_addr, bp->b_bufsize);
1774                                 bp->b_un.b_addr = NULL;
1775                                 mutex_enter(&bfree_lock);
1776                                 bfreelist.b_bufsize += bp->b_bufsize;
1777                                 mutex_exit(&bfree_lock);
1778                         }
1779 
1780                         bp->b_dev = (o_dev_t)NODEV;
1781                         bp->b_edev = NODEV;
1782                         bp->b_flags = 0;
1783                         sema_v(&bp->b_sem);
1784                         bio_bhdr_free(bp);
1785                         if (want == BIO_HEADER) {
1786                                 found = 1;
1787                         } else {
1788                                 ASSERT(want == BIO_MEM);
1789                                 if (!found && bfreelist.b_bufsize >= bsize) {
1790                                         /* Account for the memory we want */
1791                                         mutex_enter(&bfree_lock);
1792                                         if (bfreelist.b_bufsize >= bsize) {
1793                                                 bfreelist.b_bufsize -= bsize;
1794                                                 found = 1;
1795                                         }
1796                                         mutex_exit(&bfree_lock);
1797                                 }
1798                         }
1799 
1800                         /*
1801                          * Since we dropped hmp start from the
1802                          * begining.
1803                          */
1804                         mutex_enter(hmp);
1805                         bp = dp->av_forw;
1806                 }
1807                 mutex_exit(hmp);
1808 
1809                 /*
1810                  * Look at the delayed write list.
1811                  * First gather into a private list, then write them.
1812                  */
1813                 dwp = (struct buf *)&dwbuf[start];
1814                 mutex_enter(&blist_lock);
1815                 bio_doingflush++;
1816                 mutex_enter(hmp);
1817                 for (bp = dwp->av_forw; bp != dwp; bp = nbp) {
1818 
1819                         ASSERT(bp != NULL);
1820                         nbp = bp->av_forw;
1821 
1822                         if (!sema_tryp(&bp->b_sem))
1823                                 continue;
1824                         ASSERT(bp->b_flags & B_DELWRI);
1825                         /*
1826                          * Do we really want to nuke all of the B_AGE stuff??
1827                          */
1828 
1829                         if ((bp->b_flags & B_AGE) == 0 && found) {
1830                                 sema_v(&bp->b_sem);
1831                                 mutex_exit(hmp);
1832                                 lastindex = start;
1833                                 mutex_exit(&blist_lock);
1834                                 bio_flushlist(delwri_list);
1835                                 mutex_enter(&blist_lock);
1836                                 bio_doingflush--;
1837                                 if (bio_flinv_cv_wanted) {
1838                                         bio_flinv_cv_wanted = 0;
1839                                         cv_broadcast(&bio_flushinval_cv);
1840                                 }
1841                                 mutex_exit(&blist_lock);
1842                                 return; /* All done */
1843                         }
1844 
1845                         /*
1846                          * If the buffer is already on a flush or
1847                          * invalidate list then just skip it.
1848                          */
1849                         if (bp->b_list != NULL) {
1850                                 sema_v(&bp->b_sem);
1851                                 continue;
1852                         }
1853                         /*
1854                          * We are still on the same bucket.
1855                          */
1856                         hp->b_length--;
1857                         notavail(bp);
1858                         bp->b_list = delwri_list;
1859                         delwri_list = bp;
1860                 }
1861                 mutex_exit(hmp);
1862                 mutex_exit(&blist_lock);
1863                 bio_flushlist(delwri_list);
1864                 delwri_list = EMPTY_LIST;
1865                 mutex_enter(&blist_lock);
1866                 bio_doingflush--;
1867                 if (bio_flinv_cv_wanted) {
1868                         bio_flinv_cv_wanted = 0;
1869                         cv_broadcast(&bio_flushinval_cv);
1870                 }
1871                 mutex_exit(&blist_lock);
1872                 start = (start + 1) % v.v_hbuf;
1873 
1874         } while (start != end);
1875 
1876         if (found)
1877                 return;
1878 
1879         /*
1880          * Free lists exhausted and we haven't satisfied the request.
1881          * Wait here for more entries to be added to freelist.
1882          * Because this might have just happened, make it timed.
1883          */
1884         mutex_enter(&bfree_lock);
1885         bfreelist.b_flags |= B_WANTED;
1886         (void) cv_reltimedwait(&bio_mem_cv, &bfree_lock, hz, TR_CLOCK_TICK);
1887         mutex_exit(&bfree_lock);
1888         goto top;
1889 }
1890 
1891 /*
1892  * See if the block is associated with some buffer
1893  * (mainly to avoid getting hung up on a wait in breada).
1894  */
1895 static int
1896 bio_incore(dev_t dev, daddr_t blkno)
1897 {
1898         struct buf *bp;
1899         struct buf *dp;
1900         uint_t index;
1901         kmutex_t *hmp;
1902 
1903         index = bio_bhash(dev, blkno);
1904         dp = (struct buf *)&hbuf[index];
1905         hmp = &hbuf[index].b_lock;
1906 
1907         mutex_enter(hmp);
1908         for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1909                 if (bp->b_blkno == blkno && bp->b_edev == dev &&
1910                     (bp->b_flags & B_STALE) == 0) {
1911                         mutex_exit(hmp);
1912                         return (1);
1913                 }
1914         }
1915         mutex_exit(hmp);
1916         return (0);
1917 }
1918 
1919 static void
1920 bio_pageio_done(struct buf *bp)
1921 {
1922         if (bp->b_flags & B_PAGEIO) {
1923 
1924                 if (bp->b_flags & B_REMAPPED)
1925                         bp_mapout(bp);
1926 
1927                 if (bp->b_flags & B_READ)
1928                         pvn_read_done(bp->b_pages, bp->b_flags);
1929                 else
1930                         pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags);
1931                 pageio_done(bp);
1932         } else {
1933                 ASSERT(bp->b_flags & B_REMAPPED);
1934                 bp_mapout(bp);
1935                 brelse(bp);
1936         }
1937 }
1938 
1939 /*
1940  * bioerror(9F) - indicate error in buffer header
1941  * If 'error' is zero, remove the error indication.
1942  */
1943 void
1944 bioerror(struct buf *bp, int error)
1945 {
1946         ASSERT(bp != NULL);
1947         ASSERT(error >= 0);
1948         ASSERT(SEMA_HELD(&bp->b_sem));
1949 
1950         if (error != 0) {
1951                 bp->b_flags |= B_ERROR;
1952         } else {
1953                 bp->b_flags &= ~B_ERROR;
1954         }
1955         bp->b_error = error;
1956 }
1957 
1958 /*
1959  * bioreset(9F) - reuse a private buffer header after I/O is complete
1960  */
1961 void
1962 bioreset(struct buf *bp)
1963 {
1964         ASSERT(bp != NULL);
1965 
1966         biofini(bp);
1967         bioinit(bp);
1968 }
1969 
1970 /*
1971  * biosize(9F) - return size of a buffer header
1972  */
1973 size_t
1974 biosize(void)
1975 {
1976         return (sizeof (struct buf));
1977 }
1978 
1979 /*
1980  * biomodified(9F) - check if buffer is modified
1981  */
1982 int
1983 biomodified(struct buf *bp)
1984 {
1985         int npf;
1986         int ppattr;
1987         struct page *pp;
1988 
1989         ASSERT(bp != NULL);
1990 
1991         if ((bp->b_flags & B_PAGEIO) == 0) {
1992                 return (-1);
1993         }
1994         pp = bp->b_pages;
1995         npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET));
1996 
1997         while (npf > 0) {
1998                 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
1999                     HAT_SYNC_STOPON_MOD);
2000                 if (ppattr & P_MOD)
2001                         return (1);
2002                 pp = pp->p_next;
2003                 npf--;
2004         }
2005 
2006         return (0);
2007 }
2008 
2009 /*
2010  * bioinit(9F) - initialize a buffer structure
2011  */
2012 void
2013 bioinit(struct buf *bp)
2014 {
2015         bzero(bp, sizeof (struct buf));
2016         sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
2017         sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
2018         bp->b_offset = -1;
2019 }
2020 
2021 /*
2022  * biofini(9F) - uninitialize a buffer structure
2023  */
2024 void
2025 biofini(struct buf *bp)
2026 {
2027         sema_destroy(&bp->b_io);
2028         sema_destroy(&bp->b_sem);
2029 }
2030 
2031 /*
2032  * bioclone(9F) - clone a buffer
2033  */
2034 struct buf *
2035 bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno,
2036     int (*iodone)(struct buf *), struct buf *bp_mem, int sleep)
2037 {
2038         struct buf *bufp;
2039 
2040         ASSERT(bp);
2041         if (bp_mem == NULL) {
2042                 bufp = kmem_alloc(sizeof (struct buf), sleep);
2043                 if (bufp == NULL) {
2044                         return (NULL);
2045                 }
2046                 bioinit(bufp);
2047         } else {
2048                 bufp = bp_mem;
2049                 bioreset(bufp);
2050         }
2051 
2052 #define BUF_CLONE_FLAGS (B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\
2053         B_ABRWRITE)
2054 
2055         /*
2056          * The cloned buffer does not inherit the B_REMAPPED flag.
2057          */
2058         bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS)  | B_BUSY;
2059         bufp->b_bcount = len;
2060         bufp->b_blkno = blkno;
2061         bufp->b_iodone = iodone;
2062         bufp->b_proc = bp->b_proc;
2063         bufp->b_edev = dev;
2064         bufp->b_file = bp->b_file;
2065         bufp->b_offset = bp->b_offset;
2066 
2067         if (bp->b_flags & B_SHADOW) {
2068                 ASSERT(bp->b_shadow);
2069                 ASSERT(bp->b_flags & B_PHYS);
2070 
2071                 bufp->b_shadow = bp->b_shadow +
2072                     btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off);
2073                 bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2074                 if (bp->b_flags & B_REMAPPED)
2075                         bufp->b_proc = NULL;
2076         } else {
2077                 if (bp->b_flags & B_PAGEIO) {
2078                         struct page *pp;
2079                         off_t o;
2080                         int i;
2081 
2082                         pp = bp->b_pages;
2083                         o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off;
2084                         for (i = btop(o); i > 0; i--) {
2085                                 pp = pp->p_next;
2086                         }
2087                         bufp->b_pages = pp;
2088                         bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET);
2089                 } else {
2090                         bufp->b_un.b_addr =
2091                             (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2092                         if (bp->b_flags & B_REMAPPED)
2093                                 bufp->b_proc = NULL;
2094                 }
2095         }
2096         return (bufp);
2097 }