Print this page
    
re #13613 rb4516 Tunables needs volatile keyword
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/os/bio.c
          +++ new/usr/src/uts/common/os/bio.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  
    | 
      ↓ open down ↓ | 
    15 lines elided | 
    
      ↑ open up ↑ | 
  
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   * Copyright 2011 Joyent, Inc.  All rights reserved.
  25   25   */
       26 +/*
       27 + * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
       28 + */
  26   29  
  27   30  /*
  28   31   * Copyright (c) 2016 by Delphix. All rights reserved.
  29   32   */
  30   33  
  31   34  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  32   35  /*        All Rights Reserved   */
  33   36  
  34   37  /*
  35   38   * University Copyright- Copyright (c) 1982, 1986, 1988
  36   39   * The Regents of the University of California
  37   40   * All Rights Reserved
  38   41   *
  39   42   * University Acknowledgment- Portions of this document are derived from
  40   43   * software developed by the University of California, Berkeley, and its
  41   44   * contributors.
  42   45   */
  43   46  
  44   47  #include <sys/types.h>
  45   48  #include <sys/t_lock.h>
  46   49  #include <sys/sysmacros.h>
  47   50  #include <sys/conf.h>
  48   51  #include <sys/cpuvar.h>
  49   52  #include <sys/errno.h>
  50   53  #include <sys/debug.h>
  51   54  #include <sys/buf.h>
  52   55  #include <sys/var.h>
  53   56  #include <sys/vnode.h>
  54   57  #include <sys/bitmap.h>
  55   58  #include <sys/cmn_err.h>
  56   59  #include <sys/kmem.h>
  57   60  #include <sys/vmem.h>
  58   61  #include <sys/atomic.h>
  59   62  #include <vm/seg_kmem.h>
  60   63  #include <vm/page.h>
  61   64  #include <vm/pvn.h>
  62   65  #include <sys/vtrace.h>
  63   66  #include <sys/tnf_probe.h>
  64   67  #include <sys/fs/ufs_inode.h>
  65   68  #include <sys/fs/ufs_bio.h>
  66   69  #include <sys/fs/ufs_log.h>
  67   70  #include <sys/systm.h>
  68   71  #include <sys/vfs.h>
  69   72  #include <sys/sdt.h>
  70   73  
  71   74  /* Locks */
  72   75  static  kmutex_t        blist_lock;     /* protects b_list */
  73   76  static  kmutex_t        bhdr_lock;      /* protects the bhdrlist */
  74   77  static  kmutex_t        bfree_lock;     /* protects the bfreelist structure */
  75   78  
  76   79  struct hbuf     *hbuf;                  /* Hash buckets */
  77   80  struct dwbuf    *dwbuf;                 /* Delayed write buckets */
  78   81  static struct buf *bhdrlist;            /* buf header free list */
  79   82  static int      nbuf;                   /* number of buffer headers allocated */
  80   83  
  81   84  static int      lastindex;              /* Reference point on where to start */
  82   85                                          /* when looking for free buffers */
  83   86  
  84   87  #define bio_bhash(dev, bn)      (hash2ints((dev), (int)(bn)) & v.v_hmask)
  85   88  #define EMPTY_LIST      ((struct buf *)-1)
  86   89  
  87   90  static kcondvar_t       bio_mem_cv;     /* Condition variables */
  88   91  static kcondvar_t       bio_flushinval_cv;
  89   92  static int      bio_doingflush;         /* flush in progress */
  90   93  static int      bio_doinginval;         /* inval in progress */
  91   94  static int      bio_flinv_cv_wanted;    /* someone waiting for cv */
  92   95  
  93   96  /*
  94   97   * Statistics on the buffer cache
  95   98   */
  96   99  struct biostats biostats = {
  97  100          { "buffer_cache_lookups",               KSTAT_DATA_UINT32 },
  98  101          { "buffer_cache_hits",                  KSTAT_DATA_UINT32 },
  99  102          { "new_buffer_requests",                KSTAT_DATA_UINT32 },
 100  103          { "waits_for_buffer_allocs",            KSTAT_DATA_UINT32 },
 101  104          { "buffers_locked_by_someone",          KSTAT_DATA_UINT32 },
 102  105          { "duplicate_buffers_found",            KSTAT_DATA_UINT32 }
 103  106  };
 104  107  
 105  108  /*
 106  109   * kstat data
 107  110   */
 108  111  kstat_named_t   *biostats_ptr = (kstat_named_t *)&biostats;
 109  112  uint_t          biostats_ndata = (uint_t)(sizeof (biostats) /
 110  113                                          sizeof (kstat_named_t));
 111  114  
 112  115  /*
 113  116   * Statistics on ufs buffer cache
 114  117   * Not protected by locks
 115  118   */
 116  119  struct ufsbiostats ub = {
 117  120          { "breads",                     KSTAT_DATA_UINT32 },
 118  121          { "bwrites",                    KSTAT_DATA_UINT32 },
 119  122          { "fbiwrites",                  KSTAT_DATA_UINT32 },
 120  123          { "getpages",                   KSTAT_DATA_UINT32 },
 121  124          { "getras",                     KSTAT_DATA_UINT32 },
 122  125          { "putsyncs",                   KSTAT_DATA_UINT32 },
 123  126          { "putasyncs",                  KSTAT_DATA_UINT32 },
 124  127          { "putpageios",                 KSTAT_DATA_UINT32 },
 125  128  };
 126  129  
 127  130  /*
 128  131   * more UFS Logging eccentricities...
 129  132   *
 130  133   * required since "#pragma weak ..." doesn't work in reverse order.
 131  134   * i.e.:  genunix (bio.c) is loaded before the ufs modules and pointers
 132  135   *        to ufs routines don't get plugged into bio.c calls so
 133  136   *        we initialize it when setting up the "lufsops" table
 134  137   *        in "lufs.c:_init()"
 135  138   */
 136  139  void (*bio_lufs_strategy)(void *, buf_t *);
 137  140  void (*bio_snapshot_strategy)(void *, buf_t *);
 138  141  
 139  142  
 140  143  /* Private routines */
 141  144  static struct buf       *bio_getfreeblk(long);
 142  145  static void             bio_mem_get(long);
 143  146  static void             bio_bhdr_free(struct buf *);
 144  147  static struct buf       *bio_bhdr_alloc(void);
 145  148  static void             bio_recycle(int, long);
 146  149  static void             bio_pageio_done(struct buf *);
 147  150  static int              bio_incore(dev_t, daddr_t);
 148  151  
 149  152  /*
 150  153   * Buffer cache constants
 151  154   */
 152  155  #define BIO_BUF_PERCENT (100/2)         /* default: 2% of memory */
 153  156  #define BIO_MAX_PERCENT (100/20)        /* max is 20% of real memory */
  
    | 
      ↓ open down ↓ | 
    118 lines elided | 
    
      ↑ open up ↑ | 
  
 154  157  #define BIO_BHDR_POOL   100             /* Default bhdr pool size */
 155  158  #define BIO_MIN_HDR     10              /* Minimum number of buffer headers */
 156  159  #define BIO_MIN_HWM     (BIO_MIN_HDR * MAXBSIZE / 1024)
 157  160  #define BIO_HASHLEN     4               /* Target length of hash chains */
 158  161  
 159  162  
 160  163  /* Flags for bio_recycle() */
 161  164  #define BIO_HEADER      0x01
 162  165  #define BIO_MEM         0x02
 163  166  
 164      -extern  int bufhwm;             /* User tunable - high water mark for mem  */
 165      -extern  int bufhwm_pct;         /* ditto - given in % of physmem  */
      167 +extern volatile int bufhwm;     /* User tunable - high water mark for mem  */
      168 +extern volatile int bufhwm_pct; /* ditto - given in % of physmem  */
 166  169  
 167  170  /*
 168  171   * The following routines allocate and free
 169  172   * buffers with various side effects.  In general the
 170  173   * arguments to an allocate routine are a device and
 171  174   * a block number, and the value is a pointer to
 172  175   * to the buffer header; the buffer returned is locked with a
 173  176   * binary semaphore so that no one else can touch it. If the block was
 174  177   * already in core, no I/O need be done; if it is
 175  178   * already locked, the process waits until it becomes free.
 176  179   * The following routines allocate a buffer:
 177  180   *      getblk
 178  181   *      bread/BREAD
 179  182   *      breada
 180  183   * Eventually the buffer must be released, possibly with the
 181  184   * side effect of writing it out, by using one of
 182  185   *      bwrite/BWRITE/brwrite
 183  186   *      bdwrite/bdrwrite
 184  187   *      bawrite
 185  188   *      brelse
 186  189   *
 187  190   * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization.
 188  191   * Instead, a binary semaphore, b_sem is used to gain exclusive access to
 189  192   * a buffer and a binary semaphore, b_io is used for I/O synchronization.
 190  193   * B_DONE is still used to denote a buffer with I/O complete on it.
 191  194   *
 192  195   * The bfreelist.b_bcount field is computed everytime fsflush runs. It is
 193  196   * should not be used where a very accurate count of the free buffers is
 194  197   * needed.
 195  198   */
 196  199  
 197  200  /*
 198  201   * Read in (if necessary) the block and return a buffer pointer.
 199  202   *
 200  203   * This interface is provided for binary compatibility.  Using
 201  204   * BREAD() directly avoids the extra function call overhead invoked
 202  205   * by calling this routine.
 203  206   */
 204  207  struct buf *
 205  208  bread(dev_t dev, daddr_t blkno, long bsize)
 206  209  {
 207  210          return (BREAD(dev, blkno, bsize));
 208  211  }
 209  212  
 210  213  /*
 211  214   * Common code for reading a buffer with various options
 212  215   *
 213  216   * Read in (if necessary) the block and return a buffer pointer.
 214  217   */
 215  218  struct buf *
 216  219  bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize)
 217  220  {
 218  221          struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
 219  222          struct buf *bp;
 220  223          klwp_t *lwp = ttolwp(curthread);
 221  224  
 222  225          CPU_STATS_ADD_K(sys, lread, 1);
 223  226          bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1);
 224  227          if (bp->b_flags & B_DONE)
 225  228                  return (bp);
 226  229          bp->b_flags |= B_READ;
 227  230          ASSERT(bp->b_bcount == bsize);
 228  231          if (ufsvfsp == NULL) {                                  /* !ufs */
 229  232                  (void) bdev_strategy(bp);
 230  233          } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
 231  234                                                          /* ufs && logging */
 232  235                  (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
 233  236          } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
 234  237                                                          /* ufs && snapshots */
 235  238                  (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
 236  239          } else {
 237  240                  ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
 238  241                  ub.ub_breads.value.ul++;                /* ufs && !logging */
 239  242                  (void) bdev_strategy(bp);
 240  243          }
 241  244          if (lwp != NULL)
 242  245                  lwp->lwp_ru.inblock++;
 243  246          CPU_STATS_ADD_K(sys, bread, 1);
 244  247          (void) biowait(bp);
 245  248          return (bp);
 246  249  }
 247  250  
 248  251  /*
 249  252   * Read in the block, like bread, but also start I/O on the
 250  253   * read-ahead block (which is not allocated to the caller).
 251  254   */
 252  255  struct buf *
 253  256  breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize)
 254  257  {
 255  258          struct buf *bp, *rabp;
 256  259          klwp_t *lwp = ttolwp(curthread);
 257  260  
 258  261          bp = NULL;
 259  262          if (!bio_incore(dev, blkno)) {
 260  263                  CPU_STATS_ADD_K(sys, lread, 1);
 261  264                  bp = GETBLK(dev, blkno, bsize);
 262  265                  if ((bp->b_flags & B_DONE) == 0) {
 263  266                          bp->b_flags |= B_READ;
 264  267                          bp->b_bcount = bsize;
 265  268                          (void) bdev_strategy(bp);
 266  269                          if (lwp != NULL)
 267  270                                  lwp->lwp_ru.inblock++;
 268  271                          CPU_STATS_ADD_K(sys, bread, 1);
 269  272                  }
 270  273          }
 271  274          if (rablkno && bfreelist.b_bcount > 1 &&
 272  275              !bio_incore(dev, rablkno)) {
 273  276                  rabp = GETBLK(dev, rablkno, bsize);
 274  277                  if (rabp->b_flags & B_DONE)
 275  278                          brelse(rabp);
 276  279                  else {
 277  280                          rabp->b_flags |= B_READ|B_ASYNC;
 278  281                          rabp->b_bcount = bsize;
 279  282                          (void) bdev_strategy(rabp);
 280  283                          if (lwp != NULL)
 281  284                                  lwp->lwp_ru.inblock++;
 282  285                          CPU_STATS_ADD_K(sys, bread, 1);
 283  286                  }
 284  287          }
 285  288          if (bp == NULL)
 286  289                  return (BREAD(dev, blkno, bsize));
 287  290          (void) biowait(bp);
 288  291          return (bp);
 289  292  }
 290  293  
 291  294  /*
 292  295   * Common code for writing a buffer with various options.
 293  296   *
 294  297   * force_wait  - wait for write completion regardless of B_ASYNC flag
 295  298   * do_relse    - release the buffer when we are done
 296  299   * clear_flags - flags to clear from the buffer
 297  300   */
 298  301  void
 299  302  bwrite_common(void *arg, struct buf *bp, int force_wait,
 300  303      int do_relse, int clear_flags)
 301  304  {
 302  305          register int do_wait;
 303  306          struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
 304  307          int flag;
 305  308          klwp_t *lwp = ttolwp(curthread);
 306  309          struct cpu *cpup;
 307  310  
 308  311          ASSERT(SEMA_HELD(&bp->b_sem));
 309  312          flag = bp->b_flags;
 310  313          bp->b_flags &= ~clear_flags;
 311  314          if (lwp != NULL)
 312  315                  lwp->lwp_ru.oublock++;
 313  316          CPU_STATS_ENTER_K();
 314  317          cpup = CPU;             /* get pointer AFTER preemption is disabled */
 315  318          CPU_STATS_ADDQ(cpup, sys, lwrite, 1);
 316  319          CPU_STATS_ADDQ(cpup, sys, bwrite, 1);
 317  320          do_wait = ((flag & B_ASYNC) == 0 || force_wait);
 318  321          if (do_wait == 0)
 319  322                  CPU_STATS_ADDQ(cpup, sys, bawrite, 1);
 320  323          CPU_STATS_EXIT_K();
 321  324          if (ufsvfsp == NULL) {
 322  325                  (void) bdev_strategy(bp);
 323  326          } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
 324  327                                                          /* ufs && logging */
 325  328                  (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
 326  329          } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
 327  330                                                          /* ufs && snapshots */
 328  331                  (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
 329  332          } else {
 330  333                  ub.ub_bwrites.value.ul++;               /* ufs && !logging */
 331  334                  (void) bdev_strategy(bp);
 332  335          }
 333  336          if (do_wait) {
 334  337                  (void) biowait(bp);
 335  338                  if (do_relse) {
 336  339                          brelse(bp);
 337  340                  }
 338  341          }
 339  342  }
 340  343  
 341  344  /*
 342  345   * Write the buffer, waiting for completion (unless B_ASYNC is set).
 343  346   * Then release the buffer.
 344  347   * This interface is provided for binary compatibility.  Using
 345  348   * BWRITE() directly avoids the extra function call overhead invoked
 346  349   * by calling this routine.
 347  350   */
 348  351  void
 349  352  bwrite(struct buf *bp)
 350  353  {
 351  354          BWRITE(bp);
 352  355  }
 353  356  
 354  357  /*
 355  358   * Write the buffer, waiting for completion.
 356  359   * But don't release the buffer afterwards.
 357  360   * This interface is provided for binary compatibility.  Using
 358  361   * BWRITE2() directly avoids the extra function call overhead.
 359  362   */
 360  363  void
 361  364  bwrite2(struct buf *bp)
 362  365  {
 363  366          BWRITE2(bp);
 364  367  }
 365  368  
 366  369  /*
 367  370   * Release the buffer, marking it so that if it is grabbed
 368  371   * for another purpose it will be written out before being
 369  372   * given up (e.g. when writing a partial block where it is
 370  373   * assumed that another write for the same block will soon follow).
 371  374   * Also save the time that the block is first marked as delayed
 372  375   * so that it will be written in a reasonable time.
 373  376   */
 374  377  void
 375  378  bdwrite(struct buf *bp)
 376  379  {
 377  380          ASSERT(SEMA_HELD(&bp->b_sem));
 378  381          CPU_STATS_ADD_K(sys, lwrite, 1);
 379  382          if ((bp->b_flags & B_DELWRI) == 0)
 380  383                  bp->b_start = ddi_get_lbolt();
 381  384          /*
 382  385           * B_DONE allows others to use the buffer, B_DELWRI causes the
 383  386           * buffer to be written before being reused, and setting b_resid
 384  387           * to zero says the buffer is complete.
 385  388           */
 386  389          bp->b_flags |= B_DELWRI | B_DONE;
 387  390          bp->b_resid = 0;
 388  391          brelse(bp);
 389  392  }
 390  393  
 391  394  /*
 392  395   * Release the buffer, start I/O on it, but don't wait for completion.
 393  396   */
 394  397  void
 395  398  bawrite(struct buf *bp)
 396  399  {
 397  400          ASSERT(SEMA_HELD(&bp->b_sem));
 398  401  
 399  402          /* Use bfreelist.b_bcount as a weird-ass heuristic */
 400  403          if (bfreelist.b_bcount > 4)
 401  404                  bp->b_flags |= B_ASYNC;
 402  405          BWRITE(bp);
 403  406  }
 404  407  
 405  408  /*
 406  409   * Release the buffer, with no I/O implied.
 407  410   */
 408  411  void
 409  412  brelse(struct buf *bp)
 410  413  {
 411  414          struct buf      **backp;
 412  415          uint_t          index;
 413  416          kmutex_t        *hmp;
 414  417          struct  buf     *dp;
 415  418          struct  hbuf    *hp;
 416  419  
 417  420  
 418  421          ASSERT(SEMA_HELD(&bp->b_sem));
 419  422  
 420  423          /*
 421  424           * Clear the retry write flag if the buffer was written without
 422  425           * error.  The presence of B_DELWRI means the buffer has not yet
 423  426           * been written and the presence of B_ERROR means that an error
 424  427           * is still occurring.
 425  428           */
 426  429          if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) {
 427  430                  bp->b_flags &= ~B_RETRYWRI;
 428  431          }
 429  432  
 430  433          /* Check for anomalous conditions */
 431  434          if (bp->b_flags & (B_ERROR|B_NOCACHE)) {
 432  435                  if (bp->b_flags & B_NOCACHE) {
 433  436                          /* Don't add to the freelist. Destroy it now */
 434  437                          kmem_free(bp->b_un.b_addr, bp->b_bufsize);
 435  438                          sema_destroy(&bp->b_sem);
 436  439                          sema_destroy(&bp->b_io);
 437  440                          kmem_free(bp, sizeof (struct buf));
 438  441                          return;
 439  442                  }
 440  443                  /*
 441  444                   * If a write failed and we are supposed to retry write,
 442  445                   * don't toss the buffer.  Keep it around and mark it
 443  446                   * delayed write in the hopes that it will eventually
 444  447                   * get flushed (and still keep the system running.)
 445  448                   */
 446  449                  if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) {
 447  450                          bp->b_flags |= B_DELWRI;
 448  451                          /* keep fsflush from trying continuously to flush */
 449  452                          bp->b_start = ddi_get_lbolt();
 450  453                  } else
 451  454                          bp->b_flags |= B_AGE|B_STALE;
 452  455                  bp->b_flags &= ~B_ERROR;
 453  456                  bp->b_error = 0;
 454  457          }
 455  458  
 456  459          /*
 457  460           * If delayed write is set then put in on the delayed
 458  461           * write list instead of the free buffer list.
 459  462           */
 460  463          index = bio_bhash(bp->b_edev, bp->b_blkno);
 461  464          hmp   = &hbuf[index].b_lock;
 462  465  
 463  466          mutex_enter(hmp);
 464  467          hp = &hbuf[index];
 465  468          dp = (struct buf *)hp;
 466  469  
 467  470          /*
 468  471           * Make sure that the number of entries on this list are
 469  472           * Zero <= count <= total # buffers
 470  473           */
 471  474          ASSERT(hp->b_length >= 0);
 472  475          ASSERT(hp->b_length < nbuf);
 473  476  
 474  477          hp->b_length++;         /* We are adding this buffer */
 475  478  
 476  479          if (bp->b_flags & B_DELWRI) {
 477  480                  /*
 478  481                   * This buffer goes on the delayed write buffer list
 479  482                   */
 480  483                  dp = (struct buf *)&dwbuf[index];
 481  484          }
 482  485          ASSERT(bp->b_bufsize > 0);
 483  486          ASSERT(bp->b_bcount > 0);
 484  487          ASSERT(bp->b_un.b_addr != NULL);
 485  488  
 486  489          if (bp->b_flags & B_AGE) {
 487  490                  backp = &dp->av_forw;
 488  491                  (*backp)->av_back = bp;
 489  492                  bp->av_forw = *backp;
 490  493                  *backp = bp;
 491  494                  bp->av_back = dp;
 492  495          } else {
 493  496                  backp = &dp->av_back;
 494  497                  (*backp)->av_forw = bp;
 495  498                  bp->av_back = *backp;
 496  499                  *backp = bp;
 497  500                  bp->av_forw = dp;
 498  501          }
 499  502          mutex_exit(hmp);
 500  503  
 501  504          if (bfreelist.b_flags & B_WANTED) {
 502  505                  /*
 503  506                   * Should come here very very rarely.
 504  507                   */
 505  508                  mutex_enter(&bfree_lock);
 506  509                  if (bfreelist.b_flags & B_WANTED) {
 507  510                          bfreelist.b_flags &= ~B_WANTED;
 508  511                          cv_broadcast(&bio_mem_cv);
 509  512                  }
 510  513                  mutex_exit(&bfree_lock);
 511  514          }
 512  515  
 513  516          bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC);
 514  517          /*
 515  518           * Don't let anyone get the buffer off the freelist before we
 516  519           * release our hold on it.
 517  520           */
 518  521          sema_v(&bp->b_sem);
 519  522  }
 520  523  
 521  524  /*
 522  525   * Return a count of the number of B_BUSY buffers in the system
 523  526   * Can only be used as a good estimate.  If 'cleanit' is set,
 524  527   * try to flush all bufs.
 525  528   */
 526  529  int
 527  530  bio_busy(int cleanit)
 528  531  {
 529  532          struct buf *bp, *dp;
 530  533          int busy = 0;
 531  534          int i;
 532  535          kmutex_t *hmp;
 533  536  
 534  537          for (i = 0; i < v.v_hbuf; i++) {
 535  538                  dp = (struct buf *)&hbuf[i];
 536  539                  hmp = &hbuf[i].b_lock;
 537  540  
 538  541                  mutex_enter(hmp);
 539  542                  for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
 540  543                          if (bp->b_flags & B_BUSY)
 541  544                                  busy++;
 542  545                  }
 543  546                  mutex_exit(hmp);
 544  547          }
 545  548  
 546  549          if (cleanit && busy != 0) {
 547  550                  bflush(NODEV);
 548  551          }
 549  552  
 550  553          return (busy);
 551  554  }
 552  555  
 553  556  /*
 554  557   * this interface is provided for binary compatibility.
 555  558   *
 556  559   * Assign a buffer for the given block.  If the appropriate
 557  560   * block is already associated, return it; otherwise search
 558  561   * for the oldest non-busy buffer and reassign it.
 559  562   */
 560  563  struct buf *
 561  564  getblk(dev_t dev, daddr_t blkno, long bsize)
 562  565  {
 563  566          return (getblk_common(/* ufsvfsp */ NULL, dev,
 564  567              blkno, bsize, /* errflg */ 0));
 565  568  }
 566  569  
 567  570  /*
 568  571   * Assign a buffer for the given block.  If the appropriate
 569  572   * block is already associated, return it; otherwise search
 570  573   * for the oldest non-busy buffer and reassign it.
 571  574   */
 572  575  struct buf *
 573  576  getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg)
 574  577  {
 575  578          ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg;
 576  579          struct buf *bp;
 577  580          struct buf *dp;
 578  581          struct buf *nbp = NULL;
 579  582          struct buf *errbp;
 580  583          uint_t          index;
 581  584          kmutex_t        *hmp;
 582  585          struct  hbuf    *hp;
 583  586  
 584  587          if (getmajor(dev) >= devcnt)
 585  588                  cmn_err(CE_PANIC, "blkdev");
 586  589  
 587  590          biostats.bio_lookup.value.ui32++;
 588  591  
 589  592          index = bio_bhash(dev, blkno);
 590  593          hp    = &hbuf[index];
 591  594          dp    = (struct buf *)hp;
 592  595          hmp   = &hp->b_lock;
 593  596  
 594  597          mutex_enter(hmp);
 595  598  loop:
 596  599          for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
 597  600                  if (bp->b_blkno != blkno || bp->b_edev != dev ||
 598  601                      (bp->b_flags & B_STALE))
 599  602                          continue;
 600  603                  /*
 601  604                   * Avoid holding the hash lock in the event that
 602  605                   * the buffer is locked by someone. Since the hash chain
 603  606                   * may change when we drop the hash lock
 604  607                   * we have to start at the beginning of the chain if the
 605  608                   * buffer identity/contents aren't valid.
 606  609                   */
 607  610                  if (!sema_tryp(&bp->b_sem)) {
 608  611                          biostats.bio_bufbusy.value.ui32++;
 609  612                          mutex_exit(hmp);
 610  613                          /*
 611  614                           * OK, we are dealing with a busy buffer.
 612  615                           * In the case that we are panicking and we
 613  616                           * got called from bread(), we have some chance
 614  617                           * for error recovery. So better bail out from
 615  618                           * here since sema_p() won't block. If we got
 616  619                           * called directly from ufs routines, there is
 617  620                           * no way to report an error yet.
 618  621                           */
 619  622                          if (panicstr && errflg)
 620  623                                  goto errout;
 621  624                          /*
 622  625                           * For the following line of code to work
 623  626                           * correctly never kmem_free the buffer "header".
 624  627                           */
 625  628                          sema_p(&bp->b_sem);
 626  629                          if (bp->b_blkno != blkno || bp->b_edev != dev ||
 627  630                              (bp->b_flags & B_STALE)) {
 628  631                                  sema_v(&bp->b_sem);
 629  632                                  mutex_enter(hmp);
 630  633                                  goto loop;      /* start over */
 631  634                          }
 632  635                          mutex_enter(hmp);
 633  636                  }
 634  637                  /* Found */
 635  638                  biostats.bio_hit.value.ui32++;
 636  639                  bp->b_flags &= ~B_AGE;
 637  640  
 638  641                  /*
 639  642                   * Yank it off the free/delayed write lists
 640  643                   */
 641  644                  hp->b_length--;
 642  645                  notavail(bp);
 643  646                  mutex_exit(hmp);
 644  647  
 645  648                  ASSERT((bp->b_flags & B_NOCACHE) == NULL);
 646  649  
 647  650                  if (nbp == NULL) {
 648  651                          /*
 649  652                           * Make the common path short.
 650  653                           */
 651  654                          ASSERT(SEMA_HELD(&bp->b_sem));
 652  655                          return (bp);
 653  656                  }
 654  657  
 655  658                  biostats.bio_bufdup.value.ui32++;
 656  659  
 657  660                  /*
 658  661                   * The buffer must have entered during the lock upgrade
 659  662                   * so free the new buffer we allocated and return the
 660  663                   * found buffer.
 661  664                   */
 662  665                  kmem_free(nbp->b_un.b_addr, nbp->b_bufsize);
 663  666                  nbp->b_un.b_addr = NULL;
 664  667  
 665  668                  /*
 666  669                   * Account for the memory
 667  670                   */
 668  671                  mutex_enter(&bfree_lock);
 669  672                  bfreelist.b_bufsize += nbp->b_bufsize;
 670  673                  mutex_exit(&bfree_lock);
 671  674  
 672  675                  /*
 673  676                   * Destroy buf identity, and place on avail list
 674  677                   */
 675  678                  nbp->b_dev = (o_dev_t)NODEV;
 676  679                  nbp->b_edev = NODEV;
 677  680                  nbp->b_flags = 0;
 678  681                  nbp->b_file = NULL;
 679  682                  nbp->b_offset = -1;
 680  683  
 681  684                  sema_v(&nbp->b_sem);
 682  685                  bio_bhdr_free(nbp);
 683  686  
 684  687                  ASSERT(SEMA_HELD(&bp->b_sem));
 685  688                  return (bp);
 686  689          }
 687  690  
 688  691          /*
 689  692           * bio_getfreeblk may block so check the hash chain again.
 690  693           */
 691  694          if (nbp == NULL) {
 692  695                  mutex_exit(hmp);
 693  696                  nbp = bio_getfreeblk(bsize);
 694  697                  mutex_enter(hmp);
 695  698                  goto loop;
 696  699          }
 697  700  
 698  701          /*
 699  702           * New buffer. Assign nbp and stick it on the hash.
 700  703           */
 701  704          nbp->b_flags = B_BUSY;
 702  705          nbp->b_edev = dev;
 703  706          nbp->b_dev = (o_dev_t)cmpdev(dev);
 704  707          nbp->b_blkno = blkno;
 705  708          nbp->b_iodone = NULL;
 706  709          nbp->b_bcount = bsize;
 707  710          /*
 708  711           * If we are given a ufsvfsp and the vfs_root field is NULL
 709  712           * then this must be I/O for a superblock.  A superblock's
 710  713           * buffer is set up in mountfs() and there is no root vnode
 711  714           * at that point.
 712  715           */
 713  716          if (ufsvfsp && ufsvfsp->vfs_root) {
 714  717                  nbp->b_vp = ufsvfsp->vfs_root;
 715  718          } else {
 716  719                  nbp->b_vp = NULL;
 717  720          }
 718  721  
 719  722          ASSERT((nbp->b_flags & B_NOCACHE) == NULL);
 720  723  
 721  724          binshash(nbp, dp);
 722  725          mutex_exit(hmp);
 723  726  
 724  727          ASSERT(SEMA_HELD(&nbp->b_sem));
 725  728  
 726  729          return (nbp);
 727  730  
 728  731  
 729  732          /*
 730  733           * Come here in case of an internal error. At this point we couldn't
 731  734           * get a buffer, but we have to return one. Hence we allocate some
 732  735           * kind of error reply buffer on the fly. This buffer is marked as
 733  736           * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
 734  737           *      - B_ERROR will indicate error to the caller.
 735  738           *      - B_DONE will prevent us from reading the buffer from
 736  739           *        the device.
 737  740           *      - B_NOCACHE will cause that this buffer gets free'd in
 738  741           *        brelse().
 739  742           */
 740  743  
 741  744  errout:
 742  745          errbp = geteblk();
 743  746          sema_p(&errbp->b_sem);
 744  747          errbp->b_flags &= ~B_BUSY;
 745  748          errbp->b_flags |= (B_ERROR | B_DONE);
 746  749          return (errbp);
 747  750  }
 748  751  
 749  752  /*
 750  753   * Get an empty block, not assigned to any particular device.
 751  754   * Returns a locked buffer that is not on any hash or free list.
 752  755   */
 753  756  struct buf *
 754  757  ngeteblk(long bsize)
 755  758  {
 756  759          struct buf *bp;
 757  760  
 758  761          bp = kmem_alloc(sizeof (struct buf), KM_SLEEP);
 759  762          bioinit(bp);
 760  763          bp->av_forw = bp->av_back = NULL;
 761  764          bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
 762  765          bp->b_bufsize = bsize;
 763  766          bp->b_flags = B_BUSY | B_NOCACHE | B_AGE;
 764  767          bp->b_dev = (o_dev_t)NODEV;
 765  768          bp->b_edev = NODEV;
 766  769          bp->b_lblkno = 0;
 767  770          bp->b_bcount = bsize;
 768  771          bp->b_iodone = NULL;
 769  772          return (bp);
 770  773  }
 771  774  
 772  775  /*
 773  776   * Interface of geteblk() is kept intact to maintain driver compatibility.
 774  777   * Use ngeteblk() to allocate block size other than 1 KB.
 775  778   */
 776  779  struct buf *
 777  780  geteblk(void)
 778  781  {
 779  782          return (ngeteblk((long)1024));
 780  783  }
 781  784  
 782  785  /*
 783  786   * Return a buffer w/o sleeping
 784  787   */
 785  788  struct buf *
 786  789  trygetblk(dev_t dev, daddr_t blkno)
 787  790  {
 788  791          struct buf      *bp;
 789  792          struct buf      *dp;
 790  793          struct hbuf     *hp;
 791  794          kmutex_t        *hmp;
 792  795          uint_t          index;
 793  796  
 794  797          index = bio_bhash(dev, blkno);
 795  798          hp = &hbuf[index];
 796  799          hmp = &hp->b_lock;
 797  800  
 798  801          if (!mutex_tryenter(hmp))
 799  802                  return (NULL);
 800  803  
 801  804          dp = (struct buf *)hp;
 802  805          for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
 803  806                  if (bp->b_blkno != blkno || bp->b_edev != dev ||
 804  807                      (bp->b_flags & B_STALE))
 805  808                          continue;
 806  809                  /*
 807  810                   * Get access to a valid buffer without sleeping
 808  811                   */
 809  812                  if (sema_tryp(&bp->b_sem)) {
 810  813                          if (bp->b_flags & B_DONE) {
 811  814                                  hp->b_length--;
 812  815                                  notavail(bp);
 813  816                                  mutex_exit(hmp);
 814  817                                  return (bp);
 815  818                          } else {
 816  819                                  sema_v(&bp->b_sem);
 817  820                                  break;
 818  821                          }
 819  822                  }
 820  823                  break;
 821  824          }
 822  825          mutex_exit(hmp);
 823  826          return (NULL);
 824  827  }
 825  828  
 826  829  /*
 827  830   * Wait for I/O completion on the buffer; return errors
 828  831   * to the user.
 829  832   */
 830  833  int
 831  834  iowait(struct buf *bp)
 832  835  {
 833  836          ASSERT(SEMA_HELD(&bp->b_sem));
 834  837          return (biowait(bp));
 835  838  }
 836  839  
 837  840  /*
 838  841   * Mark I/O complete on a buffer, release it if I/O is asynchronous,
 839  842   * and wake up anyone waiting for it.
 840  843   */
 841  844  void
 842  845  iodone(struct buf *bp)
 843  846  {
 844  847          ASSERT(SEMA_HELD(&bp->b_sem));
 845  848          (void) biodone(bp);
 846  849  }
 847  850  
 848  851  /*
 849  852   * Zero the core associated with a buffer.
 850  853   */
 851  854  void
 852  855  clrbuf(struct buf *bp)
 853  856  {
 854  857          ASSERT(SEMA_HELD(&bp->b_sem));
 855  858          bzero(bp->b_un.b_addr, bp->b_bcount);
 856  859          bp->b_resid = 0;
 857  860  }
 858  861  
 859  862  
 860  863  /*
 861  864   * Make sure all write-behind blocks on dev (or NODEV for all)
 862  865   * are flushed out.
 863  866   */
 864  867  void
 865  868  bflush(dev_t dev)
 866  869  {
 867  870          struct buf *bp, *dp;
 868  871          struct hbuf *hp;
 869  872          struct buf *delwri_list = EMPTY_LIST;
 870  873          int i, index;
 871  874          kmutex_t *hmp;
 872  875  
 873  876          mutex_enter(&blist_lock);
 874  877          /*
 875  878           * Wait for any invalidates or flushes ahead of us to finish.
 876  879           * We really could split blist_lock up per device for better
 877  880           * parallelism here.
 878  881           */
 879  882          while (bio_doinginval || bio_doingflush) {
 880  883                  bio_flinv_cv_wanted = 1;
 881  884                  cv_wait(&bio_flushinval_cv, &blist_lock);
 882  885          }
 883  886          bio_doingflush++;
 884  887          /*
 885  888           * Gather all B_DELWRI buffer for device.
 886  889           * Lock ordering is b_sem > hash lock (brelse).
 887  890           * Since we are finding the buffer via the delayed write list,
 888  891           * it may be busy and we would block trying to get the
 889  892           * b_sem lock while holding hash lock. So transfer all the
 890  893           * candidates on the delwri_list and then drop the hash locks.
 891  894           */
 892  895          for (i = 0; i < v.v_hbuf; i++) {
 893  896                  hmp = &hbuf[i].b_lock;
 894  897                  dp = (struct buf *)&dwbuf[i];
 895  898                  mutex_enter(hmp);
 896  899                  for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) {
 897  900                          if (dev == NODEV || bp->b_edev == dev) {
 898  901                                  if (bp->b_list == NULL) {
 899  902                                          bp->b_list = delwri_list;
 900  903                                          delwri_list = bp;
 901  904                                  }
 902  905                          }
 903  906                  }
 904  907                  mutex_exit(hmp);
 905  908          }
 906  909          mutex_exit(&blist_lock);
 907  910  
 908  911          /*
 909  912           * Now that the hash locks have been dropped grab the semaphores
 910  913           * and write back all the buffers that have B_DELWRI set.
 911  914           */
 912  915          while (delwri_list != EMPTY_LIST) {
 913  916                  bp = delwri_list;
 914  917  
 915  918                  sema_p(&bp->b_sem);     /* may block */
 916  919                  if ((dev != bp->b_edev && dev != NODEV) ||
 917  920                      (panicstr && bp->b_flags & B_BUSY)) {
 918  921                          sema_v(&bp->b_sem);
 919  922                          delwri_list = bp->b_list;
 920  923                          bp->b_list = NULL;
 921  924                          continue;       /* No longer a candidate */
 922  925                  }
 923  926                  if (bp->b_flags & B_DELWRI) {
 924  927                          index = bio_bhash(bp->b_edev, bp->b_blkno);
 925  928                          hp = &hbuf[index];
 926  929                          hmp = &hp->b_lock;
 927  930                          dp = (struct buf *)hp;
 928  931  
 929  932                          bp->b_flags |= B_ASYNC;
 930  933                          mutex_enter(hmp);
 931  934                          hp->b_length--;
 932  935                          notavail(bp);
 933  936                          mutex_exit(hmp);
 934  937                          if (bp->b_vp == NULL) {         /* !ufs */
 935  938                                  BWRITE(bp);
 936  939                          } else {                        /* ufs */
 937  940                                  UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
 938  941                          }
 939  942                  } else {
 940  943                          sema_v(&bp->b_sem);
 941  944                  }
 942  945                  delwri_list = bp->b_list;
 943  946                  bp->b_list = NULL;
 944  947          }
 945  948          mutex_enter(&blist_lock);
 946  949          bio_doingflush--;
 947  950          if (bio_flinv_cv_wanted) {
 948  951                  bio_flinv_cv_wanted = 0;
 949  952                  cv_broadcast(&bio_flushinval_cv);
 950  953          }
 951  954          mutex_exit(&blist_lock);
 952  955  }
 953  956  
 954  957  /*
 955  958   * Ensure that a specified block is up-to-date on disk.
 956  959   */
 957  960  void
 958  961  blkflush(dev_t dev, daddr_t blkno)
 959  962  {
 960  963          struct buf *bp, *dp;
 961  964          struct hbuf *hp;
 962  965          struct buf *sbp = NULL;
 963  966          uint_t index;
 964  967          kmutex_t *hmp;
 965  968  
 966  969          index = bio_bhash(dev, blkno);
 967  970          hp    = &hbuf[index];
 968  971          dp    = (struct buf *)hp;
 969  972          hmp   = &hp->b_lock;
 970  973  
 971  974          /*
 972  975           * Identify the buffer in the cache belonging to
 973  976           * this device and blkno (if any).
 974  977           */
 975  978          mutex_enter(hmp);
 976  979          for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
 977  980                  if (bp->b_blkno != blkno || bp->b_edev != dev ||
 978  981                      (bp->b_flags & B_STALE))
 979  982                          continue;
 980  983                  sbp = bp;
 981  984                  break;
 982  985          }
 983  986          mutex_exit(hmp);
 984  987          if (sbp == NULL)
 985  988                  return;
 986  989          /*
 987  990           * Now check the buffer we have identified and
 988  991           * make sure it still belongs to the device and is B_DELWRI
 989  992           */
 990  993          sema_p(&sbp->b_sem);
 991  994          if (sbp->b_blkno == blkno && sbp->b_edev == dev &&
 992  995              (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) {
 993  996                  mutex_enter(hmp);
 994  997                  hp->b_length--;
 995  998                  notavail(sbp);
 996  999                  mutex_exit(hmp);
 997 1000                  /*
 998 1001                   * XXX - There is nothing to guarantee a synchronous
 999 1002                   * write here if the B_ASYNC flag is set.  This needs
1000 1003                   * some investigation.
1001 1004                   */
1002 1005                  if (sbp->b_vp == NULL) {                /* !ufs */
1003 1006                          BWRITE(sbp);    /* synchronous write */
1004 1007                  } else {                                /* ufs */
1005 1008                          UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp);
1006 1009                  }
1007 1010          } else {
1008 1011                  sema_v(&sbp->b_sem);
1009 1012          }
1010 1013  }
1011 1014  
1012 1015  /*
1013 1016   * Same as binval, except can force-invalidate delayed-write buffers
1014 1017   * (which are not be already flushed because of device errors).  Also
1015 1018   * makes sure that the retry write flag is cleared.
1016 1019   */
1017 1020  int
1018 1021  bfinval(dev_t dev, int force)
1019 1022  {
1020 1023          struct buf *dp;
1021 1024          struct buf *bp;
1022 1025          struct buf *binval_list = EMPTY_LIST;
1023 1026          int i, error = 0;
1024 1027          kmutex_t *hmp;
1025 1028          uint_t index;
1026 1029          struct buf **backp;
1027 1030  
1028 1031          mutex_enter(&blist_lock);
1029 1032          /*
1030 1033           * Wait for any flushes ahead of us to finish, it's ok to
1031 1034           * do invalidates in parallel.
1032 1035           */
1033 1036          while (bio_doingflush) {
1034 1037                  bio_flinv_cv_wanted = 1;
1035 1038                  cv_wait(&bio_flushinval_cv, &blist_lock);
1036 1039          }
1037 1040          bio_doinginval++;
1038 1041  
1039 1042          /* Gather bp's */
1040 1043          for (i = 0; i < v.v_hbuf; i++) {
1041 1044                  dp = (struct buf *)&hbuf[i];
1042 1045                  hmp = &hbuf[i].b_lock;
1043 1046  
1044 1047                  mutex_enter(hmp);
1045 1048                  for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1046 1049                          if (bp->b_edev == dev) {
1047 1050                                  if (bp->b_list == NULL) {
1048 1051                                          bp->b_list = binval_list;
1049 1052                                          binval_list = bp;
1050 1053                                  }
1051 1054                          }
1052 1055                  }
1053 1056                  mutex_exit(hmp);
1054 1057          }
1055 1058          mutex_exit(&blist_lock);
1056 1059  
1057 1060          /* Invalidate all bp's found */
1058 1061          while (binval_list != EMPTY_LIST) {
1059 1062                  bp = binval_list;
1060 1063  
1061 1064                  sema_p(&bp->b_sem);
1062 1065                  if (bp->b_edev == dev) {
1063 1066                          if (force && (bp->b_flags & B_DELWRI)) {
1064 1067                                  /* clear B_DELWRI, move to non-dw freelist */
1065 1068                                  index = bio_bhash(bp->b_edev, bp->b_blkno);
1066 1069                                  hmp = &hbuf[index].b_lock;
1067 1070                                  dp = (struct buf *)&hbuf[index];
1068 1071                                  mutex_enter(hmp);
1069 1072  
1070 1073                                  /* remove from delayed write freelist */
1071 1074                                  notavail(bp);
1072 1075  
1073 1076                                  /* add to B_AGE side of non-dw freelist */
1074 1077                                  backp = &dp->av_forw;
1075 1078                                  (*backp)->av_back = bp;
1076 1079                                  bp->av_forw = *backp;
1077 1080                                  *backp = bp;
1078 1081                                  bp->av_back = dp;
1079 1082  
1080 1083                                  /*
1081 1084                                   * make sure write retries and busy are cleared
1082 1085                                   */
1083 1086                                  bp->b_flags &=
1084 1087                                      ~(B_BUSY | B_DELWRI | B_RETRYWRI);
1085 1088                                  mutex_exit(hmp);
1086 1089                          }
1087 1090                          if ((bp->b_flags & B_DELWRI) == 0)
1088 1091                                  bp->b_flags |= B_STALE|B_AGE;
1089 1092                          else
1090 1093                                  error = EIO;
1091 1094                  }
1092 1095                  sema_v(&bp->b_sem);
1093 1096                  binval_list = bp->b_list;
1094 1097                  bp->b_list = NULL;
1095 1098          }
1096 1099          mutex_enter(&blist_lock);
1097 1100          bio_doinginval--;
1098 1101          if (bio_flinv_cv_wanted) {
1099 1102                  cv_broadcast(&bio_flushinval_cv);
1100 1103                  bio_flinv_cv_wanted = 0;
1101 1104          }
1102 1105          mutex_exit(&blist_lock);
1103 1106          return (error);
1104 1107  }
1105 1108  
1106 1109  /*
1107 1110   * If possible, invalidate blocks for a dev on demand
1108 1111   */
1109 1112  void
1110 1113  binval(dev_t dev)
1111 1114  {
1112 1115          (void) bfinval(dev, 0);
1113 1116  }
1114 1117  
1115 1118  /*
1116 1119   * Initialize the buffer I/O system by freeing
1117 1120   * all buffers and setting all device hash buffer lists to empty.
1118 1121   */
1119 1122  void
1120 1123  binit(void)
1121 1124  {
1122 1125          struct buf *bp;
1123 1126          unsigned int i, pct;
1124 1127          ulong_t bio_max_hwm, bio_default_hwm;
1125 1128  
1126 1129          /*
1127 1130           * Maximum/Default values for bufhwm are set to the smallest of:
1128 1131           *      - BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
1129 1132           *      - 1/4 of kernel virtual memory
1130 1133           *      - INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
1131 1134           * Additionally, in order to allow simple tuning by percentage of
1132 1135           * physical memory, bufhwm_pct is used to calculate the default if
1133 1136           * the value of this tunable is between 0 and BIO_MAX_PERCENT.
1134 1137           *
1135 1138           * Since the unit for v.v_bufhwm is kilobytes, this allows for
1136 1139           * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
1137 1140           */
1138 1141          bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT,
1139 1142              btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1140 1143          bio_max_hwm = MIN(INT32_MAX, bio_max_hwm);
1141 1144  
1142 1145          pct = BIO_BUF_PERCENT;
1143 1146          if (bufhwm_pct != 0 &&
1144 1147              ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) {
1145 1148                  pct = BIO_BUF_PERCENT;
1146 1149                  /*
1147 1150                   * Invalid user specified value, emit a warning.
1148 1151                   */
1149 1152                  cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \
1150 1153                      range(1..%d). Using %d as default.",
1151 1154                      bufhwm_pct,
1152 1155                      100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT);
1153 1156          }
1154 1157  
1155 1158          bio_default_hwm = MIN(physmem / pct,
1156 1159              btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1157 1160          bio_default_hwm = MIN(INT32_MAX, bio_default_hwm);
1158 1161  
1159 1162          if ((v.v_bufhwm = bufhwm) == 0)
1160 1163                  v.v_bufhwm = bio_default_hwm;
1161 1164  
1162 1165          if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) {
1163 1166                  v.v_bufhwm = (int)bio_max_hwm;
1164 1167                  /*
1165 1168                   * Invalid user specified value, emit a warning.
1166 1169                   */
1167 1170                  cmn_err(CE_WARN,
1168 1171                      "binit: bufhwm(%d) out \
1169 1172                      of range(%d..%lu). Using %lu as default",
1170 1173                      bufhwm,
1171 1174                      BIO_MIN_HWM, bio_max_hwm, bio_max_hwm);
1172 1175          }
1173 1176  
1174 1177          /*
1175 1178           * Determine the number of hash buckets. Default is to
1176 1179           * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
1177 1180           * Round up number to the next power of 2.
1178 1181           */
1179 1182          v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) /
1180 1183              BIO_HASHLEN);
1181 1184          v.v_hmask = v.v_hbuf - 1;
1182 1185          v.v_buf = BIO_BHDR_POOL;
1183 1186  
1184 1187          hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP);
1185 1188  
1186 1189          dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP);
1187 1190  
1188 1191          bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024;
1189 1192          bp = &bfreelist;
1190 1193          bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp;
1191 1194  
1192 1195          for (i = 0; i < v.v_hbuf; i++) {
1193 1196                  hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i];
1194 1197                  hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i];
1195 1198  
1196 1199                  /*
1197 1200                   * Initialize the delayed write buffer list.
1198 1201                   */
1199 1202                  dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i];
1200 1203                  dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i];
1201 1204          }
1202 1205  }
1203 1206  
1204 1207  /*
1205 1208   * Wait for I/O completion on the buffer; return error code.
1206 1209   * If bp was for synchronous I/O, bp is invalid and associated
1207 1210   * resources are freed on return.
1208 1211   */
1209 1212  int
1210 1213  biowait(struct buf *bp)
1211 1214  {
1212 1215          int error = 0;
1213 1216          struct cpu *cpup;
1214 1217  
1215 1218          ASSERT(SEMA_HELD(&bp->b_sem));
1216 1219  
1217 1220          cpup = CPU;
1218 1221          atomic_inc_64(&cpup->cpu_stats.sys.iowait);
1219 1222          DTRACE_IO1(wait__start, struct buf *, bp);
1220 1223  
1221 1224          /*
1222 1225           * In case of panic, busy wait for completion
1223 1226           */
1224 1227          if (panicstr) {
1225 1228                  while ((bp->b_flags & B_DONE) == 0)
1226 1229                          drv_usecwait(10);
1227 1230          } else
1228 1231                  sema_p(&bp->b_io);
1229 1232  
1230 1233          DTRACE_IO1(wait__done, struct buf *, bp);
1231 1234          atomic_dec_64(&cpup->cpu_stats.sys.iowait);
1232 1235  
1233 1236          error = geterror(bp);
1234 1237          if ((bp->b_flags & B_ASYNC) == 0) {
1235 1238                  if (bp->b_flags & B_REMAPPED)
1236 1239                          bp_mapout(bp);
1237 1240          }
1238 1241          return (error);
1239 1242  }
1240 1243  
1241 1244  static void
1242 1245  biodone_tnf_probe(struct buf *bp)
1243 1246  {
1244 1247          /* Kernel probe */
1245 1248          TNF_PROBE_3(biodone, "io blockio", /* CSTYLED */,
1246 1249              tnf_device,         device,         bp->b_edev,
1247 1250              tnf_diskaddr,       block,          bp->b_lblkno,
1248 1251              tnf_opaque,         buf,            bp);
1249 1252  }
1250 1253  
1251 1254  /*
1252 1255   * Mark I/O complete on a buffer, release it if I/O is asynchronous,
1253 1256   * and wake up anyone waiting for it.
1254 1257   */
1255 1258  void
1256 1259  biodone(struct buf *bp)
1257 1260  {
1258 1261          if (bp->b_flags & B_STARTED) {
1259 1262                  DTRACE_IO1(done, struct buf *, bp);
1260 1263                  bp->b_flags &= ~B_STARTED;
1261 1264          }
1262 1265  
1263 1266          /*
1264 1267           * Call the TNF probe here instead of the inline code
1265 1268           * to force our compiler to use the tail call optimization.
1266 1269           */
1267 1270          biodone_tnf_probe(bp);
1268 1271  
1269 1272          if (bp->b_iodone != NULL) {
1270 1273                  (*(bp->b_iodone))(bp);
1271 1274                  return;
1272 1275          }
1273 1276          ASSERT((bp->b_flags & B_DONE) == 0);
1274 1277          ASSERT(SEMA_HELD(&bp->b_sem));
1275 1278          bp->b_flags |= B_DONE;
1276 1279          if (bp->b_flags & B_ASYNC) {
1277 1280                  if (bp->b_flags & (B_PAGEIO|B_REMAPPED))
1278 1281                          bio_pageio_done(bp);
1279 1282                  else
1280 1283                          brelse(bp);     /* release bp to freelist */
1281 1284          } else {
1282 1285                  sema_v(&bp->b_io);
1283 1286          }
1284 1287  }
1285 1288  
1286 1289  /*
1287 1290   * Pick up the device's error number and pass it to the user;
1288 1291   * if there is an error but the number is 0 set a generalized code.
1289 1292   */
1290 1293  int
1291 1294  geterror(struct buf *bp)
1292 1295  {
1293 1296          int error = 0;
1294 1297  
1295 1298          ASSERT(SEMA_HELD(&bp->b_sem));
1296 1299          if (bp->b_flags & B_ERROR) {
1297 1300                  error = bp->b_error;
1298 1301                  if (!error)
1299 1302                          error = EIO;
1300 1303          }
1301 1304          return (error);
1302 1305  }
1303 1306  
1304 1307  /*
1305 1308   * Support for pageio buffers.
1306 1309   *
1307 1310   * This stuff should be generalized to provide a generalized bp
1308 1311   * header facility that can be used for things other than pageio.
1309 1312   */
1310 1313  
1311 1314  /*
1312 1315   * Allocate and initialize a buf struct for use with pageio.
1313 1316   */
1314 1317  struct buf *
1315 1318  pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
1316 1319  {
1317 1320          struct buf *bp;
1318 1321          struct cpu *cpup;
1319 1322  
1320 1323          if (flags & B_READ) {
1321 1324                  CPU_STATS_ENTER_K();
1322 1325                  cpup = CPU;     /* get pointer AFTER preemption is disabled */
1323 1326                  CPU_STATS_ADDQ(cpup, vm, pgin, 1);
1324 1327                  CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len));
1325 1328  
1326 1329                  atomic_add_64(&curzone->zone_pgpgin, btopr(len));
1327 1330  
1328 1331                  if ((flags & B_ASYNC) == 0) {
1329 1332                          klwp_t *lwp = ttolwp(curthread);
1330 1333                          if (lwp != NULL)
1331 1334                                  lwp->lwp_ru.majflt++;
1332 1335                          CPU_STATS_ADDQ(cpup, vm, maj_fault, 1);
1333 1336                          /* Kernel probe */
1334 1337                          TNF_PROBE_2(major_fault, "vm pagefault", /* CSTYLED */,
1335 1338                              tnf_opaque,         vnode,          pp->p_vnode,
1336 1339                              tnf_offset,         offset,         pp->p_offset);
1337 1340                  }
1338 1341                  /*
1339 1342                   * Update statistics for pages being paged in
1340 1343                   */
1341 1344                  if (pp != NULL && pp->p_vnode != NULL) {
1342 1345                          if (IS_SWAPFSVP(pp->p_vnode)) {
1343 1346                                  CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len));
1344 1347                                  atomic_add_64(&curzone->zone_anonpgin,
1345 1348                                      btopr(len));
1346 1349                          } else {
1347 1350                                  if (pp->p_vnode->v_flag & VVMEXEC) {
1348 1351                                          CPU_STATS_ADDQ(cpup, vm, execpgin,
1349 1352                                              btopr(len));
1350 1353                                          atomic_add_64(&curzone->zone_execpgin,
1351 1354                                              btopr(len));
1352 1355                                  } else {
1353 1356                                          CPU_STATS_ADDQ(cpup, vm, fspgin,
1354 1357                                              btopr(len));
1355 1358                                          atomic_add_64(&curzone->zone_fspgin,
1356 1359                                              btopr(len));
1357 1360                                  }
1358 1361                          }
1359 1362                  }
1360 1363                  CPU_STATS_EXIT_K();
1361 1364                  TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN,
1362 1365                      "page_ws_in:pp %p", pp);
1363 1366                  /* Kernel probe */
1364 1367                  TNF_PROBE_3(pagein, "vm pageio io", /* CSTYLED */,
1365 1368                      tnf_opaque, vnode,  pp->p_vnode,
1366 1369                      tnf_offset, offset, pp->p_offset,
1367 1370                      tnf_size,   size,   len);
1368 1371          }
1369 1372  
1370 1373          bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
1371 1374          bp->b_bcount = len;
1372 1375          bp->b_bufsize = len;
1373 1376          bp->b_pages = pp;
1374 1377          bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags;
1375 1378          bp->b_offset = -1;
1376 1379          sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
1377 1380  
1378 1381          /* Initialize bp->b_sem in "locked" state */
1379 1382          sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
1380 1383  
1381 1384          VN_HOLD(vp);
1382 1385          bp->b_vp = vp;
1383 1386          THREAD_KPRI_RELEASE_N(btopr(len)); /* release kpri from page_locks */
1384 1387  
1385 1388          /*
1386 1389           * Caller sets dev & blkno and can adjust
1387 1390           * b_addr for page offset and can use bp_mapin
1388 1391           * to make pages kernel addressable.
1389 1392           */
1390 1393          return (bp);
1391 1394  }
1392 1395  
1393 1396  void
1394 1397  pageio_done(struct buf *bp)
1395 1398  {
1396 1399          ASSERT(SEMA_HELD(&bp->b_sem));
1397 1400          if (bp->b_flags & B_REMAPPED)
1398 1401                  bp_mapout(bp);
1399 1402          VN_RELE(bp->b_vp);
1400 1403          bp->b_vp = NULL;
1401 1404          ASSERT((bp->b_flags & B_NOCACHE) != 0);
1402 1405  
1403 1406          /* A sema_v(bp->b_sem) is implied if we are destroying it */
1404 1407          sema_destroy(&bp->b_sem);
1405 1408          sema_destroy(&bp->b_io);
1406 1409          kmem_free(bp, sizeof (struct buf));
1407 1410  }
1408 1411  
1409 1412  /*
1410 1413   * Check to see whether the buffers, except the one pointed by sbp,
1411 1414   * associated with the device are busy.
1412 1415   * NOTE: This expensive operation shall be improved together with ufs_icheck().
1413 1416   */
1414 1417  int
1415 1418  bcheck(dev_t dev, struct buf *sbp)
1416 1419  {
1417 1420          struct buf      *bp;
1418 1421          struct buf      *dp;
1419 1422          int i;
1420 1423          kmutex_t *hmp;
1421 1424  
1422 1425          /*
1423 1426           * check for busy bufs for this filesystem
1424 1427           */
1425 1428          for (i = 0; i < v.v_hbuf; i++) {
1426 1429                  dp = (struct buf *)&hbuf[i];
1427 1430                  hmp = &hbuf[i].b_lock;
1428 1431  
1429 1432                  mutex_enter(hmp);
1430 1433                  for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1431 1434                          /*
1432 1435                           * if buf is busy or dirty, then filesystem is busy
1433 1436                           */
1434 1437                          if ((bp->b_edev == dev) &&
1435 1438                              ((bp->b_flags & B_STALE) == 0) &&
1436 1439                              (bp->b_flags & (B_DELWRI|B_BUSY)) &&
1437 1440                              (bp != sbp)) {
1438 1441                                  mutex_exit(hmp);
1439 1442                                  return (1);
1440 1443                          }
1441 1444                  }
1442 1445                  mutex_exit(hmp);
1443 1446          }
1444 1447          return (0);
1445 1448  }
1446 1449  
1447 1450  /*
1448 1451   * Hash two 32 bit entities.
1449 1452   */
1450 1453  int
1451 1454  hash2ints(int x, int y)
1452 1455  {
1453 1456          int hash = 0;
1454 1457  
1455 1458          hash = x - 1;
1456 1459          hash = ((hash * 7) + (x >> 8)) - 1;
1457 1460          hash = ((hash * 7) + (x >> 16)) - 1;
1458 1461          hash = ((hash * 7) + (x >> 24)) - 1;
1459 1462          hash = ((hash * 7) + y) - 1;
1460 1463          hash = ((hash * 7) + (y >> 8)) - 1;
1461 1464          hash = ((hash * 7) + (y >> 16)) - 1;
1462 1465          hash = ((hash * 7) + (y >> 24)) - 1;
1463 1466  
1464 1467          return (hash);
1465 1468  }
1466 1469  
1467 1470  
1468 1471  /*
1469 1472   * Return a new buffer struct.
1470 1473   *      Create a new buffer if we haven't gone over our high water
1471 1474   *      mark for memory, otherwise try to get one off the freelist.
1472 1475   *
1473 1476   * Returns a locked buf that has no id and is not on any hash or free
1474 1477   * list.
1475 1478   */
1476 1479  static struct buf *
1477 1480  bio_getfreeblk(long bsize)
1478 1481  {
1479 1482          struct buf *bp, *dp;
1480 1483          struct hbuf *hp;
1481 1484          kmutex_t        *hmp;
1482 1485          uint_t          start, end;
1483 1486  
1484 1487          /*
1485 1488           * mutex_enter(&bfree_lock);
1486 1489           * bfreelist.b_bufsize represents the amount of memory
1487 1490           * mutex_exit(&bfree_lock); protect ref to bfreelist
1488 1491           * we are allowed to allocate in the cache before we hit our hwm.
1489 1492           */
1490 1493          bio_mem_get(bsize);     /* Account for our memory request */
1491 1494  
1492 1495  again:
1493 1496          bp = bio_bhdr_alloc();  /* Get a buf hdr */
1494 1497          sema_p(&bp->b_sem);     /* Should never fail */
1495 1498  
1496 1499          ASSERT(bp->b_un.b_addr == NULL);
1497 1500          bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP);
1498 1501          if (bp->b_un.b_addr != NULL) {
1499 1502                  /*
1500 1503                   * Make the common path short
1501 1504                   */
1502 1505                  bp->b_bufsize = bsize;
1503 1506                  ASSERT(SEMA_HELD(&bp->b_sem));
1504 1507                  return (bp);
1505 1508          } else {
1506 1509                  struct buf *save;
1507 1510  
1508 1511                  save = bp;      /* Save bp we allocated */
1509 1512                  start = end = lastindex;
1510 1513  
1511 1514                  biostats.bio_bufwant.value.ui32++;
1512 1515  
1513 1516                  /*
1514 1517                   * Memory isn't available from the system now. Scan
1515 1518                   * the hash buckets till enough space is found.
1516 1519                   */
1517 1520                  do {
1518 1521                          hp = &hbuf[start];
1519 1522                          hmp = &hp->b_lock;
1520 1523                          dp = (struct buf *)hp;
1521 1524  
1522 1525                          mutex_enter(hmp);
1523 1526                          bp = dp->av_forw;
1524 1527  
1525 1528                          while (bp != dp) {
1526 1529  
1527 1530                                  ASSERT(bp != NULL);
1528 1531  
1529 1532                                  if (!sema_tryp(&bp->b_sem)) {
1530 1533                                          bp = bp->av_forw;
1531 1534                                          continue;
1532 1535                                  }
1533 1536  
1534 1537                                  /*
1535 1538                                   * Since we are going down the freelist
1536 1539                                   * associated with this hash bucket the
1537 1540                                   * B_DELWRI flag should not be set.
1538 1541                                   */
1539 1542                                  ASSERT(!(bp->b_flags & B_DELWRI));
1540 1543  
1541 1544                                  if (bp->b_bufsize == bsize) {
1542 1545                                          hp->b_length--;
1543 1546                                          notavail(bp);
1544 1547                                          bremhash(bp);
1545 1548                                          mutex_exit(hmp);
1546 1549  
1547 1550                                          /*
1548 1551                                           * Didn't kmem_alloc any more, so don't
1549 1552                                           * count it twice.
1550 1553                                           */
1551 1554                                          mutex_enter(&bfree_lock);
1552 1555                                          bfreelist.b_bufsize += bsize;
1553 1556                                          mutex_exit(&bfree_lock);
1554 1557  
1555 1558                                          /*
1556 1559                                           * Update the lastindex value.
1557 1560                                           */
1558 1561                                          lastindex = start;
1559 1562  
1560 1563                                          /*
1561 1564                                           * Put our saved bp back on the list
1562 1565                                           */
1563 1566                                          sema_v(&save->b_sem);
1564 1567                                          bio_bhdr_free(save);
1565 1568                                          ASSERT(SEMA_HELD(&bp->b_sem));
1566 1569                                          return (bp);
1567 1570                                  }
1568 1571                                  sema_v(&bp->b_sem);
1569 1572                                  bp = bp->av_forw;
1570 1573                          }
1571 1574                          mutex_exit(hmp);
1572 1575                          start = ((start + 1) % v.v_hbuf);
1573 1576                  } while (start != end);
1574 1577  
1575 1578                  biostats.bio_bufwait.value.ui32++;
1576 1579                  bp = save;              /* Use original bp */
1577 1580                  bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
1578 1581          }
1579 1582  
1580 1583          bp->b_bufsize = bsize;
1581 1584          ASSERT(SEMA_HELD(&bp->b_sem));
1582 1585          return (bp);
1583 1586  }
1584 1587  
1585 1588  /*
1586 1589   * Allocate a buffer header. If none currently available, allocate
1587 1590   * a new pool.
1588 1591   */
1589 1592  static struct buf *
1590 1593  bio_bhdr_alloc(void)
1591 1594  {
1592 1595          struct buf *dp, *sdp;
1593 1596          struct buf *bp;
1594 1597          int i;
1595 1598  
1596 1599          for (;;) {
1597 1600                  mutex_enter(&bhdr_lock);
1598 1601                  if (bhdrlist != NULL) {
1599 1602                          bp = bhdrlist;
1600 1603                          bhdrlist = bp->av_forw;
1601 1604                          mutex_exit(&bhdr_lock);
1602 1605                          bp->av_forw = NULL;
1603 1606                          return (bp);
1604 1607                  }
1605 1608                  mutex_exit(&bhdr_lock);
1606 1609  
1607 1610                  /*
1608 1611                   * Need to allocate a new pool. If the system is currently
1609 1612                   * out of memory, then try freeing things on the freelist.
1610 1613                   */
1611 1614                  dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP);
1612 1615                  if (dp == NULL) {
1613 1616                          /*
1614 1617                           * System can't give us a pool of headers, try
1615 1618                           * recycling from the free lists.
1616 1619                           */
1617 1620                          bio_recycle(BIO_HEADER, 0);
1618 1621                  } else {
1619 1622                          sdp = dp;
1620 1623                          for (i = 0; i < v.v_buf; i++, dp++) {
1621 1624                                  /*
1622 1625                                   * The next two lines are needed since NODEV
1623 1626                                   * is -1 and not NULL
1624 1627                                   */
1625 1628                                  dp->b_dev = (o_dev_t)NODEV;
1626 1629                                  dp->b_edev = NODEV;
1627 1630                                  dp->av_forw = dp + 1;
1628 1631                                  sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT,
1629 1632                                      NULL);
1630 1633                                  sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT,
1631 1634                                      NULL);
1632 1635                                  dp->b_offset = -1;
1633 1636                          }
1634 1637                          mutex_enter(&bhdr_lock);
1635 1638                          (--dp)->av_forw = bhdrlist;     /* Fix last pointer */
1636 1639                          bhdrlist = sdp;
1637 1640                          nbuf += v.v_buf;
1638 1641                          bp = bhdrlist;
1639 1642                          bhdrlist = bp->av_forw;
1640 1643                          mutex_exit(&bhdr_lock);
1641 1644  
1642 1645                          bp->av_forw = NULL;
1643 1646                          return (bp);
1644 1647                  }
1645 1648          }
1646 1649  }
1647 1650  
1648 1651  static  void
1649 1652  bio_bhdr_free(struct buf *bp)
1650 1653  {
1651 1654          ASSERT(bp->b_back == NULL);
1652 1655          ASSERT(bp->b_forw == NULL);
1653 1656          ASSERT(bp->av_back == NULL);
1654 1657          ASSERT(bp->av_forw == NULL);
1655 1658          ASSERT(bp->b_un.b_addr == NULL);
1656 1659          ASSERT(bp->b_dev == (o_dev_t)NODEV);
1657 1660          ASSERT(bp->b_edev == NODEV);
1658 1661          ASSERT(bp->b_flags == 0);
1659 1662  
1660 1663          mutex_enter(&bhdr_lock);
1661 1664          bp->av_forw = bhdrlist;
1662 1665          bhdrlist = bp;
1663 1666          mutex_exit(&bhdr_lock);
1664 1667  }
1665 1668  
1666 1669  /*
1667 1670   * If we haven't gone over the high water mark, it's o.k. to
1668 1671   * allocate more buffer space, otherwise recycle buffers
1669 1672   * from the freelist until enough memory is free for a bsize request.
1670 1673   *
1671 1674   * We account for this memory, even though
1672 1675   * we don't allocate it here.
1673 1676   */
1674 1677  static void
1675 1678  bio_mem_get(long bsize)
1676 1679  {
1677 1680          mutex_enter(&bfree_lock);
1678 1681          if (bfreelist.b_bufsize > bsize) {
1679 1682                  bfreelist.b_bufsize -= bsize;
1680 1683                  mutex_exit(&bfree_lock);
1681 1684                  return;
1682 1685          }
1683 1686          mutex_exit(&bfree_lock);
1684 1687          bio_recycle(BIO_MEM, bsize);
1685 1688  }
1686 1689  
1687 1690  /*
1688 1691   * flush a list of delayed write buffers.
1689 1692   * (currently used only by bio_recycle below.)
1690 1693   */
1691 1694  static void
1692 1695  bio_flushlist(struct buf *delwri_list)
1693 1696  {
1694 1697          struct buf *bp;
1695 1698  
1696 1699          while (delwri_list != EMPTY_LIST) {
1697 1700                  bp = delwri_list;
1698 1701                  bp->b_flags |= B_AGE | B_ASYNC;
1699 1702                  if (bp->b_vp == NULL) {         /* !ufs */
1700 1703                          BWRITE(bp);
1701 1704                  } else {                        /* ufs */
1702 1705                          UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
1703 1706                  }
1704 1707                  delwri_list = bp->b_list;
1705 1708                  bp->b_list = NULL;
1706 1709          }
1707 1710  }
1708 1711  
1709 1712  /*
1710 1713   * Start recycling buffers on the freelist for one of 2 reasons:
1711 1714   *      - we need a buffer header
1712 1715   *      - we need to free up memory
1713 1716   * Once started we continue to recycle buffers until the B_AGE
1714 1717   * buffers are gone.
1715 1718   */
1716 1719  static void
1717 1720  bio_recycle(int want, long bsize)
1718 1721  {
1719 1722          struct buf *bp, *dp, *dwp, *nbp;
1720 1723          struct hbuf *hp;
1721 1724          int     found = 0;
1722 1725          kmutex_t        *hmp;
1723 1726          int             start, end;
1724 1727          struct buf *delwri_list = EMPTY_LIST;
1725 1728  
1726 1729          /*
1727 1730           * Recycle buffers.
1728 1731           */
1729 1732  top:
1730 1733          start = end = lastindex;
1731 1734          do {
1732 1735                  hp = &hbuf[start];
1733 1736                  hmp = &hp->b_lock;
1734 1737                  dp = (struct buf *)hp;
1735 1738  
1736 1739                  mutex_enter(hmp);
1737 1740                  bp = dp->av_forw;
1738 1741  
1739 1742                  while (bp != dp) {
1740 1743  
1741 1744                          ASSERT(bp != NULL);
1742 1745  
1743 1746                          if (!sema_tryp(&bp->b_sem)) {
1744 1747                                  bp = bp->av_forw;
1745 1748                                  continue;
1746 1749                          }
1747 1750                          /*
1748 1751                           * Do we really want to nuke all of the B_AGE stuff??
1749 1752                           */
1750 1753                          if ((bp->b_flags & B_AGE) == 0 && found) {
1751 1754                                  sema_v(&bp->b_sem);
1752 1755                                  mutex_exit(hmp);
1753 1756                                  lastindex = start;
1754 1757                                  return; /* All done */
1755 1758                          }
1756 1759  
1757 1760                          ASSERT(MUTEX_HELD(&hp->b_lock));
1758 1761                          ASSERT(!(bp->b_flags & B_DELWRI));
1759 1762                          hp->b_length--;
1760 1763                          notavail(bp);
1761 1764  
1762 1765                          /*
1763 1766                           * Remove bhdr from cache, free up memory,
1764 1767                           * and add the hdr to the freelist.
1765 1768                           */
1766 1769                          bremhash(bp);
1767 1770                          mutex_exit(hmp);
1768 1771  
1769 1772                          if (bp->b_bufsize) {
1770 1773                                  kmem_free(bp->b_un.b_addr, bp->b_bufsize);
1771 1774                                  bp->b_un.b_addr = NULL;
1772 1775                                  mutex_enter(&bfree_lock);
1773 1776                                  bfreelist.b_bufsize += bp->b_bufsize;
1774 1777                                  mutex_exit(&bfree_lock);
1775 1778                          }
1776 1779  
1777 1780                          bp->b_dev = (o_dev_t)NODEV;
1778 1781                          bp->b_edev = NODEV;
1779 1782                          bp->b_flags = 0;
1780 1783                          sema_v(&bp->b_sem);
1781 1784                          bio_bhdr_free(bp);
1782 1785                          if (want == BIO_HEADER) {
1783 1786                                  found = 1;
1784 1787                          } else {
1785 1788                                  ASSERT(want == BIO_MEM);
1786 1789                                  if (!found && bfreelist.b_bufsize >= bsize) {
1787 1790                                          /* Account for the memory we want */
1788 1791                                          mutex_enter(&bfree_lock);
1789 1792                                          if (bfreelist.b_bufsize >= bsize) {
1790 1793                                                  bfreelist.b_bufsize -= bsize;
1791 1794                                                  found = 1;
1792 1795                                          }
1793 1796                                          mutex_exit(&bfree_lock);
1794 1797                                  }
1795 1798                          }
1796 1799  
1797 1800                          /*
1798 1801                           * Since we dropped hmp start from the
1799 1802                           * begining.
1800 1803                           */
1801 1804                          mutex_enter(hmp);
1802 1805                          bp = dp->av_forw;
1803 1806                  }
1804 1807                  mutex_exit(hmp);
1805 1808  
1806 1809                  /*
1807 1810                   * Look at the delayed write list.
1808 1811                   * First gather into a private list, then write them.
1809 1812                   */
1810 1813                  dwp = (struct buf *)&dwbuf[start];
1811 1814                  mutex_enter(&blist_lock);
1812 1815                  bio_doingflush++;
1813 1816                  mutex_enter(hmp);
1814 1817                  for (bp = dwp->av_forw; bp != dwp; bp = nbp) {
1815 1818  
1816 1819                          ASSERT(bp != NULL);
1817 1820                          nbp = bp->av_forw;
1818 1821  
1819 1822                          if (!sema_tryp(&bp->b_sem))
1820 1823                                  continue;
1821 1824                          ASSERT(bp->b_flags & B_DELWRI);
1822 1825                          /*
1823 1826                           * Do we really want to nuke all of the B_AGE stuff??
1824 1827                           */
1825 1828  
1826 1829                          if ((bp->b_flags & B_AGE) == 0 && found) {
1827 1830                                  sema_v(&bp->b_sem);
1828 1831                                  mutex_exit(hmp);
1829 1832                                  lastindex = start;
1830 1833                                  mutex_exit(&blist_lock);
1831 1834                                  bio_flushlist(delwri_list);
1832 1835                                  mutex_enter(&blist_lock);
1833 1836                                  bio_doingflush--;
1834 1837                                  if (bio_flinv_cv_wanted) {
1835 1838                                          bio_flinv_cv_wanted = 0;
1836 1839                                          cv_broadcast(&bio_flushinval_cv);
1837 1840                                  }
1838 1841                                  mutex_exit(&blist_lock);
1839 1842                                  return; /* All done */
1840 1843                          }
1841 1844  
1842 1845                          /*
1843 1846                           * If the buffer is already on a flush or
1844 1847                           * invalidate list then just skip it.
1845 1848                           */
1846 1849                          if (bp->b_list != NULL) {
1847 1850                                  sema_v(&bp->b_sem);
1848 1851                                  continue;
1849 1852                          }
1850 1853                          /*
1851 1854                           * We are still on the same bucket.
1852 1855                           */
1853 1856                          hp->b_length--;
1854 1857                          notavail(bp);
1855 1858                          bp->b_list = delwri_list;
1856 1859                          delwri_list = bp;
1857 1860                  }
1858 1861                  mutex_exit(hmp);
1859 1862                  mutex_exit(&blist_lock);
1860 1863                  bio_flushlist(delwri_list);
1861 1864                  delwri_list = EMPTY_LIST;
1862 1865                  mutex_enter(&blist_lock);
1863 1866                  bio_doingflush--;
1864 1867                  if (bio_flinv_cv_wanted) {
1865 1868                          bio_flinv_cv_wanted = 0;
1866 1869                          cv_broadcast(&bio_flushinval_cv);
1867 1870                  }
1868 1871                  mutex_exit(&blist_lock);
1869 1872                  start = (start + 1) % v.v_hbuf;
1870 1873  
1871 1874          } while (start != end);
1872 1875  
1873 1876          if (found)
1874 1877                  return;
1875 1878  
1876 1879          /*
1877 1880           * Free lists exhausted and we haven't satisfied the request.
1878 1881           * Wait here for more entries to be added to freelist.
1879 1882           * Because this might have just happened, make it timed.
1880 1883           */
1881 1884          mutex_enter(&bfree_lock);
1882 1885          bfreelist.b_flags |= B_WANTED;
1883 1886          (void) cv_reltimedwait(&bio_mem_cv, &bfree_lock, hz, TR_CLOCK_TICK);
1884 1887          mutex_exit(&bfree_lock);
1885 1888          goto top;
1886 1889  }
1887 1890  
1888 1891  /*
1889 1892   * See if the block is associated with some buffer
1890 1893   * (mainly to avoid getting hung up on a wait in breada).
1891 1894   */
1892 1895  static int
1893 1896  bio_incore(dev_t dev, daddr_t blkno)
1894 1897  {
1895 1898          struct buf *bp;
1896 1899          struct buf *dp;
1897 1900          uint_t index;
1898 1901          kmutex_t *hmp;
1899 1902  
1900 1903          index = bio_bhash(dev, blkno);
1901 1904          dp = (struct buf *)&hbuf[index];
1902 1905          hmp = &hbuf[index].b_lock;
1903 1906  
1904 1907          mutex_enter(hmp);
1905 1908          for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1906 1909                  if (bp->b_blkno == blkno && bp->b_edev == dev &&
1907 1910                      (bp->b_flags & B_STALE) == 0) {
1908 1911                          mutex_exit(hmp);
1909 1912                          return (1);
1910 1913                  }
1911 1914          }
1912 1915          mutex_exit(hmp);
1913 1916          return (0);
1914 1917  }
1915 1918  
1916 1919  static void
1917 1920  bio_pageio_done(struct buf *bp)
1918 1921  {
1919 1922          if (bp->b_flags & B_PAGEIO) {
1920 1923  
1921 1924                  if (bp->b_flags & B_REMAPPED)
1922 1925                          bp_mapout(bp);
1923 1926  
1924 1927                  if (bp->b_flags & B_READ)
1925 1928                          pvn_read_done(bp->b_pages, bp->b_flags);
1926 1929                  else
1927 1930                          pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags);
1928 1931                  pageio_done(bp);
1929 1932          } else {
1930 1933                  ASSERT(bp->b_flags & B_REMAPPED);
1931 1934                  bp_mapout(bp);
1932 1935                  brelse(bp);
1933 1936          }
1934 1937  }
1935 1938  
1936 1939  /*
1937 1940   * bioerror(9F) - indicate error in buffer header
1938 1941   * If 'error' is zero, remove the error indication.
1939 1942   */
1940 1943  void
1941 1944  bioerror(struct buf *bp, int error)
1942 1945  {
1943 1946          ASSERT(bp != NULL);
1944 1947          ASSERT(error >= 0);
1945 1948          ASSERT(SEMA_HELD(&bp->b_sem));
1946 1949  
1947 1950          if (error != 0) {
1948 1951                  bp->b_flags |= B_ERROR;
1949 1952          } else {
1950 1953                  bp->b_flags &= ~B_ERROR;
1951 1954          }
1952 1955          bp->b_error = error;
1953 1956  }
1954 1957  
1955 1958  /*
1956 1959   * bioreset(9F) - reuse a private buffer header after I/O is complete
1957 1960   */
1958 1961  void
1959 1962  bioreset(struct buf *bp)
1960 1963  {
1961 1964          ASSERT(bp != NULL);
1962 1965  
1963 1966          biofini(bp);
1964 1967          bioinit(bp);
1965 1968  }
1966 1969  
1967 1970  /*
1968 1971   * biosize(9F) - return size of a buffer header
1969 1972   */
1970 1973  size_t
1971 1974  biosize(void)
1972 1975  {
1973 1976          return (sizeof (struct buf));
1974 1977  }
1975 1978  
1976 1979  /*
1977 1980   * biomodified(9F) - check if buffer is modified
1978 1981   */
1979 1982  int
1980 1983  biomodified(struct buf *bp)
1981 1984  {
1982 1985          int npf;
1983 1986          int ppattr;
1984 1987          struct page *pp;
1985 1988  
1986 1989          ASSERT(bp != NULL);
1987 1990  
1988 1991          if ((bp->b_flags & B_PAGEIO) == 0) {
1989 1992                  return (-1);
1990 1993          }
1991 1994          pp = bp->b_pages;
1992 1995          npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET));
1993 1996  
1994 1997          while (npf > 0) {
1995 1998                  ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
1996 1999                      HAT_SYNC_STOPON_MOD);
1997 2000                  if (ppattr & P_MOD)
1998 2001                          return (1);
1999 2002                  pp = pp->p_next;
2000 2003                  npf--;
2001 2004          }
2002 2005  
2003 2006          return (0);
2004 2007  }
2005 2008  
2006 2009  /*
2007 2010   * bioinit(9F) - initialize a buffer structure
2008 2011   */
2009 2012  void
2010 2013  bioinit(struct buf *bp)
2011 2014  {
2012 2015          bzero(bp, sizeof (struct buf));
2013 2016          sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
2014 2017          sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
2015 2018          bp->b_offset = -1;
2016 2019  }
2017 2020  
2018 2021  /*
2019 2022   * biofini(9F) - uninitialize a buffer structure
2020 2023   */
2021 2024  void
2022 2025  biofini(struct buf *bp)
2023 2026  {
2024 2027          sema_destroy(&bp->b_io);
2025 2028          sema_destroy(&bp->b_sem);
2026 2029  }
2027 2030  
2028 2031  /*
2029 2032   * bioclone(9F) - clone a buffer
2030 2033   */
2031 2034  struct buf *
2032 2035  bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno,
2033 2036      int (*iodone)(struct buf *), struct buf *bp_mem, int sleep)
2034 2037  {
2035 2038          struct buf *bufp;
2036 2039  
2037 2040          ASSERT(bp);
2038 2041          if (bp_mem == NULL) {
2039 2042                  bufp = kmem_alloc(sizeof (struct buf), sleep);
2040 2043                  if (bufp == NULL) {
2041 2044                          return (NULL);
2042 2045                  }
2043 2046                  bioinit(bufp);
2044 2047          } else {
2045 2048                  bufp = bp_mem;
2046 2049                  bioreset(bufp);
2047 2050          }
2048 2051  
2049 2052  #define BUF_CLONE_FLAGS (B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\
2050 2053          B_ABRWRITE)
2051 2054  
2052 2055          /*
2053 2056           * The cloned buffer does not inherit the B_REMAPPED flag.
2054 2057           */
2055 2058          bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS)  | B_BUSY;
2056 2059          bufp->b_bcount = len;
2057 2060          bufp->b_blkno = blkno;
2058 2061          bufp->b_iodone = iodone;
2059 2062          bufp->b_proc = bp->b_proc;
2060 2063          bufp->b_edev = dev;
2061 2064          bufp->b_file = bp->b_file;
2062 2065          bufp->b_offset = bp->b_offset;
2063 2066  
2064 2067          if (bp->b_flags & B_SHADOW) {
2065 2068                  ASSERT(bp->b_shadow);
2066 2069                  ASSERT(bp->b_flags & B_PHYS);
2067 2070  
2068 2071                  bufp->b_shadow = bp->b_shadow +
2069 2072                      btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off);
2070 2073                  bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2071 2074                  if (bp->b_flags & B_REMAPPED)
2072 2075                          bufp->b_proc = NULL;
2073 2076          } else {
2074 2077                  if (bp->b_flags & B_PAGEIO) {
2075 2078                          struct page *pp;
2076 2079                          off_t o;
2077 2080                          int i;
2078 2081  
2079 2082                          pp = bp->b_pages;
2080 2083                          o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off;
2081 2084                          for (i = btop(o); i > 0; i--) {
2082 2085                                  pp = pp->p_next;
2083 2086                          }
2084 2087                          bufp->b_pages = pp;
2085 2088                          bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET);
2086 2089                  } else {
2087 2090                          bufp->b_un.b_addr =
2088 2091                              (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2089 2092                          if (bp->b_flags & B_REMAPPED)
2090 2093                                  bufp->b_proc = NULL;
2091 2094                  }
2092 2095          }
2093 2096          return (bufp);
2094 2097  }
  
    | 
      ↓ open down ↓ | 
    1919 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX