1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright 2012 Joyent, Inc.  All rights reserved.
  25  */
  26 
  27 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  28 /*        All Rights Reserved   */
  29 
  30 /*
  31  * University Copyright- Copyright (c) 1982, 1986, 1988
  32  * The Regents of the University of California
  33  * All Rights Reserved
  34  *
  35  * University Acknowledgment- Portions of this document are derived from
  36  * software developed by the University of California, Berkeley, and its
  37  * contributors.
  38  */
  39 
  40 #ifndef _SYS_BUF_H
  41 #define _SYS_BUF_H
  42 
  43 #include <sys/types32.h>
  44 #include <sys/t_lock.h>
  45 #include <sys/kstat.h>
  46 
  47 #ifdef  __cplusplus
  48 extern "C" {
  49 #endif
  50 
  51 /*
  52  *      Each buffer in the pool is usually doubly linked into 2 lists:
  53  *      the device with which it is currently associated (always)
  54  *      and also on a list of blocks available for allocation
  55  *      for other use (usually).
  56  *      The latter list is kept in last-used order, and the two
  57  *      lists are doubly linked to make it easy to remove
  58  *      a buffer from one list when it was found by
  59  *      looking through the other.
  60  *      A buffer is on the available list, and is liable
  61  *      to be reassigned to another disk block, if and only
  62  *      if it is not marked BUSY.  When a buffer is busy, the
  63  *      available-list pointers can be used for other purposes.
  64  *      Most drivers use the forward ptr as a link in their I/O active queue.
  65  *      A buffer header contains all the information required to perform I/O.
  66  *      Most of the routines which manipulate these things are in bio.c.
  67  *
  68  *      There are a number of locks associated with the buffer management
  69  *      system.
  70  *      hbuf.b_lock:    protects hash chains, buffer hdr freelists
  71  *                      and delayed write freelist
  72  *      bfree_lock;     protects the bfreelist structure
  73  *      bhdr_lock:      protects the free header list
  74  *      blist_lock:     protects b_list fields
  75  *      buf.b_sem:      protects all remaining members in the buf struct
  76  *      buf.b_io:       I/O synchronization variable
  77  *
  78  *      A buffer header is never "locked" (b_sem) when it is on
  79  *      a "freelist" (bhdrlist or bfreelist avail lists).
  80  */
  81 typedef struct  buf {
  82         int     b_flags;                /* see defines below */
  83         struct buf *b_forw;             /* headed by d_tab of conf.c */
  84         struct buf *b_back;             /*  "  */
  85         struct buf *av_forw;            /* position on free list, */
  86         struct buf *av_back;            /* if not BUSY */
  87         o_dev_t b_dev;                  /* OLD major+minor device name */
  88         size_t b_bcount;                /* transfer count */
  89         union {
  90                 caddr_t b_addr;         /* low order core address */
  91                 struct fs *b_fs;        /* superblocks */
  92                 struct cg *b_cg;        /* UFS cylinder group block */
  93                 struct dinode *b_dino;  /* UFS ilist */
  94                 daddr32_t *b_daddr;     /* disk blocks */
  95         } b_un;
  96 
  97         lldaddr_t       _b_blkno;       /* block # on device (union) */
  98 #define b_lblkno        _b_blkno._f
  99 #ifdef _LP64
 100 #define b_blkno         _b_blkno._f
 101 #else
 102 #define b_blkno         _b_blkno._p._l
 103 #endif /* _LP64 */
 104 
 105         char    b_obs1;                 /* obsolete */
 106         size_t  b_resid;                /* words not transferred after error */
 107         clock_t b_start;                /* request start time */
 108         struct  proc  *b_proc;          /* process doing physical or swap I/O */
 109         struct  page  *b_pages;         /* page list for PAGEIO */
 110         clock_t b_obs2;                 /* obsolete */
 111         /* Begin new stuff */
 112 #define b_actf  av_forw
 113 #define b_actl  av_back
 114 #define b_active b_bcount
 115 #define b_errcnt b_resid
 116         size_t  b_bufsize;              /* size of allocated buffer */
 117         int     (*b_iodone)(struct buf *);      /* function called by iodone */
 118         struct  vnode *b_vp;            /* vnode associated with block */
 119         struct  buf *b_chain;           /* chain together all buffers here */
 120         int     b_obs3;                 /* obsolete */
 121         int     b_error;                /* expanded error field */
 122         void    *b_private;             /* "opaque" driver private area */
 123         dev_t   b_edev;                 /* expanded dev field */
 124         ksema_t b_sem;                  /* Exclusive access to buf */
 125         ksema_t b_io;                   /* I/O Synchronization */
 126         struct buf *b_list;             /* List of potential B_DELWRI bufs */
 127         struct page **b_shadow;         /* shadow page list */
 128         void    *b_dip;                 /* device info pointer */
 129         struct vnode *b_file;           /* file associated with this buffer */
 130         offset_t b_offset;              /* offset in file assoc. with buffer */
 131 } buf_t;
 132 
 133 /*
 134  * Bufhd structures used at the head of the hashed buffer queues.
 135  * We only need seven words for this, so this abbreviated
 136  * definition saves some space.
 137  */
 138 struct diskhd {
 139         int     b_flags;                /* not used, needed for consistency */
 140         struct buf *b_forw, *b_back;    /* queue of unit queues */
 141         struct buf *av_forw, *av_back;  /* queue of bufs for this unit */
 142         o_dev_t b_dev;                  /* OLD major+minor device name */
 143         size_t b_bcount;                /* transfer count */
 144 };
 145 
 146 
 147 /*
 148  * Statistics on the buffer cache
 149  */
 150 struct biostats {
 151         kstat_named_t   bio_lookup;     /* requests to assign buffer */
 152         kstat_named_t   bio_hit;        /* buffer already associated with blk */
 153         kstat_named_t   bio_bufwant;    /* kmem_allocs NOSLEEP failed new buf */
 154         kstat_named_t   bio_bufwait;    /* kmem_allocs with KM_SLEEP for buf */
 155         kstat_named_t   bio_bufbusy;    /* buffer locked by someone else */
 156         kstat_named_t   bio_bufdup;     /* duplicate buffer found for block */
 157 };
 158 
 159 /*
 160  * These flags are kept in b_flags.
 161  * The first group is part of the DDI
 162  */
 163 #define B_BUSY          0x0001  /* not on av_forw/back list */
 164 #define B_DONE          0x0002  /* transaction finished */
 165 #define B_ERROR         0x0004  /* transaction aborted */
 166 #define B_PAGEIO        0x0010  /* do I/O to pages on bp->p_pages */
 167 #define B_PHYS          0x0020  /* Physical IO potentially using UNIBUS map */
 168 #define B_READ          0x0040  /* read when I/O occurs */
 169 #define B_WRITE         0x0100  /* non-read pseudo-flag */
 170 
 171 /* Not part of the DDI */
 172 #define B_WANTED        0x0080          /* issue wakeup when BUSY goes off */
 173 #define B_AGE           0x000200        /* delayed write for correct aging */
 174 #define B_ASYNC         0x000400        /* don't wait for I/O completion */
 175 #define B_DELWRI        0x000800        /* delayed write-wait til buf needed */
 176 #define B_STALE         0x001000        /* on av_* list; invalid contents */
 177 #define B_DONTNEED      0x002000        /* after write, need not be cached */
 178 #define B_REMAPPED      0x004000        /* buffer is kernel addressable */
 179 #define B_FREE          0x008000        /* free page when done */
 180 #define B_INVAL         0x010000        /* destroy page when done */
 181 #define B_FORCE         0x020000        /* semi-permanent removal from cache */
 182 #define B_NOCACHE       0x080000        /* don't cache block when released */
 183 #define B_TRUNC         0x100000        /* truncate page without I/O */
 184 #define B_SHADOW        0x200000        /* is b_shadow field valid? */
 185 #define B_RETRYWRI      0x400000        /* retry write til works or bfinval */
 186 #define B_FAILFAST      0x1000000       /* Fail promptly if device goes away */
 187 #define B_STARTED       0x2000000       /* io:::start probe called for buf */
 188 #define B_ABRWRITE      0x4000000       /* Application based recovery active */
 189 #define B_PAGE_NOWAIT   0x8000000       /* Skip the page if it is locked */
 190 #define B_INVALCURONLY  0x10000000      /* invalidate only for curproc */
 191 
 192 /*
 193  * There is some confusion over the meaning of B_FREE and B_INVAL and what
 194  * the use of one over the other implies.
 195  *
 196  * In both cases, when we are done with the page (buffer) we want to free
 197  * up the page.  In the case of B_FREE, the page will go to the cachelist.
 198  * In the case of B_INVAL, the page will be destroyed (hashed out of it's
 199  * vnode) and placed on the freelist.  Beyond this, there is no difference
 200  * between the sole use of these two flags.  In both cases, IO will be done
 201  * if the page is not yet committed to storage.
 202  *
 203  * The B_INVALCURONLY flag modifies the behavior of the B_INVAL flag and is
 204  * intended to be used in conjunction with B_INVAL.  B_INVALCURONLY has no
 205  * meaning on its own.  When both B_INVALCURONLY and B_INVAL are set, then
 206  * the mapping for the page is only invalidated for the current process.
 207  * In this case, the page is not destroyed unless this was the final mapping.
 208  *
 209  * In order to discard pages without writing them back, (B_INVAL | B_TRUNC)
 210  * should be used.
 211  *
 212  * Use (B_INVAL | B_FORCE) to force the page to be destroyed even if we
 213  * could not successfuly write out the page.
 214  */
 215 
 216 /*
 217  * Insq/Remq for the buffer hash lists.
 218  */
 219 #define bremhash(bp) { \
 220         ASSERT((bp)->b_forw != NULL); \
 221         ASSERT((bp)->b_back != NULL); \
 222         (bp)->b_back->b_forw = (bp)->b_forw; \
 223         (bp)->b_forw->b_back = (bp)->b_back; \
 224         (bp)->b_forw = (bp)->b_back = NULL; \
 225 }
 226 #define binshash(bp, dp) { \
 227         ASSERT((bp)->b_forw == NULL); \
 228         ASSERT((bp)->b_back == NULL); \
 229         ASSERT((dp)->b_forw != NULL); \
 230         ASSERT((dp)->b_back != NULL); \
 231         (bp)->b_forw = (dp)->b_forw; \
 232         (bp)->b_back = (dp); \
 233         (dp)->b_forw->b_back = (bp); \
 234         (dp)->b_forw = (bp); \
 235 }
 236 
 237 
 238 /*
 239  * The hash structure maintains two lists:
 240  *
 241  *      1) The hash list of buffers (b_forw & b_back)
 242  *      2) The LRU free list of buffers on this hash bucket (av_forw & av_back)
 243  *
 244  * The dwbuf structure keeps a list of delayed write buffers per hash bucket
 245  * hence there are exactly the same number of dwbuf structures as there are
 246  * the hash buckets (hbuf structures) in the system.
 247  *
 248  * The number of buffers on the freelist may not be equal to the number of
 249  * buffers on the hash list. That is because when buffers are busy they are
 250  * taken off the freelist but not off the hash list. "b_length" field keeps
 251  * track of the number of free buffers (including delayed writes ones) on
 252  * the hash bucket. The "b_lock" mutex protects the free list as well as
 253  * the hash list. It also protects the counter "b_length".
 254  *
 255  * Enties b_forw, b_back, av_forw & av_back must be at the same offset
 256  * as the ones in buf structure.
 257  */
 258 struct  hbuf {
 259         int     b_flags;
 260 
 261         struct  buf     *b_forw;        /* hash list forw pointer */
 262         struct  buf     *b_back;        /* hash list back pointer */
 263 
 264         struct  buf     *av_forw;       /* free list forw pointer */
 265         struct  buf     *av_back;       /* free list back pointer */
 266 
 267         int             b_length;       /* # of entries on free list */
 268         kmutex_t        b_lock;         /* lock to protect this structure */
 269 };
 270 
 271 
 272 /*
 273  * The delayed list pointer entries should match with the buf strcuture.
 274  */
 275 struct  dwbuf {
 276         int     b_flags;                /* not used */
 277 
 278         struct  buf     *b_forw;        /* not used */
 279         struct  buf     *b_back;        /* not used */
 280 
 281         struct  buf     *av_forw;       /* delayed write forw pointer */
 282         struct  buf     *av_back;       /* delayed write back pointer */
 283 };
 284 
 285 
 286 /*
 287  * Unlink a buffer from the available (free or delayed write) list and mark
 288  * it busy (internal interface).
 289  */
 290 #define notavail(bp) \
 291 {\
 292         ASSERT(SEMA_HELD(&bp->b_sem)); \
 293         ASSERT((bp)->av_forw != NULL); \
 294         ASSERT((bp)->av_back != NULL); \
 295         ASSERT((bp)->av_forw != (bp)); \
 296         ASSERT((bp)->av_back != (bp)); \
 297         (bp)->av_back->av_forw = (bp)->av_forw; \
 298         (bp)->av_forw->av_back = (bp)->av_back; \
 299         (bp)->b_flags |= B_BUSY; \
 300         (bp)->av_forw = (bp)->av_back = NULL; \
 301 }
 302 
 303 #if defined(_KERNEL)
 304 /*
 305  * Macros to avoid the extra function call needed for binary compat.
 306  *
 307  * B_RETRYWRI is not included in clear_flags for BWRITE(), BWRITE2(),
 308  * or brwrite() so that the retry operation is persistent until the
 309  * write either succeeds or the buffer is bfinval()'d.
 310  *
 311  */
 312 #define BREAD(dev, blkno, bsize) \
 313         bread_common(/* ufsvfsp */ NULL, dev, blkno, bsize)
 314 
 315 #define BWRITE(bp) \
 316         bwrite_common(/* ufsvfsp */ NULL, bp, /* force_wait */ 0, \
 317                 /* do_relse */ 1, \
 318                 /* clear_flags */ (B_READ | B_DONE | B_ERROR | B_DELWRI))
 319 
 320 #define BWRITE2(bp) \
 321         bwrite_common(/* ufsvfsp */ NULL, bp, /* force_wait */ 1, \
 322                 /* do_relse */ 0, \
 323                 /* clear_flags */ (B_READ | B_DONE | B_ERROR | B_DELWRI))
 324 
 325 #define GETBLK(dev, blkno, bsize) \
 326         getblk_common(/* ufsvfsp */ NULL, dev, blkno, bsize, /* errflg */ 0)
 327 
 328 
 329 /*
 330  * Macros for new retry write interfaces.
 331  */
 332 
 333 /*
 334  * Same as bdwrite() except write failures are retried.
 335  */
 336 #define bdrwrite(bp) { \
 337         (bp)->b_flags |= B_RETRYWRI; \
 338         bdwrite((bp)); \
 339 }
 340 
 341 /*
 342  * Same as bwrite() except write failures are retried.
 343  */
 344 #define brwrite(bp) { \
 345         (bp)->b_flags |= B_RETRYWRI; \
 346         bwrite_common((bp), /* force_wait */ 0, /* do_relse */ 1, \
 347                 /* clear_flags */ (B_READ | B_DONE | B_ERROR | B_DELWRI)); \
 348 }
 349 
 350 extern struct hbuf      *hbuf;          /* Hash table */
 351 extern struct dwbuf     *dwbuf;         /* delayed write hash table */
 352 extern struct buf       *buf;           /* The buffer pool itself */
 353 extern struct buf       bfreelist;      /* head of available list */
 354 
 355 extern void (*bio_lufs_strategy)(void *, buf_t *);      /* UFS Logging */
 356 extern void (*bio_snapshot_strategy)(void *, buf_t *);  /* UFS snapshots */
 357 
 358 int     bcheck(dev_t, struct buf *);
 359 int     iowait(struct buf *);
 360 int     hash2ints(int x, int y);
 361 int     bio_busy(int);
 362 int     biowait(struct buf *);
 363 int     biomodified(struct buf *);
 364 int     geterror(struct buf *);
 365 void    minphys(struct buf *);
 366 /*
 367  * ufsvfsp is declared as a void * to avoid having everyone that uses
 368  * this header file include sys/fs/ufs_inode.h.
 369  */
 370 void    bwrite_common(void *ufsvfsp, struct buf *, int force_wait,
 371         int do_relse, int clear_flags);
 372 void    bwrite(struct buf *);
 373 void    bwrite2(struct buf *);
 374 void    bdwrite(struct buf *);
 375 void    bawrite(struct buf *);
 376 void    brelse(struct buf *);
 377 void    iodone(struct buf *);
 378 void    clrbuf(struct buf *);
 379 void    bflush(dev_t);
 380 void    blkflush(dev_t, daddr_t);
 381 void    binval(dev_t);
 382 int     bfinval(dev_t, int);
 383 void    binit(void);
 384 void    biodone(struct buf *);
 385 void    bioinit(struct buf *);
 386 void    biofini(struct buf *);
 387 void    bp_mapin(struct buf *);
 388 void    *bp_mapin_common(struct buf *, int);
 389 void    bp_mapout(struct buf *);
 390 int     bp_copyin(struct buf *, void *, offset_t, size_t);
 391 int     bp_copyout(void *, struct buf *, offset_t, size_t);
 392 void    bp_init(size_t, uint_t);
 393 int     bp_color(struct buf *);
 394 void    pageio_done(struct buf *);
 395 struct buf *bread(dev_t, daddr_t, long);
 396 struct buf *bread_common(void *, dev_t, daddr_t, long);
 397 struct buf *breada(dev_t, daddr_t, daddr_t, long);
 398 struct buf *getblk(dev_t, daddr_t, long);
 399 struct buf *getblk_common(void *, dev_t, daddr_t, long, int);
 400 struct buf *ngeteblk(long);
 401 struct buf *geteblk(void);
 402 struct buf *pageio_setup(struct page *, size_t, struct vnode *, int);
 403 void bioerror(struct buf *bp, int error);
 404 void bioreset(struct buf *bp);
 405 struct buf *bioclone(struct buf *, off_t, size_t, dev_t, daddr_t,
 406         int (*)(struct buf *), struct buf *, int);
 407 size_t  biosize(void);
 408 #endif  /* defined(_KERNEL) */
 409 
 410 #ifdef  __cplusplus
 411 }
 412 #endif
 413 
 414 #endif  /* _SYS_BUF_H */