1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2011 Joyent, Inc. All rights reserved.
25 */
26 /*
27 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
28 */
29
30 /*
31 * Copyright (c) 2016 by Delphix. All rights reserved.
32 */
33
34 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
35 /* All Rights Reserved */
36
37 /*
38 * University Copyright- Copyright (c) 1982, 1986, 1988
39 * The Regents of the University of California
40 * All Rights Reserved
41 *
42 * University Acknowledgment- Portions of this document are derived from
43 * software developed by the University of California, Berkeley, and its
44 * contributors.
45 */
46
47 #include <sys/types.h>
48 #include <sys/t_lock.h>
49 #include <sys/sysmacros.h>
50 #include <sys/conf.h>
51 #include <sys/cpuvar.h>
52 #include <sys/errno.h>
53 #include <sys/debug.h>
54 #include <sys/buf.h>
55 #include <sys/var.h>
56 #include <sys/vnode.h>
57 #include <sys/bitmap.h>
58 #include <sys/cmn_err.h>
59 #include <sys/kmem.h>
60 #include <sys/vmem.h>
61 #include <sys/atomic.h>
62 #include <vm/seg_kmem.h>
63 #include <vm/page.h>
64 #include <vm/pvn.h>
65 #include <sys/vtrace.h>
66 #include <sys/tnf_probe.h>
67 #include <sys/fs/ufs_inode.h>
68 #include <sys/fs/ufs_bio.h>
69 #include <sys/fs/ufs_log.h>
70 #include <sys/systm.h>
71 #include <sys/vfs.h>
72 #include <sys/sdt.h>
73
74 /* Locks */
75 static kmutex_t blist_lock; /* protects b_list */
76 static kmutex_t bhdr_lock; /* protects the bhdrlist */
77 static kmutex_t bfree_lock; /* protects the bfreelist structure */
78
79 struct hbuf *hbuf; /* Hash buckets */
80 struct dwbuf *dwbuf; /* Delayed write buckets */
81 static struct buf *bhdrlist; /* buf header free list */
82 static int nbuf; /* number of buffer headers allocated */
83
84 static int lastindex; /* Reference point on where to start */
85 /* when looking for free buffers */
86
87 #define bio_bhash(dev, bn) (hash2ints((dev), (int)(bn)) & v.v_hmask)
88 #define EMPTY_LIST ((struct buf *)-1)
89
90 static kcondvar_t bio_mem_cv; /* Condition variables */
91 static kcondvar_t bio_flushinval_cv;
92 static int bio_doingflush; /* flush in progress */
93 static int bio_doinginval; /* inval in progress */
94 static int bio_flinv_cv_wanted; /* someone waiting for cv */
95
96 /*
97 * Statistics on the buffer cache
98 */
99 struct biostats biostats = {
100 { "buffer_cache_lookups", KSTAT_DATA_UINT32 },
101 { "buffer_cache_hits", KSTAT_DATA_UINT32 },
102 { "new_buffer_requests", KSTAT_DATA_UINT32 },
103 { "waits_for_buffer_allocs", KSTAT_DATA_UINT32 },
104 { "buffers_locked_by_someone", KSTAT_DATA_UINT32 },
105 { "duplicate_buffers_found", KSTAT_DATA_UINT32 }
106 };
107
108 /*
109 * kstat data
110 */
111 kstat_named_t *biostats_ptr = (kstat_named_t *)&biostats;
112 uint_t biostats_ndata = (uint_t)(sizeof (biostats) /
113 sizeof (kstat_named_t));
114
115 /*
116 * Statistics on ufs buffer cache
117 * Not protected by locks
118 */
119 struct ufsbiostats ub = {
120 { "breads", KSTAT_DATA_UINT32 },
121 { "bwrites", KSTAT_DATA_UINT32 },
122 { "fbiwrites", KSTAT_DATA_UINT32 },
123 { "getpages", KSTAT_DATA_UINT32 },
124 { "getras", KSTAT_DATA_UINT32 },
125 { "putsyncs", KSTAT_DATA_UINT32 },
126 { "putasyncs", KSTAT_DATA_UINT32 },
127 { "putpageios", KSTAT_DATA_UINT32 },
128 };
129
130 /*
131 * more UFS Logging eccentricities...
132 *
133 * required since "#pragma weak ..." doesn't work in reverse order.
134 * i.e.: genunix (bio.c) is loaded before the ufs modules and pointers
135 * to ufs routines don't get plugged into bio.c calls so
136 * we initialize it when setting up the "lufsops" table
137 * in "lufs.c:_init()"
138 */
139 void (*bio_lufs_strategy)(void *, buf_t *);
140 void (*bio_snapshot_strategy)(void *, buf_t *);
141
142
143 /* Private routines */
144 static struct buf *bio_getfreeblk(long);
145 static void bio_mem_get(long);
146 static void bio_bhdr_free(struct buf *);
147 static struct buf *bio_bhdr_alloc(void);
148 static void bio_recycle(int, long);
149 static void bio_pageio_done(struct buf *);
150 static int bio_incore(dev_t, daddr_t);
151
152 /*
153 * Buffer cache constants
154 */
155 #define BIO_BUF_PERCENT (100/2) /* default: 2% of memory */
156 #define BIO_MAX_PERCENT (100/20) /* max is 20% of real memory */
157 #define BIO_BHDR_POOL 100 /* Default bhdr pool size */
158 #define BIO_MIN_HDR 10 /* Minimum number of buffer headers */
159 #define BIO_MIN_HWM (BIO_MIN_HDR * MAXBSIZE / 1024)
160 #define BIO_HASHLEN 4 /* Target length of hash chains */
161
162
163 /* Flags for bio_recycle() */
164 #define BIO_HEADER 0x01
165 #define BIO_MEM 0x02
166
167 extern volatile int bufhwm; /* User tunable - high water mark for mem */
168 extern volatile int bufhwm_pct; /* ditto - given in % of physmem */
169
170 /*
171 * The following routines allocate and free
172 * buffers with various side effects. In general the
173 * arguments to an allocate routine are a device and
174 * a block number, and the value is a pointer to
175 * to the buffer header; the buffer returned is locked with a
176 * binary semaphore so that no one else can touch it. If the block was
177 * already in core, no I/O need be done; if it is
178 * already locked, the process waits until it becomes free.
179 * The following routines allocate a buffer:
180 * getblk
181 * bread/BREAD
182 * breada
183 * Eventually the buffer must be released, possibly with the
184 * side effect of writing it out, by using one of
185 * bwrite/BWRITE/brwrite
186 * bdwrite/bdrwrite
187 * bawrite
188 * brelse
189 *
190 * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization.
191 * Instead, a binary semaphore, b_sem is used to gain exclusive access to
192 * a buffer and a binary semaphore, b_io is used for I/O synchronization.
193 * B_DONE is still used to denote a buffer with I/O complete on it.
194 *
195 * The bfreelist.b_bcount field is computed everytime fsflush runs. It is
196 * should not be used where a very accurate count of the free buffers is
197 * needed.
198 */
199
200 /*
201 * Read in (if necessary) the block and return a buffer pointer.
202 *
203 * This interface is provided for binary compatibility. Using
204 * BREAD() directly avoids the extra function call overhead invoked
205 * by calling this routine.
206 */
207 struct buf *
208 bread(dev_t dev, daddr_t blkno, long bsize)
209 {
210 return (BREAD(dev, blkno, bsize));
211 }
212
213 /*
214 * Common code for reading a buffer with various options
215 *
216 * Read in (if necessary) the block and return a buffer pointer.
217 */
218 struct buf *
219 bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize)
220 {
221 struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
222 struct buf *bp;
223 klwp_t *lwp = ttolwp(curthread);
224
225 CPU_STATS_ADD_K(sys, lread, 1);
226 bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1);
227 if (bp->b_flags & B_DONE)
228 return (bp);
229 bp->b_flags |= B_READ;
230 ASSERT(bp->b_bcount == bsize);
231 if (ufsvfsp == NULL) { /* !ufs */
232 (void) bdev_strategy(bp);
233 } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
234 /* ufs && logging */
235 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
236 } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
237 /* ufs && snapshots */
238 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
239 } else {
240 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
241 ub.ub_breads.value.ul++; /* ufs && !logging */
242 (void) bdev_strategy(bp);
243 }
244 if (lwp != NULL)
245 lwp->lwp_ru.inblock++;
246 CPU_STATS_ADD_K(sys, bread, 1);
247 (void) biowait(bp);
248 return (bp);
249 }
250
251 /*
252 * Read in the block, like bread, but also start I/O on the
253 * read-ahead block (which is not allocated to the caller).
254 */
255 struct buf *
256 breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize)
257 {
258 struct buf *bp, *rabp;
259 klwp_t *lwp = ttolwp(curthread);
260
261 bp = NULL;
262 if (!bio_incore(dev, blkno)) {
263 CPU_STATS_ADD_K(sys, lread, 1);
264 bp = GETBLK(dev, blkno, bsize);
265 if ((bp->b_flags & B_DONE) == 0) {
266 bp->b_flags |= B_READ;
267 bp->b_bcount = bsize;
268 (void) bdev_strategy(bp);
269 if (lwp != NULL)
270 lwp->lwp_ru.inblock++;
271 CPU_STATS_ADD_K(sys, bread, 1);
272 }
273 }
274 if (rablkno && bfreelist.b_bcount > 1 &&
275 !bio_incore(dev, rablkno)) {
276 rabp = GETBLK(dev, rablkno, bsize);
277 if (rabp->b_flags & B_DONE)
278 brelse(rabp);
279 else {
280 rabp->b_flags |= B_READ|B_ASYNC;
281 rabp->b_bcount = bsize;
282 (void) bdev_strategy(rabp);
283 if (lwp != NULL)
284 lwp->lwp_ru.inblock++;
285 CPU_STATS_ADD_K(sys, bread, 1);
286 }
287 }
288 if (bp == NULL)
289 return (BREAD(dev, blkno, bsize));
290 (void) biowait(bp);
291 return (bp);
292 }
293
294 /*
295 * Common code for writing a buffer with various options.
296 *
297 * force_wait - wait for write completion regardless of B_ASYNC flag
298 * do_relse - release the buffer when we are done
299 * clear_flags - flags to clear from the buffer
300 */
301 void
302 bwrite_common(void *arg, struct buf *bp, int force_wait,
303 int do_relse, int clear_flags)
304 {
305 register int do_wait;
306 struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
307 int flag;
308 klwp_t *lwp = ttolwp(curthread);
309 struct cpu *cpup;
310
311 ASSERT(SEMA_HELD(&bp->b_sem));
312 flag = bp->b_flags;
313 bp->b_flags &= ~clear_flags;
314 if (lwp != NULL)
315 lwp->lwp_ru.oublock++;
316 CPU_STATS_ENTER_K();
317 cpup = CPU; /* get pointer AFTER preemption is disabled */
318 CPU_STATS_ADDQ(cpup, sys, lwrite, 1);
319 CPU_STATS_ADDQ(cpup, sys, bwrite, 1);
320 do_wait = ((flag & B_ASYNC) == 0 || force_wait);
321 if (do_wait == 0)
322 CPU_STATS_ADDQ(cpup, sys, bawrite, 1);
323 CPU_STATS_EXIT_K();
324 if (ufsvfsp == NULL) {
325 (void) bdev_strategy(bp);
326 } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
327 /* ufs && logging */
328 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
329 } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
330 /* ufs && snapshots */
331 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
332 } else {
333 ub.ub_bwrites.value.ul++; /* ufs && !logging */
334 (void) bdev_strategy(bp);
335 }
336 if (do_wait) {
337 (void) biowait(bp);
338 if (do_relse) {
339 brelse(bp);
340 }
341 }
342 }
343
344 /*
345 * Write the buffer, waiting for completion (unless B_ASYNC is set).
346 * Then release the buffer.
347 * This interface is provided for binary compatibility. Using
348 * BWRITE() directly avoids the extra function call overhead invoked
349 * by calling this routine.
350 */
351 void
352 bwrite(struct buf *bp)
353 {
354 BWRITE(bp);
355 }
356
357 /*
358 * Write the buffer, waiting for completion.
359 * But don't release the buffer afterwards.
360 * This interface is provided for binary compatibility. Using
361 * BWRITE2() directly avoids the extra function call overhead.
362 */
363 void
364 bwrite2(struct buf *bp)
365 {
366 BWRITE2(bp);
367 }
368
369 /*
370 * Release the buffer, marking it so that if it is grabbed
371 * for another purpose it will be written out before being
372 * given up (e.g. when writing a partial block where it is
373 * assumed that another write for the same block will soon follow).
374 * Also save the time that the block is first marked as delayed
375 * so that it will be written in a reasonable time.
376 */
377 void
378 bdwrite(struct buf *bp)
379 {
380 ASSERT(SEMA_HELD(&bp->b_sem));
381 CPU_STATS_ADD_K(sys, lwrite, 1);
382 if ((bp->b_flags & B_DELWRI) == 0)
383 bp->b_start = ddi_get_lbolt();
384 /*
385 * B_DONE allows others to use the buffer, B_DELWRI causes the
386 * buffer to be written before being reused, and setting b_resid
387 * to zero says the buffer is complete.
388 */
389 bp->b_flags |= B_DELWRI | B_DONE;
390 bp->b_resid = 0;
391 brelse(bp);
392 }
393
394 /*
395 * Release the buffer, start I/O on it, but don't wait for completion.
396 */
397 void
398 bawrite(struct buf *bp)
399 {
400 ASSERT(SEMA_HELD(&bp->b_sem));
401
402 /* Use bfreelist.b_bcount as a weird-ass heuristic */
403 if (bfreelist.b_bcount > 4)
404 bp->b_flags |= B_ASYNC;
405 BWRITE(bp);
406 }
407
408 /*
409 * Release the buffer, with no I/O implied.
410 */
411 void
412 brelse(struct buf *bp)
413 {
414 struct buf **backp;
415 uint_t index;
416 kmutex_t *hmp;
417 struct buf *dp;
418 struct hbuf *hp;
419
420
421 ASSERT(SEMA_HELD(&bp->b_sem));
422
423 /*
424 * Clear the retry write flag if the buffer was written without
425 * error. The presence of B_DELWRI means the buffer has not yet
426 * been written and the presence of B_ERROR means that an error
427 * is still occurring.
428 */
429 if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) {
430 bp->b_flags &= ~B_RETRYWRI;
431 }
432
433 /* Check for anomalous conditions */
434 if (bp->b_flags & (B_ERROR|B_NOCACHE)) {
435 if (bp->b_flags & B_NOCACHE) {
436 /* Don't add to the freelist. Destroy it now */
437 kmem_free(bp->b_un.b_addr, bp->b_bufsize);
438 sema_destroy(&bp->b_sem);
439 sema_destroy(&bp->b_io);
440 kmem_free(bp, sizeof (struct buf));
441 return;
442 }
443 /*
444 * If a write failed and we are supposed to retry write,
445 * don't toss the buffer. Keep it around and mark it
446 * delayed write in the hopes that it will eventually
447 * get flushed (and still keep the system running.)
448 */
449 if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) {
450 bp->b_flags |= B_DELWRI;
451 /* keep fsflush from trying continuously to flush */
452 bp->b_start = ddi_get_lbolt();
453 } else
454 bp->b_flags |= B_AGE|B_STALE;
455 bp->b_flags &= ~B_ERROR;
456 bp->b_error = 0;
457 }
458
459 /*
460 * If delayed write is set then put in on the delayed
461 * write list instead of the free buffer list.
462 */
463 index = bio_bhash(bp->b_edev, bp->b_blkno);
464 hmp = &hbuf[index].b_lock;
465
466 mutex_enter(hmp);
467 hp = &hbuf[index];
468 dp = (struct buf *)hp;
469
470 /*
471 * Make sure that the number of entries on this list are
472 * Zero <= count <= total # buffers
473 */
474 ASSERT(hp->b_length >= 0);
475 ASSERT(hp->b_length < nbuf);
476
477 hp->b_length++; /* We are adding this buffer */
478
479 if (bp->b_flags & B_DELWRI) {
480 /*
481 * This buffer goes on the delayed write buffer list
482 */
483 dp = (struct buf *)&dwbuf[index];
484 }
485 ASSERT(bp->b_bufsize > 0);
486 ASSERT(bp->b_bcount > 0);
487 ASSERT(bp->b_un.b_addr != NULL);
488
489 if (bp->b_flags & B_AGE) {
490 backp = &dp->av_forw;
491 (*backp)->av_back = bp;
492 bp->av_forw = *backp;
493 *backp = bp;
494 bp->av_back = dp;
495 } else {
496 backp = &dp->av_back;
497 (*backp)->av_forw = bp;
498 bp->av_back = *backp;
499 *backp = bp;
500 bp->av_forw = dp;
501 }
502 mutex_exit(hmp);
503
504 if (bfreelist.b_flags & B_WANTED) {
505 /*
506 * Should come here very very rarely.
507 */
508 mutex_enter(&bfree_lock);
509 if (bfreelist.b_flags & B_WANTED) {
510 bfreelist.b_flags &= ~B_WANTED;
511 cv_broadcast(&bio_mem_cv);
512 }
513 mutex_exit(&bfree_lock);
514 }
515
516 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC);
517 /*
518 * Don't let anyone get the buffer off the freelist before we
519 * release our hold on it.
520 */
521 sema_v(&bp->b_sem);
522 }
523
524 /*
525 * Return a count of the number of B_BUSY buffers in the system
526 * Can only be used as a good estimate. If 'cleanit' is set,
527 * try to flush all bufs.
528 */
529 int
530 bio_busy(int cleanit)
531 {
532 struct buf *bp, *dp;
533 int busy = 0;
534 int i;
535 kmutex_t *hmp;
536
537 for (i = 0; i < v.v_hbuf; i++) {
538 dp = (struct buf *)&hbuf[i];
539 hmp = &hbuf[i].b_lock;
540
541 mutex_enter(hmp);
542 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
543 if (bp->b_flags & B_BUSY)
544 busy++;
545 }
546 mutex_exit(hmp);
547 }
548
549 if (cleanit && busy != 0) {
550 bflush(NODEV);
551 }
552
553 return (busy);
554 }
555
556 /*
557 * this interface is provided for binary compatibility.
558 *
559 * Assign a buffer for the given block. If the appropriate
560 * block is already associated, return it; otherwise search
561 * for the oldest non-busy buffer and reassign it.
562 */
563 struct buf *
564 getblk(dev_t dev, daddr_t blkno, long bsize)
565 {
566 return (getblk_common(/* ufsvfsp */ NULL, dev,
567 blkno, bsize, /* errflg */ 0));
568 }
569
570 /*
571 * Assign a buffer for the given block. If the appropriate
572 * block is already associated, return it; otherwise search
573 * for the oldest non-busy buffer and reassign it.
574 */
575 struct buf *
576 getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg)
577 {
578 ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg;
579 struct buf *bp;
580 struct buf *dp;
581 struct buf *nbp = NULL;
582 struct buf *errbp;
583 uint_t index;
584 kmutex_t *hmp;
585 struct hbuf *hp;
586
587 if (getmajor(dev) >= devcnt)
588 cmn_err(CE_PANIC, "blkdev");
589
590 biostats.bio_lookup.value.ui32++;
591
592 index = bio_bhash(dev, blkno);
593 hp = &hbuf[index];
594 dp = (struct buf *)hp;
595 hmp = &hp->b_lock;
596
597 mutex_enter(hmp);
598 loop:
599 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
600 if (bp->b_blkno != blkno || bp->b_edev != dev ||
601 (bp->b_flags & B_STALE))
602 continue;
603 /*
604 * Avoid holding the hash lock in the event that
605 * the buffer is locked by someone. Since the hash chain
606 * may change when we drop the hash lock
607 * we have to start at the beginning of the chain if the
608 * buffer identity/contents aren't valid.
609 */
610 if (!sema_tryp(&bp->b_sem)) {
611 biostats.bio_bufbusy.value.ui32++;
612 mutex_exit(hmp);
613 /*
614 * OK, we are dealing with a busy buffer.
615 * In the case that we are panicking and we
616 * got called from bread(), we have some chance
617 * for error recovery. So better bail out from
618 * here since sema_p() won't block. If we got
619 * called directly from ufs routines, there is
620 * no way to report an error yet.
621 */
622 if (panicstr && errflg)
623 goto errout;
624 /*
625 * For the following line of code to work
626 * correctly never kmem_free the buffer "header".
627 */
628 sema_p(&bp->b_sem);
629 if (bp->b_blkno != blkno || bp->b_edev != dev ||
630 (bp->b_flags & B_STALE)) {
631 sema_v(&bp->b_sem);
632 mutex_enter(hmp);
633 goto loop; /* start over */
634 }
635 mutex_enter(hmp);
636 }
637 /* Found */
638 biostats.bio_hit.value.ui32++;
639 bp->b_flags &= ~B_AGE;
640
641 /*
642 * Yank it off the free/delayed write lists
643 */
644 hp->b_length--;
645 notavail(bp);
646 mutex_exit(hmp);
647
648 ASSERT((bp->b_flags & B_NOCACHE) == NULL);
649
650 if (nbp == NULL) {
651 /*
652 * Make the common path short.
653 */
654 ASSERT(SEMA_HELD(&bp->b_sem));
655 return (bp);
656 }
657
658 biostats.bio_bufdup.value.ui32++;
659
660 /*
661 * The buffer must have entered during the lock upgrade
662 * so free the new buffer we allocated and return the
663 * found buffer.
664 */
665 kmem_free(nbp->b_un.b_addr, nbp->b_bufsize);
666 nbp->b_un.b_addr = NULL;
667
668 /*
669 * Account for the memory
670 */
671 mutex_enter(&bfree_lock);
672 bfreelist.b_bufsize += nbp->b_bufsize;
673 mutex_exit(&bfree_lock);
674
675 /*
676 * Destroy buf identity, and place on avail list
677 */
678 nbp->b_dev = (o_dev_t)NODEV;
679 nbp->b_edev = NODEV;
680 nbp->b_flags = 0;
681 nbp->b_file = NULL;
682 nbp->b_offset = -1;
683
684 sema_v(&nbp->b_sem);
685 bio_bhdr_free(nbp);
686
687 ASSERT(SEMA_HELD(&bp->b_sem));
688 return (bp);
689 }
690
691 /*
692 * bio_getfreeblk may block so check the hash chain again.
693 */
694 if (nbp == NULL) {
695 mutex_exit(hmp);
696 nbp = bio_getfreeblk(bsize);
697 mutex_enter(hmp);
698 goto loop;
699 }
700
701 /*
702 * New buffer. Assign nbp and stick it on the hash.
703 */
704 nbp->b_flags = B_BUSY;
705 nbp->b_edev = dev;
706 nbp->b_dev = (o_dev_t)cmpdev(dev);
707 nbp->b_blkno = blkno;
708 nbp->b_iodone = NULL;
709 nbp->b_bcount = bsize;
710 /*
711 * If we are given a ufsvfsp and the vfs_root field is NULL
712 * then this must be I/O for a superblock. A superblock's
713 * buffer is set up in mountfs() and there is no root vnode
714 * at that point.
715 */
716 if (ufsvfsp && ufsvfsp->vfs_root) {
717 nbp->b_vp = ufsvfsp->vfs_root;
718 } else {
719 nbp->b_vp = NULL;
720 }
721
722 ASSERT((nbp->b_flags & B_NOCACHE) == NULL);
723
724 binshash(nbp, dp);
725 mutex_exit(hmp);
726
727 ASSERT(SEMA_HELD(&nbp->b_sem));
728
729 return (nbp);
730
731
732 /*
733 * Come here in case of an internal error. At this point we couldn't
734 * get a buffer, but we have to return one. Hence we allocate some
735 * kind of error reply buffer on the fly. This buffer is marked as
736 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
737 * - B_ERROR will indicate error to the caller.
738 * - B_DONE will prevent us from reading the buffer from
739 * the device.
740 * - B_NOCACHE will cause that this buffer gets free'd in
741 * brelse().
742 */
743
744 errout:
745 errbp = geteblk();
746 sema_p(&errbp->b_sem);
747 errbp->b_flags &= ~B_BUSY;
748 errbp->b_flags |= (B_ERROR | B_DONE);
749 return (errbp);
750 }
751
752 /*
753 * Get an empty block, not assigned to any particular device.
754 * Returns a locked buffer that is not on any hash or free list.
755 */
756 struct buf *
757 ngeteblk(long bsize)
758 {
759 struct buf *bp;
760
761 bp = kmem_alloc(sizeof (struct buf), KM_SLEEP);
762 bioinit(bp);
763 bp->av_forw = bp->av_back = NULL;
764 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
765 bp->b_bufsize = bsize;
766 bp->b_flags = B_BUSY | B_NOCACHE | B_AGE;
767 bp->b_dev = (o_dev_t)NODEV;
768 bp->b_edev = NODEV;
769 bp->b_lblkno = 0;
770 bp->b_bcount = bsize;
771 bp->b_iodone = NULL;
772 return (bp);
773 }
774
775 /*
776 * Interface of geteblk() is kept intact to maintain driver compatibility.
777 * Use ngeteblk() to allocate block size other than 1 KB.
778 */
779 struct buf *
780 geteblk(void)
781 {
782 return (ngeteblk((long)1024));
783 }
784
785 /*
786 * Return a buffer w/o sleeping
787 */
788 struct buf *
789 trygetblk(dev_t dev, daddr_t blkno)
790 {
791 struct buf *bp;
792 struct buf *dp;
793 struct hbuf *hp;
794 kmutex_t *hmp;
795 uint_t index;
796
797 index = bio_bhash(dev, blkno);
798 hp = &hbuf[index];
799 hmp = &hp->b_lock;
800
801 if (!mutex_tryenter(hmp))
802 return (NULL);
803
804 dp = (struct buf *)hp;
805 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
806 if (bp->b_blkno != blkno || bp->b_edev != dev ||
807 (bp->b_flags & B_STALE))
808 continue;
809 /*
810 * Get access to a valid buffer without sleeping
811 */
812 if (sema_tryp(&bp->b_sem)) {
813 if (bp->b_flags & B_DONE) {
814 hp->b_length--;
815 notavail(bp);
816 mutex_exit(hmp);
817 return (bp);
818 } else {
819 sema_v(&bp->b_sem);
820 break;
821 }
822 }
823 break;
824 }
825 mutex_exit(hmp);
826 return (NULL);
827 }
828
829 /*
830 * Wait for I/O completion on the buffer; return errors
831 * to the user.
832 */
833 int
834 iowait(struct buf *bp)
835 {
836 ASSERT(SEMA_HELD(&bp->b_sem));
837 return (biowait(bp));
838 }
839
840 /*
841 * Mark I/O complete on a buffer, release it if I/O is asynchronous,
842 * and wake up anyone waiting for it.
843 */
844 void
845 iodone(struct buf *bp)
846 {
847 ASSERT(SEMA_HELD(&bp->b_sem));
848 (void) biodone(bp);
849 }
850
851 /*
852 * Zero the core associated with a buffer.
853 */
854 void
855 clrbuf(struct buf *bp)
856 {
857 ASSERT(SEMA_HELD(&bp->b_sem));
858 bzero(bp->b_un.b_addr, bp->b_bcount);
859 bp->b_resid = 0;
860 }
861
862
863 /*
864 * Make sure all write-behind blocks on dev (or NODEV for all)
865 * are flushed out.
866 */
867 void
868 bflush(dev_t dev)
869 {
870 struct buf *bp, *dp;
871 struct hbuf *hp;
872 struct buf *delwri_list = EMPTY_LIST;
873 int i, index;
874 kmutex_t *hmp;
875
876 mutex_enter(&blist_lock);
877 /*
878 * Wait for any invalidates or flushes ahead of us to finish.
879 * We really could split blist_lock up per device for better
880 * parallelism here.
881 */
882 while (bio_doinginval || bio_doingflush) {
883 bio_flinv_cv_wanted = 1;
884 cv_wait(&bio_flushinval_cv, &blist_lock);
885 }
886 bio_doingflush++;
887 /*
888 * Gather all B_DELWRI buffer for device.
889 * Lock ordering is b_sem > hash lock (brelse).
890 * Since we are finding the buffer via the delayed write list,
891 * it may be busy and we would block trying to get the
892 * b_sem lock while holding hash lock. So transfer all the
893 * candidates on the delwri_list and then drop the hash locks.
894 */
895 for (i = 0; i < v.v_hbuf; i++) {
896 hmp = &hbuf[i].b_lock;
897 dp = (struct buf *)&dwbuf[i];
898 mutex_enter(hmp);
899 for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) {
900 if (dev == NODEV || bp->b_edev == dev) {
901 if (bp->b_list == NULL) {
902 bp->b_list = delwri_list;
903 delwri_list = bp;
904 }
905 }
906 }
907 mutex_exit(hmp);
908 }
909 mutex_exit(&blist_lock);
910
911 /*
912 * Now that the hash locks have been dropped grab the semaphores
913 * and write back all the buffers that have B_DELWRI set.
914 */
915 while (delwri_list != EMPTY_LIST) {
916 bp = delwri_list;
917
918 sema_p(&bp->b_sem); /* may block */
919 if ((dev != bp->b_edev && dev != NODEV) ||
920 (panicstr && bp->b_flags & B_BUSY)) {
921 sema_v(&bp->b_sem);
922 delwri_list = bp->b_list;
923 bp->b_list = NULL;
924 continue; /* No longer a candidate */
925 }
926 if (bp->b_flags & B_DELWRI) {
927 index = bio_bhash(bp->b_edev, bp->b_blkno);
928 hp = &hbuf[index];
929 hmp = &hp->b_lock;
930 dp = (struct buf *)hp;
931
932 bp->b_flags |= B_ASYNC;
933 mutex_enter(hmp);
934 hp->b_length--;
935 notavail(bp);
936 mutex_exit(hmp);
937 if (bp->b_vp == NULL) { /* !ufs */
938 BWRITE(bp);
939 } else { /* ufs */
940 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
941 }
942 } else {
943 sema_v(&bp->b_sem);
944 }
945 delwri_list = bp->b_list;
946 bp->b_list = NULL;
947 }
948 mutex_enter(&blist_lock);
949 bio_doingflush--;
950 if (bio_flinv_cv_wanted) {
951 bio_flinv_cv_wanted = 0;
952 cv_broadcast(&bio_flushinval_cv);
953 }
954 mutex_exit(&blist_lock);
955 }
956
957 /*
958 * Ensure that a specified block is up-to-date on disk.
959 */
960 void
961 blkflush(dev_t dev, daddr_t blkno)
962 {
963 struct buf *bp, *dp;
964 struct hbuf *hp;
965 struct buf *sbp = NULL;
966 uint_t index;
967 kmutex_t *hmp;
968
969 index = bio_bhash(dev, blkno);
970 hp = &hbuf[index];
971 dp = (struct buf *)hp;
972 hmp = &hp->b_lock;
973
974 /*
975 * Identify the buffer in the cache belonging to
976 * this device and blkno (if any).
977 */
978 mutex_enter(hmp);
979 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
980 if (bp->b_blkno != blkno || bp->b_edev != dev ||
981 (bp->b_flags & B_STALE))
982 continue;
983 sbp = bp;
984 break;
985 }
986 mutex_exit(hmp);
987 if (sbp == NULL)
988 return;
989 /*
990 * Now check the buffer we have identified and
991 * make sure it still belongs to the device and is B_DELWRI
992 */
993 sema_p(&sbp->b_sem);
994 if (sbp->b_blkno == blkno && sbp->b_edev == dev &&
995 (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) {
996 mutex_enter(hmp);
997 hp->b_length--;
998 notavail(sbp);
999 mutex_exit(hmp);
1000 /*
1001 * XXX - There is nothing to guarantee a synchronous
1002 * write here if the B_ASYNC flag is set. This needs
1003 * some investigation.
1004 */
1005 if (sbp->b_vp == NULL) { /* !ufs */
1006 BWRITE(sbp); /* synchronous write */
1007 } else { /* ufs */
1008 UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp);
1009 }
1010 } else {
1011 sema_v(&sbp->b_sem);
1012 }
1013 }
1014
1015 /*
1016 * Same as binval, except can force-invalidate delayed-write buffers
1017 * (which are not be already flushed because of device errors). Also
1018 * makes sure that the retry write flag is cleared.
1019 */
1020 int
1021 bfinval(dev_t dev, int force)
1022 {
1023 struct buf *dp;
1024 struct buf *bp;
1025 struct buf *binval_list = EMPTY_LIST;
1026 int i, error = 0;
1027 kmutex_t *hmp;
1028 uint_t index;
1029 struct buf **backp;
1030
1031 mutex_enter(&blist_lock);
1032 /*
1033 * Wait for any flushes ahead of us to finish, it's ok to
1034 * do invalidates in parallel.
1035 */
1036 while (bio_doingflush) {
1037 bio_flinv_cv_wanted = 1;
1038 cv_wait(&bio_flushinval_cv, &blist_lock);
1039 }
1040 bio_doinginval++;
1041
1042 /* Gather bp's */
1043 for (i = 0; i < v.v_hbuf; i++) {
1044 dp = (struct buf *)&hbuf[i];
1045 hmp = &hbuf[i].b_lock;
1046
1047 mutex_enter(hmp);
1048 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1049 if (bp->b_edev == dev) {
1050 if (bp->b_list == NULL) {
1051 bp->b_list = binval_list;
1052 binval_list = bp;
1053 }
1054 }
1055 }
1056 mutex_exit(hmp);
1057 }
1058 mutex_exit(&blist_lock);
1059
1060 /* Invalidate all bp's found */
1061 while (binval_list != EMPTY_LIST) {
1062 bp = binval_list;
1063
1064 sema_p(&bp->b_sem);
1065 if (bp->b_edev == dev) {
1066 if (force && (bp->b_flags & B_DELWRI)) {
1067 /* clear B_DELWRI, move to non-dw freelist */
1068 index = bio_bhash(bp->b_edev, bp->b_blkno);
1069 hmp = &hbuf[index].b_lock;
1070 dp = (struct buf *)&hbuf[index];
1071 mutex_enter(hmp);
1072
1073 /* remove from delayed write freelist */
1074 notavail(bp);
1075
1076 /* add to B_AGE side of non-dw freelist */
1077 backp = &dp->av_forw;
1078 (*backp)->av_back = bp;
1079 bp->av_forw = *backp;
1080 *backp = bp;
1081 bp->av_back = dp;
1082
1083 /*
1084 * make sure write retries and busy are cleared
1085 */
1086 bp->b_flags &=
1087 ~(B_BUSY | B_DELWRI | B_RETRYWRI);
1088 mutex_exit(hmp);
1089 }
1090 if ((bp->b_flags & B_DELWRI) == 0)
1091 bp->b_flags |= B_STALE|B_AGE;
1092 else
1093 error = EIO;
1094 }
1095 sema_v(&bp->b_sem);
1096 binval_list = bp->b_list;
1097 bp->b_list = NULL;
1098 }
1099 mutex_enter(&blist_lock);
1100 bio_doinginval--;
1101 if (bio_flinv_cv_wanted) {
1102 cv_broadcast(&bio_flushinval_cv);
1103 bio_flinv_cv_wanted = 0;
1104 }
1105 mutex_exit(&blist_lock);
1106 return (error);
1107 }
1108
1109 /*
1110 * If possible, invalidate blocks for a dev on demand
1111 */
1112 void
1113 binval(dev_t dev)
1114 {
1115 (void) bfinval(dev, 0);
1116 }
1117
1118 /*
1119 * Initialize the buffer I/O system by freeing
1120 * all buffers and setting all device hash buffer lists to empty.
1121 */
1122 void
1123 binit(void)
1124 {
1125 struct buf *bp;
1126 unsigned int i, pct;
1127 ulong_t bio_max_hwm, bio_default_hwm;
1128
1129 /*
1130 * Maximum/Default values for bufhwm are set to the smallest of:
1131 * - BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
1132 * - 1/4 of kernel virtual memory
1133 * - INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
1134 * Additionally, in order to allow simple tuning by percentage of
1135 * physical memory, bufhwm_pct is used to calculate the default if
1136 * the value of this tunable is between 0 and BIO_MAX_PERCENT.
1137 *
1138 * Since the unit for v.v_bufhwm is kilobytes, this allows for
1139 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
1140 */
1141 bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT,
1142 btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1143 bio_max_hwm = MIN(INT32_MAX, bio_max_hwm);
1144
1145 pct = BIO_BUF_PERCENT;
1146 if (bufhwm_pct != 0 &&
1147 ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) {
1148 pct = BIO_BUF_PERCENT;
1149 /*
1150 * Invalid user specified value, emit a warning.
1151 */
1152 cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \
1153 range(1..%d). Using %d as default.",
1154 bufhwm_pct,
1155 100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT);
1156 }
1157
1158 bio_default_hwm = MIN(physmem / pct,
1159 btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1160 bio_default_hwm = MIN(INT32_MAX, bio_default_hwm);
1161
1162 if ((v.v_bufhwm = bufhwm) == 0)
1163 v.v_bufhwm = bio_default_hwm;
1164
1165 if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) {
1166 v.v_bufhwm = (int)bio_max_hwm;
1167 /*
1168 * Invalid user specified value, emit a warning.
1169 */
1170 cmn_err(CE_WARN,
1171 "binit: bufhwm(%d) out \
1172 of range(%d..%lu). Using %lu as default",
1173 bufhwm,
1174 BIO_MIN_HWM, bio_max_hwm, bio_max_hwm);
1175 }
1176
1177 /*
1178 * Determine the number of hash buckets. Default is to
1179 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
1180 * Round up number to the next power of 2.
1181 */
1182 v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) /
1183 BIO_HASHLEN);
1184 v.v_hmask = v.v_hbuf - 1;
1185 v.v_buf = BIO_BHDR_POOL;
1186
1187 hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP);
1188
1189 dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP);
1190
1191 bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024;
1192 bp = &bfreelist;
1193 bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp;
1194
1195 for (i = 0; i < v.v_hbuf; i++) {
1196 hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i];
1197 hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i];
1198
1199 /*
1200 * Initialize the delayed write buffer list.
1201 */
1202 dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i];
1203 dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i];
1204 }
1205 }
1206
1207 /*
1208 * Wait for I/O completion on the buffer; return error code.
1209 * If bp was for synchronous I/O, bp is invalid and associated
1210 * resources are freed on return.
1211 */
1212 int
1213 biowait(struct buf *bp)
1214 {
1215 int error = 0;
1216 struct cpu *cpup;
1217
1218 ASSERT(SEMA_HELD(&bp->b_sem));
1219
1220 cpup = CPU;
1221 atomic_inc_64(&cpup->cpu_stats.sys.iowait);
1222 DTRACE_IO1(wait__start, struct buf *, bp);
1223
1224 /*
1225 * In case of panic, busy wait for completion
1226 */
1227 if (panicstr) {
1228 while ((bp->b_flags & B_DONE) == 0)
1229 drv_usecwait(10);
1230 } else
1231 sema_p(&bp->b_io);
1232
1233 DTRACE_IO1(wait__done, struct buf *, bp);
1234 atomic_dec_64(&cpup->cpu_stats.sys.iowait);
1235
1236 error = geterror(bp);
1237 if ((bp->b_flags & B_ASYNC) == 0) {
1238 if (bp->b_flags & B_REMAPPED)
1239 bp_mapout(bp);
1240 }
1241 return (error);
1242 }
1243
1244 static void
1245 biodone_tnf_probe(struct buf *bp)
1246 {
1247 /* Kernel probe */
1248 TNF_PROBE_3(biodone, "io blockio", /* CSTYLED */,
1249 tnf_device, device, bp->b_edev,
1250 tnf_diskaddr, block, bp->b_lblkno,
1251 tnf_opaque, buf, bp);
1252 }
1253
1254 /*
1255 * Mark I/O complete on a buffer, release it if I/O is asynchronous,
1256 * and wake up anyone waiting for it.
1257 */
1258 void
1259 biodone(struct buf *bp)
1260 {
1261 if (bp->b_flags & B_STARTED) {
1262 DTRACE_IO1(done, struct buf *, bp);
1263 bp->b_flags &= ~B_STARTED;
1264 }
1265
1266 /*
1267 * Call the TNF probe here instead of the inline code
1268 * to force our compiler to use the tail call optimization.
1269 */
1270 biodone_tnf_probe(bp);
1271
1272 if (bp->b_iodone != NULL) {
1273 (*(bp->b_iodone))(bp);
1274 return;
1275 }
1276 ASSERT((bp->b_flags & B_DONE) == 0);
1277 ASSERT(SEMA_HELD(&bp->b_sem));
1278 bp->b_flags |= B_DONE;
1279 if (bp->b_flags & B_ASYNC) {
1280 if (bp->b_flags & (B_PAGEIO|B_REMAPPED))
1281 bio_pageio_done(bp);
1282 else
1283 brelse(bp); /* release bp to freelist */
1284 } else {
1285 sema_v(&bp->b_io);
1286 }
1287 }
1288
1289 /*
1290 * Pick up the device's error number and pass it to the user;
1291 * if there is an error but the number is 0 set a generalized code.
1292 */
1293 int
1294 geterror(struct buf *bp)
1295 {
1296 int error = 0;
1297
1298 ASSERT(SEMA_HELD(&bp->b_sem));
1299 if (bp->b_flags & B_ERROR) {
1300 error = bp->b_error;
1301 if (!error)
1302 error = EIO;
1303 }
1304 return (error);
1305 }
1306
1307 /*
1308 * Support for pageio buffers.
1309 *
1310 * This stuff should be generalized to provide a generalized bp
1311 * header facility that can be used for things other than pageio.
1312 */
1313
1314 /*
1315 * Allocate and initialize a buf struct for use with pageio.
1316 */
1317 struct buf *
1318 pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
1319 {
1320 struct buf *bp;
1321 struct cpu *cpup;
1322
1323 if (flags & B_READ) {
1324 CPU_STATS_ENTER_K();
1325 cpup = CPU; /* get pointer AFTER preemption is disabled */
1326 CPU_STATS_ADDQ(cpup, vm, pgin, 1);
1327 CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len));
1328
1329 atomic_add_64(&curzone->zone_pgpgin, btopr(len));
1330
1331 if ((flags & B_ASYNC) == 0) {
1332 klwp_t *lwp = ttolwp(curthread);
1333 if (lwp != NULL)
1334 lwp->lwp_ru.majflt++;
1335 CPU_STATS_ADDQ(cpup, vm, maj_fault, 1);
1336 /* Kernel probe */
1337 TNF_PROBE_2(major_fault, "vm pagefault", /* CSTYLED */,
1338 tnf_opaque, vnode, pp->p_vnode,
1339 tnf_offset, offset, pp->p_offset);
1340 }
1341 /*
1342 * Update statistics for pages being paged in
1343 */
1344 if (pp != NULL && pp->p_vnode != NULL) {
1345 if (IS_SWAPFSVP(pp->p_vnode)) {
1346 CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len));
1347 atomic_add_64(&curzone->zone_anonpgin,
1348 btopr(len));
1349 } else {
1350 if (pp->p_vnode->v_flag & VVMEXEC) {
1351 CPU_STATS_ADDQ(cpup, vm, execpgin,
1352 btopr(len));
1353 atomic_add_64(&curzone->zone_execpgin,
1354 btopr(len));
1355 } else {
1356 CPU_STATS_ADDQ(cpup, vm, fspgin,
1357 btopr(len));
1358 atomic_add_64(&curzone->zone_fspgin,
1359 btopr(len));
1360 }
1361 }
1362 }
1363 CPU_STATS_EXIT_K();
1364 TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN,
1365 "page_ws_in:pp %p", pp);
1366 /* Kernel probe */
1367 TNF_PROBE_3(pagein, "vm pageio io", /* CSTYLED */,
1368 tnf_opaque, vnode, pp->p_vnode,
1369 tnf_offset, offset, pp->p_offset,
1370 tnf_size, size, len);
1371 }
1372
1373 bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
1374 bp->b_bcount = len;
1375 bp->b_bufsize = len;
1376 bp->b_pages = pp;
1377 bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags;
1378 bp->b_offset = -1;
1379 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
1380
1381 /* Initialize bp->b_sem in "locked" state */
1382 sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
1383
1384 VN_HOLD(vp);
1385 bp->b_vp = vp;
1386 THREAD_KPRI_RELEASE_N(btopr(len)); /* release kpri from page_locks */
1387
1388 /*
1389 * Caller sets dev & blkno and can adjust
1390 * b_addr for page offset and can use bp_mapin
1391 * to make pages kernel addressable.
1392 */
1393 return (bp);
1394 }
1395
1396 void
1397 pageio_done(struct buf *bp)
1398 {
1399 ASSERT(SEMA_HELD(&bp->b_sem));
1400 if (bp->b_flags & B_REMAPPED)
1401 bp_mapout(bp);
1402 VN_RELE(bp->b_vp);
1403 bp->b_vp = NULL;
1404 ASSERT((bp->b_flags & B_NOCACHE) != 0);
1405
1406 /* A sema_v(bp->b_sem) is implied if we are destroying it */
1407 sema_destroy(&bp->b_sem);
1408 sema_destroy(&bp->b_io);
1409 kmem_free(bp, sizeof (struct buf));
1410 }
1411
1412 /*
1413 * Check to see whether the buffers, except the one pointed by sbp,
1414 * associated with the device are busy.
1415 * NOTE: This expensive operation shall be improved together with ufs_icheck().
1416 */
1417 int
1418 bcheck(dev_t dev, struct buf *sbp)
1419 {
1420 struct buf *bp;
1421 struct buf *dp;
1422 int i;
1423 kmutex_t *hmp;
1424
1425 /*
1426 * check for busy bufs for this filesystem
1427 */
1428 for (i = 0; i < v.v_hbuf; i++) {
1429 dp = (struct buf *)&hbuf[i];
1430 hmp = &hbuf[i].b_lock;
1431
1432 mutex_enter(hmp);
1433 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1434 /*
1435 * if buf is busy or dirty, then filesystem is busy
1436 */
1437 if ((bp->b_edev == dev) &&
1438 ((bp->b_flags & B_STALE) == 0) &&
1439 (bp->b_flags & (B_DELWRI|B_BUSY)) &&
1440 (bp != sbp)) {
1441 mutex_exit(hmp);
1442 return (1);
1443 }
1444 }
1445 mutex_exit(hmp);
1446 }
1447 return (0);
1448 }
1449
1450 /*
1451 * Hash two 32 bit entities.
1452 */
1453 int
1454 hash2ints(int x, int y)
1455 {
1456 int hash = 0;
1457
1458 hash = x - 1;
1459 hash = ((hash * 7) + (x >> 8)) - 1;
1460 hash = ((hash * 7) + (x >> 16)) - 1;
1461 hash = ((hash * 7) + (x >> 24)) - 1;
1462 hash = ((hash * 7) + y) - 1;
1463 hash = ((hash * 7) + (y >> 8)) - 1;
1464 hash = ((hash * 7) + (y >> 16)) - 1;
1465 hash = ((hash * 7) + (y >> 24)) - 1;
1466
1467 return (hash);
1468 }
1469
1470
1471 /*
1472 * Return a new buffer struct.
1473 * Create a new buffer if we haven't gone over our high water
1474 * mark for memory, otherwise try to get one off the freelist.
1475 *
1476 * Returns a locked buf that has no id and is not on any hash or free
1477 * list.
1478 */
1479 static struct buf *
1480 bio_getfreeblk(long bsize)
1481 {
1482 struct buf *bp, *dp;
1483 struct hbuf *hp;
1484 kmutex_t *hmp;
1485 uint_t start, end;
1486
1487 /*
1488 * mutex_enter(&bfree_lock);
1489 * bfreelist.b_bufsize represents the amount of memory
1490 * mutex_exit(&bfree_lock); protect ref to bfreelist
1491 * we are allowed to allocate in the cache before we hit our hwm.
1492 */
1493 bio_mem_get(bsize); /* Account for our memory request */
1494
1495 again:
1496 bp = bio_bhdr_alloc(); /* Get a buf hdr */
1497 sema_p(&bp->b_sem); /* Should never fail */
1498
1499 ASSERT(bp->b_un.b_addr == NULL);
1500 bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP);
1501 if (bp->b_un.b_addr != NULL) {
1502 /*
1503 * Make the common path short
1504 */
1505 bp->b_bufsize = bsize;
1506 ASSERT(SEMA_HELD(&bp->b_sem));
1507 return (bp);
1508 } else {
1509 struct buf *save;
1510
1511 save = bp; /* Save bp we allocated */
1512 start = end = lastindex;
1513
1514 biostats.bio_bufwant.value.ui32++;
1515
1516 /*
1517 * Memory isn't available from the system now. Scan
1518 * the hash buckets till enough space is found.
1519 */
1520 do {
1521 hp = &hbuf[start];
1522 hmp = &hp->b_lock;
1523 dp = (struct buf *)hp;
1524
1525 mutex_enter(hmp);
1526 bp = dp->av_forw;
1527
1528 while (bp != dp) {
1529
1530 ASSERT(bp != NULL);
1531
1532 if (!sema_tryp(&bp->b_sem)) {
1533 bp = bp->av_forw;
1534 continue;
1535 }
1536
1537 /*
1538 * Since we are going down the freelist
1539 * associated with this hash bucket the
1540 * B_DELWRI flag should not be set.
1541 */
1542 ASSERT(!(bp->b_flags & B_DELWRI));
1543
1544 if (bp->b_bufsize == bsize) {
1545 hp->b_length--;
1546 notavail(bp);
1547 bremhash(bp);
1548 mutex_exit(hmp);
1549
1550 /*
1551 * Didn't kmem_alloc any more, so don't
1552 * count it twice.
1553 */
1554 mutex_enter(&bfree_lock);
1555 bfreelist.b_bufsize += bsize;
1556 mutex_exit(&bfree_lock);
1557
1558 /*
1559 * Update the lastindex value.
1560 */
1561 lastindex = start;
1562
1563 /*
1564 * Put our saved bp back on the list
1565 */
1566 sema_v(&save->b_sem);
1567 bio_bhdr_free(save);
1568 ASSERT(SEMA_HELD(&bp->b_sem));
1569 return (bp);
1570 }
1571 sema_v(&bp->b_sem);
1572 bp = bp->av_forw;
1573 }
1574 mutex_exit(hmp);
1575 start = ((start + 1) % v.v_hbuf);
1576 } while (start != end);
1577
1578 biostats.bio_bufwait.value.ui32++;
1579 bp = save; /* Use original bp */
1580 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
1581 }
1582
1583 bp->b_bufsize = bsize;
1584 ASSERT(SEMA_HELD(&bp->b_sem));
1585 return (bp);
1586 }
1587
1588 /*
1589 * Allocate a buffer header. If none currently available, allocate
1590 * a new pool.
1591 */
1592 static struct buf *
1593 bio_bhdr_alloc(void)
1594 {
1595 struct buf *dp, *sdp;
1596 struct buf *bp;
1597 int i;
1598
1599 for (;;) {
1600 mutex_enter(&bhdr_lock);
1601 if (bhdrlist != NULL) {
1602 bp = bhdrlist;
1603 bhdrlist = bp->av_forw;
1604 mutex_exit(&bhdr_lock);
1605 bp->av_forw = NULL;
1606 return (bp);
1607 }
1608 mutex_exit(&bhdr_lock);
1609
1610 /*
1611 * Need to allocate a new pool. If the system is currently
1612 * out of memory, then try freeing things on the freelist.
1613 */
1614 dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP);
1615 if (dp == NULL) {
1616 /*
1617 * System can't give us a pool of headers, try
1618 * recycling from the free lists.
1619 */
1620 bio_recycle(BIO_HEADER, 0);
1621 } else {
1622 sdp = dp;
1623 for (i = 0; i < v.v_buf; i++, dp++) {
1624 /*
1625 * The next two lines are needed since NODEV
1626 * is -1 and not NULL
1627 */
1628 dp->b_dev = (o_dev_t)NODEV;
1629 dp->b_edev = NODEV;
1630 dp->av_forw = dp + 1;
1631 sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT,
1632 NULL);
1633 sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT,
1634 NULL);
1635 dp->b_offset = -1;
1636 }
1637 mutex_enter(&bhdr_lock);
1638 (--dp)->av_forw = bhdrlist; /* Fix last pointer */
1639 bhdrlist = sdp;
1640 nbuf += v.v_buf;
1641 bp = bhdrlist;
1642 bhdrlist = bp->av_forw;
1643 mutex_exit(&bhdr_lock);
1644
1645 bp->av_forw = NULL;
1646 return (bp);
1647 }
1648 }
1649 }
1650
1651 static void
1652 bio_bhdr_free(struct buf *bp)
1653 {
1654 ASSERT(bp->b_back == NULL);
1655 ASSERT(bp->b_forw == NULL);
1656 ASSERT(bp->av_back == NULL);
1657 ASSERT(bp->av_forw == NULL);
1658 ASSERT(bp->b_un.b_addr == NULL);
1659 ASSERT(bp->b_dev == (o_dev_t)NODEV);
1660 ASSERT(bp->b_edev == NODEV);
1661 ASSERT(bp->b_flags == 0);
1662
1663 mutex_enter(&bhdr_lock);
1664 bp->av_forw = bhdrlist;
1665 bhdrlist = bp;
1666 mutex_exit(&bhdr_lock);
1667 }
1668
1669 /*
1670 * If we haven't gone over the high water mark, it's o.k. to
1671 * allocate more buffer space, otherwise recycle buffers
1672 * from the freelist until enough memory is free for a bsize request.
1673 *
1674 * We account for this memory, even though
1675 * we don't allocate it here.
1676 */
1677 static void
1678 bio_mem_get(long bsize)
1679 {
1680 mutex_enter(&bfree_lock);
1681 if (bfreelist.b_bufsize > bsize) {
1682 bfreelist.b_bufsize -= bsize;
1683 mutex_exit(&bfree_lock);
1684 return;
1685 }
1686 mutex_exit(&bfree_lock);
1687 bio_recycle(BIO_MEM, bsize);
1688 }
1689
1690 /*
1691 * flush a list of delayed write buffers.
1692 * (currently used only by bio_recycle below.)
1693 */
1694 static void
1695 bio_flushlist(struct buf *delwri_list)
1696 {
1697 struct buf *bp;
1698
1699 while (delwri_list != EMPTY_LIST) {
1700 bp = delwri_list;
1701 bp->b_flags |= B_AGE | B_ASYNC;
1702 if (bp->b_vp == NULL) { /* !ufs */
1703 BWRITE(bp);
1704 } else { /* ufs */
1705 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
1706 }
1707 delwri_list = bp->b_list;
1708 bp->b_list = NULL;
1709 }
1710 }
1711
1712 /*
1713 * Start recycling buffers on the freelist for one of 2 reasons:
1714 * - we need a buffer header
1715 * - we need to free up memory
1716 * Once started we continue to recycle buffers until the B_AGE
1717 * buffers are gone.
1718 */
1719 static void
1720 bio_recycle(int want, long bsize)
1721 {
1722 struct buf *bp, *dp, *dwp, *nbp;
1723 struct hbuf *hp;
1724 int found = 0;
1725 kmutex_t *hmp;
1726 int start, end;
1727 struct buf *delwri_list = EMPTY_LIST;
1728
1729 /*
1730 * Recycle buffers.
1731 */
1732 top:
1733 start = end = lastindex;
1734 do {
1735 hp = &hbuf[start];
1736 hmp = &hp->b_lock;
1737 dp = (struct buf *)hp;
1738
1739 mutex_enter(hmp);
1740 bp = dp->av_forw;
1741
1742 while (bp != dp) {
1743
1744 ASSERT(bp != NULL);
1745
1746 if (!sema_tryp(&bp->b_sem)) {
1747 bp = bp->av_forw;
1748 continue;
1749 }
1750 /*
1751 * Do we really want to nuke all of the B_AGE stuff??
1752 */
1753 if ((bp->b_flags & B_AGE) == 0 && found) {
1754 sema_v(&bp->b_sem);
1755 mutex_exit(hmp);
1756 lastindex = start;
1757 return; /* All done */
1758 }
1759
1760 ASSERT(MUTEX_HELD(&hp->b_lock));
1761 ASSERT(!(bp->b_flags & B_DELWRI));
1762 hp->b_length--;
1763 notavail(bp);
1764
1765 /*
1766 * Remove bhdr from cache, free up memory,
1767 * and add the hdr to the freelist.
1768 */
1769 bremhash(bp);
1770 mutex_exit(hmp);
1771
1772 if (bp->b_bufsize) {
1773 kmem_free(bp->b_un.b_addr, bp->b_bufsize);
1774 bp->b_un.b_addr = NULL;
1775 mutex_enter(&bfree_lock);
1776 bfreelist.b_bufsize += bp->b_bufsize;
1777 mutex_exit(&bfree_lock);
1778 }
1779
1780 bp->b_dev = (o_dev_t)NODEV;
1781 bp->b_edev = NODEV;
1782 bp->b_flags = 0;
1783 sema_v(&bp->b_sem);
1784 bio_bhdr_free(bp);
1785 if (want == BIO_HEADER) {
1786 found = 1;
1787 } else {
1788 ASSERT(want == BIO_MEM);
1789 if (!found && bfreelist.b_bufsize >= bsize) {
1790 /* Account for the memory we want */
1791 mutex_enter(&bfree_lock);
1792 if (bfreelist.b_bufsize >= bsize) {
1793 bfreelist.b_bufsize -= bsize;
1794 found = 1;
1795 }
1796 mutex_exit(&bfree_lock);
1797 }
1798 }
1799
1800 /*
1801 * Since we dropped hmp start from the
1802 * begining.
1803 */
1804 mutex_enter(hmp);
1805 bp = dp->av_forw;
1806 }
1807 mutex_exit(hmp);
1808
1809 /*
1810 * Look at the delayed write list.
1811 * First gather into a private list, then write them.
1812 */
1813 dwp = (struct buf *)&dwbuf[start];
1814 mutex_enter(&blist_lock);
1815 bio_doingflush++;
1816 mutex_enter(hmp);
1817 for (bp = dwp->av_forw; bp != dwp; bp = nbp) {
1818
1819 ASSERT(bp != NULL);
1820 nbp = bp->av_forw;
1821
1822 if (!sema_tryp(&bp->b_sem))
1823 continue;
1824 ASSERT(bp->b_flags & B_DELWRI);
1825 /*
1826 * Do we really want to nuke all of the B_AGE stuff??
1827 */
1828
1829 if ((bp->b_flags & B_AGE) == 0 && found) {
1830 sema_v(&bp->b_sem);
1831 mutex_exit(hmp);
1832 lastindex = start;
1833 mutex_exit(&blist_lock);
1834 bio_flushlist(delwri_list);
1835 mutex_enter(&blist_lock);
1836 bio_doingflush--;
1837 if (bio_flinv_cv_wanted) {
1838 bio_flinv_cv_wanted = 0;
1839 cv_broadcast(&bio_flushinval_cv);
1840 }
1841 mutex_exit(&blist_lock);
1842 return; /* All done */
1843 }
1844
1845 /*
1846 * If the buffer is already on a flush or
1847 * invalidate list then just skip it.
1848 */
1849 if (bp->b_list != NULL) {
1850 sema_v(&bp->b_sem);
1851 continue;
1852 }
1853 /*
1854 * We are still on the same bucket.
1855 */
1856 hp->b_length--;
1857 notavail(bp);
1858 bp->b_list = delwri_list;
1859 delwri_list = bp;
1860 }
1861 mutex_exit(hmp);
1862 mutex_exit(&blist_lock);
1863 bio_flushlist(delwri_list);
1864 delwri_list = EMPTY_LIST;
1865 mutex_enter(&blist_lock);
1866 bio_doingflush--;
1867 if (bio_flinv_cv_wanted) {
1868 bio_flinv_cv_wanted = 0;
1869 cv_broadcast(&bio_flushinval_cv);
1870 }
1871 mutex_exit(&blist_lock);
1872 start = (start + 1) % v.v_hbuf;
1873
1874 } while (start != end);
1875
1876 if (found)
1877 return;
1878
1879 /*
1880 * Free lists exhausted and we haven't satisfied the request.
1881 * Wait here for more entries to be added to freelist.
1882 * Because this might have just happened, make it timed.
1883 */
1884 mutex_enter(&bfree_lock);
1885 bfreelist.b_flags |= B_WANTED;
1886 (void) cv_reltimedwait(&bio_mem_cv, &bfree_lock, hz, TR_CLOCK_TICK);
1887 mutex_exit(&bfree_lock);
1888 goto top;
1889 }
1890
1891 /*
1892 * See if the block is associated with some buffer
1893 * (mainly to avoid getting hung up on a wait in breada).
1894 */
1895 static int
1896 bio_incore(dev_t dev, daddr_t blkno)
1897 {
1898 struct buf *bp;
1899 struct buf *dp;
1900 uint_t index;
1901 kmutex_t *hmp;
1902
1903 index = bio_bhash(dev, blkno);
1904 dp = (struct buf *)&hbuf[index];
1905 hmp = &hbuf[index].b_lock;
1906
1907 mutex_enter(hmp);
1908 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1909 if (bp->b_blkno == blkno && bp->b_edev == dev &&
1910 (bp->b_flags & B_STALE) == 0) {
1911 mutex_exit(hmp);
1912 return (1);
1913 }
1914 }
1915 mutex_exit(hmp);
1916 return (0);
1917 }
1918
1919 static void
1920 bio_pageio_done(struct buf *bp)
1921 {
1922 if (bp->b_flags & B_PAGEIO) {
1923
1924 if (bp->b_flags & B_REMAPPED)
1925 bp_mapout(bp);
1926
1927 if (bp->b_flags & B_READ)
1928 pvn_read_done(bp->b_pages, bp->b_flags);
1929 else
1930 pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags);
1931 pageio_done(bp);
1932 } else {
1933 ASSERT(bp->b_flags & B_REMAPPED);
1934 bp_mapout(bp);
1935 brelse(bp);
1936 }
1937 }
1938
1939 /*
1940 * bioerror(9F) - indicate error in buffer header
1941 * If 'error' is zero, remove the error indication.
1942 */
1943 void
1944 bioerror(struct buf *bp, int error)
1945 {
1946 ASSERT(bp != NULL);
1947 ASSERT(error >= 0);
1948 ASSERT(SEMA_HELD(&bp->b_sem));
1949
1950 if (error != 0) {
1951 bp->b_flags |= B_ERROR;
1952 } else {
1953 bp->b_flags &= ~B_ERROR;
1954 }
1955 bp->b_error = error;
1956 }
1957
1958 /*
1959 * bioreset(9F) - reuse a private buffer header after I/O is complete
1960 */
1961 void
1962 bioreset(struct buf *bp)
1963 {
1964 ASSERT(bp != NULL);
1965
1966 biofini(bp);
1967 bioinit(bp);
1968 }
1969
1970 /*
1971 * biosize(9F) - return size of a buffer header
1972 */
1973 size_t
1974 biosize(void)
1975 {
1976 return (sizeof (struct buf));
1977 }
1978
1979 /*
1980 * biomodified(9F) - check if buffer is modified
1981 */
1982 int
1983 biomodified(struct buf *bp)
1984 {
1985 int npf;
1986 int ppattr;
1987 struct page *pp;
1988
1989 ASSERT(bp != NULL);
1990
1991 if ((bp->b_flags & B_PAGEIO) == 0) {
1992 return (-1);
1993 }
1994 pp = bp->b_pages;
1995 npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET));
1996
1997 while (npf > 0) {
1998 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
1999 HAT_SYNC_STOPON_MOD);
2000 if (ppattr & P_MOD)
2001 return (1);
2002 pp = pp->p_next;
2003 npf--;
2004 }
2005
2006 return (0);
2007 }
2008
2009 /*
2010 * bioinit(9F) - initialize a buffer structure
2011 */
2012 void
2013 bioinit(struct buf *bp)
2014 {
2015 bzero(bp, sizeof (struct buf));
2016 sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
2017 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
2018 bp->b_offset = -1;
2019 }
2020
2021 /*
2022 * biofini(9F) - uninitialize a buffer structure
2023 */
2024 void
2025 biofini(struct buf *bp)
2026 {
2027 sema_destroy(&bp->b_io);
2028 sema_destroy(&bp->b_sem);
2029 }
2030
2031 /*
2032 * bioclone(9F) - clone a buffer
2033 */
2034 struct buf *
2035 bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno,
2036 int (*iodone)(struct buf *), struct buf *bp_mem, int sleep)
2037 {
2038 struct buf *bufp;
2039
2040 ASSERT(bp);
2041 if (bp_mem == NULL) {
2042 bufp = kmem_alloc(sizeof (struct buf), sleep);
2043 if (bufp == NULL) {
2044 return (NULL);
2045 }
2046 bioinit(bufp);
2047 } else {
2048 bufp = bp_mem;
2049 bioreset(bufp);
2050 }
2051
2052 #define BUF_CLONE_FLAGS (B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\
2053 B_ABRWRITE)
2054
2055 /*
2056 * The cloned buffer does not inherit the B_REMAPPED flag.
2057 */
2058 bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS) | B_BUSY;
2059 bufp->b_bcount = len;
2060 bufp->b_blkno = blkno;
2061 bufp->b_iodone = iodone;
2062 bufp->b_proc = bp->b_proc;
2063 bufp->b_edev = dev;
2064 bufp->b_file = bp->b_file;
2065 bufp->b_offset = bp->b_offset;
2066
2067 if (bp->b_flags & B_SHADOW) {
2068 ASSERT(bp->b_shadow);
2069 ASSERT(bp->b_flags & B_PHYS);
2070
2071 bufp->b_shadow = bp->b_shadow +
2072 btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off);
2073 bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2074 if (bp->b_flags & B_REMAPPED)
2075 bufp->b_proc = NULL;
2076 } else {
2077 if (bp->b_flags & B_PAGEIO) {
2078 struct page *pp;
2079 off_t o;
2080 int i;
2081
2082 pp = bp->b_pages;
2083 o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off;
2084 for (i = btop(o); i > 0; i--) {
2085 pp = pp->p_next;
2086 }
2087 bufp->b_pages = pp;
2088 bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET);
2089 } else {
2090 bufp->b_un.b_addr =
2091 (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2092 if (bp->b_flags & B_REMAPPED)
2093 bufp->b_proc = NULL;
2094 }
2095 }
2096 return (bufp);
2097 }