Print this page
re #13613 rb4516 Tunables needs volatile keyword
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/os/bio.c
+++ new/usr/src/uts/common/os/bio.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
|
↓ open down ↓ |
15 lines elided |
↑ open up ↑ |
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 * Copyright 2011 Joyent, Inc. All rights reserved.
25 25 */
26 +/*
27 + * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
28 + */
26 29
27 30 /*
28 31 * Copyright (c) 2016 by Delphix. All rights reserved.
29 32 */
30 33
31 34 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
32 35 /* All Rights Reserved */
33 36
34 37 /*
35 38 * University Copyright- Copyright (c) 1982, 1986, 1988
36 39 * The Regents of the University of California
37 40 * All Rights Reserved
38 41 *
39 42 * University Acknowledgment- Portions of this document are derived from
40 43 * software developed by the University of California, Berkeley, and its
41 44 * contributors.
42 45 */
43 46
44 47 #include <sys/types.h>
45 48 #include <sys/t_lock.h>
46 49 #include <sys/sysmacros.h>
47 50 #include <sys/conf.h>
48 51 #include <sys/cpuvar.h>
49 52 #include <sys/errno.h>
50 53 #include <sys/debug.h>
51 54 #include <sys/buf.h>
52 55 #include <sys/var.h>
53 56 #include <sys/vnode.h>
54 57 #include <sys/bitmap.h>
55 58 #include <sys/cmn_err.h>
56 59 #include <sys/kmem.h>
57 60 #include <sys/vmem.h>
58 61 #include <sys/atomic.h>
59 62 #include <vm/seg_kmem.h>
60 63 #include <vm/page.h>
61 64 #include <vm/pvn.h>
62 65 #include <sys/vtrace.h>
63 66 #include <sys/tnf_probe.h>
64 67 #include <sys/fs/ufs_inode.h>
65 68 #include <sys/fs/ufs_bio.h>
66 69 #include <sys/fs/ufs_log.h>
67 70 #include <sys/systm.h>
68 71 #include <sys/vfs.h>
69 72 #include <sys/sdt.h>
70 73
71 74 /* Locks */
72 75 static kmutex_t blist_lock; /* protects b_list */
73 76 static kmutex_t bhdr_lock; /* protects the bhdrlist */
74 77 static kmutex_t bfree_lock; /* protects the bfreelist structure */
75 78
76 79 struct hbuf *hbuf; /* Hash buckets */
77 80 struct dwbuf *dwbuf; /* Delayed write buckets */
78 81 static struct buf *bhdrlist; /* buf header free list */
79 82 static int nbuf; /* number of buffer headers allocated */
80 83
81 84 static int lastindex; /* Reference point on where to start */
82 85 /* when looking for free buffers */
83 86
84 87 #define bio_bhash(dev, bn) (hash2ints((dev), (int)(bn)) & v.v_hmask)
85 88 #define EMPTY_LIST ((struct buf *)-1)
86 89
87 90 static kcondvar_t bio_mem_cv; /* Condition variables */
88 91 static kcondvar_t bio_flushinval_cv;
89 92 static int bio_doingflush; /* flush in progress */
90 93 static int bio_doinginval; /* inval in progress */
91 94 static int bio_flinv_cv_wanted; /* someone waiting for cv */
92 95
93 96 /*
94 97 * Statistics on the buffer cache
95 98 */
96 99 struct biostats biostats = {
97 100 { "buffer_cache_lookups", KSTAT_DATA_UINT32 },
98 101 { "buffer_cache_hits", KSTAT_DATA_UINT32 },
99 102 { "new_buffer_requests", KSTAT_DATA_UINT32 },
100 103 { "waits_for_buffer_allocs", KSTAT_DATA_UINT32 },
101 104 { "buffers_locked_by_someone", KSTAT_DATA_UINT32 },
102 105 { "duplicate_buffers_found", KSTAT_DATA_UINT32 }
103 106 };
104 107
105 108 /*
106 109 * kstat data
107 110 */
108 111 kstat_named_t *biostats_ptr = (kstat_named_t *)&biostats;
109 112 uint_t biostats_ndata = (uint_t)(sizeof (biostats) /
110 113 sizeof (kstat_named_t));
111 114
112 115 /*
113 116 * Statistics on ufs buffer cache
114 117 * Not protected by locks
115 118 */
116 119 struct ufsbiostats ub = {
117 120 { "breads", KSTAT_DATA_UINT32 },
118 121 { "bwrites", KSTAT_DATA_UINT32 },
119 122 { "fbiwrites", KSTAT_DATA_UINT32 },
120 123 { "getpages", KSTAT_DATA_UINT32 },
121 124 { "getras", KSTAT_DATA_UINT32 },
122 125 { "putsyncs", KSTAT_DATA_UINT32 },
123 126 { "putasyncs", KSTAT_DATA_UINT32 },
124 127 { "putpageios", KSTAT_DATA_UINT32 },
125 128 };
126 129
127 130 /*
128 131 * more UFS Logging eccentricities...
129 132 *
130 133 * required since "#pragma weak ..." doesn't work in reverse order.
131 134 * i.e.: genunix (bio.c) is loaded before the ufs modules and pointers
132 135 * to ufs routines don't get plugged into bio.c calls so
133 136 * we initialize it when setting up the "lufsops" table
134 137 * in "lufs.c:_init()"
135 138 */
136 139 void (*bio_lufs_strategy)(void *, buf_t *);
137 140 void (*bio_snapshot_strategy)(void *, buf_t *);
138 141
139 142
140 143 /* Private routines */
141 144 static struct buf *bio_getfreeblk(long);
142 145 static void bio_mem_get(long);
143 146 static void bio_bhdr_free(struct buf *);
144 147 static struct buf *bio_bhdr_alloc(void);
145 148 static void bio_recycle(int, long);
146 149 static void bio_pageio_done(struct buf *);
147 150 static int bio_incore(dev_t, daddr_t);
148 151
149 152 /*
150 153 * Buffer cache constants
151 154 */
152 155 #define BIO_BUF_PERCENT (100/2) /* default: 2% of memory */
153 156 #define BIO_MAX_PERCENT (100/20) /* max is 20% of real memory */
|
↓ open down ↓ |
118 lines elided |
↑ open up ↑ |
154 157 #define BIO_BHDR_POOL 100 /* Default bhdr pool size */
155 158 #define BIO_MIN_HDR 10 /* Minimum number of buffer headers */
156 159 #define BIO_MIN_HWM (BIO_MIN_HDR * MAXBSIZE / 1024)
157 160 #define BIO_HASHLEN 4 /* Target length of hash chains */
158 161
159 162
160 163 /* Flags for bio_recycle() */
161 164 #define BIO_HEADER 0x01
162 165 #define BIO_MEM 0x02
163 166
164 -extern int bufhwm; /* User tunable - high water mark for mem */
165 -extern int bufhwm_pct; /* ditto - given in % of physmem */
167 +extern volatile int bufhwm; /* User tunable - high water mark for mem */
168 +extern volatile int bufhwm_pct; /* ditto - given in % of physmem */
166 169
167 170 /*
168 171 * The following routines allocate and free
169 172 * buffers with various side effects. In general the
170 173 * arguments to an allocate routine are a device and
171 174 * a block number, and the value is a pointer to
172 175 * to the buffer header; the buffer returned is locked with a
173 176 * binary semaphore so that no one else can touch it. If the block was
174 177 * already in core, no I/O need be done; if it is
175 178 * already locked, the process waits until it becomes free.
176 179 * The following routines allocate a buffer:
177 180 * getblk
178 181 * bread/BREAD
179 182 * breada
180 183 * Eventually the buffer must be released, possibly with the
181 184 * side effect of writing it out, by using one of
182 185 * bwrite/BWRITE/brwrite
183 186 * bdwrite/bdrwrite
184 187 * bawrite
185 188 * brelse
186 189 *
187 190 * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization.
188 191 * Instead, a binary semaphore, b_sem is used to gain exclusive access to
189 192 * a buffer and a binary semaphore, b_io is used for I/O synchronization.
190 193 * B_DONE is still used to denote a buffer with I/O complete on it.
191 194 *
192 195 * The bfreelist.b_bcount field is computed everytime fsflush runs. It is
193 196 * should not be used where a very accurate count of the free buffers is
194 197 * needed.
195 198 */
196 199
197 200 /*
198 201 * Read in (if necessary) the block and return a buffer pointer.
199 202 *
200 203 * This interface is provided for binary compatibility. Using
201 204 * BREAD() directly avoids the extra function call overhead invoked
202 205 * by calling this routine.
203 206 */
204 207 struct buf *
205 208 bread(dev_t dev, daddr_t blkno, long bsize)
206 209 {
207 210 return (BREAD(dev, blkno, bsize));
208 211 }
209 212
210 213 /*
211 214 * Common code for reading a buffer with various options
212 215 *
213 216 * Read in (if necessary) the block and return a buffer pointer.
214 217 */
215 218 struct buf *
216 219 bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize)
217 220 {
218 221 struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
219 222 struct buf *bp;
220 223 klwp_t *lwp = ttolwp(curthread);
221 224
222 225 CPU_STATS_ADD_K(sys, lread, 1);
223 226 bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1);
224 227 if (bp->b_flags & B_DONE)
225 228 return (bp);
226 229 bp->b_flags |= B_READ;
227 230 ASSERT(bp->b_bcount == bsize);
228 231 if (ufsvfsp == NULL) { /* !ufs */
229 232 (void) bdev_strategy(bp);
230 233 } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
231 234 /* ufs && logging */
232 235 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
233 236 } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
234 237 /* ufs && snapshots */
235 238 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
236 239 } else {
237 240 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
238 241 ub.ub_breads.value.ul++; /* ufs && !logging */
239 242 (void) bdev_strategy(bp);
240 243 }
241 244 if (lwp != NULL)
242 245 lwp->lwp_ru.inblock++;
243 246 CPU_STATS_ADD_K(sys, bread, 1);
244 247 (void) biowait(bp);
245 248 return (bp);
246 249 }
247 250
248 251 /*
249 252 * Read in the block, like bread, but also start I/O on the
250 253 * read-ahead block (which is not allocated to the caller).
251 254 */
252 255 struct buf *
253 256 breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize)
254 257 {
255 258 struct buf *bp, *rabp;
256 259 klwp_t *lwp = ttolwp(curthread);
257 260
258 261 bp = NULL;
259 262 if (!bio_incore(dev, blkno)) {
260 263 CPU_STATS_ADD_K(sys, lread, 1);
261 264 bp = GETBLK(dev, blkno, bsize);
262 265 if ((bp->b_flags & B_DONE) == 0) {
263 266 bp->b_flags |= B_READ;
264 267 bp->b_bcount = bsize;
265 268 (void) bdev_strategy(bp);
266 269 if (lwp != NULL)
267 270 lwp->lwp_ru.inblock++;
268 271 CPU_STATS_ADD_K(sys, bread, 1);
269 272 }
270 273 }
271 274 if (rablkno && bfreelist.b_bcount > 1 &&
272 275 !bio_incore(dev, rablkno)) {
273 276 rabp = GETBLK(dev, rablkno, bsize);
274 277 if (rabp->b_flags & B_DONE)
275 278 brelse(rabp);
276 279 else {
277 280 rabp->b_flags |= B_READ|B_ASYNC;
278 281 rabp->b_bcount = bsize;
279 282 (void) bdev_strategy(rabp);
280 283 if (lwp != NULL)
281 284 lwp->lwp_ru.inblock++;
282 285 CPU_STATS_ADD_K(sys, bread, 1);
283 286 }
284 287 }
285 288 if (bp == NULL)
286 289 return (BREAD(dev, blkno, bsize));
287 290 (void) biowait(bp);
288 291 return (bp);
289 292 }
290 293
291 294 /*
292 295 * Common code for writing a buffer with various options.
293 296 *
294 297 * force_wait - wait for write completion regardless of B_ASYNC flag
295 298 * do_relse - release the buffer when we are done
296 299 * clear_flags - flags to clear from the buffer
297 300 */
298 301 void
299 302 bwrite_common(void *arg, struct buf *bp, int force_wait,
300 303 int do_relse, int clear_flags)
301 304 {
302 305 register int do_wait;
303 306 struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
304 307 int flag;
305 308 klwp_t *lwp = ttolwp(curthread);
306 309 struct cpu *cpup;
307 310
308 311 ASSERT(SEMA_HELD(&bp->b_sem));
309 312 flag = bp->b_flags;
310 313 bp->b_flags &= ~clear_flags;
311 314 if (lwp != NULL)
312 315 lwp->lwp_ru.oublock++;
313 316 CPU_STATS_ENTER_K();
314 317 cpup = CPU; /* get pointer AFTER preemption is disabled */
315 318 CPU_STATS_ADDQ(cpup, sys, lwrite, 1);
316 319 CPU_STATS_ADDQ(cpup, sys, bwrite, 1);
317 320 do_wait = ((flag & B_ASYNC) == 0 || force_wait);
318 321 if (do_wait == 0)
319 322 CPU_STATS_ADDQ(cpup, sys, bawrite, 1);
320 323 CPU_STATS_EXIT_K();
321 324 if (ufsvfsp == NULL) {
322 325 (void) bdev_strategy(bp);
323 326 } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
324 327 /* ufs && logging */
325 328 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
326 329 } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
327 330 /* ufs && snapshots */
328 331 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
329 332 } else {
330 333 ub.ub_bwrites.value.ul++; /* ufs && !logging */
331 334 (void) bdev_strategy(bp);
332 335 }
333 336 if (do_wait) {
334 337 (void) biowait(bp);
335 338 if (do_relse) {
336 339 brelse(bp);
337 340 }
338 341 }
339 342 }
340 343
341 344 /*
342 345 * Write the buffer, waiting for completion (unless B_ASYNC is set).
343 346 * Then release the buffer.
344 347 * This interface is provided for binary compatibility. Using
345 348 * BWRITE() directly avoids the extra function call overhead invoked
346 349 * by calling this routine.
347 350 */
348 351 void
349 352 bwrite(struct buf *bp)
350 353 {
351 354 BWRITE(bp);
352 355 }
353 356
354 357 /*
355 358 * Write the buffer, waiting for completion.
356 359 * But don't release the buffer afterwards.
357 360 * This interface is provided for binary compatibility. Using
358 361 * BWRITE2() directly avoids the extra function call overhead.
359 362 */
360 363 void
361 364 bwrite2(struct buf *bp)
362 365 {
363 366 BWRITE2(bp);
364 367 }
365 368
366 369 /*
367 370 * Release the buffer, marking it so that if it is grabbed
368 371 * for another purpose it will be written out before being
369 372 * given up (e.g. when writing a partial block where it is
370 373 * assumed that another write for the same block will soon follow).
371 374 * Also save the time that the block is first marked as delayed
372 375 * so that it will be written in a reasonable time.
373 376 */
374 377 void
375 378 bdwrite(struct buf *bp)
376 379 {
377 380 ASSERT(SEMA_HELD(&bp->b_sem));
378 381 CPU_STATS_ADD_K(sys, lwrite, 1);
379 382 if ((bp->b_flags & B_DELWRI) == 0)
380 383 bp->b_start = ddi_get_lbolt();
381 384 /*
382 385 * B_DONE allows others to use the buffer, B_DELWRI causes the
383 386 * buffer to be written before being reused, and setting b_resid
384 387 * to zero says the buffer is complete.
385 388 */
386 389 bp->b_flags |= B_DELWRI | B_DONE;
387 390 bp->b_resid = 0;
388 391 brelse(bp);
389 392 }
390 393
391 394 /*
392 395 * Release the buffer, start I/O on it, but don't wait for completion.
393 396 */
394 397 void
395 398 bawrite(struct buf *bp)
396 399 {
397 400 ASSERT(SEMA_HELD(&bp->b_sem));
398 401
399 402 /* Use bfreelist.b_bcount as a weird-ass heuristic */
400 403 if (bfreelist.b_bcount > 4)
401 404 bp->b_flags |= B_ASYNC;
402 405 BWRITE(bp);
403 406 }
404 407
405 408 /*
406 409 * Release the buffer, with no I/O implied.
407 410 */
408 411 void
409 412 brelse(struct buf *bp)
410 413 {
411 414 struct buf **backp;
412 415 uint_t index;
413 416 kmutex_t *hmp;
414 417 struct buf *dp;
415 418 struct hbuf *hp;
416 419
417 420
418 421 ASSERT(SEMA_HELD(&bp->b_sem));
419 422
420 423 /*
421 424 * Clear the retry write flag if the buffer was written without
422 425 * error. The presence of B_DELWRI means the buffer has not yet
423 426 * been written and the presence of B_ERROR means that an error
424 427 * is still occurring.
425 428 */
426 429 if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) {
427 430 bp->b_flags &= ~B_RETRYWRI;
428 431 }
429 432
430 433 /* Check for anomalous conditions */
431 434 if (bp->b_flags & (B_ERROR|B_NOCACHE)) {
432 435 if (bp->b_flags & B_NOCACHE) {
433 436 /* Don't add to the freelist. Destroy it now */
434 437 kmem_free(bp->b_un.b_addr, bp->b_bufsize);
435 438 sema_destroy(&bp->b_sem);
436 439 sema_destroy(&bp->b_io);
437 440 kmem_free(bp, sizeof (struct buf));
438 441 return;
439 442 }
440 443 /*
441 444 * If a write failed and we are supposed to retry write,
442 445 * don't toss the buffer. Keep it around and mark it
443 446 * delayed write in the hopes that it will eventually
444 447 * get flushed (and still keep the system running.)
445 448 */
446 449 if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) {
447 450 bp->b_flags |= B_DELWRI;
448 451 /* keep fsflush from trying continuously to flush */
449 452 bp->b_start = ddi_get_lbolt();
450 453 } else
451 454 bp->b_flags |= B_AGE|B_STALE;
452 455 bp->b_flags &= ~B_ERROR;
453 456 bp->b_error = 0;
454 457 }
455 458
456 459 /*
457 460 * If delayed write is set then put in on the delayed
458 461 * write list instead of the free buffer list.
459 462 */
460 463 index = bio_bhash(bp->b_edev, bp->b_blkno);
461 464 hmp = &hbuf[index].b_lock;
462 465
463 466 mutex_enter(hmp);
464 467 hp = &hbuf[index];
465 468 dp = (struct buf *)hp;
466 469
467 470 /*
468 471 * Make sure that the number of entries on this list are
469 472 * Zero <= count <= total # buffers
470 473 */
471 474 ASSERT(hp->b_length >= 0);
472 475 ASSERT(hp->b_length < nbuf);
473 476
474 477 hp->b_length++; /* We are adding this buffer */
475 478
476 479 if (bp->b_flags & B_DELWRI) {
477 480 /*
478 481 * This buffer goes on the delayed write buffer list
479 482 */
480 483 dp = (struct buf *)&dwbuf[index];
481 484 }
482 485 ASSERT(bp->b_bufsize > 0);
483 486 ASSERT(bp->b_bcount > 0);
484 487 ASSERT(bp->b_un.b_addr != NULL);
485 488
486 489 if (bp->b_flags & B_AGE) {
487 490 backp = &dp->av_forw;
488 491 (*backp)->av_back = bp;
489 492 bp->av_forw = *backp;
490 493 *backp = bp;
491 494 bp->av_back = dp;
492 495 } else {
493 496 backp = &dp->av_back;
494 497 (*backp)->av_forw = bp;
495 498 bp->av_back = *backp;
496 499 *backp = bp;
497 500 bp->av_forw = dp;
498 501 }
499 502 mutex_exit(hmp);
500 503
501 504 if (bfreelist.b_flags & B_WANTED) {
502 505 /*
503 506 * Should come here very very rarely.
504 507 */
505 508 mutex_enter(&bfree_lock);
506 509 if (bfreelist.b_flags & B_WANTED) {
507 510 bfreelist.b_flags &= ~B_WANTED;
508 511 cv_broadcast(&bio_mem_cv);
509 512 }
510 513 mutex_exit(&bfree_lock);
511 514 }
512 515
513 516 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC);
514 517 /*
515 518 * Don't let anyone get the buffer off the freelist before we
516 519 * release our hold on it.
517 520 */
518 521 sema_v(&bp->b_sem);
519 522 }
520 523
521 524 /*
522 525 * Return a count of the number of B_BUSY buffers in the system
523 526 * Can only be used as a good estimate. If 'cleanit' is set,
524 527 * try to flush all bufs.
525 528 */
526 529 int
527 530 bio_busy(int cleanit)
528 531 {
529 532 struct buf *bp, *dp;
530 533 int busy = 0;
531 534 int i;
532 535 kmutex_t *hmp;
533 536
534 537 for (i = 0; i < v.v_hbuf; i++) {
535 538 dp = (struct buf *)&hbuf[i];
536 539 hmp = &hbuf[i].b_lock;
537 540
538 541 mutex_enter(hmp);
539 542 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
540 543 if (bp->b_flags & B_BUSY)
541 544 busy++;
542 545 }
543 546 mutex_exit(hmp);
544 547 }
545 548
546 549 if (cleanit && busy != 0) {
547 550 bflush(NODEV);
548 551 }
549 552
550 553 return (busy);
551 554 }
552 555
553 556 /*
554 557 * this interface is provided for binary compatibility.
555 558 *
556 559 * Assign a buffer for the given block. If the appropriate
557 560 * block is already associated, return it; otherwise search
558 561 * for the oldest non-busy buffer and reassign it.
559 562 */
560 563 struct buf *
561 564 getblk(dev_t dev, daddr_t blkno, long bsize)
562 565 {
563 566 return (getblk_common(/* ufsvfsp */ NULL, dev,
564 567 blkno, bsize, /* errflg */ 0));
565 568 }
566 569
567 570 /*
568 571 * Assign a buffer for the given block. If the appropriate
569 572 * block is already associated, return it; otherwise search
570 573 * for the oldest non-busy buffer and reassign it.
571 574 */
572 575 struct buf *
573 576 getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg)
574 577 {
575 578 ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg;
576 579 struct buf *bp;
577 580 struct buf *dp;
578 581 struct buf *nbp = NULL;
579 582 struct buf *errbp;
580 583 uint_t index;
581 584 kmutex_t *hmp;
582 585 struct hbuf *hp;
583 586
584 587 if (getmajor(dev) >= devcnt)
585 588 cmn_err(CE_PANIC, "blkdev");
586 589
587 590 biostats.bio_lookup.value.ui32++;
588 591
589 592 index = bio_bhash(dev, blkno);
590 593 hp = &hbuf[index];
591 594 dp = (struct buf *)hp;
592 595 hmp = &hp->b_lock;
593 596
594 597 mutex_enter(hmp);
595 598 loop:
596 599 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
597 600 if (bp->b_blkno != blkno || bp->b_edev != dev ||
598 601 (bp->b_flags & B_STALE))
599 602 continue;
600 603 /*
601 604 * Avoid holding the hash lock in the event that
602 605 * the buffer is locked by someone. Since the hash chain
603 606 * may change when we drop the hash lock
604 607 * we have to start at the beginning of the chain if the
605 608 * buffer identity/contents aren't valid.
606 609 */
607 610 if (!sema_tryp(&bp->b_sem)) {
608 611 biostats.bio_bufbusy.value.ui32++;
609 612 mutex_exit(hmp);
610 613 /*
611 614 * OK, we are dealing with a busy buffer.
612 615 * In the case that we are panicking and we
613 616 * got called from bread(), we have some chance
614 617 * for error recovery. So better bail out from
615 618 * here since sema_p() won't block. If we got
616 619 * called directly from ufs routines, there is
617 620 * no way to report an error yet.
618 621 */
619 622 if (panicstr && errflg)
620 623 goto errout;
621 624 /*
622 625 * For the following line of code to work
623 626 * correctly never kmem_free the buffer "header".
624 627 */
625 628 sema_p(&bp->b_sem);
626 629 if (bp->b_blkno != blkno || bp->b_edev != dev ||
627 630 (bp->b_flags & B_STALE)) {
628 631 sema_v(&bp->b_sem);
629 632 mutex_enter(hmp);
630 633 goto loop; /* start over */
631 634 }
632 635 mutex_enter(hmp);
633 636 }
634 637 /* Found */
635 638 biostats.bio_hit.value.ui32++;
636 639 bp->b_flags &= ~B_AGE;
637 640
638 641 /*
639 642 * Yank it off the free/delayed write lists
640 643 */
641 644 hp->b_length--;
642 645 notavail(bp);
643 646 mutex_exit(hmp);
644 647
645 648 ASSERT((bp->b_flags & B_NOCACHE) == NULL);
646 649
647 650 if (nbp == NULL) {
648 651 /*
649 652 * Make the common path short.
650 653 */
651 654 ASSERT(SEMA_HELD(&bp->b_sem));
652 655 return (bp);
653 656 }
654 657
655 658 biostats.bio_bufdup.value.ui32++;
656 659
657 660 /*
658 661 * The buffer must have entered during the lock upgrade
659 662 * so free the new buffer we allocated and return the
660 663 * found buffer.
661 664 */
662 665 kmem_free(nbp->b_un.b_addr, nbp->b_bufsize);
663 666 nbp->b_un.b_addr = NULL;
664 667
665 668 /*
666 669 * Account for the memory
667 670 */
668 671 mutex_enter(&bfree_lock);
669 672 bfreelist.b_bufsize += nbp->b_bufsize;
670 673 mutex_exit(&bfree_lock);
671 674
672 675 /*
673 676 * Destroy buf identity, and place on avail list
674 677 */
675 678 nbp->b_dev = (o_dev_t)NODEV;
676 679 nbp->b_edev = NODEV;
677 680 nbp->b_flags = 0;
678 681 nbp->b_file = NULL;
679 682 nbp->b_offset = -1;
680 683
681 684 sema_v(&nbp->b_sem);
682 685 bio_bhdr_free(nbp);
683 686
684 687 ASSERT(SEMA_HELD(&bp->b_sem));
685 688 return (bp);
686 689 }
687 690
688 691 /*
689 692 * bio_getfreeblk may block so check the hash chain again.
690 693 */
691 694 if (nbp == NULL) {
692 695 mutex_exit(hmp);
693 696 nbp = bio_getfreeblk(bsize);
694 697 mutex_enter(hmp);
695 698 goto loop;
696 699 }
697 700
698 701 /*
699 702 * New buffer. Assign nbp and stick it on the hash.
700 703 */
701 704 nbp->b_flags = B_BUSY;
702 705 nbp->b_edev = dev;
703 706 nbp->b_dev = (o_dev_t)cmpdev(dev);
704 707 nbp->b_blkno = blkno;
705 708 nbp->b_iodone = NULL;
706 709 nbp->b_bcount = bsize;
707 710 /*
708 711 * If we are given a ufsvfsp and the vfs_root field is NULL
709 712 * then this must be I/O for a superblock. A superblock's
710 713 * buffer is set up in mountfs() and there is no root vnode
711 714 * at that point.
712 715 */
713 716 if (ufsvfsp && ufsvfsp->vfs_root) {
714 717 nbp->b_vp = ufsvfsp->vfs_root;
715 718 } else {
716 719 nbp->b_vp = NULL;
717 720 }
718 721
719 722 ASSERT((nbp->b_flags & B_NOCACHE) == NULL);
720 723
721 724 binshash(nbp, dp);
722 725 mutex_exit(hmp);
723 726
724 727 ASSERT(SEMA_HELD(&nbp->b_sem));
725 728
726 729 return (nbp);
727 730
728 731
729 732 /*
730 733 * Come here in case of an internal error. At this point we couldn't
731 734 * get a buffer, but we have to return one. Hence we allocate some
732 735 * kind of error reply buffer on the fly. This buffer is marked as
733 736 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
734 737 * - B_ERROR will indicate error to the caller.
735 738 * - B_DONE will prevent us from reading the buffer from
736 739 * the device.
737 740 * - B_NOCACHE will cause that this buffer gets free'd in
738 741 * brelse().
739 742 */
740 743
741 744 errout:
742 745 errbp = geteblk();
743 746 sema_p(&errbp->b_sem);
744 747 errbp->b_flags &= ~B_BUSY;
745 748 errbp->b_flags |= (B_ERROR | B_DONE);
746 749 return (errbp);
747 750 }
748 751
749 752 /*
750 753 * Get an empty block, not assigned to any particular device.
751 754 * Returns a locked buffer that is not on any hash or free list.
752 755 */
753 756 struct buf *
754 757 ngeteblk(long bsize)
755 758 {
756 759 struct buf *bp;
757 760
758 761 bp = kmem_alloc(sizeof (struct buf), KM_SLEEP);
759 762 bioinit(bp);
760 763 bp->av_forw = bp->av_back = NULL;
761 764 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
762 765 bp->b_bufsize = bsize;
763 766 bp->b_flags = B_BUSY | B_NOCACHE | B_AGE;
764 767 bp->b_dev = (o_dev_t)NODEV;
765 768 bp->b_edev = NODEV;
766 769 bp->b_lblkno = 0;
767 770 bp->b_bcount = bsize;
768 771 bp->b_iodone = NULL;
769 772 return (bp);
770 773 }
771 774
772 775 /*
773 776 * Interface of geteblk() is kept intact to maintain driver compatibility.
774 777 * Use ngeteblk() to allocate block size other than 1 KB.
775 778 */
776 779 struct buf *
777 780 geteblk(void)
778 781 {
779 782 return (ngeteblk((long)1024));
780 783 }
781 784
782 785 /*
783 786 * Return a buffer w/o sleeping
784 787 */
785 788 struct buf *
786 789 trygetblk(dev_t dev, daddr_t blkno)
787 790 {
788 791 struct buf *bp;
789 792 struct buf *dp;
790 793 struct hbuf *hp;
791 794 kmutex_t *hmp;
792 795 uint_t index;
793 796
794 797 index = bio_bhash(dev, blkno);
795 798 hp = &hbuf[index];
796 799 hmp = &hp->b_lock;
797 800
798 801 if (!mutex_tryenter(hmp))
799 802 return (NULL);
800 803
801 804 dp = (struct buf *)hp;
802 805 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
803 806 if (bp->b_blkno != blkno || bp->b_edev != dev ||
804 807 (bp->b_flags & B_STALE))
805 808 continue;
806 809 /*
807 810 * Get access to a valid buffer without sleeping
808 811 */
809 812 if (sema_tryp(&bp->b_sem)) {
810 813 if (bp->b_flags & B_DONE) {
811 814 hp->b_length--;
812 815 notavail(bp);
813 816 mutex_exit(hmp);
814 817 return (bp);
815 818 } else {
816 819 sema_v(&bp->b_sem);
817 820 break;
818 821 }
819 822 }
820 823 break;
821 824 }
822 825 mutex_exit(hmp);
823 826 return (NULL);
824 827 }
825 828
826 829 /*
827 830 * Wait for I/O completion on the buffer; return errors
828 831 * to the user.
829 832 */
830 833 int
831 834 iowait(struct buf *bp)
832 835 {
833 836 ASSERT(SEMA_HELD(&bp->b_sem));
834 837 return (biowait(bp));
835 838 }
836 839
837 840 /*
838 841 * Mark I/O complete on a buffer, release it if I/O is asynchronous,
839 842 * and wake up anyone waiting for it.
840 843 */
841 844 void
842 845 iodone(struct buf *bp)
843 846 {
844 847 ASSERT(SEMA_HELD(&bp->b_sem));
845 848 (void) biodone(bp);
846 849 }
847 850
848 851 /*
849 852 * Zero the core associated with a buffer.
850 853 */
851 854 void
852 855 clrbuf(struct buf *bp)
853 856 {
854 857 ASSERT(SEMA_HELD(&bp->b_sem));
855 858 bzero(bp->b_un.b_addr, bp->b_bcount);
856 859 bp->b_resid = 0;
857 860 }
858 861
859 862
860 863 /*
861 864 * Make sure all write-behind blocks on dev (or NODEV for all)
862 865 * are flushed out.
863 866 */
864 867 void
865 868 bflush(dev_t dev)
866 869 {
867 870 struct buf *bp, *dp;
868 871 struct hbuf *hp;
869 872 struct buf *delwri_list = EMPTY_LIST;
870 873 int i, index;
871 874 kmutex_t *hmp;
872 875
873 876 mutex_enter(&blist_lock);
874 877 /*
875 878 * Wait for any invalidates or flushes ahead of us to finish.
876 879 * We really could split blist_lock up per device for better
877 880 * parallelism here.
878 881 */
879 882 while (bio_doinginval || bio_doingflush) {
880 883 bio_flinv_cv_wanted = 1;
881 884 cv_wait(&bio_flushinval_cv, &blist_lock);
882 885 }
883 886 bio_doingflush++;
884 887 /*
885 888 * Gather all B_DELWRI buffer for device.
886 889 * Lock ordering is b_sem > hash lock (brelse).
887 890 * Since we are finding the buffer via the delayed write list,
888 891 * it may be busy and we would block trying to get the
889 892 * b_sem lock while holding hash lock. So transfer all the
890 893 * candidates on the delwri_list and then drop the hash locks.
891 894 */
892 895 for (i = 0; i < v.v_hbuf; i++) {
893 896 hmp = &hbuf[i].b_lock;
894 897 dp = (struct buf *)&dwbuf[i];
895 898 mutex_enter(hmp);
896 899 for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) {
897 900 if (dev == NODEV || bp->b_edev == dev) {
898 901 if (bp->b_list == NULL) {
899 902 bp->b_list = delwri_list;
900 903 delwri_list = bp;
901 904 }
902 905 }
903 906 }
904 907 mutex_exit(hmp);
905 908 }
906 909 mutex_exit(&blist_lock);
907 910
908 911 /*
909 912 * Now that the hash locks have been dropped grab the semaphores
910 913 * and write back all the buffers that have B_DELWRI set.
911 914 */
912 915 while (delwri_list != EMPTY_LIST) {
913 916 bp = delwri_list;
914 917
915 918 sema_p(&bp->b_sem); /* may block */
916 919 if ((dev != bp->b_edev && dev != NODEV) ||
917 920 (panicstr && bp->b_flags & B_BUSY)) {
918 921 sema_v(&bp->b_sem);
919 922 delwri_list = bp->b_list;
920 923 bp->b_list = NULL;
921 924 continue; /* No longer a candidate */
922 925 }
923 926 if (bp->b_flags & B_DELWRI) {
924 927 index = bio_bhash(bp->b_edev, bp->b_blkno);
925 928 hp = &hbuf[index];
926 929 hmp = &hp->b_lock;
927 930 dp = (struct buf *)hp;
928 931
929 932 bp->b_flags |= B_ASYNC;
930 933 mutex_enter(hmp);
931 934 hp->b_length--;
932 935 notavail(bp);
933 936 mutex_exit(hmp);
934 937 if (bp->b_vp == NULL) { /* !ufs */
935 938 BWRITE(bp);
936 939 } else { /* ufs */
937 940 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
938 941 }
939 942 } else {
940 943 sema_v(&bp->b_sem);
941 944 }
942 945 delwri_list = bp->b_list;
943 946 bp->b_list = NULL;
944 947 }
945 948 mutex_enter(&blist_lock);
946 949 bio_doingflush--;
947 950 if (bio_flinv_cv_wanted) {
948 951 bio_flinv_cv_wanted = 0;
949 952 cv_broadcast(&bio_flushinval_cv);
950 953 }
951 954 mutex_exit(&blist_lock);
952 955 }
953 956
954 957 /*
955 958 * Ensure that a specified block is up-to-date on disk.
956 959 */
957 960 void
958 961 blkflush(dev_t dev, daddr_t blkno)
959 962 {
960 963 struct buf *bp, *dp;
961 964 struct hbuf *hp;
962 965 struct buf *sbp = NULL;
963 966 uint_t index;
964 967 kmutex_t *hmp;
965 968
966 969 index = bio_bhash(dev, blkno);
967 970 hp = &hbuf[index];
968 971 dp = (struct buf *)hp;
969 972 hmp = &hp->b_lock;
970 973
971 974 /*
972 975 * Identify the buffer in the cache belonging to
973 976 * this device and blkno (if any).
974 977 */
975 978 mutex_enter(hmp);
976 979 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
977 980 if (bp->b_blkno != blkno || bp->b_edev != dev ||
978 981 (bp->b_flags & B_STALE))
979 982 continue;
980 983 sbp = bp;
981 984 break;
982 985 }
983 986 mutex_exit(hmp);
984 987 if (sbp == NULL)
985 988 return;
986 989 /*
987 990 * Now check the buffer we have identified and
988 991 * make sure it still belongs to the device and is B_DELWRI
989 992 */
990 993 sema_p(&sbp->b_sem);
991 994 if (sbp->b_blkno == blkno && sbp->b_edev == dev &&
992 995 (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) {
993 996 mutex_enter(hmp);
994 997 hp->b_length--;
995 998 notavail(sbp);
996 999 mutex_exit(hmp);
997 1000 /*
998 1001 * XXX - There is nothing to guarantee a synchronous
999 1002 * write here if the B_ASYNC flag is set. This needs
1000 1003 * some investigation.
1001 1004 */
1002 1005 if (sbp->b_vp == NULL) { /* !ufs */
1003 1006 BWRITE(sbp); /* synchronous write */
1004 1007 } else { /* ufs */
1005 1008 UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp);
1006 1009 }
1007 1010 } else {
1008 1011 sema_v(&sbp->b_sem);
1009 1012 }
1010 1013 }
1011 1014
1012 1015 /*
1013 1016 * Same as binval, except can force-invalidate delayed-write buffers
1014 1017 * (which are not be already flushed because of device errors). Also
1015 1018 * makes sure that the retry write flag is cleared.
1016 1019 */
1017 1020 int
1018 1021 bfinval(dev_t dev, int force)
1019 1022 {
1020 1023 struct buf *dp;
1021 1024 struct buf *bp;
1022 1025 struct buf *binval_list = EMPTY_LIST;
1023 1026 int i, error = 0;
1024 1027 kmutex_t *hmp;
1025 1028 uint_t index;
1026 1029 struct buf **backp;
1027 1030
1028 1031 mutex_enter(&blist_lock);
1029 1032 /*
1030 1033 * Wait for any flushes ahead of us to finish, it's ok to
1031 1034 * do invalidates in parallel.
1032 1035 */
1033 1036 while (bio_doingflush) {
1034 1037 bio_flinv_cv_wanted = 1;
1035 1038 cv_wait(&bio_flushinval_cv, &blist_lock);
1036 1039 }
1037 1040 bio_doinginval++;
1038 1041
1039 1042 /* Gather bp's */
1040 1043 for (i = 0; i < v.v_hbuf; i++) {
1041 1044 dp = (struct buf *)&hbuf[i];
1042 1045 hmp = &hbuf[i].b_lock;
1043 1046
1044 1047 mutex_enter(hmp);
1045 1048 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1046 1049 if (bp->b_edev == dev) {
1047 1050 if (bp->b_list == NULL) {
1048 1051 bp->b_list = binval_list;
1049 1052 binval_list = bp;
1050 1053 }
1051 1054 }
1052 1055 }
1053 1056 mutex_exit(hmp);
1054 1057 }
1055 1058 mutex_exit(&blist_lock);
1056 1059
1057 1060 /* Invalidate all bp's found */
1058 1061 while (binval_list != EMPTY_LIST) {
1059 1062 bp = binval_list;
1060 1063
1061 1064 sema_p(&bp->b_sem);
1062 1065 if (bp->b_edev == dev) {
1063 1066 if (force && (bp->b_flags & B_DELWRI)) {
1064 1067 /* clear B_DELWRI, move to non-dw freelist */
1065 1068 index = bio_bhash(bp->b_edev, bp->b_blkno);
1066 1069 hmp = &hbuf[index].b_lock;
1067 1070 dp = (struct buf *)&hbuf[index];
1068 1071 mutex_enter(hmp);
1069 1072
1070 1073 /* remove from delayed write freelist */
1071 1074 notavail(bp);
1072 1075
1073 1076 /* add to B_AGE side of non-dw freelist */
1074 1077 backp = &dp->av_forw;
1075 1078 (*backp)->av_back = bp;
1076 1079 bp->av_forw = *backp;
1077 1080 *backp = bp;
1078 1081 bp->av_back = dp;
1079 1082
1080 1083 /*
1081 1084 * make sure write retries and busy are cleared
1082 1085 */
1083 1086 bp->b_flags &=
1084 1087 ~(B_BUSY | B_DELWRI | B_RETRYWRI);
1085 1088 mutex_exit(hmp);
1086 1089 }
1087 1090 if ((bp->b_flags & B_DELWRI) == 0)
1088 1091 bp->b_flags |= B_STALE|B_AGE;
1089 1092 else
1090 1093 error = EIO;
1091 1094 }
1092 1095 sema_v(&bp->b_sem);
1093 1096 binval_list = bp->b_list;
1094 1097 bp->b_list = NULL;
1095 1098 }
1096 1099 mutex_enter(&blist_lock);
1097 1100 bio_doinginval--;
1098 1101 if (bio_flinv_cv_wanted) {
1099 1102 cv_broadcast(&bio_flushinval_cv);
1100 1103 bio_flinv_cv_wanted = 0;
1101 1104 }
1102 1105 mutex_exit(&blist_lock);
1103 1106 return (error);
1104 1107 }
1105 1108
1106 1109 /*
1107 1110 * If possible, invalidate blocks for a dev on demand
1108 1111 */
1109 1112 void
1110 1113 binval(dev_t dev)
1111 1114 {
1112 1115 (void) bfinval(dev, 0);
1113 1116 }
1114 1117
1115 1118 /*
1116 1119 * Initialize the buffer I/O system by freeing
1117 1120 * all buffers and setting all device hash buffer lists to empty.
1118 1121 */
1119 1122 void
1120 1123 binit(void)
1121 1124 {
1122 1125 struct buf *bp;
1123 1126 unsigned int i, pct;
1124 1127 ulong_t bio_max_hwm, bio_default_hwm;
1125 1128
1126 1129 /*
1127 1130 * Maximum/Default values for bufhwm are set to the smallest of:
1128 1131 * - BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
1129 1132 * - 1/4 of kernel virtual memory
1130 1133 * - INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
1131 1134 * Additionally, in order to allow simple tuning by percentage of
1132 1135 * physical memory, bufhwm_pct is used to calculate the default if
1133 1136 * the value of this tunable is between 0 and BIO_MAX_PERCENT.
1134 1137 *
1135 1138 * Since the unit for v.v_bufhwm is kilobytes, this allows for
1136 1139 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
1137 1140 */
1138 1141 bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT,
1139 1142 btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1140 1143 bio_max_hwm = MIN(INT32_MAX, bio_max_hwm);
1141 1144
1142 1145 pct = BIO_BUF_PERCENT;
1143 1146 if (bufhwm_pct != 0 &&
1144 1147 ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) {
1145 1148 pct = BIO_BUF_PERCENT;
1146 1149 /*
1147 1150 * Invalid user specified value, emit a warning.
1148 1151 */
1149 1152 cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \
1150 1153 range(1..%d). Using %d as default.",
1151 1154 bufhwm_pct,
1152 1155 100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT);
1153 1156 }
1154 1157
1155 1158 bio_default_hwm = MIN(physmem / pct,
1156 1159 btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1157 1160 bio_default_hwm = MIN(INT32_MAX, bio_default_hwm);
1158 1161
1159 1162 if ((v.v_bufhwm = bufhwm) == 0)
1160 1163 v.v_bufhwm = bio_default_hwm;
1161 1164
1162 1165 if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) {
1163 1166 v.v_bufhwm = (int)bio_max_hwm;
1164 1167 /*
1165 1168 * Invalid user specified value, emit a warning.
1166 1169 */
1167 1170 cmn_err(CE_WARN,
1168 1171 "binit: bufhwm(%d) out \
1169 1172 of range(%d..%lu). Using %lu as default",
1170 1173 bufhwm,
1171 1174 BIO_MIN_HWM, bio_max_hwm, bio_max_hwm);
1172 1175 }
1173 1176
1174 1177 /*
1175 1178 * Determine the number of hash buckets. Default is to
1176 1179 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
1177 1180 * Round up number to the next power of 2.
1178 1181 */
1179 1182 v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) /
1180 1183 BIO_HASHLEN);
1181 1184 v.v_hmask = v.v_hbuf - 1;
1182 1185 v.v_buf = BIO_BHDR_POOL;
1183 1186
1184 1187 hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP);
1185 1188
1186 1189 dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP);
1187 1190
1188 1191 bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024;
1189 1192 bp = &bfreelist;
1190 1193 bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp;
1191 1194
1192 1195 for (i = 0; i < v.v_hbuf; i++) {
1193 1196 hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i];
1194 1197 hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i];
1195 1198
1196 1199 /*
1197 1200 * Initialize the delayed write buffer list.
1198 1201 */
1199 1202 dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i];
1200 1203 dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i];
1201 1204 }
1202 1205 }
1203 1206
1204 1207 /*
1205 1208 * Wait for I/O completion on the buffer; return error code.
1206 1209 * If bp was for synchronous I/O, bp is invalid and associated
1207 1210 * resources are freed on return.
1208 1211 */
1209 1212 int
1210 1213 biowait(struct buf *bp)
1211 1214 {
1212 1215 int error = 0;
1213 1216 struct cpu *cpup;
1214 1217
1215 1218 ASSERT(SEMA_HELD(&bp->b_sem));
1216 1219
1217 1220 cpup = CPU;
1218 1221 atomic_inc_64(&cpup->cpu_stats.sys.iowait);
1219 1222 DTRACE_IO1(wait__start, struct buf *, bp);
1220 1223
1221 1224 /*
1222 1225 * In case of panic, busy wait for completion
1223 1226 */
1224 1227 if (panicstr) {
1225 1228 while ((bp->b_flags & B_DONE) == 0)
1226 1229 drv_usecwait(10);
1227 1230 } else
1228 1231 sema_p(&bp->b_io);
1229 1232
1230 1233 DTRACE_IO1(wait__done, struct buf *, bp);
1231 1234 atomic_dec_64(&cpup->cpu_stats.sys.iowait);
1232 1235
1233 1236 error = geterror(bp);
1234 1237 if ((bp->b_flags & B_ASYNC) == 0) {
1235 1238 if (bp->b_flags & B_REMAPPED)
1236 1239 bp_mapout(bp);
1237 1240 }
1238 1241 return (error);
1239 1242 }
1240 1243
1241 1244 static void
1242 1245 biodone_tnf_probe(struct buf *bp)
1243 1246 {
1244 1247 /* Kernel probe */
1245 1248 TNF_PROBE_3(biodone, "io blockio", /* CSTYLED */,
1246 1249 tnf_device, device, bp->b_edev,
1247 1250 tnf_diskaddr, block, bp->b_lblkno,
1248 1251 tnf_opaque, buf, bp);
1249 1252 }
1250 1253
1251 1254 /*
1252 1255 * Mark I/O complete on a buffer, release it if I/O is asynchronous,
1253 1256 * and wake up anyone waiting for it.
1254 1257 */
1255 1258 void
1256 1259 biodone(struct buf *bp)
1257 1260 {
1258 1261 if (bp->b_flags & B_STARTED) {
1259 1262 DTRACE_IO1(done, struct buf *, bp);
1260 1263 bp->b_flags &= ~B_STARTED;
1261 1264 }
1262 1265
1263 1266 /*
1264 1267 * Call the TNF probe here instead of the inline code
1265 1268 * to force our compiler to use the tail call optimization.
1266 1269 */
1267 1270 biodone_tnf_probe(bp);
1268 1271
1269 1272 if (bp->b_iodone != NULL) {
1270 1273 (*(bp->b_iodone))(bp);
1271 1274 return;
1272 1275 }
1273 1276 ASSERT((bp->b_flags & B_DONE) == 0);
1274 1277 ASSERT(SEMA_HELD(&bp->b_sem));
1275 1278 bp->b_flags |= B_DONE;
1276 1279 if (bp->b_flags & B_ASYNC) {
1277 1280 if (bp->b_flags & (B_PAGEIO|B_REMAPPED))
1278 1281 bio_pageio_done(bp);
1279 1282 else
1280 1283 brelse(bp); /* release bp to freelist */
1281 1284 } else {
1282 1285 sema_v(&bp->b_io);
1283 1286 }
1284 1287 }
1285 1288
1286 1289 /*
1287 1290 * Pick up the device's error number and pass it to the user;
1288 1291 * if there is an error but the number is 0 set a generalized code.
1289 1292 */
1290 1293 int
1291 1294 geterror(struct buf *bp)
1292 1295 {
1293 1296 int error = 0;
1294 1297
1295 1298 ASSERT(SEMA_HELD(&bp->b_sem));
1296 1299 if (bp->b_flags & B_ERROR) {
1297 1300 error = bp->b_error;
1298 1301 if (!error)
1299 1302 error = EIO;
1300 1303 }
1301 1304 return (error);
1302 1305 }
1303 1306
1304 1307 /*
1305 1308 * Support for pageio buffers.
1306 1309 *
1307 1310 * This stuff should be generalized to provide a generalized bp
1308 1311 * header facility that can be used for things other than pageio.
1309 1312 */
1310 1313
1311 1314 /*
1312 1315 * Allocate and initialize a buf struct for use with pageio.
1313 1316 */
1314 1317 struct buf *
1315 1318 pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
1316 1319 {
1317 1320 struct buf *bp;
1318 1321 struct cpu *cpup;
1319 1322
1320 1323 if (flags & B_READ) {
1321 1324 CPU_STATS_ENTER_K();
1322 1325 cpup = CPU; /* get pointer AFTER preemption is disabled */
1323 1326 CPU_STATS_ADDQ(cpup, vm, pgin, 1);
1324 1327 CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len));
1325 1328
1326 1329 atomic_add_64(&curzone->zone_pgpgin, btopr(len));
1327 1330
1328 1331 if ((flags & B_ASYNC) == 0) {
1329 1332 klwp_t *lwp = ttolwp(curthread);
1330 1333 if (lwp != NULL)
1331 1334 lwp->lwp_ru.majflt++;
1332 1335 CPU_STATS_ADDQ(cpup, vm, maj_fault, 1);
1333 1336 /* Kernel probe */
1334 1337 TNF_PROBE_2(major_fault, "vm pagefault", /* CSTYLED */,
1335 1338 tnf_opaque, vnode, pp->p_vnode,
1336 1339 tnf_offset, offset, pp->p_offset);
1337 1340 }
1338 1341 /*
1339 1342 * Update statistics for pages being paged in
1340 1343 */
1341 1344 if (pp != NULL && pp->p_vnode != NULL) {
1342 1345 if (IS_SWAPFSVP(pp->p_vnode)) {
1343 1346 CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len));
1344 1347 atomic_add_64(&curzone->zone_anonpgin,
1345 1348 btopr(len));
1346 1349 } else {
1347 1350 if (pp->p_vnode->v_flag & VVMEXEC) {
1348 1351 CPU_STATS_ADDQ(cpup, vm, execpgin,
1349 1352 btopr(len));
1350 1353 atomic_add_64(&curzone->zone_execpgin,
1351 1354 btopr(len));
1352 1355 } else {
1353 1356 CPU_STATS_ADDQ(cpup, vm, fspgin,
1354 1357 btopr(len));
1355 1358 atomic_add_64(&curzone->zone_fspgin,
1356 1359 btopr(len));
1357 1360 }
1358 1361 }
1359 1362 }
1360 1363 CPU_STATS_EXIT_K();
1361 1364 TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN,
1362 1365 "page_ws_in:pp %p", pp);
1363 1366 /* Kernel probe */
1364 1367 TNF_PROBE_3(pagein, "vm pageio io", /* CSTYLED */,
1365 1368 tnf_opaque, vnode, pp->p_vnode,
1366 1369 tnf_offset, offset, pp->p_offset,
1367 1370 tnf_size, size, len);
1368 1371 }
1369 1372
1370 1373 bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
1371 1374 bp->b_bcount = len;
1372 1375 bp->b_bufsize = len;
1373 1376 bp->b_pages = pp;
1374 1377 bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags;
1375 1378 bp->b_offset = -1;
1376 1379 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
1377 1380
1378 1381 /* Initialize bp->b_sem in "locked" state */
1379 1382 sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
1380 1383
1381 1384 VN_HOLD(vp);
1382 1385 bp->b_vp = vp;
1383 1386 THREAD_KPRI_RELEASE_N(btopr(len)); /* release kpri from page_locks */
1384 1387
1385 1388 /*
1386 1389 * Caller sets dev & blkno and can adjust
1387 1390 * b_addr for page offset and can use bp_mapin
1388 1391 * to make pages kernel addressable.
1389 1392 */
1390 1393 return (bp);
1391 1394 }
1392 1395
1393 1396 void
1394 1397 pageio_done(struct buf *bp)
1395 1398 {
1396 1399 ASSERT(SEMA_HELD(&bp->b_sem));
1397 1400 if (bp->b_flags & B_REMAPPED)
1398 1401 bp_mapout(bp);
1399 1402 VN_RELE(bp->b_vp);
1400 1403 bp->b_vp = NULL;
1401 1404 ASSERT((bp->b_flags & B_NOCACHE) != 0);
1402 1405
1403 1406 /* A sema_v(bp->b_sem) is implied if we are destroying it */
1404 1407 sema_destroy(&bp->b_sem);
1405 1408 sema_destroy(&bp->b_io);
1406 1409 kmem_free(bp, sizeof (struct buf));
1407 1410 }
1408 1411
1409 1412 /*
1410 1413 * Check to see whether the buffers, except the one pointed by sbp,
1411 1414 * associated with the device are busy.
1412 1415 * NOTE: This expensive operation shall be improved together with ufs_icheck().
1413 1416 */
1414 1417 int
1415 1418 bcheck(dev_t dev, struct buf *sbp)
1416 1419 {
1417 1420 struct buf *bp;
1418 1421 struct buf *dp;
1419 1422 int i;
1420 1423 kmutex_t *hmp;
1421 1424
1422 1425 /*
1423 1426 * check for busy bufs for this filesystem
1424 1427 */
1425 1428 for (i = 0; i < v.v_hbuf; i++) {
1426 1429 dp = (struct buf *)&hbuf[i];
1427 1430 hmp = &hbuf[i].b_lock;
1428 1431
1429 1432 mutex_enter(hmp);
1430 1433 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1431 1434 /*
1432 1435 * if buf is busy or dirty, then filesystem is busy
1433 1436 */
1434 1437 if ((bp->b_edev == dev) &&
1435 1438 ((bp->b_flags & B_STALE) == 0) &&
1436 1439 (bp->b_flags & (B_DELWRI|B_BUSY)) &&
1437 1440 (bp != sbp)) {
1438 1441 mutex_exit(hmp);
1439 1442 return (1);
1440 1443 }
1441 1444 }
1442 1445 mutex_exit(hmp);
1443 1446 }
1444 1447 return (0);
1445 1448 }
1446 1449
1447 1450 /*
1448 1451 * Hash two 32 bit entities.
1449 1452 */
1450 1453 int
1451 1454 hash2ints(int x, int y)
1452 1455 {
1453 1456 int hash = 0;
1454 1457
1455 1458 hash = x - 1;
1456 1459 hash = ((hash * 7) + (x >> 8)) - 1;
1457 1460 hash = ((hash * 7) + (x >> 16)) - 1;
1458 1461 hash = ((hash * 7) + (x >> 24)) - 1;
1459 1462 hash = ((hash * 7) + y) - 1;
1460 1463 hash = ((hash * 7) + (y >> 8)) - 1;
1461 1464 hash = ((hash * 7) + (y >> 16)) - 1;
1462 1465 hash = ((hash * 7) + (y >> 24)) - 1;
1463 1466
1464 1467 return (hash);
1465 1468 }
1466 1469
1467 1470
1468 1471 /*
1469 1472 * Return a new buffer struct.
1470 1473 * Create a new buffer if we haven't gone over our high water
1471 1474 * mark for memory, otherwise try to get one off the freelist.
1472 1475 *
1473 1476 * Returns a locked buf that has no id and is not on any hash or free
1474 1477 * list.
1475 1478 */
1476 1479 static struct buf *
1477 1480 bio_getfreeblk(long bsize)
1478 1481 {
1479 1482 struct buf *bp, *dp;
1480 1483 struct hbuf *hp;
1481 1484 kmutex_t *hmp;
1482 1485 uint_t start, end;
1483 1486
1484 1487 /*
1485 1488 * mutex_enter(&bfree_lock);
1486 1489 * bfreelist.b_bufsize represents the amount of memory
1487 1490 * mutex_exit(&bfree_lock); protect ref to bfreelist
1488 1491 * we are allowed to allocate in the cache before we hit our hwm.
1489 1492 */
1490 1493 bio_mem_get(bsize); /* Account for our memory request */
1491 1494
1492 1495 again:
1493 1496 bp = bio_bhdr_alloc(); /* Get a buf hdr */
1494 1497 sema_p(&bp->b_sem); /* Should never fail */
1495 1498
1496 1499 ASSERT(bp->b_un.b_addr == NULL);
1497 1500 bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP);
1498 1501 if (bp->b_un.b_addr != NULL) {
1499 1502 /*
1500 1503 * Make the common path short
1501 1504 */
1502 1505 bp->b_bufsize = bsize;
1503 1506 ASSERT(SEMA_HELD(&bp->b_sem));
1504 1507 return (bp);
1505 1508 } else {
1506 1509 struct buf *save;
1507 1510
1508 1511 save = bp; /* Save bp we allocated */
1509 1512 start = end = lastindex;
1510 1513
1511 1514 biostats.bio_bufwant.value.ui32++;
1512 1515
1513 1516 /*
1514 1517 * Memory isn't available from the system now. Scan
1515 1518 * the hash buckets till enough space is found.
1516 1519 */
1517 1520 do {
1518 1521 hp = &hbuf[start];
1519 1522 hmp = &hp->b_lock;
1520 1523 dp = (struct buf *)hp;
1521 1524
1522 1525 mutex_enter(hmp);
1523 1526 bp = dp->av_forw;
1524 1527
1525 1528 while (bp != dp) {
1526 1529
1527 1530 ASSERT(bp != NULL);
1528 1531
1529 1532 if (!sema_tryp(&bp->b_sem)) {
1530 1533 bp = bp->av_forw;
1531 1534 continue;
1532 1535 }
1533 1536
1534 1537 /*
1535 1538 * Since we are going down the freelist
1536 1539 * associated with this hash bucket the
1537 1540 * B_DELWRI flag should not be set.
1538 1541 */
1539 1542 ASSERT(!(bp->b_flags & B_DELWRI));
1540 1543
1541 1544 if (bp->b_bufsize == bsize) {
1542 1545 hp->b_length--;
1543 1546 notavail(bp);
1544 1547 bremhash(bp);
1545 1548 mutex_exit(hmp);
1546 1549
1547 1550 /*
1548 1551 * Didn't kmem_alloc any more, so don't
1549 1552 * count it twice.
1550 1553 */
1551 1554 mutex_enter(&bfree_lock);
1552 1555 bfreelist.b_bufsize += bsize;
1553 1556 mutex_exit(&bfree_lock);
1554 1557
1555 1558 /*
1556 1559 * Update the lastindex value.
1557 1560 */
1558 1561 lastindex = start;
1559 1562
1560 1563 /*
1561 1564 * Put our saved bp back on the list
1562 1565 */
1563 1566 sema_v(&save->b_sem);
1564 1567 bio_bhdr_free(save);
1565 1568 ASSERT(SEMA_HELD(&bp->b_sem));
1566 1569 return (bp);
1567 1570 }
1568 1571 sema_v(&bp->b_sem);
1569 1572 bp = bp->av_forw;
1570 1573 }
1571 1574 mutex_exit(hmp);
1572 1575 start = ((start + 1) % v.v_hbuf);
1573 1576 } while (start != end);
1574 1577
1575 1578 biostats.bio_bufwait.value.ui32++;
1576 1579 bp = save; /* Use original bp */
1577 1580 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
1578 1581 }
1579 1582
1580 1583 bp->b_bufsize = bsize;
1581 1584 ASSERT(SEMA_HELD(&bp->b_sem));
1582 1585 return (bp);
1583 1586 }
1584 1587
1585 1588 /*
1586 1589 * Allocate a buffer header. If none currently available, allocate
1587 1590 * a new pool.
1588 1591 */
1589 1592 static struct buf *
1590 1593 bio_bhdr_alloc(void)
1591 1594 {
1592 1595 struct buf *dp, *sdp;
1593 1596 struct buf *bp;
1594 1597 int i;
1595 1598
1596 1599 for (;;) {
1597 1600 mutex_enter(&bhdr_lock);
1598 1601 if (bhdrlist != NULL) {
1599 1602 bp = bhdrlist;
1600 1603 bhdrlist = bp->av_forw;
1601 1604 mutex_exit(&bhdr_lock);
1602 1605 bp->av_forw = NULL;
1603 1606 return (bp);
1604 1607 }
1605 1608 mutex_exit(&bhdr_lock);
1606 1609
1607 1610 /*
1608 1611 * Need to allocate a new pool. If the system is currently
1609 1612 * out of memory, then try freeing things on the freelist.
1610 1613 */
1611 1614 dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP);
1612 1615 if (dp == NULL) {
1613 1616 /*
1614 1617 * System can't give us a pool of headers, try
1615 1618 * recycling from the free lists.
1616 1619 */
1617 1620 bio_recycle(BIO_HEADER, 0);
1618 1621 } else {
1619 1622 sdp = dp;
1620 1623 for (i = 0; i < v.v_buf; i++, dp++) {
1621 1624 /*
1622 1625 * The next two lines are needed since NODEV
1623 1626 * is -1 and not NULL
1624 1627 */
1625 1628 dp->b_dev = (o_dev_t)NODEV;
1626 1629 dp->b_edev = NODEV;
1627 1630 dp->av_forw = dp + 1;
1628 1631 sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT,
1629 1632 NULL);
1630 1633 sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT,
1631 1634 NULL);
1632 1635 dp->b_offset = -1;
1633 1636 }
1634 1637 mutex_enter(&bhdr_lock);
1635 1638 (--dp)->av_forw = bhdrlist; /* Fix last pointer */
1636 1639 bhdrlist = sdp;
1637 1640 nbuf += v.v_buf;
1638 1641 bp = bhdrlist;
1639 1642 bhdrlist = bp->av_forw;
1640 1643 mutex_exit(&bhdr_lock);
1641 1644
1642 1645 bp->av_forw = NULL;
1643 1646 return (bp);
1644 1647 }
1645 1648 }
1646 1649 }
1647 1650
1648 1651 static void
1649 1652 bio_bhdr_free(struct buf *bp)
1650 1653 {
1651 1654 ASSERT(bp->b_back == NULL);
1652 1655 ASSERT(bp->b_forw == NULL);
1653 1656 ASSERT(bp->av_back == NULL);
1654 1657 ASSERT(bp->av_forw == NULL);
1655 1658 ASSERT(bp->b_un.b_addr == NULL);
1656 1659 ASSERT(bp->b_dev == (o_dev_t)NODEV);
1657 1660 ASSERT(bp->b_edev == NODEV);
1658 1661 ASSERT(bp->b_flags == 0);
1659 1662
1660 1663 mutex_enter(&bhdr_lock);
1661 1664 bp->av_forw = bhdrlist;
1662 1665 bhdrlist = bp;
1663 1666 mutex_exit(&bhdr_lock);
1664 1667 }
1665 1668
1666 1669 /*
1667 1670 * If we haven't gone over the high water mark, it's o.k. to
1668 1671 * allocate more buffer space, otherwise recycle buffers
1669 1672 * from the freelist until enough memory is free for a bsize request.
1670 1673 *
1671 1674 * We account for this memory, even though
1672 1675 * we don't allocate it here.
1673 1676 */
1674 1677 static void
1675 1678 bio_mem_get(long bsize)
1676 1679 {
1677 1680 mutex_enter(&bfree_lock);
1678 1681 if (bfreelist.b_bufsize > bsize) {
1679 1682 bfreelist.b_bufsize -= bsize;
1680 1683 mutex_exit(&bfree_lock);
1681 1684 return;
1682 1685 }
1683 1686 mutex_exit(&bfree_lock);
1684 1687 bio_recycle(BIO_MEM, bsize);
1685 1688 }
1686 1689
1687 1690 /*
1688 1691 * flush a list of delayed write buffers.
1689 1692 * (currently used only by bio_recycle below.)
1690 1693 */
1691 1694 static void
1692 1695 bio_flushlist(struct buf *delwri_list)
1693 1696 {
1694 1697 struct buf *bp;
1695 1698
1696 1699 while (delwri_list != EMPTY_LIST) {
1697 1700 bp = delwri_list;
1698 1701 bp->b_flags |= B_AGE | B_ASYNC;
1699 1702 if (bp->b_vp == NULL) { /* !ufs */
1700 1703 BWRITE(bp);
1701 1704 } else { /* ufs */
1702 1705 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
1703 1706 }
1704 1707 delwri_list = bp->b_list;
1705 1708 bp->b_list = NULL;
1706 1709 }
1707 1710 }
1708 1711
1709 1712 /*
1710 1713 * Start recycling buffers on the freelist for one of 2 reasons:
1711 1714 * - we need a buffer header
1712 1715 * - we need to free up memory
1713 1716 * Once started we continue to recycle buffers until the B_AGE
1714 1717 * buffers are gone.
1715 1718 */
1716 1719 static void
1717 1720 bio_recycle(int want, long bsize)
1718 1721 {
1719 1722 struct buf *bp, *dp, *dwp, *nbp;
1720 1723 struct hbuf *hp;
1721 1724 int found = 0;
1722 1725 kmutex_t *hmp;
1723 1726 int start, end;
1724 1727 struct buf *delwri_list = EMPTY_LIST;
1725 1728
1726 1729 /*
1727 1730 * Recycle buffers.
1728 1731 */
1729 1732 top:
1730 1733 start = end = lastindex;
1731 1734 do {
1732 1735 hp = &hbuf[start];
1733 1736 hmp = &hp->b_lock;
1734 1737 dp = (struct buf *)hp;
1735 1738
1736 1739 mutex_enter(hmp);
1737 1740 bp = dp->av_forw;
1738 1741
1739 1742 while (bp != dp) {
1740 1743
1741 1744 ASSERT(bp != NULL);
1742 1745
1743 1746 if (!sema_tryp(&bp->b_sem)) {
1744 1747 bp = bp->av_forw;
1745 1748 continue;
1746 1749 }
1747 1750 /*
1748 1751 * Do we really want to nuke all of the B_AGE stuff??
1749 1752 */
1750 1753 if ((bp->b_flags & B_AGE) == 0 && found) {
1751 1754 sema_v(&bp->b_sem);
1752 1755 mutex_exit(hmp);
1753 1756 lastindex = start;
1754 1757 return; /* All done */
1755 1758 }
1756 1759
1757 1760 ASSERT(MUTEX_HELD(&hp->b_lock));
1758 1761 ASSERT(!(bp->b_flags & B_DELWRI));
1759 1762 hp->b_length--;
1760 1763 notavail(bp);
1761 1764
1762 1765 /*
1763 1766 * Remove bhdr from cache, free up memory,
1764 1767 * and add the hdr to the freelist.
1765 1768 */
1766 1769 bremhash(bp);
1767 1770 mutex_exit(hmp);
1768 1771
1769 1772 if (bp->b_bufsize) {
1770 1773 kmem_free(bp->b_un.b_addr, bp->b_bufsize);
1771 1774 bp->b_un.b_addr = NULL;
1772 1775 mutex_enter(&bfree_lock);
1773 1776 bfreelist.b_bufsize += bp->b_bufsize;
1774 1777 mutex_exit(&bfree_lock);
1775 1778 }
1776 1779
1777 1780 bp->b_dev = (o_dev_t)NODEV;
1778 1781 bp->b_edev = NODEV;
1779 1782 bp->b_flags = 0;
1780 1783 sema_v(&bp->b_sem);
1781 1784 bio_bhdr_free(bp);
1782 1785 if (want == BIO_HEADER) {
1783 1786 found = 1;
1784 1787 } else {
1785 1788 ASSERT(want == BIO_MEM);
1786 1789 if (!found && bfreelist.b_bufsize >= bsize) {
1787 1790 /* Account for the memory we want */
1788 1791 mutex_enter(&bfree_lock);
1789 1792 if (bfreelist.b_bufsize >= bsize) {
1790 1793 bfreelist.b_bufsize -= bsize;
1791 1794 found = 1;
1792 1795 }
1793 1796 mutex_exit(&bfree_lock);
1794 1797 }
1795 1798 }
1796 1799
1797 1800 /*
1798 1801 * Since we dropped hmp start from the
1799 1802 * begining.
1800 1803 */
1801 1804 mutex_enter(hmp);
1802 1805 bp = dp->av_forw;
1803 1806 }
1804 1807 mutex_exit(hmp);
1805 1808
1806 1809 /*
1807 1810 * Look at the delayed write list.
1808 1811 * First gather into a private list, then write them.
1809 1812 */
1810 1813 dwp = (struct buf *)&dwbuf[start];
1811 1814 mutex_enter(&blist_lock);
1812 1815 bio_doingflush++;
1813 1816 mutex_enter(hmp);
1814 1817 for (bp = dwp->av_forw; bp != dwp; bp = nbp) {
1815 1818
1816 1819 ASSERT(bp != NULL);
1817 1820 nbp = bp->av_forw;
1818 1821
1819 1822 if (!sema_tryp(&bp->b_sem))
1820 1823 continue;
1821 1824 ASSERT(bp->b_flags & B_DELWRI);
1822 1825 /*
1823 1826 * Do we really want to nuke all of the B_AGE stuff??
1824 1827 */
1825 1828
1826 1829 if ((bp->b_flags & B_AGE) == 0 && found) {
1827 1830 sema_v(&bp->b_sem);
1828 1831 mutex_exit(hmp);
1829 1832 lastindex = start;
1830 1833 mutex_exit(&blist_lock);
1831 1834 bio_flushlist(delwri_list);
1832 1835 mutex_enter(&blist_lock);
1833 1836 bio_doingflush--;
1834 1837 if (bio_flinv_cv_wanted) {
1835 1838 bio_flinv_cv_wanted = 0;
1836 1839 cv_broadcast(&bio_flushinval_cv);
1837 1840 }
1838 1841 mutex_exit(&blist_lock);
1839 1842 return; /* All done */
1840 1843 }
1841 1844
1842 1845 /*
1843 1846 * If the buffer is already on a flush or
1844 1847 * invalidate list then just skip it.
1845 1848 */
1846 1849 if (bp->b_list != NULL) {
1847 1850 sema_v(&bp->b_sem);
1848 1851 continue;
1849 1852 }
1850 1853 /*
1851 1854 * We are still on the same bucket.
1852 1855 */
1853 1856 hp->b_length--;
1854 1857 notavail(bp);
1855 1858 bp->b_list = delwri_list;
1856 1859 delwri_list = bp;
1857 1860 }
1858 1861 mutex_exit(hmp);
1859 1862 mutex_exit(&blist_lock);
1860 1863 bio_flushlist(delwri_list);
1861 1864 delwri_list = EMPTY_LIST;
1862 1865 mutex_enter(&blist_lock);
1863 1866 bio_doingflush--;
1864 1867 if (bio_flinv_cv_wanted) {
1865 1868 bio_flinv_cv_wanted = 0;
1866 1869 cv_broadcast(&bio_flushinval_cv);
1867 1870 }
1868 1871 mutex_exit(&blist_lock);
1869 1872 start = (start + 1) % v.v_hbuf;
1870 1873
1871 1874 } while (start != end);
1872 1875
1873 1876 if (found)
1874 1877 return;
1875 1878
1876 1879 /*
1877 1880 * Free lists exhausted and we haven't satisfied the request.
1878 1881 * Wait here for more entries to be added to freelist.
1879 1882 * Because this might have just happened, make it timed.
1880 1883 */
1881 1884 mutex_enter(&bfree_lock);
1882 1885 bfreelist.b_flags |= B_WANTED;
1883 1886 (void) cv_reltimedwait(&bio_mem_cv, &bfree_lock, hz, TR_CLOCK_TICK);
1884 1887 mutex_exit(&bfree_lock);
1885 1888 goto top;
1886 1889 }
1887 1890
1888 1891 /*
1889 1892 * See if the block is associated with some buffer
1890 1893 * (mainly to avoid getting hung up on a wait in breada).
1891 1894 */
1892 1895 static int
1893 1896 bio_incore(dev_t dev, daddr_t blkno)
1894 1897 {
1895 1898 struct buf *bp;
1896 1899 struct buf *dp;
1897 1900 uint_t index;
1898 1901 kmutex_t *hmp;
1899 1902
1900 1903 index = bio_bhash(dev, blkno);
1901 1904 dp = (struct buf *)&hbuf[index];
1902 1905 hmp = &hbuf[index].b_lock;
1903 1906
1904 1907 mutex_enter(hmp);
1905 1908 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1906 1909 if (bp->b_blkno == blkno && bp->b_edev == dev &&
1907 1910 (bp->b_flags & B_STALE) == 0) {
1908 1911 mutex_exit(hmp);
1909 1912 return (1);
1910 1913 }
1911 1914 }
1912 1915 mutex_exit(hmp);
1913 1916 return (0);
1914 1917 }
1915 1918
1916 1919 static void
1917 1920 bio_pageio_done(struct buf *bp)
1918 1921 {
1919 1922 if (bp->b_flags & B_PAGEIO) {
1920 1923
1921 1924 if (bp->b_flags & B_REMAPPED)
1922 1925 bp_mapout(bp);
1923 1926
1924 1927 if (bp->b_flags & B_READ)
1925 1928 pvn_read_done(bp->b_pages, bp->b_flags);
1926 1929 else
1927 1930 pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags);
1928 1931 pageio_done(bp);
1929 1932 } else {
1930 1933 ASSERT(bp->b_flags & B_REMAPPED);
1931 1934 bp_mapout(bp);
1932 1935 brelse(bp);
1933 1936 }
1934 1937 }
1935 1938
1936 1939 /*
1937 1940 * bioerror(9F) - indicate error in buffer header
1938 1941 * If 'error' is zero, remove the error indication.
1939 1942 */
1940 1943 void
1941 1944 bioerror(struct buf *bp, int error)
1942 1945 {
1943 1946 ASSERT(bp != NULL);
1944 1947 ASSERT(error >= 0);
1945 1948 ASSERT(SEMA_HELD(&bp->b_sem));
1946 1949
1947 1950 if (error != 0) {
1948 1951 bp->b_flags |= B_ERROR;
1949 1952 } else {
1950 1953 bp->b_flags &= ~B_ERROR;
1951 1954 }
1952 1955 bp->b_error = error;
1953 1956 }
1954 1957
1955 1958 /*
1956 1959 * bioreset(9F) - reuse a private buffer header after I/O is complete
1957 1960 */
1958 1961 void
1959 1962 bioreset(struct buf *bp)
1960 1963 {
1961 1964 ASSERT(bp != NULL);
1962 1965
1963 1966 biofini(bp);
1964 1967 bioinit(bp);
1965 1968 }
1966 1969
1967 1970 /*
1968 1971 * biosize(9F) - return size of a buffer header
1969 1972 */
1970 1973 size_t
1971 1974 biosize(void)
1972 1975 {
1973 1976 return (sizeof (struct buf));
1974 1977 }
1975 1978
1976 1979 /*
1977 1980 * biomodified(9F) - check if buffer is modified
1978 1981 */
1979 1982 int
1980 1983 biomodified(struct buf *bp)
1981 1984 {
1982 1985 int npf;
1983 1986 int ppattr;
1984 1987 struct page *pp;
1985 1988
1986 1989 ASSERT(bp != NULL);
1987 1990
1988 1991 if ((bp->b_flags & B_PAGEIO) == 0) {
1989 1992 return (-1);
1990 1993 }
1991 1994 pp = bp->b_pages;
1992 1995 npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET));
1993 1996
1994 1997 while (npf > 0) {
1995 1998 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
1996 1999 HAT_SYNC_STOPON_MOD);
1997 2000 if (ppattr & P_MOD)
1998 2001 return (1);
1999 2002 pp = pp->p_next;
2000 2003 npf--;
2001 2004 }
2002 2005
2003 2006 return (0);
2004 2007 }
2005 2008
2006 2009 /*
2007 2010 * bioinit(9F) - initialize a buffer structure
2008 2011 */
2009 2012 void
2010 2013 bioinit(struct buf *bp)
2011 2014 {
2012 2015 bzero(bp, sizeof (struct buf));
2013 2016 sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
2014 2017 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
2015 2018 bp->b_offset = -1;
2016 2019 }
2017 2020
2018 2021 /*
2019 2022 * biofini(9F) - uninitialize a buffer structure
2020 2023 */
2021 2024 void
2022 2025 biofini(struct buf *bp)
2023 2026 {
2024 2027 sema_destroy(&bp->b_io);
2025 2028 sema_destroy(&bp->b_sem);
2026 2029 }
2027 2030
2028 2031 /*
2029 2032 * bioclone(9F) - clone a buffer
2030 2033 */
2031 2034 struct buf *
2032 2035 bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno,
2033 2036 int (*iodone)(struct buf *), struct buf *bp_mem, int sleep)
2034 2037 {
2035 2038 struct buf *bufp;
2036 2039
2037 2040 ASSERT(bp);
2038 2041 if (bp_mem == NULL) {
2039 2042 bufp = kmem_alloc(sizeof (struct buf), sleep);
2040 2043 if (bufp == NULL) {
2041 2044 return (NULL);
2042 2045 }
2043 2046 bioinit(bufp);
2044 2047 } else {
2045 2048 bufp = bp_mem;
2046 2049 bioreset(bufp);
2047 2050 }
2048 2051
2049 2052 #define BUF_CLONE_FLAGS (B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\
2050 2053 B_ABRWRITE)
2051 2054
2052 2055 /*
2053 2056 * The cloned buffer does not inherit the B_REMAPPED flag.
2054 2057 */
2055 2058 bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS) | B_BUSY;
2056 2059 bufp->b_bcount = len;
2057 2060 bufp->b_blkno = blkno;
2058 2061 bufp->b_iodone = iodone;
2059 2062 bufp->b_proc = bp->b_proc;
2060 2063 bufp->b_edev = dev;
2061 2064 bufp->b_file = bp->b_file;
2062 2065 bufp->b_offset = bp->b_offset;
2063 2066
2064 2067 if (bp->b_flags & B_SHADOW) {
2065 2068 ASSERT(bp->b_shadow);
2066 2069 ASSERT(bp->b_flags & B_PHYS);
2067 2070
2068 2071 bufp->b_shadow = bp->b_shadow +
2069 2072 btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off);
2070 2073 bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2071 2074 if (bp->b_flags & B_REMAPPED)
2072 2075 bufp->b_proc = NULL;
2073 2076 } else {
2074 2077 if (bp->b_flags & B_PAGEIO) {
2075 2078 struct page *pp;
2076 2079 off_t o;
2077 2080 int i;
2078 2081
2079 2082 pp = bp->b_pages;
2080 2083 o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off;
2081 2084 for (i = btop(o); i > 0; i--) {
2082 2085 pp = pp->p_next;
2083 2086 }
2084 2087 bufp->b_pages = pp;
2085 2088 bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET);
2086 2089 } else {
2087 2090 bufp->b_un.b_addr =
2088 2091 (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2089 2092 if (bp->b_flags & B_REMAPPED)
2090 2093 bufp->b_proc = NULL;
2091 2094 }
2092 2095 }
2093 2096 return (bufp);
2094 2097 }
|
↓ open down ↓ |
1919 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX