Print this page
re #13613 rb4516 Tunables needs volatile keyword
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/ufs/lufs_log.c
+++ new/usr/src/uts/common/fs/ufs/lufs_log.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
↓ open down ↓ |
14 lines elided |
↑ open up ↑ |
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 */
25 +/*
26 + * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
27 + */
25 28
26 29 #include <sys/systm.h>
27 30 #include <sys/types.h>
28 31 #include <sys/vnode.h>
29 32 #include <sys/errno.h>
30 33 #include <sys/sysmacros.h>
31 34 #include <sys/debug.h>
32 35 #include <sys/kmem.h>
33 36 #include <sys/conf.h>
34 37 #include <sys/proc.h>
35 38 #include <sys/cmn_err.h>
36 39 #include <sys/fssnap_if.h>
37 40 #include <sys/fs/ufs_inode.h>
38 41 #include <sys/fs/ufs_filio.h>
39 42 #include <sys/fs/ufs_log.h>
40 43 #include <sys/fs/ufs_bio.h>
41 44 #include <sys/atomic.h>
45 +#include <sys/sunddi.h>
42 46
43 -extern int maxphys;
44 47 extern uint_t bypass_snapshot_throttle_key;
45 48
46 49 extern struct kmem_cache *lufs_sv;
47 50 extern struct kmem_cache *lufs_bp;
48 51
49 52 static void
50 53 makebusy(ml_unit_t *ul, buf_t *bp)
51 54 {
52 55 sema_p(&bp->b_sem);
53 56 if ((bp->b_flags & B_ERROR) == 0)
54 57 return;
55 58 if (bp->b_flags & B_READ)
56 59 ldl_seterror(ul, "Error reading ufs log");
57 60 else
58 61 ldl_seterror(ul, "Error writing ufs log");
59 62 }
60 63
61 64 static int
62 65 logdone(buf_t *bp)
63 66 {
64 67 bp->b_flags |= B_DONE;
65 68
66 69 if (bp->b_flags & B_WRITE)
67 70 sema_v(&bp->b_sem);
68 71 else
69 72 /* wakeup the thread waiting on this buf */
70 73 sema_v(&bp->b_io);
71 74 return (0);
72 75 }
73 76
74 77 static int
75 78 ldl_strategy_done(buf_t *cb)
76 79 {
77 80 lufs_save_t *sv;
78 81 lufs_buf_t *lbp;
79 82 buf_t *bp;
80 83
81 84 ASSERT(SEMA_HELD(&cb->b_sem));
82 85 ASSERT((cb->b_flags & B_DONE) == 0);
83 86
84 87 /*
85 88 * Compute address of the ``save'' struct
86 89 */
87 90 lbp = (lufs_buf_t *)cb;
88 91 sv = (lufs_save_t *)lbp->lb_ptr;
89 92
90 93 if (cb->b_flags & B_ERROR)
91 94 sv->sv_error = 1;
92 95
93 96 /*
94 97 * If this is the last request, release the resources and
95 98 * ``done'' the original buffer header.
96 99 */
97 100 if (atomic_add_long_nv(&sv->sv_nb_left, -cb->b_bcount)) {
98 101 kmem_cache_free(lufs_bp, lbp);
99 102 return (1);
100 103 }
101 104 /* Propagate any errors back to the original buffer header */
102 105 bp = sv->sv_bp;
103 106 if (sv->sv_error)
104 107 bp->b_flags |= B_ERROR;
105 108 kmem_cache_free(lufs_bp, lbp);
106 109 kmem_cache_free(lufs_sv, sv);
107 110
108 111 biodone(bp);
109 112 return (0);
110 113 }
111 114
112 115 /*
113 116 * Map the log logical block number to a physical disk block number
114 117 */
115 118 static int
116 119 map_frag(
117 120 ml_unit_t *ul,
118 121 daddr_t lblkno,
119 122 size_t bcount,
120 123 daddr_t *pblkno,
121 124 size_t *pbcount)
122 125 {
123 126 ic_extent_t *ext = ul->un_ebp->ic_extents;
124 127 uint32_t e = ul->un_ebp->ic_nextents;
125 128 uint32_t s = 0;
126 129 uint32_t i = e >> 1;
127 130 uint32_t lasti = i;
128 131 uint32_t bno_off;
129 132
130 133 again:
131 134 if (ext[i].ic_lbno <= lblkno) {
132 135 if ((ext[i].ic_lbno + ext[i].ic_nbno) > lblkno) {
133 136 /* FOUND IT */
134 137 bno_off = lblkno - (uint32_t)ext[i].ic_lbno;
135 138 *pbcount = MIN(bcount, dbtob(ext[i].ic_nbno - bno_off));
136 139 *pblkno = ext[i].ic_pbno + bno_off;
137 140 return (0);
138 141 } else
139 142 s = i;
140 143 } else
141 144 e = i;
142 145 i = s + ((e - s) >> 1);
143 146
144 147 if (i == lasti) {
145 148 *pbcount = bcount;
146 149 return (ENOENT);
147 150 }
148 151 lasti = i;
149 152
150 153 goto again;
151 154 }
152 155
153 156 /*
154 157 * The log is a set of extents (which typically will be only one, but
155 158 * may be more if the disk was close to full when the log was created)
156 159 * and hence the logical offsets into the log
157 160 * have to be translated into their real device locations before
158 161 * calling the device's strategy routine. The translation may result
159 162 * in several IO requests if this request spans extents.
160 163 */
161 164 void
162 165 ldl_strategy(ml_unit_t *ul, buf_t *pb)
163 166 {
164 167 lufs_save_t *sv;
165 168 lufs_buf_t *lbp;
166 169 buf_t *cb;
167 170 ufsvfs_t *ufsvfsp = ul->un_ufsvfs;
168 171 daddr_t lblkno, pblkno;
169 172 size_t nb_left, pbcount;
170 173 off_t offset;
171 174 dev_t dev = ul->un_dev;
172 175 int error;
173 176 int read = pb->b_flags & B_READ;
174 177
175 178 /*
176 179 * Allocate and initialise the save stucture,
177 180 */
178 181 sv = kmem_cache_alloc(lufs_sv, KM_SLEEP);
179 182 sv->sv_error = 0;
180 183 sv->sv_bp = pb;
181 184 nb_left = pb->b_bcount;
182 185 sv->sv_nb_left = nb_left;
183 186
184 187 lblkno = pb->b_blkno;
185 188 offset = 0;
186 189
187 190 do {
188 191 error = map_frag(ul, lblkno, nb_left, &pblkno, &pbcount);
189 192
190 193 lbp = kmem_cache_alloc(lufs_bp, KM_SLEEP);
191 194 bioinit(&lbp->lb_buf);
192 195 lbp->lb_ptr = sv;
193 196
194 197 cb = bioclone(pb, offset, pbcount, dev,
195 198 pblkno, ldl_strategy_done, &lbp->lb_buf, KM_SLEEP);
196 199
197 200 offset += pbcount;
198 201 lblkno += btodb(pbcount);
199 202 nb_left -= pbcount;
200 203
201 204 if (error) {
202 205 cb->b_flags |= B_ERROR;
203 206 cb->b_resid = cb->b_bcount;
204 207 biodone(cb);
205 208 } else {
206 209 if (read) {
207 210 logstats.ls_ldlreads.value.ui64++;
208 211 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
209 212 lwp_stat_update(LWP_STAT_INBLK, 1);
210 213 } else {
211 214 logstats.ls_ldlwrites.value.ui64++;
212 215 lwp_stat_update(LWP_STAT_OUBLK, 1);
213 216 }
214 217
215 218 /*
216 219 * write through the snapshot driver if necessary
217 220 * We do not want this write to be throttled because
218 221 * we are holding the un_log mutex here. If we
219 222 * are throttled in fssnap_translate, the fssnap_taskq
220 223 * thread which can wake us up can get blocked on
221 224 * the un_log mutex resulting in a deadlock.
222 225 */
223 226 if (ufsvfsp->vfs_snapshot) {
224 227 (void) tsd_set(bypass_snapshot_throttle_key,
225 228 (void *)1);
226 229 fssnap_strategy(&ufsvfsp->vfs_snapshot, cb);
227 230
228 231 (void) tsd_set(bypass_snapshot_throttle_key,
229 232 (void *)0);
230 233 } else {
231 234 (void) bdev_strategy(cb);
232 235 }
233 236 }
234 237
235 238 } while (nb_left);
236 239 }
237 240
238 241 static void
239 242 writelog(ml_unit_t *ul, buf_t *bp)
240 243 {
241 244 ASSERT(SEMA_HELD(&bp->b_sem));
242 245
243 246 /*
244 247 * This is really an B_ASYNC write but we want Presto to
245 248 * cache this write. The iodone routine, logdone, processes
246 249 * the buf correctly.
247 250 */
248 251 bp->b_flags = B_WRITE;
249 252 bp->b_edev = ul->un_dev;
250 253 bp->b_iodone = logdone;
251 254
252 255 /*
253 256 * return EIO for every IO if in hard error state
254 257 */
255 258 if (ul->un_flags & LDL_ERROR) {
256 259 bp->b_flags |= B_ERROR;
257 260 bp->b_error = EIO;
258 261 biodone(bp);
259 262 return;
260 263 }
261 264
262 265 ldl_strategy(ul, bp);
263 266 }
264 267
265 268 static void
266 269 readlog(ml_unit_t *ul, buf_t *bp)
267 270 {
268 271 ASSERT(SEMA_HELD(&bp->b_sem));
269 272 ASSERT(bp->b_bcount);
270 273
271 274 bp->b_flags = B_READ;
272 275 bp->b_edev = ul->un_dev;
273 276 bp->b_iodone = logdone;
274 277
275 278 /* all IO returns errors when in error state */
276 279 if (ul->un_flags & LDL_ERROR) {
277 280 bp->b_flags |= B_ERROR;
278 281 bp->b_error = EIO;
279 282 biodone(bp);
280 283 (void) trans_wait(bp);
281 284 return;
282 285 }
283 286
284 287 ldl_strategy(ul, bp);
285 288
286 289 if (trans_wait(bp))
287 290 ldl_seterror(ul, "Error reading ufs log");
288 291 }
289 292
290 293 /*
291 294 * NOTE: writers are single threaded thru the log layer.
292 295 * This means we can safely reference and change the cb and bp fields
293 296 * that ldl_read does not reference w/o holding the cb_rwlock or
294 297 * the bp makebusy lock.
295 298 */
296 299 static void
297 300 push_dirty_bp(ml_unit_t *ul, buf_t *bp)
298 301 {
299 302 buf_t *newbp;
300 303 cirbuf_t *cb = &ul->un_wrbuf;
301 304
302 305 ASSERT(bp == cb->cb_bp && bp == cb->cb_dirty);
303 306 ASSERT((bp->b_bcount & (DEV_BSIZE-1)) == 0);
304 307
305 308 /*
306 309 * async write the buf
307 310 */
308 311 writelog(ul, bp);
309 312
310 313 /*
311 314 * no longer filling any buf
312 315 */
313 316 cb->cb_dirty = NULL;
314 317
315 318 /*
316 319 * no extra buffer space; all done
317 320 */
318 321 if (bp->b_bcount == bp->b_bufsize)
319 322 return;
320 323
321 324 /*
322 325 * give extra buffer space to a new bp
323 326 * try to take buf off of free list
324 327 */
325 328 if ((newbp = cb->cb_free) != NULL) {
326 329 cb->cb_free = newbp->b_forw;
327 330 } else {
328 331 newbp = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
329 332 sema_init(&newbp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
330 333 sema_init(&newbp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
331 334 }
332 335 newbp->b_flags = 0;
333 336 newbp->b_bcount = 0;
334 337 newbp->b_file = NULL;
335 338 newbp->b_offset = -1;
336 339 newbp->b_bufsize = bp->b_bufsize - bp->b_bcount;
337 340 newbp->b_un.b_addr = bp->b_un.b_addr + bp->b_bcount;
338 341 bp->b_bufsize = bp->b_bcount;
339 342
340 343 /*
341 344 * lock out readers and put new buf at LRU position
342 345 */
343 346 rw_enter(&cb->cb_rwlock, RW_WRITER);
344 347 newbp->b_forw = bp->b_forw;
345 348 newbp->b_back = bp;
346 349 bp->b_forw->b_back = newbp;
347 350 bp->b_forw = newbp;
348 351 rw_exit(&cb->cb_rwlock);
349 352 }
350 353
351 354 static void
352 355 inval_range(ml_unit_t *ul, cirbuf_t *cb, off_t lof, off_t nb)
353 356 {
354 357 buf_t *bp;
355 358 off_t elof = lof + nb;
356 359 off_t buflof;
357 360 off_t bufelof;
358 361
359 362 /*
360 363 * discard all bufs that overlap the range (lof, lof + nb)
361 364 */
362 365 rw_enter(&cb->cb_rwlock, RW_WRITER);
363 366 bp = cb->cb_bp;
364 367 do {
365 368 if (bp == cb->cb_dirty || bp->b_bcount == 0) {
366 369 bp = bp->b_forw;
367 370 continue;
368 371 }
369 372 buflof = dbtob(bp->b_blkno);
370 373 bufelof = buflof + bp->b_bcount;
371 374 if ((buflof < lof && bufelof <= lof) ||
372 375 (buflof >= elof && bufelof > elof)) {
373 376 bp = bp->b_forw;
374 377 continue;
375 378 }
376 379 makebusy(ul, bp);
377 380 bp->b_flags = 0;
378 381 bp->b_bcount = 0;
379 382 sema_v(&bp->b_sem);
380 383 bp = bp->b_forw;
381 384 } while (bp != cb->cb_bp);
382 385 rw_exit(&cb->cb_rwlock);
383 386 }
384 387
385 388 /*
386 389 * NOTE: writers are single threaded thru the log layer.
387 390 * This means we can safely reference and change the cb and bp fields
388 391 * that ldl_read does not reference w/o holding the cb_rwlock or
389 392 * the bp makebusy lock.
390 393 */
391 394 static buf_t *
392 395 get_write_bp(ml_unit_t *ul)
393 396 {
394 397 cirbuf_t *cb = &ul->un_wrbuf;
395 398 buf_t *bp;
396 399
397 400 /*
398 401 * cb_dirty is the buffer we are currently filling; if any
399 402 */
400 403 if ((bp = cb->cb_dirty) != NULL) {
401 404 makebusy(ul, bp);
402 405 return (bp);
403 406 }
404 407 /*
405 408 * discard any bp that overlaps the current tail since we are
406 409 * about to overwrite it.
407 410 */
408 411 inval_range(ul, cb, ul->un_tail_lof, 1);
409 412
410 413 /*
411 414 * steal LRU buf
412 415 */
413 416 rw_enter(&cb->cb_rwlock, RW_WRITER);
414 417 bp = cb->cb_bp->b_forw;
415 418 makebusy(ul, bp);
416 419
417 420 cb->cb_dirty = bp;
418 421 cb->cb_bp = bp;
419 422
420 423 bp->b_flags = 0;
421 424 bp->b_bcount = 0;
422 425 bp->b_blkno = btodb(ul->un_tail_lof);
423 426 ASSERT(dbtob(bp->b_blkno) == ul->un_tail_lof);
424 427 rw_exit(&cb->cb_rwlock);
425 428
426 429 /*
427 430 * NOTE:
428 431 * 1. un_tail_lof never addresses >= un_eol_lof
429 432 * 2. b_blkno + btodb(b_bufsize) may > un_eol_lof
430 433 * this case is handled in storebuf
431 434 */
432 435 return (bp);
433 436 }
434 437
435 438 void
436 439 alloc_wrbuf(cirbuf_t *cb, size_t bufsize)
437 440 {
438 441 int i;
439 442 buf_t *bp;
440 443
441 444 /*
442 445 * Clear previous allocation
443 446 */
444 447 if (cb->cb_nb)
445 448 free_cirbuf(cb);
446 449
447 450 bzero(cb, sizeof (*cb));
448 451 rw_init(&cb->cb_rwlock, NULL, RW_DRIVER, NULL);
449 452
450 453 rw_enter(&cb->cb_rwlock, RW_WRITER);
451 454
452 455 /*
453 456 * preallocate 3 bp's and put them on the free list.
454 457 */
455 458 for (i = 0; i < 3; ++i) {
456 459 bp = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
457 460 sema_init(&bp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
458 461 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
459 462 bp->b_offset = -1;
460 463 bp->b_forw = cb->cb_free;
461 464 cb->cb_free = bp;
462 465 }
463 466
464 467 cb->cb_va = kmem_alloc(bufsize, KM_SLEEP);
465 468 cb->cb_nb = bufsize;
466 469
467 470 /*
468 471 * first bp claims entire write buffer
469 472 */
470 473 bp = cb->cb_free;
471 474 cb->cb_free = bp->b_forw;
472 475
473 476 bp->b_forw = bp;
474 477 bp->b_back = bp;
475 478 cb->cb_bp = bp;
476 479 bp->b_un.b_addr = cb->cb_va;
477 480 bp->b_bufsize = cb->cb_nb;
478 481
479 482 rw_exit(&cb->cb_rwlock);
480 483 }
481 484
482 485 void
483 486 alloc_rdbuf(cirbuf_t *cb, size_t bufsize, size_t blksize)
484 487 {
485 488 caddr_t va;
486 489 size_t nb;
487 490 buf_t *bp;
488 491
489 492 /*
490 493 * Clear previous allocation
491 494 */
492 495 if (cb->cb_nb)
493 496 free_cirbuf(cb);
494 497
495 498 bzero(cb, sizeof (*cb));
496 499 rw_init(&cb->cb_rwlock, NULL, RW_DRIVER, NULL);
497 500
498 501 rw_enter(&cb->cb_rwlock, RW_WRITER);
499 502
500 503 cb->cb_va = kmem_alloc(bufsize, KM_SLEEP);
501 504 cb->cb_nb = bufsize;
502 505
503 506 /*
504 507 * preallocate N bufs that are hard-sized to blksize
505 508 * in other words, the read buffer pool is a linked list
506 509 * of statically sized bufs.
507 510 */
508 511 va = cb->cb_va;
509 512 while ((nb = bufsize) != 0) {
510 513 if (nb > blksize)
511 514 nb = blksize;
512 515 bp = kmem_alloc(sizeof (buf_t), KM_SLEEP);
513 516 bzero(bp, sizeof (buf_t));
514 517 sema_init(&bp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
515 518 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
516 519 bp->b_un.b_addr = va;
517 520 bp->b_bufsize = nb;
518 521 if (cb->cb_bp) {
519 522 bp->b_forw = cb->cb_bp->b_forw;
520 523 bp->b_back = cb->cb_bp;
521 524 cb->cb_bp->b_forw->b_back = bp;
522 525 cb->cb_bp->b_forw = bp;
523 526 } else
524 527 bp->b_forw = bp->b_back = bp;
525 528 cb->cb_bp = bp;
526 529 bufsize -= nb;
527 530 va += nb;
528 531 }
529 532
530 533 rw_exit(&cb->cb_rwlock);
531 534 }
532 535
533 536 void
534 537 free_cirbuf(cirbuf_t *cb)
535 538 {
536 539 buf_t *bp;
537 540
538 541 if (cb->cb_nb == 0)
539 542 return;
540 543
541 544 rw_enter(&cb->cb_rwlock, RW_WRITER);
542 545 ASSERT(cb->cb_dirty == NULL);
543 546
544 547 /*
545 548 * free the active bufs
546 549 */
547 550 while ((bp = cb->cb_bp) != NULL) {
548 551 if (bp == bp->b_forw)
549 552 cb->cb_bp = NULL;
550 553 else
551 554 cb->cb_bp = bp->b_forw;
552 555 bp->b_back->b_forw = bp->b_forw;
553 556 bp->b_forw->b_back = bp->b_back;
554 557 sema_destroy(&bp->b_sem);
555 558 sema_destroy(&bp->b_io);
556 559 kmem_free(bp, sizeof (buf_t));
557 560 }
558 561
559 562 /*
560 563 * free the free bufs
561 564 */
562 565 while ((bp = cb->cb_free) != NULL) {
563 566 cb->cb_free = bp->b_forw;
564 567 sema_destroy(&bp->b_sem);
565 568 sema_destroy(&bp->b_io);
566 569 kmem_free(bp, sizeof (buf_t));
567 570 }
568 571 kmem_free(cb->cb_va, cb->cb_nb);
569 572 cb->cb_va = NULL;
570 573 cb->cb_nb = 0;
571 574 rw_exit(&cb->cb_rwlock);
572 575 rw_destroy(&cb->cb_rwlock);
573 576 }
574 577
575 578 static int
576 579 within_range(off_t lof, daddr_t blkno, ulong_t bcount)
577 580 {
578 581 off_t blof = dbtob(blkno);
579 582
580 583 return ((lof >= blof) && (lof < (blof + bcount)));
581 584 }
582 585
583 586 static buf_t *
584 587 find_bp(ml_unit_t *ul, cirbuf_t *cb, off_t lof)
585 588 {
586 589 buf_t *bp;
587 590
588 591 /*
589 592 * find a buf that contains the offset lof
590 593 */
591 594 rw_enter(&cb->cb_rwlock, RW_READER);
592 595 bp = cb->cb_bp;
593 596 do {
594 597 if (bp->b_bcount &&
595 598 within_range(lof, bp->b_blkno, bp->b_bcount)) {
596 599 makebusy(ul, bp);
597 600 rw_exit(&cb->cb_rwlock);
598 601 return (bp);
599 602 }
600 603 bp = bp->b_forw;
601 604 } while (bp != cb->cb_bp);
602 605 rw_exit(&cb->cb_rwlock);
603 606
604 607 return (NULL);
605 608 }
606 609
607 610 static off_t
608 611 find_read_lof(ml_unit_t *ul, cirbuf_t *cb, off_t lof)
609 612 {
610 613 buf_t *bp, *bpend;
611 614 off_t rlof;
612 615
613 616 /*
614 617 * we mustn't:
615 618 * o read past eol
616 619 * o read past the tail
617 620 * o read data that may be being written.
618 621 */
619 622 rw_enter(&cb->cb_rwlock, RW_READER);
620 623 bpend = bp = cb->cb_bp->b_forw;
621 624 rlof = ul->un_tail_lof;
622 625 do {
623 626 if (bp->b_bcount) {
624 627 rlof = dbtob(bp->b_blkno);
625 628 break;
626 629 }
627 630 bp = bp->b_forw;
628 631 } while (bp != bpend);
629 632 rw_exit(&cb->cb_rwlock);
630 633
631 634 if (lof <= rlof)
632 635 /* lof is prior to the range represented by the write buf */
633 636 return (rlof);
634 637 else
635 638 /* lof follows the range represented by the write buf */
636 639 return ((off_t)ul->un_eol_lof);
637 640 }
638 641
639 642 static buf_t *
640 643 get_read_bp(ml_unit_t *ul, off_t lof)
641 644 {
642 645 cirbuf_t *cb;
643 646 buf_t *bp;
644 647 off_t rlof;
645 648
646 649 /*
647 650 * retrieve as much data as possible from the incore buffers
648 651 */
649 652 if ((bp = find_bp(ul, &ul->un_wrbuf, lof)) != NULL) {
650 653 logstats.ls_lreadsinmem.value.ui64++;
651 654 return (bp);
652 655 }
653 656 if ((bp = find_bp(ul, &ul->un_rdbuf, lof)) != NULL) {
654 657 logstats.ls_lreadsinmem.value.ui64++;
655 658 return (bp);
656 659 }
657 660
658 661 /*
659 662 * steal the LRU buf
660 663 */
661 664 cb = &ul->un_rdbuf;
662 665 rw_enter(&cb->cb_rwlock, RW_WRITER);
663 666 bp = cb->cb_bp->b_forw;
664 667 makebusy(ul, bp);
665 668 bp->b_flags = 0;
666 669 bp->b_bcount = 0;
667 670 cb->cb_bp = bp;
668 671 rw_exit(&cb->cb_rwlock);
669 672
670 673 /*
671 674 * don't read past the tail or the end-of-log
672 675 */
673 676 bp->b_blkno = btodb(lof);
674 677 lof = dbtob(bp->b_blkno);
675 678 rlof = find_read_lof(ul, &ul->un_wrbuf, lof);
676 679 bp->b_bcount = MIN(bp->b_bufsize, rlof - lof);
677 680 readlog(ul, bp);
678 681 return (bp);
679 682 }
680 683
681 684 /*
682 685 * NOTE: writers are single threaded thru the log layer.
683 686 * This means we can safely reference and change the cb and bp fields
684 687 * that ldl_read does not reference w/o holding the cb_rwlock or
685 688 * the bp makebusy lock.
686 689 */
687 690 static int
688 691 extend_write_bp(ml_unit_t *ul, cirbuf_t *cb, buf_t *bp)
689 692 {
690 693 buf_t *bpforw = bp->b_forw;
691 694
692 695 ASSERT(bp == cb->cb_bp && bp == cb->cb_dirty);
693 696
694 697 /*
695 698 * there is no `next' bp; do nothing
696 699 */
697 700 if (bpforw == bp)
698 701 return (0);
699 702
700 703 /*
701 704 * buffer space is not adjacent; do nothing
702 705 */
703 706 if ((bp->b_un.b_addr + bp->b_bufsize) != bpforw->b_un.b_addr)
704 707 return (0);
705 708
706 709 /*
707 710 * locking protocol requires giving up any bp locks before
708 711 * acquiring cb_rwlock. This is okay because we hold
709 712 * un_log_mutex.
710 713 */
711 714 sema_v(&bp->b_sem);
712 715
713 716 /*
714 717 * lock out ldl_read
715 718 */
716 719 rw_enter(&cb->cb_rwlock, RW_WRITER);
717 720
718 721 /*
719 722 * wait for current IO to finish w/next bp; if necessary
720 723 */
721 724 makebusy(ul, bpforw);
722 725
723 726 /*
724 727 * free the next bp and steal its space
725 728 */
726 729 bp->b_forw = bpforw->b_forw;
727 730 bpforw->b_forw->b_back = bp;
728 731 bp->b_bufsize += bpforw->b_bufsize;
729 732 sema_v(&bpforw->b_sem);
730 733 bpforw->b_forw = cb->cb_free;
731 734 cb->cb_free = bpforw;
732 735 makebusy(ul, bp);
733 736 rw_exit(&cb->cb_rwlock);
734 737
735 738 return (1);
736 739 }
737 740
738 741 static size_t
739 742 storebuf(ml_unit_t *ul, buf_t *bp, caddr_t va, size_t nb)
740 743 {
741 744 size_t copy_nb;
742 745 size_t nb_in_sec;
743 746 sect_trailer_t *st;
744 747 size_t nb_left = nb;
745 748 cirbuf_t *cb = &ul->un_wrbuf;
746 749
747 750 again:
748 751 nb_in_sec = NB_LEFT_IN_SECTOR(bp->b_bcount);
749 752 copy_nb = MIN(nb_left, nb_in_sec);
750 753
751 754 ASSERT(copy_nb);
752 755
753 756 bcopy(va, bp->b_un.b_addr + bp->b_bcount, copy_nb);
754 757 bp->b_bcount += copy_nb;
755 758 va += copy_nb;
756 759 nb_left -= copy_nb;
757 760 ul->un_tail_lof += copy_nb;
758 761
759 762 if ((nb_in_sec -= copy_nb) == 0) {
760 763 st = (sect_trailer_t *)(bp->b_un.b_addr + bp->b_bcount);
761 764
762 765 st->st_tid = ul->un_logmap->mtm_tid;
763 766 st->st_ident = ul->un_tail_ident++;
764 767 bp->b_bcount += sizeof (sect_trailer_t);
765 768 ul->un_tail_lof += sizeof (sect_trailer_t);
766 769 /*
767 770 * log wrapped; async write this bp
768 771 */
769 772 if (ul->un_tail_lof == ul->un_eol_lof) {
770 773 ul->un_tail_lof = ul->un_bol_lof;
771 774 push_dirty_bp(ul, bp);
772 775 return (nb - nb_left);
773 776 }
774 777 /*
775 778 * out of bp space; get more or async write buf
776 779 */
777 780 if (bp->b_bcount == bp->b_bufsize) {
778 781 if (!extend_write_bp(ul, cb, bp)) {
779 782 push_dirty_bp(ul, bp);
780 783 return (nb - nb_left);
781 784 }
782 785 }
783 786 }
784 787 if (nb_left)
785 788 goto again;
786 789
787 790 sema_v(&bp->b_sem);
788 791 return (nb);
789 792 }
790 793
791 794 static void
792 795 fetchzeroes(caddr_t dst_va, offset_t dst_mof, ulong_t dst_nb, mapentry_t *me)
793 796 {
794 797 offset_t src_mof = me->me_mof;
795 798 size_t src_nb = me->me_nb;
796 799
797 800 if (src_mof > dst_mof) {
798 801 ASSERT(src_mof < (dst_mof + dst_nb));
799 802 dst_va += (src_mof - dst_mof);
800 803 dst_nb -= (src_mof - dst_mof);
801 804 } else {
802 805 ASSERT(dst_mof < (src_mof + src_nb));
803 806 src_nb -= (dst_mof - src_mof);
804 807 }
805 808
806 809 src_nb = MIN(src_nb, dst_nb);
807 810 ASSERT(src_nb);
808 811 bzero(dst_va, src_nb);
809 812 }
810 813
811 814 /*
812 815 * dst_va == NULL means don't copy anything
813 816 */
814 817 static ulong_t
815 818 fetchbuf(
816 819 ml_unit_t *ul,
817 820 buf_t *bp,
818 821 caddr_t dst_va,
819 822 size_t dst_nb,
820 823 off_t *dst_lofp)
821 824 {
822 825 caddr_t copy_va;
823 826 size_t copy_nb;
824 827 size_t nb_sec;
825 828 off_t dst_lof = *dst_lofp;
826 829 ulong_t sav_dst_nb = dst_nb;
827 830 ulong_t src_nb = bp->b_bcount;
828 831 off_t src_lof = dbtob(bp->b_blkno);
829 832 off_t src_elof = src_lof + src_nb;
830 833 caddr_t src_va = bp->b_un.b_addr;
831 834
832 835 /*
833 836 * copy from bp to dst_va
834 837 */
835 838 while (dst_nb) {
836 839 /*
837 840 * compute address within bp
838 841 */
839 842 copy_va = src_va + (dst_lof - src_lof);
840 843
841 844 /*
842 845 * adjust copy size to amount of data in bp
843 846 */
844 847 copy_nb = MIN(dst_nb, src_elof - dst_lof);
845 848
846 849 /*
847 850 * adjust copy size to amount of data in sector
848 851 */
849 852 nb_sec = NB_LEFT_IN_SECTOR(dst_lof);
850 853 copy_nb = MIN(copy_nb, nb_sec);
851 854
852 855 /*
853 856 * dst_va == NULL means don't do copy (see logseek())
854 857 */
855 858 if (dst_va) {
856 859 bcopy(copy_va, dst_va, copy_nb);
857 860 dst_va += copy_nb;
858 861 }
859 862 dst_lof += copy_nb;
860 863 dst_nb -= copy_nb;
861 864 nb_sec -= copy_nb;
862 865
863 866 /*
864 867 * advance over sector trailer
865 868 */
866 869 if (nb_sec == 0)
867 870 dst_lof += sizeof (sect_trailer_t);
868 871
869 872 /*
870 873 * exhausted buffer
871 874 * return current lof for next read
872 875 */
873 876 if (dst_lof == src_elof) {
874 877 sema_v(&bp->b_sem);
875 878 if (dst_lof == ul->un_eol_lof)
876 879 dst_lof = ul->un_bol_lof;
877 880 *dst_lofp = dst_lof;
878 881 return (sav_dst_nb - dst_nb);
879 882 }
880 883 }
881 884
882 885 /*
883 886 * copy complete - return current lof
884 887 */
885 888 sema_v(&bp->b_sem);
886 889 *dst_lofp = dst_lof;
887 890 return (sav_dst_nb);
888 891 }
889 892
890 893 void
891 894 ldl_round_commit(ml_unit_t *ul)
892 895 {
893 896 int wrapped;
894 897 buf_t *bp;
895 898 sect_trailer_t *st;
896 899 size_t bcount;
897 900 cirbuf_t *cb = &ul->un_wrbuf;
898 901
899 902 /*
900 903 * if nothing to write; then do nothing
901 904 */
902 905 if ((bp = cb->cb_dirty) == NULL)
903 906 return;
904 907 makebusy(ul, bp);
905 908
906 909 /*
907 910 * round up to sector boundary and set new tail
908 911 * don't readjust st_ident if buf is already rounded
909 912 */
910 913 bcount = P2ROUNDUP(bp->b_bcount, DEV_BSIZE);
911 914 if (bcount == bp->b_bcount) {
912 915 sema_v(&bp->b_sem);
913 916 return;
914 917 }
915 918 bp->b_bcount = bcount;
916 919 ul->un_tail_lof = dbtob(bp->b_blkno) + bcount;
917 920 wrapped = 0;
918 921 if (ul->un_tail_lof == ul->un_eol_lof) {
919 922 ul->un_tail_lof = ul->un_bol_lof;
920 923 ++wrapped;
921 924 }
922 925 ASSERT(ul->un_tail_lof != ul->un_head_lof);
923 926
924 927 /*
925 928 * fix up the sector trailer
926 929 */
927 930 /* LINTED */
928 931 st = (sect_trailer_t *)
929 932 ((bp->b_un.b_addr + bcount) - sizeof (*st));
930 933 st->st_tid = ul->un_logmap->mtm_tid;
931 934 st->st_ident = ul->un_tail_ident++;
932 935
933 936 /*
934 937 * if tail wrapped or we have exhausted this buffer
935 938 * async write the buffer
936 939 */
937 940 if (wrapped || bcount == bp->b_bufsize)
938 941 push_dirty_bp(ul, bp);
939 942 else
940 943 sema_v(&bp->b_sem);
941 944 }
942 945
943 946 void
944 947 ldl_push_commit(ml_unit_t *ul)
945 948 {
946 949 buf_t *bp;
947 950 cirbuf_t *cb = &ul->un_wrbuf;
948 951
949 952 /*
950 953 * if nothing to write; then do nothing
951 954 */
952 955 if ((bp = cb->cb_dirty) == NULL)
953 956 return;
954 957 makebusy(ul, bp);
955 958 push_dirty_bp(ul, bp);
956 959 }
957 960
958 961 int
959 962 ldl_need_commit(ml_unit_t *ul)
960 963 {
961 964 return (ul->un_resv > (ul->un_maxresv - (ul->un_maxresv>>2)));
962 965 }
963 966
964 967 int
965 968 ldl_has_space(ml_unit_t *ul, mapentry_t *me)
966 969 {
967 970 off_t nfb;
968 971 off_t nb;
969 972
970 973 ASSERT(MUTEX_HELD(&ul->un_log_mutex));
971 974
972 975 /*
973 976 * Add up the size used by the deltas
974 977 * round nb up to a sector length plus an extra sector
975 978 * w/o the extra sector we couldn't distinguish
976 979 * a full log (head == tail) from an empty log (head == tail)
977 980 */
978 981 for (nb = DEV_BSIZE; me; me = me->me_hash) {
979 982 nb += sizeof (struct delta);
980 983 if (me->me_dt != DT_CANCEL)
981 984 nb += me->me_nb;
982 985 }
983 986 nb = P2ROUNDUP(nb, DEV_BSIZE);
984 987
985 988 if (ul->un_head_lof <= ul->un_tail_lof)
986 989 nfb = (ul->un_head_lof - ul->un_bol_lof) +
987 990 (ul->un_eol_lof - ul->un_tail_lof);
988 991 else
989 992 nfb = ul->un_head_lof - ul->un_tail_lof;
990 993
991 994 return (nb < nfb);
992 995 }
993 996
994 997 void
995 998 ldl_write(ml_unit_t *ul, caddr_t bufp, offset_t bufmof, struct mapentry *me)
996 999 {
997 1000 buf_t *bp;
998 1001 caddr_t va;
999 1002 size_t nb;
1000 1003 size_t actual;
1001 1004
1002 1005 ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1003 1006
1004 1007 /* Write the delta */
1005 1008
1006 1009 nb = sizeof (struct delta);
1007 1010 va = (caddr_t)&me->me_delta;
1008 1011 bp = get_write_bp(ul);
1009 1012
1010 1013 while (nb) {
1011 1014 if (ul->un_flags & LDL_ERROR) {
1012 1015 sema_v(&bp->b_sem);
1013 1016 return;
1014 1017 }
1015 1018 actual = storebuf(ul, bp, va, nb);
1016 1019 ASSERT(actual);
1017 1020 va += actual;
1018 1021 nb -= actual;
1019 1022 if (nb)
1020 1023 bp = get_write_bp(ul);
1021 1024 }
1022 1025
1023 1026 /* If a commit, cancel, or 0's; we're almost done */
1024 1027 switch (me->me_dt) {
1025 1028 case DT_COMMIT:
1026 1029 case DT_CANCEL:
1027 1030 case DT_ABZERO:
1028 1031 /* roll needs to know where the next delta will go */
1029 1032 me->me_lof = ul->un_tail_lof;
1030 1033 return;
1031 1034 default:
1032 1035 break;
1033 1036 }
1034 1037
1035 1038 /* Now write the data */
1036 1039
1037 1040 ASSERT(me->me_nb != 0);
1038 1041
1039 1042 nb = me->me_nb;
1040 1043 va = (me->me_mof - bufmof) + bufp;
1041 1044 bp = get_write_bp(ul);
1042 1045
1043 1046 /* Save where we will put the data */
1044 1047 me->me_lof = ul->un_tail_lof;
1045 1048
1046 1049 while (nb) {
1047 1050 if (ul->un_flags & LDL_ERROR) {
1048 1051 sema_v(&bp->b_sem);
1049 1052 return;
1050 1053 }
1051 1054 actual = storebuf(ul, bp, va, nb);
1052 1055 ASSERT(actual);
1053 1056 va += actual;
1054 1057 nb -= actual;
1055 1058 if (nb)
1056 1059 bp = get_write_bp(ul);
1057 1060 }
1058 1061 }
1059 1062
1060 1063 void
1061 1064 ldl_waito(ml_unit_t *ul)
1062 1065 {
1063 1066 buf_t *bp;
1064 1067 cirbuf_t *cb = &ul->un_wrbuf;
1065 1068
1066 1069 rw_enter(&cb->cb_rwlock, RW_WRITER);
1067 1070 /*
1068 1071 * wait on them
1069 1072 */
1070 1073 bp = cb->cb_bp;
1071 1074 do {
1072 1075 if ((bp->b_flags & B_DONE) == 0) {
1073 1076 makebusy(ul, bp);
1074 1077 sema_v(&bp->b_sem);
1075 1078 }
1076 1079 bp = bp->b_forw;
1077 1080 } while (bp != cb->cb_bp);
1078 1081 rw_exit(&cb->cb_rwlock);
1079 1082 }
1080 1083
1081 1084 /*
1082 1085 * seek nb bytes from location lof
1083 1086 */
1084 1087 static int
1085 1088 logseek(ml_unit_t *ul, off_t lof, size_t nb, off_t *lofp)
1086 1089 {
1087 1090 buf_t *bp;
1088 1091 ulong_t actual;
1089 1092
1090 1093 while (nb) {
1091 1094 bp = get_read_bp(ul, lof);
1092 1095 if (bp->b_flags & B_ERROR) {
1093 1096 sema_v(&bp->b_sem);
1094 1097 return (EIO);
1095 1098 }
1096 1099 actual = fetchbuf(ul, bp, NULL, nb, &lof);
1097 1100 ASSERT(actual);
1098 1101 nb -= actual;
1099 1102 }
1100 1103 *lofp = lof;
1101 1104 ASSERT(nb == 0);
1102 1105 return (0);
1103 1106 }
1104 1107
1105 1108 int
1106 1109 ldl_read(
1107 1110 ml_unit_t *ul, /* Log unit */
1108 1111 caddr_t va, /* address of buffer to read into */
1109 1112 offset_t mof, /* mof of buffer */
1110 1113 off_t nb, /* length of buffer */
1111 1114 mapentry_t *me) /* Map entry list */
1112 1115 {
1113 1116 buf_t *bp;
1114 1117 crb_t *crb;
1115 1118 caddr_t rva; /* address to read into */
1116 1119 size_t rnb; /* # of bytes to read */
1117 1120 off_t lof; /* log device offset to read from */
1118 1121 off_t skip;
1119 1122 ulong_t actual;
1120 1123 int error;
1121 1124 caddr_t eva = va + nb; /* end of buffer */
1122 1125
1123 1126 for (; me; me = me->me_agenext) {
1124 1127 ASSERT(me->me_dt != DT_CANCEL);
1125 1128
1126 1129 /*
1127 1130 * check for an cached roll buffer
1128 1131 */
1129 1132 crb = me->me_crb;
1130 1133 if (crb) {
1131 1134 if (mof > crb->c_mof) {
1132 1135 /*
1133 1136 * This mapentry overlaps with the beginning of
1134 1137 * the supplied buffer
1135 1138 */
1136 1139 skip = mof - crb->c_mof;
1137 1140 bcopy(crb->c_buf + skip, va,
1138 1141 MIN(nb, crb->c_nb - skip));
1139 1142 } else {
1140 1143 /*
1141 1144 * This mapentry starts at or after
1142 1145 * the supplied buffer.
1143 1146 */
1144 1147 skip = crb->c_mof - mof;
1145 1148 bcopy(crb->c_buf, va + skip,
1146 1149 MIN(crb->c_nb, nb - skip));
1147 1150 }
1148 1151 logstats.ls_lreadsinmem.value.ui64++;
1149 1152 continue;
1150 1153 }
1151 1154
1152 1155 /*
1153 1156 * check for a delta full of zeroes - there's no log data
1154 1157 */
1155 1158 if (me->me_dt == DT_ABZERO) {
1156 1159 fetchzeroes(va, mof, nb, me);
1157 1160 continue;
1158 1161 }
1159 1162
1160 1163 if (mof > me->me_mof) {
1161 1164 rnb = (size_t)(mof - me->me_mof);
1162 1165 error = logseek(ul, me->me_lof, rnb, &lof);
1163 1166 if (error)
1164 1167 return (EIO);
1165 1168 rva = va;
1166 1169 rnb = me->me_nb - rnb;
1167 1170 rnb = ((rva + rnb) > eva) ? eva - rva : rnb;
1168 1171 } else {
1169 1172 lof = me->me_lof;
1170 1173 rva = (me->me_mof - mof) + va;
1171 1174 rnb = ((rva + me->me_nb) > eva) ? eva - rva : me->me_nb;
1172 1175 }
1173 1176
1174 1177 while (rnb) {
1175 1178 bp = get_read_bp(ul, lof);
1176 1179 if (bp->b_flags & B_ERROR) {
1177 1180 sema_v(&bp->b_sem);
1178 1181 return (EIO);
1179 1182 }
1180 1183 ASSERT(((me->me_flags & ME_ROLL) == 0) ||
1181 1184 (bp != ul->un_wrbuf.cb_dirty));
1182 1185 actual = fetchbuf(ul, bp, rva, rnb, &lof);
1183 1186 ASSERT(actual);
1184 1187 rva += actual;
1185 1188 rnb -= actual;
1186 1189 }
1187 1190 }
1188 1191 return (0);
1189 1192 }
1190 1193
1191 1194 void
1192 1195 ldl_savestate(ml_unit_t *ul)
1193 1196 {
1194 1197 int error;
1195 1198 buf_t *bp = ul->un_bp;
1196 1199 ml_odunit_t *ud = (void *)bp->b_un.b_addr;
1197 1200 ml_odunit_t *ud2 = (void *)(bp->b_un.b_addr + DEV_BSIZE);
1198 1201
1199 1202 #if DEBUG
1200 1203 /*
1201 1204 * Scan test is running; don't update intermediate state
1202 1205 */
1203 1206 if (ul->un_logmap && ul->un_logmap->mtm_trimlof)
1204 1207 return;
1205 1208 #endif /* DEBUG */
1206 1209
1207 1210 mutex_enter(&ul->un_state_mutex);
1208 1211 bcopy(&ul->un_ondisk, ud, sizeof (*ud));
1209 1212 ud->od_chksum = ud->od_head_ident + ud->od_tail_ident;
1210 1213 bcopy(ud, ud2, sizeof (*ud));
1211 1214
1212 1215 /* If a snapshot is enabled write through the shapshot driver. */
1213 1216 if (ul->un_ufsvfs->vfs_snapshot)
1214 1217 UFS_BWRITE2(ul->un_ufsvfs, bp);
1215 1218 else
1216 1219 BWRITE2(bp);
1217 1220 logstats.ls_ldlwrites.value.ui64++;
1218 1221 error = bp->b_flags & B_ERROR;
1219 1222 mutex_exit(&ul->un_state_mutex);
1220 1223 if (error)
1221 1224 ldl_seterror(ul, "Error writing ufs log state");
1222 1225 }
1223 1226
1224 1227 /*
1225 1228 * The head will be set to (new_lof - header) since ldl_sethead is
1226 1229 * called with the new_lof of the data portion of a delta.
1227 1230 */
1228 1231 void
1229 1232 ldl_sethead(ml_unit_t *ul, off_t data_lof, uint32_t tid)
1230 1233 {
1231 1234 off_t nb;
1232 1235 off_t new_lof;
1233 1236 uint32_t new_ident;
1234 1237 daddr_t beg_blkno;
1235 1238 daddr_t end_blkno;
1236 1239
1237 1240 ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1238 1241
1239 1242 if (data_lof == -1) {
1240 1243 /* log is empty */
1241 1244 new_ident = lufs_hd_genid(ul);
1242 1245 new_lof = ul->un_tail_lof;
1243 1246
1244 1247 } else {
1245 1248 /* compute header's lof */
1246 1249 new_ident = ul->un_head_ident;
1247 1250 new_lof = data_lof - sizeof (struct delta);
1248 1251
1249 1252 /* whoops, header spans sectors; subtract out sector trailer */
1250 1253 if (btodb(new_lof) != btodb(data_lof))
1251 1254 new_lof -= sizeof (sect_trailer_t);
1252 1255
1253 1256 /* whoops, header wrapped the log; go to last sector */
1254 1257 if (new_lof < ul->un_bol_lof) {
1255 1258 /* sector offset */
1256 1259 new_lof -= dbtob(btodb(new_lof));
1257 1260 /* add to last sector's lof */
1258 1261 new_lof += (ul->un_eol_lof - DEV_BSIZE);
1259 1262 }
1260 1263 ul->un_head_tid = tid;
1261 1264 }
1262 1265
1263 1266 /*
1264 1267 * check for nop
1265 1268 */
1266 1269 if (new_lof == ul->un_head_lof)
1267 1270 return;
1268 1271
1269 1272 /*
1270 1273 * invalidate the affected bufs and calculate new ident
1271 1274 */
1272 1275 if (new_lof > ul->un_head_lof) {
1273 1276 nb = new_lof - ul->un_head_lof;
1274 1277 inval_range(ul, &ul->un_wrbuf, ul->un_head_lof, nb);
1275 1278 inval_range(ul, &ul->un_rdbuf, ul->un_head_lof, nb);
1276 1279
1277 1280 end_blkno = btodb(new_lof);
1278 1281 beg_blkno = btodb(ul->un_head_lof);
1279 1282 new_ident += (end_blkno - beg_blkno);
1280 1283 } else {
1281 1284 nb = ul->un_eol_lof - ul->un_head_lof;
1282 1285 inval_range(ul, &ul->un_wrbuf, ul->un_head_lof, nb);
1283 1286 inval_range(ul, &ul->un_rdbuf, ul->un_head_lof, nb);
1284 1287
1285 1288 end_blkno = btodb(ul->un_eol_lof);
1286 1289 beg_blkno = btodb(ul->un_head_lof);
1287 1290 new_ident += (end_blkno - beg_blkno);
1288 1291
1289 1292 nb = new_lof - ul->un_bol_lof;
1290 1293 inval_range(ul, &ul->un_wrbuf, ul->un_bol_lof, nb);
1291 1294 inval_range(ul, &ul->un_rdbuf, ul->un_bol_lof, nb);
1292 1295
1293 1296 end_blkno = btodb(new_lof);
1294 1297 beg_blkno = btodb(ul->un_bol_lof);
1295 1298 new_ident += (end_blkno - beg_blkno);
1296 1299 }
1297 1300 /*
1298 1301 * don't update the head if there has been an error
1299 1302 */
1300 1303 if (ul->un_flags & LDL_ERROR)
1301 1304 return;
1302 1305
1303 1306 /* Fix up the head and ident */
1304 1307 ASSERT(new_lof >= ul->un_bol_lof);
1305 1308 ul->un_head_lof = new_lof;
1306 1309 ul->un_head_ident = new_ident;
1307 1310 if (data_lof == -1) {
1308 1311 ul->un_tail_ident = ul->un_head_ident;
1309 1312 }
1310 1313
1311 1314
1312 1315 /* Commit to the database */
1313 1316 ldl_savestate(ul);
1314 1317
1315 1318 ASSERT(((ul->un_logmap->mtm_debug & MT_SCAN) == 0) ||
1316 1319 ldl_sethead_debug(ul));
1317 1320 }
1318 1321
1319 1322 /*
1320 1323 * The tail will be set to the sector following lof+nb
1321 1324 * lof + nb == size of the last delta + commit record
1322 1325 * this function is called once after the log scan has completed.
1323 1326 */
1324 1327 void
1325 1328 ldl_settail(ml_unit_t *ul, off_t lof, size_t nb)
1326 1329 {
1327 1330 off_t new_lof;
1328 1331 uint32_t new_ident;
1329 1332 daddr_t beg_blkno;
1330 1333 daddr_t end_blkno;
1331 1334
1332 1335 ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1333 1336
1334 1337 if (lof == -1) {
1335 1338 ul->un_tail_lof = dbtob(btodb(ul->un_head_lof));
1336 1339 ul->un_head_lof = ul->un_tail_lof;
1337 1340 ul->un_head_ident = lufs_hd_genid(ul);
1338 1341 ul->un_tail_ident = ul->un_head_ident;
1339 1342
1340 1343 /* Commit to the database */
1341 1344 ldl_savestate(ul);
1342 1345
1343 1346 return;
1344 1347 }
1345 1348
1346 1349 /*
1347 1350 * new_lof is the offset of the sector following the last commit
1348 1351 */
1349 1352 (void) logseek(ul, lof, nb, &new_lof);
1350 1353 ASSERT(new_lof != dbtob(btodb(ul->un_head_lof)));
1351 1354
1352 1355 /*
1353 1356 * calculate new ident
1354 1357 */
1355 1358 if (new_lof > ul->un_head_lof) {
1356 1359 end_blkno = btodb(new_lof);
1357 1360 beg_blkno = btodb(ul->un_head_lof);
1358 1361 new_ident = ul->un_head_ident + (end_blkno - beg_blkno);
1359 1362 } else {
1360 1363 end_blkno = btodb(ul->un_eol_lof);
1361 1364 beg_blkno = btodb(ul->un_head_lof);
1362 1365 new_ident = ul->un_head_ident + (end_blkno - beg_blkno);
1363 1366
1364 1367 end_blkno = btodb(new_lof);
1365 1368 beg_blkno = btodb(ul->un_bol_lof);
1366 1369 new_ident += (end_blkno - beg_blkno);
1367 1370 }
1368 1371
1369 1372 /* Fix up the tail and ident */
1370 1373 ul->un_tail_lof = new_lof;
1371 1374 ul->un_tail_ident = new_ident;
1372 1375
1373 1376 /* Commit to the database */
1374 1377 ldl_savestate(ul);
1375 1378 }
1376 1379
1377 1380 /*
1378 1381 * LOGSCAN STUFF
1379 1382 */
1380 1383 static int
1381 1384 ldl_logscan_ident(ml_unit_t *ul, buf_t *bp, off_t lof)
1382 1385 {
1383 1386 ulong_t ident;
1384 1387 size_t nblk, i;
1385 1388 sect_trailer_t *st;
1386 1389
1387 1390 /*
1388 1391 * compute ident for first sector in the buffer
1389 1392 */
1390 1393 ident = ul->un_head_ident;
1391 1394 if (bp->b_blkno >= btodb(ul->un_head_lof)) {
1392 1395 ident += (bp->b_blkno - btodb(ul->un_head_lof));
1393 1396 } else {
1394 1397 ident += (btodb(ul->un_eol_lof) - btodb(ul->un_head_lof));
1395 1398 ident += (bp->b_blkno - btodb(ul->un_bol_lof));
1396 1399 }
1397 1400 /*
1398 1401 * truncate the buffer down to the last valid sector
1399 1402 */
1400 1403 nblk = btodb(bp->b_bcount);
1401 1404 bp->b_bcount = 0;
1402 1405 /* LINTED */
1403 1406 st = (sect_trailer_t *)(bp->b_un.b_addr + LDL_USABLE_BSIZE);
1404 1407 for (i = 0; i < nblk; ++i) {
1405 1408 if (st->st_ident != ident)
1406 1409 break;
1407 1410
1408 1411 /* remember last valid tid for ldl_logscan_error() */
1409 1412 ul->un_tid = st->st_tid;
1410 1413
1411 1414 /* LINTED */
1412 1415 st = (sect_trailer_t *)(((caddr_t)st) + DEV_BSIZE);
1413 1416 ++ident;
1414 1417 bp->b_bcount += DEV_BSIZE;
1415 1418 }
1416 1419 /*
1417 1420 * make sure that lof is still within range
1418 1421 */
1419 1422 return (within_range(lof, bp->b_blkno, bp->b_bcount));
1420 1423 }
1421 1424
1422 1425 ulong_t
1423 1426 ldl_logscan_nbcommit(off_t lof)
1424 1427 {
1425 1428 /*
1426 1429 * lof is the offset following the commit header. However,
1427 1430 * if the commit header fell on the end-of-sector, then lof
1428 1431 * has already been advanced to the beginning of the next
1429 1432 * sector. So do nothing. Otherwise, return the remaining
1430 1433 * bytes in the sector.
1431 1434 */
1432 1435 if ((lof & (DEV_BSIZE - 1)) == 0)
1433 1436 return (0);
1434 1437 return (NB_LEFT_IN_SECTOR(lof));
1435 1438 }
1436 1439
1437 1440 int
1438 1441 ldl_logscan_read(ml_unit_t *ul, off_t *lofp, size_t nb, caddr_t va)
1439 1442 {
1440 1443 buf_t *bp;
1441 1444 ulong_t actual;
1442 1445
1443 1446 ASSERT(ul->un_head_lof != ul->un_tail_lof);
1444 1447
1445 1448 /*
1446 1449 * Check the log data doesn't go out of bounds
1447 1450 */
1448 1451 if (ul->un_head_lof < ul->un_tail_lof) {
1449 1452 if (!WITHIN(*lofp, nb, ul->un_head_lof,
1450 1453 (ul->un_tail_lof - ul->un_head_lof))) {
1451 1454 return (EIO);
1452 1455 }
1453 1456 } else {
1454 1457 if (OVERLAP(*lofp, nb, ul->un_tail_lof,
1455 1458 (ul->un_head_lof - ul->un_tail_lof))) {
1456 1459 return (EIO);
1457 1460 }
1458 1461 }
1459 1462
1460 1463 while (nb) {
1461 1464 bp = get_read_bp(ul, *lofp);
1462 1465 if (bp->b_flags & B_ERROR) {
1463 1466 sema_v(&bp->b_sem);
1464 1467 return (EIO);
1465 1468 }
1466 1469 /*
1467 1470 * out-of-seq idents means partial transaction
1468 1471 * panic, non-corrupting powerfail, ...
1469 1472 */
1470 1473 if (!ldl_logscan_ident(ul, bp, *lofp)) {
1471 1474 sema_v(&bp->b_sem);
1472 1475 return (EIO);
1473 1476 }
1474 1477 /*
1475 1478 * copy the header into the caller's buf
1476 1479 */
1477 1480 actual = fetchbuf(ul, bp, va, nb, lofp);
1478 1481 if (va)
1479 1482 va += actual;
1480 1483 nb -= actual;
1481 1484 }
1482 1485 return (0);
1483 1486 }
1484 1487
1485 1488 void
1486 1489 ldl_logscan_begin(ml_unit_t *ul)
1487 1490 {
1488 1491 size_t bufsize;
1489 1492
1490 1493 ASSERT(ul->un_wrbuf.cb_dirty == NULL);
1491 1494
1492 1495 /*
1493 1496 * logscan has begun
1494 1497 */
1495 1498 ul->un_flags |= LDL_SCAN;
1496 1499
1497 1500 /*
1498 1501 * reset the circular bufs
1499 1502 */
1500 1503 bufsize = ldl_bufsize(ul);
1501 1504 alloc_rdbuf(&ul->un_rdbuf, bufsize, bufsize);
1502 1505 alloc_wrbuf(&ul->un_wrbuf, bufsize);
1503 1506
1504 1507 /*
1505 1508 * set the tail to reflect a full log
1506 1509 */
1507 1510 ul->un_tail_lof = dbtob(btodb(ul->un_head_lof)) - DEV_BSIZE;
1508 1511
1509 1512 if (ul->un_tail_lof < ul->un_bol_lof)
1510 1513 ul->un_tail_lof = ul->un_eol_lof - DEV_BSIZE;
1511 1514 if (ul->un_tail_lof >= ul->un_eol_lof)
1512 1515 ul->un_tail_lof = ul->un_bol_lof;
1513 1516
1514 1517 /*
1515 1518 * un_tid is used during error processing; it is initialized to
1516 1519 * the tid of the delta at un_head_lof;
1517 1520 */
1518 1521 ul->un_tid = ul->un_head_tid;
1519 1522 }
1520 1523
1521 1524 void
1522 1525 ldl_logscan_end(ml_unit_t *ul)
1523 1526 {
1524 1527 size_t bufsize;
1525 1528
1526 1529 /*
1527 1530 * reset the circular bufs
1528 1531 */
1529 1532 bufsize = ldl_bufsize(ul);
1530 1533 alloc_rdbuf(&ul->un_rdbuf, MAPBLOCKSIZE, MAPBLOCKSIZE);
1531 1534 alloc_wrbuf(&ul->un_wrbuf, bufsize);
1532 1535
1533 1536 /*
1534 1537 * Done w/scan
1535 1538 */
1536 1539 ul->un_flags &= ~LDL_SCAN;
1537 1540 }
1538 1541
1539 1542 int
1540 1543 ldl_need_roll(ml_unit_t *ul)
1541 1544 {
1542 1545 off_t busybytes;
1543 1546 off_t head;
1544 1547 off_t tail;
1545 1548 off_t bol;
1546 1549 off_t eol;
1547 1550 off_t nb;
1548 1551
1549 1552 /*
1550 1553 * snapshot the log state
1551 1554 */
1552 1555 head = ul->un_head_lof;
1553 1556 tail = ul->un_tail_lof;
1554 1557 bol = ul->un_bol_lof;
1555 1558 eol = ul->un_eol_lof;
1556 1559 nb = ul->un_logsize;
1557 1560
1558 1561 /*
1559 1562 * compute number of busy (inuse) bytes
1560 1563 */
1561 1564 if (head <= tail)
1562 1565 busybytes = tail - head;
1563 1566 else
1564 1567 busybytes = (eol - head) + (tail - bol);
1565 1568
1566 1569 /*
1567 1570 * return TRUE if > 75% full
1568 1571 */
1569 1572 return (busybytes > (nb - (nb >> 2)));
1570 1573 }
1571 1574
1572 1575 void
1573 1576 ldl_seterror(ml_unit_t *ul, char *why)
1574 1577 {
1575 1578 /*
1576 1579 * already in error state; do nothing
1577 1580 */
1578 1581 if (ul->un_flags & LDL_ERROR)
1579 1582 return;
1580 1583
1581 1584 ul->un_flags |= LDL_ERROR; /* incore */
1582 1585 ul->un_badlog = 1; /* ondisk (cleared by fsck) */
1583 1586
1584 1587 /*
1585 1588 * Commit to state sectors
1586 1589 */
1587 1590 uniqtime(&ul->un_timestamp);
1588 1591 ldl_savestate(ul);
1589 1592
1590 1593 /* Pretty print */
1591 1594 cmn_err(CE_WARN, "%s", why);
1592 1595 cmn_err(CE_WARN, "ufs log for %s changed state to Error",
1593 1596 ul->un_ufsvfs->vfs_fs->fs_fsmnt);
1594 1597 cmn_err(CE_WARN, "Please umount(1M) %s and run fsck(1M)",
1595 1598 ul->un_ufsvfs->vfs_fs->fs_fsmnt);
1596 1599
1597 1600 /*
1598 1601 * If we aren't in the middle of scan (aka snarf); tell ufs
1599 1602 * to hard lock itself.
1600 1603 */
1601 1604 if ((ul->un_flags & LDL_SCAN) == 0)
1602 1605 ufs_trans_onerror();
1603 1606 }
1604 1607
1605 1608 size_t
1606 1609 ldl_bufsize(ml_unit_t *ul)
1607 1610 {
1608 1611 size_t bufsize;
1609 1612 extern uint32_t ldl_minbufsize;
1610 1613
1611 1614 /*
1612 1615 * initial guess is the maxtransfer value for this log device
1613 1616 * increase if too small
1614 1617 * decrease if too large
1615 1618 */
1616 1619 bufsize = dbtob(btod(ul->un_maxtransfer));
1617 1620 if (bufsize < ldl_minbufsize)
1618 1621 bufsize = ldl_minbufsize;
1619 1622 if (bufsize > maxphys)
1620 1623 bufsize = maxphys;
1621 1624 if (bufsize > ul->un_maxtransfer)
1622 1625 bufsize = ul->un_maxtransfer;
1623 1626 return (bufsize);
1624 1627 }
|
↓ open down ↓ |
1571 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX