1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 /*
  26  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  27  */
  28 
  29 #include <sys/systm.h>
  30 #include <sys/types.h>
  31 #include <sys/vnode.h>
  32 #include <sys/errno.h>
  33 #include <sys/sysmacros.h>
  34 #include <sys/debug.h>
  35 #include <sys/kmem.h>
  36 #include <sys/conf.h>
  37 #include <sys/proc.h>
  38 #include <sys/cmn_err.h>
  39 #include <sys/fssnap_if.h>
  40 #include <sys/fs/ufs_inode.h>
  41 #include <sys/fs/ufs_filio.h>
  42 #include <sys/fs/ufs_log.h>
  43 #include <sys/fs/ufs_bio.h>
  44 #include <sys/atomic.h>
  45 #include <sys/sunddi.h>
  46 
  47 extern uint_t           bypass_snapshot_throttle_key;
  48 
  49 extern struct kmem_cache        *lufs_sv;
  50 extern struct kmem_cache        *lufs_bp;
  51 
  52 static void
  53 makebusy(ml_unit_t *ul, buf_t *bp)
  54 {
  55         sema_p(&bp->b_sem);
  56         if ((bp->b_flags & B_ERROR) == 0)
  57                 return;
  58         if (bp->b_flags & B_READ)
  59                 ldl_seterror(ul, "Error reading ufs log");
  60         else
  61                 ldl_seterror(ul, "Error writing ufs log");
  62 }
  63 
  64 static int
  65 logdone(buf_t *bp)
  66 {
  67         bp->b_flags |= B_DONE;
  68 
  69         if (bp->b_flags & B_WRITE)
  70                 sema_v(&bp->b_sem);
  71         else
  72                 /* wakeup the thread waiting on this buf */
  73                 sema_v(&bp->b_io);
  74         return (0);
  75 }
  76 
  77 static int
  78 ldl_strategy_done(buf_t *cb)
  79 {
  80         lufs_save_t     *sv;
  81         lufs_buf_t      *lbp;
  82         buf_t           *bp;
  83 
  84         ASSERT(SEMA_HELD(&cb->b_sem));
  85         ASSERT((cb->b_flags & B_DONE) == 0);
  86 
  87         /*
  88          * Compute address of the ``save'' struct
  89          */
  90         lbp = (lufs_buf_t *)cb;
  91         sv = (lufs_save_t *)lbp->lb_ptr;
  92 
  93         if (cb->b_flags & B_ERROR)
  94                 sv->sv_error = 1;
  95 
  96         /*
  97          * If this is the last request, release the resources and
  98          * ``done'' the original buffer header.
  99          */
 100         if (atomic_add_long_nv(&sv->sv_nb_left, -cb->b_bcount)) {
 101                 kmem_cache_free(lufs_bp, lbp);
 102                 return (1);
 103         }
 104         /* Propagate any errors back to the original buffer header */
 105         bp = sv->sv_bp;
 106         if (sv->sv_error)
 107                 bp->b_flags |= B_ERROR;
 108         kmem_cache_free(lufs_bp, lbp);
 109         kmem_cache_free(lufs_sv, sv);
 110 
 111         biodone(bp);
 112         return (0);
 113 }
 114 
 115 /*
 116  * Map the log logical block number to a physical disk block number
 117  */
 118 static int
 119 map_frag(
 120         ml_unit_t       *ul,
 121         daddr_t         lblkno,
 122         size_t          bcount,
 123         daddr_t         *pblkno,
 124         size_t          *pbcount)
 125 {
 126         ic_extent_t     *ext = ul->un_ebp->ic_extents;
 127         uint32_t        e = ul->un_ebp->ic_nextents;
 128         uint32_t        s = 0;
 129         uint32_t        i = e >> 1;
 130         uint32_t        lasti = i;
 131         uint32_t        bno_off;
 132 
 133 again:
 134         if (ext[i].ic_lbno <= lblkno) {
 135                 if ((ext[i].ic_lbno + ext[i].ic_nbno) > lblkno) {
 136                         /* FOUND IT */
 137                         bno_off = lblkno - (uint32_t)ext[i].ic_lbno;
 138                         *pbcount = MIN(bcount, dbtob(ext[i].ic_nbno - bno_off));
 139                         *pblkno = ext[i].ic_pbno + bno_off;
 140                         return (0);
 141                 } else
 142                         s = i;
 143         } else
 144                 e = i;
 145         i = s + ((e - s) >> 1);
 146 
 147         if (i == lasti) {
 148                 *pbcount = bcount;
 149                 return (ENOENT);
 150         }
 151         lasti = i;
 152 
 153         goto again;
 154 }
 155 
 156 /*
 157  * The log is a set of extents (which typically will be only one, but
 158  * may be more if the disk was close to full when the log was created)
 159  * and hence the logical offsets into the log
 160  * have to be translated into their real device locations before
 161  * calling the device's strategy routine. The translation may result
 162  * in several IO requests if this request spans extents.
 163  */
 164 void
 165 ldl_strategy(ml_unit_t *ul, buf_t *pb)
 166 {
 167         lufs_save_t     *sv;
 168         lufs_buf_t      *lbp;
 169         buf_t           *cb;
 170         ufsvfs_t        *ufsvfsp = ul->un_ufsvfs;
 171         daddr_t         lblkno, pblkno;
 172         size_t          nb_left, pbcount;
 173         off_t           offset;
 174         dev_t           dev     = ul->un_dev;
 175         int             error;
 176         int             read = pb->b_flags & B_READ;
 177 
 178         /*
 179          * Allocate and initialise the save stucture,
 180          */
 181         sv = kmem_cache_alloc(lufs_sv, KM_SLEEP);
 182         sv->sv_error = 0;
 183         sv->sv_bp = pb;
 184         nb_left = pb->b_bcount;
 185         sv->sv_nb_left = nb_left;
 186 
 187         lblkno = pb->b_blkno;
 188         offset = 0;
 189 
 190         do {
 191                 error = map_frag(ul, lblkno, nb_left, &pblkno, &pbcount);
 192 
 193                 lbp = kmem_cache_alloc(lufs_bp, KM_SLEEP);
 194                 bioinit(&lbp->lb_buf);
 195                 lbp->lb_ptr = sv;
 196 
 197                 cb = bioclone(pb, offset, pbcount, dev,
 198                     pblkno, ldl_strategy_done, &lbp->lb_buf, KM_SLEEP);
 199 
 200                 offset += pbcount;
 201                 lblkno += btodb(pbcount);
 202                 nb_left -= pbcount;
 203 
 204                 if (error) {
 205                         cb->b_flags |= B_ERROR;
 206                         cb->b_resid = cb->b_bcount;
 207                         biodone(cb);
 208                 } else {
 209                         if (read) {
 210                                 logstats.ls_ldlreads.value.ui64++;
 211                                 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
 212                                 lwp_stat_update(LWP_STAT_INBLK, 1);
 213                         } else {
 214                                 logstats.ls_ldlwrites.value.ui64++;
 215                                 lwp_stat_update(LWP_STAT_OUBLK, 1);
 216                         }
 217 
 218                         /*
 219                          * write through the snapshot driver if necessary
 220                          * We do not want this write to be throttled because
 221                          * we are holding the un_log mutex here. If we
 222                          * are throttled in fssnap_translate, the fssnap_taskq
 223                          * thread which can wake us up can get blocked on
 224                          * the un_log mutex resulting in a deadlock.
 225                          */
 226                         if (ufsvfsp->vfs_snapshot) {
 227                                 (void) tsd_set(bypass_snapshot_throttle_key,
 228                                     (void *)1);
 229                                 fssnap_strategy(&ufsvfsp->vfs_snapshot, cb);
 230 
 231                                 (void) tsd_set(bypass_snapshot_throttle_key,
 232                                     (void *)0);
 233                         } else {
 234                                 (void) bdev_strategy(cb);
 235                         }
 236                 }
 237 
 238         } while (nb_left);
 239 }
 240 
 241 static void
 242 writelog(ml_unit_t *ul, buf_t *bp)
 243 {
 244         ASSERT(SEMA_HELD(&bp->b_sem));
 245 
 246         /*
 247          * This is really an B_ASYNC write but we want Presto to
 248          * cache this write.  The iodone routine, logdone, processes
 249          * the buf correctly.
 250          */
 251         bp->b_flags = B_WRITE;
 252         bp->b_edev = ul->un_dev;
 253         bp->b_iodone = logdone;
 254 
 255         /*
 256          * return EIO for every IO if in hard error state
 257          */
 258         if (ul->un_flags & LDL_ERROR) {
 259                 bp->b_flags |= B_ERROR;
 260                 bp->b_error = EIO;
 261                 biodone(bp);
 262                 return;
 263         }
 264 
 265         ldl_strategy(ul, bp);
 266 }
 267 
 268 static void
 269 readlog(ml_unit_t *ul, buf_t *bp)
 270 {
 271         ASSERT(SEMA_HELD(&bp->b_sem));
 272         ASSERT(bp->b_bcount);
 273 
 274         bp->b_flags = B_READ;
 275         bp->b_edev = ul->un_dev;
 276         bp->b_iodone = logdone;
 277 
 278         /* all IO returns errors when in error state */
 279         if (ul->un_flags & LDL_ERROR) {
 280                 bp->b_flags |= B_ERROR;
 281                 bp->b_error = EIO;
 282                 biodone(bp);
 283                 (void) trans_wait(bp);
 284                 return;
 285         }
 286 
 287         ldl_strategy(ul, bp);
 288 
 289         if (trans_wait(bp))
 290                 ldl_seterror(ul, "Error reading ufs log");
 291 }
 292 
 293 /*
 294  * NOTE: writers are single threaded thru the log layer.
 295  * This means we can safely reference and change the cb and bp fields
 296  * that ldl_read does not reference w/o holding the cb_rwlock or
 297  * the bp makebusy lock.
 298  */
 299 static void
 300 push_dirty_bp(ml_unit_t *ul, buf_t *bp)
 301 {
 302         buf_t           *newbp;
 303         cirbuf_t        *cb             = &ul->un_wrbuf;
 304 
 305         ASSERT(bp == cb->cb_bp && bp == cb->cb_dirty);
 306         ASSERT((bp->b_bcount & (DEV_BSIZE-1)) == 0);
 307 
 308         /*
 309          * async write the buf
 310          */
 311         writelog(ul, bp);
 312 
 313         /*
 314          * no longer filling any buf
 315          */
 316         cb->cb_dirty = NULL;
 317 
 318         /*
 319          * no extra buffer space; all done
 320          */
 321         if (bp->b_bcount == bp->b_bufsize)
 322                 return;
 323 
 324         /*
 325          * give extra buffer space to a new bp
 326          *      try to take buf off of free list
 327          */
 328         if ((newbp = cb->cb_free) != NULL) {
 329                 cb->cb_free = newbp->b_forw;
 330         } else {
 331                 newbp = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
 332                 sema_init(&newbp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
 333                 sema_init(&newbp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
 334         }
 335         newbp->b_flags = 0;
 336         newbp->b_bcount = 0;
 337         newbp->b_file = NULL;
 338         newbp->b_offset = -1;
 339         newbp->b_bufsize = bp->b_bufsize - bp->b_bcount;
 340         newbp->b_un.b_addr = bp->b_un.b_addr + bp->b_bcount;
 341         bp->b_bufsize = bp->b_bcount;
 342 
 343         /*
 344          * lock out readers and put new buf at LRU position
 345          */
 346         rw_enter(&cb->cb_rwlock, RW_WRITER);
 347         newbp->b_forw = bp->b_forw;
 348         newbp->b_back = bp;
 349         bp->b_forw->b_back = newbp;
 350         bp->b_forw = newbp;
 351         rw_exit(&cb->cb_rwlock);
 352 }
 353 
 354 static void
 355 inval_range(ml_unit_t *ul, cirbuf_t *cb, off_t lof, off_t nb)
 356 {
 357         buf_t           *bp;
 358         off_t           elof    = lof + nb;
 359         off_t           buflof;
 360         off_t           bufelof;
 361 
 362         /*
 363          * discard all bufs that overlap the range (lof, lof + nb)
 364          */
 365         rw_enter(&cb->cb_rwlock, RW_WRITER);
 366         bp = cb->cb_bp;
 367         do {
 368                 if (bp == cb->cb_dirty || bp->b_bcount == 0) {
 369                         bp = bp->b_forw;
 370                         continue;
 371                 }
 372                 buflof = dbtob(bp->b_blkno);
 373                 bufelof = buflof + bp->b_bcount;
 374                 if ((buflof < lof && bufelof <= lof) ||
 375                     (buflof >= elof && bufelof > elof)) {
 376                         bp = bp->b_forw;
 377                         continue;
 378                 }
 379                 makebusy(ul, bp);
 380                 bp->b_flags = 0;
 381                 bp->b_bcount = 0;
 382                 sema_v(&bp->b_sem);
 383                 bp = bp->b_forw;
 384         } while (bp != cb->cb_bp);
 385         rw_exit(&cb->cb_rwlock);
 386 }
 387 
 388 /*
 389  * NOTE: writers are single threaded thru the log layer.
 390  * This means we can safely reference and change the cb and bp fields
 391  * that ldl_read does not reference w/o holding the cb_rwlock or
 392  * the bp makebusy lock.
 393  */
 394 static buf_t *
 395 get_write_bp(ml_unit_t *ul)
 396 {
 397         cirbuf_t        *cb = &ul->un_wrbuf;
 398         buf_t           *bp;
 399 
 400         /*
 401          * cb_dirty is the buffer we are currently filling; if any
 402          */
 403         if ((bp = cb->cb_dirty) != NULL) {
 404                 makebusy(ul, bp);
 405                 return (bp);
 406         }
 407         /*
 408          * discard any bp that overlaps the current tail since we are
 409          * about to overwrite it.
 410          */
 411         inval_range(ul, cb, ul->un_tail_lof, 1);
 412 
 413         /*
 414          * steal LRU buf
 415          */
 416         rw_enter(&cb->cb_rwlock, RW_WRITER);
 417         bp = cb->cb_bp->b_forw;
 418         makebusy(ul, bp);
 419 
 420         cb->cb_dirty = bp;
 421         cb->cb_bp = bp;
 422 
 423         bp->b_flags = 0;
 424         bp->b_bcount = 0;
 425         bp->b_blkno = btodb(ul->un_tail_lof);
 426         ASSERT(dbtob(bp->b_blkno) == ul->un_tail_lof);
 427         rw_exit(&cb->cb_rwlock);
 428 
 429         /*
 430          * NOTE:
 431          *      1. un_tail_lof never addresses >= un_eol_lof
 432          *      2. b_blkno + btodb(b_bufsize) may > un_eol_lof
 433          *              this case is handled in storebuf
 434          */
 435         return (bp);
 436 }
 437 
 438 void
 439 alloc_wrbuf(cirbuf_t *cb, size_t bufsize)
 440 {
 441         int     i;
 442         buf_t   *bp;
 443 
 444         /*
 445          * Clear previous allocation
 446          */
 447         if (cb->cb_nb)
 448                 free_cirbuf(cb);
 449 
 450         bzero(cb, sizeof (*cb));
 451         rw_init(&cb->cb_rwlock, NULL, RW_DRIVER, NULL);
 452 
 453         rw_enter(&cb->cb_rwlock, RW_WRITER);
 454 
 455         /*
 456          * preallocate 3 bp's and put them on the free list.
 457          */
 458         for (i = 0; i < 3; ++i) {
 459                 bp = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
 460                 sema_init(&bp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
 461                 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
 462                 bp->b_offset = -1;
 463                 bp->b_forw = cb->cb_free;
 464                 cb->cb_free = bp;
 465         }
 466 
 467         cb->cb_va = kmem_alloc(bufsize, KM_SLEEP);
 468         cb->cb_nb = bufsize;
 469 
 470         /*
 471          * first bp claims entire write buffer
 472          */
 473         bp = cb->cb_free;
 474         cb->cb_free = bp->b_forw;
 475 
 476         bp->b_forw = bp;
 477         bp->b_back = bp;
 478         cb->cb_bp = bp;
 479         bp->b_un.b_addr = cb->cb_va;
 480         bp->b_bufsize = cb->cb_nb;
 481 
 482         rw_exit(&cb->cb_rwlock);
 483 }
 484 
 485 void
 486 alloc_rdbuf(cirbuf_t *cb, size_t bufsize, size_t blksize)
 487 {
 488         caddr_t va;
 489         size_t  nb;
 490         buf_t   *bp;
 491 
 492         /*
 493          * Clear previous allocation
 494          */
 495         if (cb->cb_nb)
 496                 free_cirbuf(cb);
 497 
 498         bzero(cb, sizeof (*cb));
 499         rw_init(&cb->cb_rwlock, NULL, RW_DRIVER, NULL);
 500 
 501         rw_enter(&cb->cb_rwlock, RW_WRITER);
 502 
 503         cb->cb_va = kmem_alloc(bufsize, KM_SLEEP);
 504         cb->cb_nb = bufsize;
 505 
 506         /*
 507          * preallocate N bufs that are hard-sized to blksize
 508          *      in other words, the read buffer pool is a linked list
 509          *      of statically sized bufs.
 510          */
 511         va = cb->cb_va;
 512         while ((nb = bufsize) != 0) {
 513                 if (nb > blksize)
 514                         nb = blksize;
 515                 bp = kmem_alloc(sizeof (buf_t), KM_SLEEP);
 516                 bzero(bp, sizeof (buf_t));
 517                 sema_init(&bp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
 518                 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
 519                 bp->b_un.b_addr = va;
 520                 bp->b_bufsize = nb;
 521                 if (cb->cb_bp) {
 522                         bp->b_forw = cb->cb_bp->b_forw;
 523                         bp->b_back = cb->cb_bp;
 524                         cb->cb_bp->b_forw->b_back = bp;
 525                         cb->cb_bp->b_forw = bp;
 526                 } else
 527                         bp->b_forw = bp->b_back = bp;
 528                 cb->cb_bp = bp;
 529                 bufsize -= nb;
 530                 va += nb;
 531         }
 532 
 533         rw_exit(&cb->cb_rwlock);
 534 }
 535 
 536 void
 537 free_cirbuf(cirbuf_t *cb)
 538 {
 539         buf_t   *bp;
 540 
 541         if (cb->cb_nb == 0)
 542                 return;
 543 
 544         rw_enter(&cb->cb_rwlock, RW_WRITER);
 545         ASSERT(cb->cb_dirty == NULL);
 546 
 547         /*
 548          * free the active bufs
 549          */
 550         while ((bp = cb->cb_bp) != NULL) {
 551                 if (bp == bp->b_forw)
 552                         cb->cb_bp = NULL;
 553                 else
 554                         cb->cb_bp = bp->b_forw;
 555                 bp->b_back->b_forw = bp->b_forw;
 556                 bp->b_forw->b_back = bp->b_back;
 557                 sema_destroy(&bp->b_sem);
 558                 sema_destroy(&bp->b_io);
 559                 kmem_free(bp, sizeof (buf_t));
 560         }
 561 
 562         /*
 563          * free the free bufs
 564          */
 565         while ((bp = cb->cb_free) != NULL) {
 566                 cb->cb_free = bp->b_forw;
 567                 sema_destroy(&bp->b_sem);
 568                 sema_destroy(&bp->b_io);
 569                 kmem_free(bp, sizeof (buf_t));
 570         }
 571         kmem_free(cb->cb_va, cb->cb_nb);
 572         cb->cb_va = NULL;
 573         cb->cb_nb = 0;
 574         rw_exit(&cb->cb_rwlock);
 575         rw_destroy(&cb->cb_rwlock);
 576 }
 577 
 578 static int
 579 within_range(off_t lof, daddr_t blkno, ulong_t bcount)
 580 {
 581         off_t   blof    = dbtob(blkno);
 582 
 583         return ((lof >= blof) && (lof < (blof + bcount)));
 584 }
 585 
 586 static buf_t *
 587 find_bp(ml_unit_t *ul, cirbuf_t *cb, off_t lof)
 588 {
 589         buf_t *bp;
 590 
 591         /*
 592          * find a buf that contains the offset lof
 593          */
 594         rw_enter(&cb->cb_rwlock, RW_READER);
 595         bp = cb->cb_bp;
 596         do {
 597                 if (bp->b_bcount &&
 598                     within_range(lof, bp->b_blkno, bp->b_bcount)) {
 599                         makebusy(ul, bp);
 600                         rw_exit(&cb->cb_rwlock);
 601                         return (bp);
 602                 }
 603                 bp = bp->b_forw;
 604         } while (bp != cb->cb_bp);
 605         rw_exit(&cb->cb_rwlock);
 606 
 607         return (NULL);
 608 }
 609 
 610 static off_t
 611 find_read_lof(ml_unit_t *ul, cirbuf_t *cb, off_t lof)
 612 {
 613         buf_t   *bp, *bpend;
 614         off_t   rlof;
 615 
 616         /*
 617          * we mustn't:
 618          *      o read past eol
 619          *      o read past the tail
 620          *      o read data that may be being written.
 621          */
 622         rw_enter(&cb->cb_rwlock, RW_READER);
 623         bpend = bp = cb->cb_bp->b_forw;
 624         rlof = ul->un_tail_lof;
 625         do {
 626                 if (bp->b_bcount) {
 627                         rlof = dbtob(bp->b_blkno);
 628                         break;
 629                 }
 630                 bp = bp->b_forw;
 631         } while (bp != bpend);
 632         rw_exit(&cb->cb_rwlock);
 633 
 634         if (lof <= rlof)
 635                 /* lof is prior to the range represented by the write buf */
 636                 return (rlof);
 637         else
 638                 /* lof follows the range represented by the write buf */
 639                 return ((off_t)ul->un_eol_lof);
 640 }
 641 
 642 static buf_t *
 643 get_read_bp(ml_unit_t *ul, off_t lof)
 644 {
 645         cirbuf_t        *cb;
 646         buf_t           *bp;
 647         off_t           rlof;
 648 
 649         /*
 650          * retrieve as much data as possible from the incore buffers
 651          */
 652         if ((bp = find_bp(ul, &ul->un_wrbuf, lof)) != NULL) {
 653                 logstats.ls_lreadsinmem.value.ui64++;
 654                 return (bp);
 655         }
 656         if ((bp = find_bp(ul, &ul->un_rdbuf, lof)) != NULL) {
 657                 logstats.ls_lreadsinmem.value.ui64++;
 658                 return (bp);
 659         }
 660 
 661         /*
 662          * steal the LRU buf
 663          */
 664         cb = &ul->un_rdbuf;
 665         rw_enter(&cb->cb_rwlock, RW_WRITER);
 666         bp = cb->cb_bp->b_forw;
 667         makebusy(ul, bp);
 668         bp->b_flags = 0;
 669         bp->b_bcount = 0;
 670         cb->cb_bp = bp;
 671         rw_exit(&cb->cb_rwlock);
 672 
 673         /*
 674          * don't read past the tail or the end-of-log
 675          */
 676         bp->b_blkno = btodb(lof);
 677         lof = dbtob(bp->b_blkno);
 678         rlof = find_read_lof(ul, &ul->un_wrbuf, lof);
 679         bp->b_bcount = MIN(bp->b_bufsize, rlof - lof);
 680         readlog(ul, bp);
 681         return (bp);
 682 }
 683 
 684 /*
 685  * NOTE: writers are single threaded thru the log layer.
 686  * This means we can safely reference and change the cb and bp fields
 687  * that ldl_read does not reference w/o holding the cb_rwlock or
 688  * the bp makebusy lock.
 689  */
 690 static int
 691 extend_write_bp(ml_unit_t *ul, cirbuf_t *cb, buf_t *bp)
 692 {
 693         buf_t   *bpforw = bp->b_forw;
 694 
 695         ASSERT(bp == cb->cb_bp && bp == cb->cb_dirty);
 696 
 697         /*
 698          * there is no `next' bp; do nothing
 699          */
 700         if (bpforw == bp)
 701                 return (0);
 702 
 703         /*
 704          * buffer space is not adjacent; do nothing
 705          */
 706         if ((bp->b_un.b_addr + bp->b_bufsize) != bpforw->b_un.b_addr)
 707                 return (0);
 708 
 709         /*
 710          * locking protocol requires giving up any bp locks before
 711          * acquiring cb_rwlock.  This is okay because we hold
 712          * un_log_mutex.
 713          */
 714         sema_v(&bp->b_sem);
 715 
 716         /*
 717          * lock out ldl_read
 718          */
 719         rw_enter(&cb->cb_rwlock, RW_WRITER);
 720 
 721         /*
 722          * wait for current IO to finish w/next bp; if necessary
 723          */
 724         makebusy(ul, bpforw);
 725 
 726         /*
 727          * free the next bp and steal its space
 728          */
 729         bp->b_forw = bpforw->b_forw;
 730         bpforw->b_forw->b_back = bp;
 731         bp->b_bufsize += bpforw->b_bufsize;
 732         sema_v(&bpforw->b_sem);
 733         bpforw->b_forw = cb->cb_free;
 734         cb->cb_free = bpforw;
 735         makebusy(ul, bp);
 736         rw_exit(&cb->cb_rwlock);
 737 
 738         return (1);
 739 }
 740 
 741 static size_t
 742 storebuf(ml_unit_t *ul, buf_t *bp, caddr_t va, size_t nb)
 743 {
 744         size_t          copy_nb;
 745         size_t          nb_in_sec;
 746         sect_trailer_t  *st;
 747         size_t          nb_left = nb;
 748         cirbuf_t        *cb     = &ul->un_wrbuf;
 749 
 750 again:
 751         nb_in_sec = NB_LEFT_IN_SECTOR(bp->b_bcount);
 752         copy_nb = MIN(nb_left, nb_in_sec);
 753 
 754         ASSERT(copy_nb);
 755 
 756         bcopy(va, bp->b_un.b_addr + bp->b_bcount, copy_nb);
 757         bp->b_bcount += copy_nb;
 758         va += copy_nb;
 759         nb_left -= copy_nb;
 760         ul->un_tail_lof += copy_nb;
 761 
 762         if ((nb_in_sec -= copy_nb) == 0) {
 763                 st = (sect_trailer_t *)(bp->b_un.b_addr + bp->b_bcount);
 764 
 765                 st->st_tid = ul->un_logmap->mtm_tid;
 766                 st->st_ident = ul->un_tail_ident++;
 767                 bp->b_bcount += sizeof (sect_trailer_t);
 768                 ul->un_tail_lof += sizeof (sect_trailer_t);
 769                 /*
 770                  * log wrapped; async write this bp
 771                  */
 772                 if (ul->un_tail_lof == ul->un_eol_lof) {
 773                         ul->un_tail_lof = ul->un_bol_lof;
 774                         push_dirty_bp(ul, bp);
 775                         return (nb - nb_left);
 776                 }
 777                 /*
 778                  * out of bp space; get more or async write buf
 779                  */
 780                 if (bp->b_bcount == bp->b_bufsize) {
 781                         if (!extend_write_bp(ul, cb, bp)) {
 782                                 push_dirty_bp(ul, bp);
 783                                 return (nb - nb_left);
 784                         }
 785                 }
 786         }
 787         if (nb_left)
 788                 goto again;
 789 
 790         sema_v(&bp->b_sem);
 791         return (nb);
 792 }
 793 
 794 static void
 795 fetchzeroes(caddr_t dst_va, offset_t dst_mof, ulong_t dst_nb, mapentry_t *me)
 796 {
 797         offset_t        src_mof = me->me_mof;
 798         size_t          src_nb  = me->me_nb;
 799 
 800         if (src_mof > dst_mof) {
 801                 ASSERT(src_mof < (dst_mof + dst_nb));
 802                 dst_va += (src_mof - dst_mof);
 803                 dst_nb -= (src_mof - dst_mof);
 804         } else {
 805                 ASSERT(dst_mof < (src_mof + src_nb));
 806                 src_nb -= (dst_mof - src_mof);
 807         }
 808 
 809         src_nb = MIN(src_nb, dst_nb);
 810         ASSERT(src_nb);
 811         bzero(dst_va, src_nb);
 812 }
 813 
 814 /*
 815  * dst_va == NULL means don't copy anything
 816  */
 817 static ulong_t
 818 fetchbuf(
 819         ml_unit_t *ul,
 820         buf_t *bp,
 821         caddr_t dst_va,
 822         size_t dst_nb,
 823         off_t *dst_lofp)
 824 {
 825         caddr_t copy_va;
 826         size_t  copy_nb;
 827         size_t  nb_sec;
 828         off_t   dst_lof         = *dst_lofp;
 829         ulong_t sav_dst_nb      = dst_nb;
 830         ulong_t src_nb          = bp->b_bcount;
 831         off_t   src_lof         = dbtob(bp->b_blkno);
 832         off_t   src_elof        = src_lof + src_nb;
 833         caddr_t src_va          = bp->b_un.b_addr;
 834 
 835         /*
 836          * copy from bp to dst_va
 837          */
 838         while (dst_nb) {
 839                 /*
 840                  * compute address within bp
 841                  */
 842                 copy_va = src_va + (dst_lof - src_lof);
 843 
 844                 /*
 845                  * adjust copy size to amount of data in bp
 846                  */
 847                 copy_nb = MIN(dst_nb, src_elof - dst_lof);
 848 
 849                 /*
 850                  * adjust copy size to amount of data in sector
 851                  */
 852                 nb_sec = NB_LEFT_IN_SECTOR(dst_lof);
 853                 copy_nb = MIN(copy_nb, nb_sec);
 854 
 855                 /*
 856                  * dst_va == NULL means don't do copy (see logseek())
 857                  */
 858                 if (dst_va) {
 859                         bcopy(copy_va, dst_va, copy_nb);
 860                         dst_va += copy_nb;
 861                 }
 862                 dst_lof += copy_nb;
 863                 dst_nb -= copy_nb;
 864                 nb_sec -= copy_nb;
 865 
 866                 /*
 867                  * advance over sector trailer
 868                  */
 869                 if (nb_sec == 0)
 870                         dst_lof += sizeof (sect_trailer_t);
 871 
 872                 /*
 873                  * exhausted buffer
 874                  *      return current lof for next read
 875                  */
 876                 if (dst_lof == src_elof) {
 877                         sema_v(&bp->b_sem);
 878                         if (dst_lof == ul->un_eol_lof)
 879                                 dst_lof = ul->un_bol_lof;
 880                         *dst_lofp = dst_lof;
 881                         return (sav_dst_nb - dst_nb);
 882                 }
 883         }
 884 
 885         /*
 886          * copy complete - return current lof
 887          */
 888         sema_v(&bp->b_sem);
 889         *dst_lofp = dst_lof;
 890         return (sav_dst_nb);
 891 }
 892 
 893 void
 894 ldl_round_commit(ml_unit_t *ul)
 895 {
 896         int             wrapped;
 897         buf_t           *bp;
 898         sect_trailer_t  *st;
 899         size_t          bcount;
 900         cirbuf_t        *cb     = &ul->un_wrbuf;
 901 
 902         /*
 903          * if nothing to write; then do nothing
 904          */
 905         if ((bp = cb->cb_dirty) == NULL)
 906                 return;
 907         makebusy(ul, bp);
 908 
 909         /*
 910          * round up to sector boundary and set new tail
 911          *      don't readjust st_ident if buf is already rounded
 912          */
 913         bcount = P2ROUNDUP(bp->b_bcount, DEV_BSIZE);
 914         if (bcount == bp->b_bcount) {
 915                 sema_v(&bp->b_sem);
 916                 return;
 917         }
 918         bp->b_bcount = bcount;
 919         ul->un_tail_lof = dbtob(bp->b_blkno) + bcount;
 920         wrapped = 0;
 921         if (ul->un_tail_lof == ul->un_eol_lof) {
 922                 ul->un_tail_lof = ul->un_bol_lof;
 923                 ++wrapped;
 924         }
 925         ASSERT(ul->un_tail_lof != ul->un_head_lof);
 926 
 927         /*
 928          * fix up the sector trailer
 929          */
 930         /* LINTED */
 931         st = (sect_trailer_t *)
 932             ((bp->b_un.b_addr + bcount) - sizeof (*st));
 933         st->st_tid = ul->un_logmap->mtm_tid;
 934         st->st_ident = ul->un_tail_ident++;
 935 
 936         /*
 937          * if tail wrapped or we have exhausted this buffer
 938          *      async write the buffer
 939          */
 940         if (wrapped || bcount == bp->b_bufsize)
 941                 push_dirty_bp(ul, bp);
 942         else
 943                 sema_v(&bp->b_sem);
 944 }
 945 
 946 void
 947 ldl_push_commit(ml_unit_t *ul)
 948 {
 949         buf_t           *bp;
 950         cirbuf_t        *cb     = &ul->un_wrbuf;
 951 
 952         /*
 953          * if nothing to write; then do nothing
 954          */
 955         if ((bp = cb->cb_dirty) == NULL)
 956                 return;
 957         makebusy(ul, bp);
 958         push_dirty_bp(ul, bp);
 959 }
 960 
 961 int
 962 ldl_need_commit(ml_unit_t *ul)
 963 {
 964         return (ul->un_resv > (ul->un_maxresv - (ul->un_maxresv>>2)));
 965 }
 966 
 967 int
 968 ldl_has_space(ml_unit_t *ul, mapentry_t *me)
 969 {
 970         off_t   nfb;
 971         off_t   nb;
 972 
 973         ASSERT(MUTEX_HELD(&ul->un_log_mutex));
 974 
 975         /*
 976          * Add up the size used by the deltas
 977          * round nb up to a sector length plus an extra sector
 978          *      w/o the extra sector we couldn't distinguish
 979          *      a full log (head == tail) from an empty log (head == tail)
 980          */
 981         for (nb = DEV_BSIZE; me; me = me->me_hash) {
 982                 nb += sizeof (struct delta);
 983                 if (me->me_dt != DT_CANCEL)
 984                         nb += me->me_nb;
 985         }
 986         nb = P2ROUNDUP(nb, DEV_BSIZE);
 987 
 988         if (ul->un_head_lof <= ul->un_tail_lof)
 989                 nfb = (ul->un_head_lof - ul->un_bol_lof) +
 990                     (ul->un_eol_lof - ul->un_tail_lof);
 991         else
 992                 nfb = ul->un_head_lof - ul->un_tail_lof;
 993 
 994         return (nb < nfb);
 995 }
 996 
 997 void
 998 ldl_write(ml_unit_t *ul, caddr_t bufp, offset_t bufmof, struct mapentry *me)
 999 {
1000         buf_t           *bp;
1001         caddr_t         va;
1002         size_t          nb;
1003         size_t          actual;
1004 
1005         ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1006 
1007         /* Write the delta */
1008 
1009         nb = sizeof (struct delta);
1010         va = (caddr_t)&me->me_delta;
1011         bp = get_write_bp(ul);
1012 
1013         while (nb) {
1014                 if (ul->un_flags & LDL_ERROR) {
1015                         sema_v(&bp->b_sem);
1016                         return;
1017                 }
1018                 actual = storebuf(ul, bp, va, nb);
1019                 ASSERT(actual);
1020                 va += actual;
1021                 nb -= actual;
1022                 if (nb)
1023                         bp = get_write_bp(ul);
1024         }
1025 
1026         /* If a commit, cancel, or 0's; we're almost done */
1027         switch (me->me_dt) {
1028                 case DT_COMMIT:
1029                 case DT_CANCEL:
1030                 case DT_ABZERO:
1031                         /* roll needs to know where the next delta will go */
1032                         me->me_lof = ul->un_tail_lof;
1033                         return;
1034                 default:
1035                         break;
1036         }
1037 
1038         /* Now write the data */
1039 
1040         ASSERT(me->me_nb != 0);
1041 
1042         nb = me->me_nb;
1043         va = (me->me_mof - bufmof) + bufp;
1044         bp = get_write_bp(ul);
1045 
1046         /* Save where we will put the data */
1047         me->me_lof = ul->un_tail_lof;
1048 
1049         while (nb) {
1050                 if (ul->un_flags & LDL_ERROR) {
1051                         sema_v(&bp->b_sem);
1052                         return;
1053                 }
1054                 actual = storebuf(ul, bp, va, nb);
1055                 ASSERT(actual);
1056                 va += actual;
1057                 nb -= actual;
1058                 if (nb)
1059                         bp = get_write_bp(ul);
1060         }
1061 }
1062 
1063 void
1064 ldl_waito(ml_unit_t *ul)
1065 {
1066         buf_t           *bp;
1067         cirbuf_t        *cb     = &ul->un_wrbuf;
1068 
1069         rw_enter(&cb->cb_rwlock, RW_WRITER);
1070         /*
1071          * wait on them
1072          */
1073         bp = cb->cb_bp;
1074         do {
1075                 if ((bp->b_flags & B_DONE) == 0) {
1076                         makebusy(ul, bp);
1077                         sema_v(&bp->b_sem);
1078                 }
1079                 bp = bp->b_forw;
1080         } while (bp != cb->cb_bp);
1081         rw_exit(&cb->cb_rwlock);
1082 }
1083 
1084 /*
1085  * seek nb bytes from location lof
1086  */
1087 static int
1088 logseek(ml_unit_t *ul, off_t lof, size_t nb, off_t *lofp)
1089 {
1090         buf_t   *bp;
1091         ulong_t actual;
1092 
1093         while (nb) {
1094                 bp = get_read_bp(ul, lof);
1095                 if (bp->b_flags & B_ERROR) {
1096                         sema_v(&bp->b_sem);
1097                         return (EIO);
1098                 }
1099                 actual = fetchbuf(ul, bp, NULL, nb, &lof);
1100                 ASSERT(actual);
1101                 nb -= actual;
1102         }
1103         *lofp = lof;
1104         ASSERT(nb == 0);
1105         return (0);
1106 }
1107 
1108 int
1109 ldl_read(
1110         ml_unit_t *ul,          /* Log unit */
1111         caddr_t va,             /* address of buffer to read into */
1112         offset_t mof,           /* mof of buffer */
1113         off_t nb,               /* length of buffer */
1114         mapentry_t *me)         /* Map entry list */
1115 {
1116         buf_t   *bp;
1117         crb_t   *crb;
1118         caddr_t rva;                    /* address to read into */
1119         size_t  rnb;                    /* # of bytes to read */
1120         off_t   lof;                    /* log device offset to read from */
1121         off_t   skip;
1122         ulong_t actual;
1123         int     error;
1124         caddr_t eva     = va + nb;      /* end of buffer */
1125 
1126         for (; me; me = me->me_agenext) {
1127                 ASSERT(me->me_dt != DT_CANCEL);
1128 
1129                 /*
1130                  * check for an cached roll buffer
1131                  */
1132                 crb = me->me_crb;
1133                 if (crb) {
1134                         if (mof > crb->c_mof) {
1135                                 /*
1136                                  * This mapentry overlaps with the beginning of
1137                                  * the supplied buffer
1138                                  */
1139                                 skip = mof - crb->c_mof;
1140                                 bcopy(crb->c_buf + skip, va,
1141                                     MIN(nb, crb->c_nb - skip));
1142                         } else {
1143                                 /*
1144                                  * This mapentry starts at or after
1145                                  * the supplied buffer.
1146                                  */
1147                                 skip = crb->c_mof - mof;
1148                                 bcopy(crb->c_buf, va + skip,
1149                                     MIN(crb->c_nb, nb - skip));
1150                         }
1151                         logstats.ls_lreadsinmem.value.ui64++;
1152                         continue;
1153                 }
1154 
1155                 /*
1156                  * check for a delta full of zeroes - there's no log data
1157                  */
1158                 if (me->me_dt == DT_ABZERO) {
1159                         fetchzeroes(va, mof, nb, me);
1160                         continue;
1161                 }
1162 
1163                 if (mof > me->me_mof) {
1164                         rnb = (size_t)(mof - me->me_mof);
1165                         error = logseek(ul, me->me_lof, rnb, &lof);
1166                         if (error)
1167                                 return (EIO);
1168                         rva = va;
1169                         rnb = me->me_nb - rnb;
1170                         rnb = ((rva + rnb) > eva) ? eva - rva : rnb;
1171                 } else {
1172                         lof = me->me_lof;
1173                         rva = (me->me_mof - mof) + va;
1174                         rnb = ((rva + me->me_nb) > eva) ? eva - rva : me->me_nb;
1175                 }
1176 
1177                 while (rnb) {
1178                         bp = get_read_bp(ul, lof);
1179                         if (bp->b_flags & B_ERROR) {
1180                                 sema_v(&bp->b_sem);
1181                                 return (EIO);
1182                         }
1183                         ASSERT(((me->me_flags & ME_ROLL) == 0) ||
1184                             (bp != ul->un_wrbuf.cb_dirty));
1185                         actual = fetchbuf(ul, bp, rva, rnb, &lof);
1186                         ASSERT(actual);
1187                         rva += actual;
1188                         rnb -= actual;
1189                 }
1190         }
1191         return (0);
1192 }
1193 
1194 void
1195 ldl_savestate(ml_unit_t *ul)
1196 {
1197         int             error;
1198         buf_t           *bp     = ul->un_bp;
1199         ml_odunit_t     *ud     = (void *)bp->b_un.b_addr;
1200         ml_odunit_t     *ud2    = (void *)(bp->b_un.b_addr + DEV_BSIZE);
1201 
1202 #if     DEBUG
1203         /*
1204          * Scan test is running; don't update intermediate state
1205          */
1206         if (ul->un_logmap && ul->un_logmap->mtm_trimlof)
1207                 return;
1208 #endif  /* DEBUG */
1209 
1210         mutex_enter(&ul->un_state_mutex);
1211         bcopy(&ul->un_ondisk, ud, sizeof (*ud));
1212         ud->od_chksum = ud->od_head_ident + ud->od_tail_ident;
1213         bcopy(ud, ud2, sizeof (*ud));
1214 
1215         /* If a snapshot is enabled write through the shapshot driver. */
1216         if (ul->un_ufsvfs->vfs_snapshot)
1217                 UFS_BWRITE2(ul->un_ufsvfs, bp);
1218         else
1219                 BWRITE2(bp);
1220         logstats.ls_ldlwrites.value.ui64++;
1221         error = bp->b_flags & B_ERROR;
1222         mutex_exit(&ul->un_state_mutex);
1223         if (error)
1224                 ldl_seterror(ul, "Error writing ufs log state");
1225 }
1226 
1227 /*
1228  * The head will be set to (new_lof - header) since ldl_sethead is
1229  * called with the new_lof of the data portion of a delta.
1230  */
1231 void
1232 ldl_sethead(ml_unit_t *ul, off_t data_lof, uint32_t tid)
1233 {
1234         off_t           nb;
1235         off_t           new_lof;
1236         uint32_t        new_ident;
1237         daddr_t         beg_blkno;
1238         daddr_t         end_blkno;
1239 
1240         ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1241 
1242         if (data_lof == -1) {
1243                 /* log is empty */
1244                 new_ident = lufs_hd_genid(ul);
1245                 new_lof = ul->un_tail_lof;
1246 
1247         } else {
1248                 /* compute header's lof */
1249                 new_ident = ul->un_head_ident;
1250                 new_lof = data_lof - sizeof (struct delta);
1251 
1252                 /* whoops, header spans sectors; subtract out sector trailer */
1253                 if (btodb(new_lof) != btodb(data_lof))
1254                         new_lof -= sizeof (sect_trailer_t);
1255 
1256                 /* whoops, header wrapped the log; go to last sector */
1257                 if (new_lof < ul->un_bol_lof) {
1258                         /* sector offset */
1259                         new_lof -= dbtob(btodb(new_lof));
1260                         /* add to last sector's lof */
1261                         new_lof += (ul->un_eol_lof - DEV_BSIZE);
1262                 }
1263                 ul->un_head_tid = tid;
1264         }
1265 
1266         /*
1267          * check for nop
1268          */
1269         if (new_lof == ul->un_head_lof)
1270                 return;
1271 
1272         /*
1273          * invalidate the affected bufs and calculate new ident
1274          */
1275         if (new_lof > ul->un_head_lof) {
1276                 nb = new_lof - ul->un_head_lof;
1277                 inval_range(ul, &ul->un_wrbuf, ul->un_head_lof, nb);
1278                 inval_range(ul, &ul->un_rdbuf, ul->un_head_lof, nb);
1279 
1280                 end_blkno = btodb(new_lof);
1281                 beg_blkno = btodb(ul->un_head_lof);
1282                 new_ident += (end_blkno - beg_blkno);
1283         } else {
1284                 nb = ul->un_eol_lof - ul->un_head_lof;
1285                 inval_range(ul, &ul->un_wrbuf, ul->un_head_lof, nb);
1286                 inval_range(ul, &ul->un_rdbuf, ul->un_head_lof, nb);
1287 
1288                 end_blkno = btodb(ul->un_eol_lof);
1289                 beg_blkno = btodb(ul->un_head_lof);
1290                 new_ident += (end_blkno - beg_blkno);
1291 
1292                 nb = new_lof - ul->un_bol_lof;
1293                 inval_range(ul, &ul->un_wrbuf, ul->un_bol_lof, nb);
1294                 inval_range(ul, &ul->un_rdbuf, ul->un_bol_lof, nb);
1295 
1296                 end_blkno = btodb(new_lof);
1297                 beg_blkno = btodb(ul->un_bol_lof);
1298                 new_ident += (end_blkno - beg_blkno);
1299         }
1300         /*
1301          * don't update the head if there has been an error
1302          */
1303         if (ul->un_flags & LDL_ERROR)
1304                 return;
1305 
1306         /* Fix up the head and ident */
1307         ASSERT(new_lof >= ul->un_bol_lof);
1308         ul->un_head_lof = new_lof;
1309         ul->un_head_ident = new_ident;
1310         if (data_lof == -1) {
1311                 ul->un_tail_ident = ul->un_head_ident;
1312         }
1313 
1314 
1315         /* Commit to the database */
1316         ldl_savestate(ul);
1317 
1318         ASSERT(((ul->un_logmap->mtm_debug & MT_SCAN) == 0) ||
1319             ldl_sethead_debug(ul));
1320 }
1321 
1322 /*
1323  * The tail will be set to the sector following lof+nb
1324  *      lof + nb == size of the last delta + commit record
1325  *      this function is called once after the log scan has completed.
1326  */
1327 void
1328 ldl_settail(ml_unit_t *ul, off_t lof, size_t nb)
1329 {
1330         off_t           new_lof;
1331         uint32_t        new_ident;
1332         daddr_t         beg_blkno;
1333         daddr_t         end_blkno;
1334 
1335         ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1336 
1337         if (lof == -1) {
1338                 ul->un_tail_lof = dbtob(btodb(ul->un_head_lof));
1339                 ul->un_head_lof = ul->un_tail_lof;
1340                 ul->un_head_ident = lufs_hd_genid(ul);
1341                 ul->un_tail_ident = ul->un_head_ident;
1342 
1343                 /* Commit to the database */
1344                 ldl_savestate(ul);
1345 
1346                 return;
1347         }
1348 
1349         /*
1350          * new_lof is the offset of the sector following the last commit
1351          */
1352         (void) logseek(ul, lof, nb, &new_lof);
1353         ASSERT(new_lof != dbtob(btodb(ul->un_head_lof)));
1354 
1355         /*
1356          * calculate new ident
1357          */
1358         if (new_lof > ul->un_head_lof) {
1359                 end_blkno = btodb(new_lof);
1360                 beg_blkno = btodb(ul->un_head_lof);
1361                 new_ident = ul->un_head_ident + (end_blkno - beg_blkno);
1362         } else {
1363                 end_blkno = btodb(ul->un_eol_lof);
1364                 beg_blkno = btodb(ul->un_head_lof);
1365                 new_ident = ul->un_head_ident + (end_blkno - beg_blkno);
1366 
1367                 end_blkno = btodb(new_lof);
1368                 beg_blkno = btodb(ul->un_bol_lof);
1369                 new_ident += (end_blkno - beg_blkno);
1370         }
1371 
1372         /* Fix up the tail and ident */
1373         ul->un_tail_lof = new_lof;
1374         ul->un_tail_ident = new_ident;
1375 
1376         /* Commit to the database */
1377         ldl_savestate(ul);
1378 }
1379 
1380 /*
1381  * LOGSCAN STUFF
1382  */
1383 static int
1384 ldl_logscan_ident(ml_unit_t *ul, buf_t *bp, off_t lof)
1385 {
1386         ulong_t         ident;
1387         size_t          nblk, i;
1388         sect_trailer_t  *st;
1389 
1390         /*
1391          * compute ident for first sector in the buffer
1392          */
1393         ident = ul->un_head_ident;
1394         if (bp->b_blkno >= btodb(ul->un_head_lof)) {
1395                 ident += (bp->b_blkno - btodb(ul->un_head_lof));
1396         } else {
1397                 ident += (btodb(ul->un_eol_lof) - btodb(ul->un_head_lof));
1398                 ident += (bp->b_blkno - btodb(ul->un_bol_lof));
1399         }
1400         /*
1401          * truncate the buffer down to the last valid sector
1402          */
1403         nblk = btodb(bp->b_bcount);
1404         bp->b_bcount = 0;
1405         /* LINTED */
1406         st = (sect_trailer_t *)(bp->b_un.b_addr + LDL_USABLE_BSIZE);
1407         for (i = 0; i < nblk; ++i) {
1408                 if (st->st_ident != ident)
1409                         break;
1410 
1411                 /* remember last valid tid for ldl_logscan_error() */
1412                 ul->un_tid = st->st_tid;
1413 
1414                 /* LINTED */
1415                 st = (sect_trailer_t *)(((caddr_t)st) + DEV_BSIZE);
1416                 ++ident;
1417                 bp->b_bcount += DEV_BSIZE;
1418         }
1419         /*
1420          * make sure that lof is still within range
1421          */
1422         return (within_range(lof, bp->b_blkno, bp->b_bcount));
1423 }
1424 
1425 ulong_t
1426 ldl_logscan_nbcommit(off_t lof)
1427 {
1428         /*
1429          * lof is the offset following the commit header.  However,
1430          * if the commit header fell on the end-of-sector, then lof
1431          * has already been advanced to the beginning of the next
1432          * sector.  So do nothing.  Otherwise, return the remaining
1433          * bytes in the sector.
1434          */
1435         if ((lof & (DEV_BSIZE - 1)) == 0)
1436                 return (0);
1437         return (NB_LEFT_IN_SECTOR(lof));
1438 }
1439 
1440 int
1441 ldl_logscan_read(ml_unit_t *ul, off_t *lofp, size_t nb, caddr_t va)
1442 {
1443         buf_t   *bp;
1444         ulong_t actual;
1445 
1446         ASSERT(ul->un_head_lof != ul->un_tail_lof);
1447 
1448         /*
1449          * Check the log data doesn't go out of bounds
1450          */
1451         if (ul->un_head_lof < ul->un_tail_lof) {
1452                 if (!WITHIN(*lofp, nb, ul->un_head_lof,
1453                     (ul->un_tail_lof - ul->un_head_lof))) {
1454                         return (EIO);
1455                 }
1456         } else {
1457                 if (OVERLAP(*lofp, nb, ul->un_tail_lof,
1458                     (ul->un_head_lof - ul->un_tail_lof))) {
1459                         return (EIO);
1460                 }
1461         }
1462 
1463         while (nb) {
1464                 bp = get_read_bp(ul, *lofp);
1465                 if (bp->b_flags & B_ERROR) {
1466                         sema_v(&bp->b_sem);
1467                         return (EIO);
1468                 }
1469                 /*
1470                  * out-of-seq idents means partial transaction
1471                  *      panic, non-corrupting powerfail, ...
1472                  */
1473                 if (!ldl_logscan_ident(ul, bp, *lofp)) {
1474                         sema_v(&bp->b_sem);
1475                         return (EIO);
1476                 }
1477                 /*
1478                  * copy the header into the caller's buf
1479                  */
1480                 actual = fetchbuf(ul, bp, va, nb, lofp);
1481                 if (va)
1482                         va += actual;
1483                 nb -= actual;
1484         }
1485         return (0);
1486 }
1487 
1488 void
1489 ldl_logscan_begin(ml_unit_t *ul)
1490 {
1491         size_t  bufsize;
1492 
1493         ASSERT(ul->un_wrbuf.cb_dirty == NULL);
1494 
1495         /*
1496          * logscan has begun
1497          */
1498         ul->un_flags |= LDL_SCAN;
1499 
1500         /*
1501          * reset the circular bufs
1502          */
1503         bufsize = ldl_bufsize(ul);
1504         alloc_rdbuf(&ul->un_rdbuf, bufsize, bufsize);
1505         alloc_wrbuf(&ul->un_wrbuf, bufsize);
1506 
1507         /*
1508          * set the tail to reflect a full log
1509          */
1510         ul->un_tail_lof = dbtob(btodb(ul->un_head_lof)) - DEV_BSIZE;
1511 
1512         if (ul->un_tail_lof < ul->un_bol_lof)
1513                 ul->un_tail_lof = ul->un_eol_lof - DEV_BSIZE;
1514         if (ul->un_tail_lof >= ul->un_eol_lof)
1515                 ul->un_tail_lof = ul->un_bol_lof;
1516 
1517         /*
1518          * un_tid is used during error processing; it is initialized to
1519          * the tid of the delta at un_head_lof;
1520          */
1521         ul->un_tid = ul->un_head_tid;
1522 }
1523 
1524 void
1525 ldl_logscan_end(ml_unit_t *ul)
1526 {
1527         size_t  bufsize;
1528 
1529         /*
1530          * reset the circular bufs
1531          */
1532         bufsize = ldl_bufsize(ul);
1533         alloc_rdbuf(&ul->un_rdbuf, MAPBLOCKSIZE, MAPBLOCKSIZE);
1534         alloc_wrbuf(&ul->un_wrbuf, bufsize);
1535 
1536         /*
1537          * Done w/scan
1538          */
1539         ul->un_flags &= ~LDL_SCAN;
1540 }
1541 
1542 int
1543 ldl_need_roll(ml_unit_t *ul)
1544 {
1545         off_t   busybytes;
1546         off_t   head;
1547         off_t   tail;
1548         off_t   bol;
1549         off_t   eol;
1550         off_t   nb;
1551 
1552         /*
1553          * snapshot the log state
1554          */
1555         head = ul->un_head_lof;
1556         tail = ul->un_tail_lof;
1557         bol = ul->un_bol_lof;
1558         eol = ul->un_eol_lof;
1559         nb = ul->un_logsize;
1560 
1561         /*
1562          * compute number of busy (inuse) bytes
1563          */
1564         if (head <= tail)
1565                 busybytes = tail - head;
1566         else
1567                 busybytes = (eol - head) + (tail - bol);
1568 
1569         /*
1570          * return TRUE if > 75% full
1571          */
1572         return (busybytes > (nb - (nb >> 2)));
1573 }
1574 
1575 void
1576 ldl_seterror(ml_unit_t *ul, char *why)
1577 {
1578         /*
1579          * already in error state; do nothing
1580          */
1581         if (ul->un_flags & LDL_ERROR)
1582                 return;
1583 
1584         ul->un_flags |= LDL_ERROR;   /* incore */
1585         ul->un_badlog = 1;           /* ondisk (cleared by fsck) */
1586 
1587         /*
1588          * Commit to state sectors
1589          */
1590         uniqtime(&ul->un_timestamp);
1591         ldl_savestate(ul);
1592 
1593         /* Pretty print */
1594         cmn_err(CE_WARN, "%s", why);
1595         cmn_err(CE_WARN, "ufs log for %s changed state to Error",
1596             ul->un_ufsvfs->vfs_fs->fs_fsmnt);
1597         cmn_err(CE_WARN, "Please umount(1M) %s and run fsck(1M)",
1598             ul->un_ufsvfs->vfs_fs->fs_fsmnt);
1599 
1600         /*
1601          * If we aren't in the middle of scan (aka snarf); tell ufs
1602          * to hard lock itself.
1603          */
1604         if ((ul->un_flags & LDL_SCAN) == 0)
1605                 ufs_trans_onerror();
1606 }
1607 
1608 size_t
1609 ldl_bufsize(ml_unit_t *ul)
1610 {
1611         size_t          bufsize;
1612         extern uint32_t ldl_minbufsize;
1613 
1614         /*
1615          * initial guess is the maxtransfer value for this log device
1616          *      increase if too small
1617          *      decrease if too large
1618          */
1619         bufsize = dbtob(btod(ul->un_maxtransfer));
1620         if (bufsize < ldl_minbufsize)
1621                 bufsize = ldl_minbufsize;
1622         if (bufsize > maxphys)
1623                 bufsize = maxphys;
1624         if (bufsize > ul->un_maxtransfer)
1625                 bufsize = ul->un_maxtransfer;
1626         return (bufsize);
1627 }