1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 #include <sys/systm.h>
  27 #include <sys/types.h>
  28 #include <sys/vnode.h>
  29 #include <sys/errno.h>
  30 #include <sys/sysmacros.h>
  31 #include <sys/debug.h>
  32 #include <sys/kmem.h>
  33 #include <sys/conf.h>
  34 #include <sys/proc.h>
  35 #include <sys/cmn_err.h>
  36 #include <sys/fssnap_if.h>
  37 #include <sys/fs/ufs_inode.h>
  38 #include <sys/fs/ufs_filio.h>
  39 #include <sys/fs/ufs_log.h>
  40 #include <sys/fs/ufs_bio.h>
  41 #include <sys/atomic.h>
  42 
  43 extern int              maxphys;
  44 extern uint_t           bypass_snapshot_throttle_key;
  45 
  46 extern struct kmem_cache        *lufs_sv;
  47 extern struct kmem_cache        *lufs_bp;
  48 
  49 static void
  50 makebusy(ml_unit_t *ul, buf_t *bp)
  51 {
  52         sema_p(&bp->b_sem);
  53         if ((bp->b_flags & B_ERROR) == 0)
  54                 return;
  55         if (bp->b_flags & B_READ)
  56                 ldl_seterror(ul, "Error reading ufs log");
  57         else
  58                 ldl_seterror(ul, "Error writing ufs log");
  59 }
  60 
  61 static int
  62 logdone(buf_t *bp)
  63 {
  64         bp->b_flags |= B_DONE;
  65 
  66         if (bp->b_flags & B_WRITE)
  67                 sema_v(&bp->b_sem);
  68         else
  69                 /* wakeup the thread waiting on this buf */
  70                 sema_v(&bp->b_io);
  71         return (0);
  72 }
  73 
  74 static int
  75 ldl_strategy_done(buf_t *cb)
  76 {
  77         lufs_save_t     *sv;
  78         lufs_buf_t      *lbp;
  79         buf_t           *bp;
  80 
  81         ASSERT(SEMA_HELD(&cb->b_sem));
  82         ASSERT((cb->b_flags & B_DONE) == 0);
  83 
  84         /*
  85          * Compute address of the ``save'' struct
  86          */
  87         lbp = (lufs_buf_t *)cb;
  88         sv = (lufs_save_t *)lbp->lb_ptr;
  89 
  90         if (cb->b_flags & B_ERROR)
  91                 sv->sv_error = 1;
  92 
  93         /*
  94          * If this is the last request, release the resources and
  95          * ``done'' the original buffer header.
  96          */
  97         if (atomic_add_long_nv(&sv->sv_nb_left, -cb->b_bcount)) {
  98                 kmem_cache_free(lufs_bp, lbp);
  99                 return (1);
 100         }
 101         /* Propagate any errors back to the original buffer header */
 102         bp = sv->sv_bp;
 103         if (sv->sv_error)
 104                 bp->b_flags |= B_ERROR;
 105         kmem_cache_free(lufs_bp, lbp);
 106         kmem_cache_free(lufs_sv, sv);
 107 
 108         biodone(bp);
 109         return (0);
 110 }
 111 
 112 /*
 113  * Map the log logical block number to a physical disk block number
 114  */
 115 static int
 116 map_frag(
 117         ml_unit_t       *ul,
 118         daddr_t         lblkno,
 119         size_t          bcount,
 120         daddr_t         *pblkno,
 121         size_t          *pbcount)
 122 {
 123         ic_extent_t     *ext = ul->un_ebp->ic_extents;
 124         uint32_t        e = ul->un_ebp->ic_nextents;
 125         uint32_t        s = 0;
 126         uint32_t        i = e >> 1;
 127         uint32_t        lasti = i;
 128         uint32_t        bno_off;
 129 
 130 again:
 131         if (ext[i].ic_lbno <= lblkno) {
 132                 if ((ext[i].ic_lbno + ext[i].ic_nbno) > lblkno) {
 133                         /* FOUND IT */
 134                         bno_off = lblkno - (uint32_t)ext[i].ic_lbno;
 135                         *pbcount = MIN(bcount, dbtob(ext[i].ic_nbno - bno_off));
 136                         *pblkno = ext[i].ic_pbno + bno_off;
 137                         return (0);
 138                 } else
 139                         s = i;
 140         } else
 141                 e = i;
 142         i = s + ((e - s) >> 1);
 143 
 144         if (i == lasti) {
 145                 *pbcount = bcount;
 146                 return (ENOENT);
 147         }
 148         lasti = i;
 149 
 150         goto again;
 151 }
 152 
 153 /*
 154  * The log is a set of extents (which typically will be only one, but
 155  * may be more if the disk was close to full when the log was created)
 156  * and hence the logical offsets into the log
 157  * have to be translated into their real device locations before
 158  * calling the device's strategy routine. The translation may result
 159  * in several IO requests if this request spans extents.
 160  */
 161 void
 162 ldl_strategy(ml_unit_t *ul, buf_t *pb)
 163 {
 164         lufs_save_t     *sv;
 165         lufs_buf_t      *lbp;
 166         buf_t           *cb;
 167         ufsvfs_t        *ufsvfsp = ul->un_ufsvfs;
 168         daddr_t         lblkno, pblkno;
 169         size_t          nb_left, pbcount;
 170         off_t           offset;
 171         dev_t           dev     = ul->un_dev;
 172         int             error;
 173         int             read = pb->b_flags & B_READ;
 174 
 175         /*
 176          * Allocate and initialise the save stucture,
 177          */
 178         sv = kmem_cache_alloc(lufs_sv, KM_SLEEP);
 179         sv->sv_error = 0;
 180         sv->sv_bp = pb;
 181         nb_left = pb->b_bcount;
 182         sv->sv_nb_left = nb_left;
 183 
 184         lblkno = pb->b_blkno;
 185         offset = 0;
 186 
 187         do {
 188                 error = map_frag(ul, lblkno, nb_left, &pblkno, &pbcount);
 189 
 190                 lbp = kmem_cache_alloc(lufs_bp, KM_SLEEP);
 191                 bioinit(&lbp->lb_buf);
 192                 lbp->lb_ptr = sv;
 193 
 194                 cb = bioclone(pb, offset, pbcount, dev,
 195                     pblkno, ldl_strategy_done, &lbp->lb_buf, KM_SLEEP);
 196 
 197                 offset += pbcount;
 198                 lblkno += btodb(pbcount);
 199                 nb_left -= pbcount;
 200 
 201                 if (error) {
 202                         cb->b_flags |= B_ERROR;
 203                         cb->b_resid = cb->b_bcount;
 204                         biodone(cb);
 205                 } else {
 206                         if (read) {
 207                                 logstats.ls_ldlreads.value.ui64++;
 208                                 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
 209                                 lwp_stat_update(LWP_STAT_INBLK, 1);
 210                         } else {
 211                                 logstats.ls_ldlwrites.value.ui64++;
 212                                 lwp_stat_update(LWP_STAT_OUBLK, 1);
 213                         }
 214 
 215                         /*
 216                          * write through the snapshot driver if necessary
 217                          * We do not want this write to be throttled because
 218                          * we are holding the un_log mutex here. If we
 219                          * are throttled in fssnap_translate, the fssnap_taskq
 220                          * thread which can wake us up can get blocked on
 221                          * the un_log mutex resulting in a deadlock.
 222                          */
 223                         if (ufsvfsp->vfs_snapshot) {
 224                                 (void) tsd_set(bypass_snapshot_throttle_key,
 225                                     (void *)1);
 226                                 fssnap_strategy(&ufsvfsp->vfs_snapshot, cb);
 227 
 228                                 (void) tsd_set(bypass_snapshot_throttle_key,
 229                                     (void *)0);
 230                         } else {
 231                                 (void) bdev_strategy(cb);
 232                         }
 233                 }
 234 
 235         } while (nb_left);
 236 }
 237 
 238 static void
 239 writelog(ml_unit_t *ul, buf_t *bp)
 240 {
 241         ASSERT(SEMA_HELD(&bp->b_sem));
 242 
 243         /*
 244          * This is really an B_ASYNC write but we want Presto to
 245          * cache this write.  The iodone routine, logdone, processes
 246          * the buf correctly.
 247          */
 248         bp->b_flags = B_WRITE;
 249         bp->b_edev = ul->un_dev;
 250         bp->b_iodone = logdone;
 251 
 252         /*
 253          * return EIO for every IO if in hard error state
 254          */
 255         if (ul->un_flags & LDL_ERROR) {
 256                 bp->b_flags |= B_ERROR;
 257                 bp->b_error = EIO;
 258                 biodone(bp);
 259                 return;
 260         }
 261 
 262         ldl_strategy(ul, bp);
 263 }
 264 
 265 static void
 266 readlog(ml_unit_t *ul, buf_t *bp)
 267 {
 268         ASSERT(SEMA_HELD(&bp->b_sem));
 269         ASSERT(bp->b_bcount);
 270 
 271         bp->b_flags = B_READ;
 272         bp->b_edev = ul->un_dev;
 273         bp->b_iodone = logdone;
 274 
 275         /* all IO returns errors when in error state */
 276         if (ul->un_flags & LDL_ERROR) {
 277                 bp->b_flags |= B_ERROR;
 278                 bp->b_error = EIO;
 279                 biodone(bp);
 280                 (void) trans_wait(bp);
 281                 return;
 282         }
 283 
 284         ldl_strategy(ul, bp);
 285 
 286         if (trans_wait(bp))
 287                 ldl_seterror(ul, "Error reading ufs log");
 288 }
 289 
 290 /*
 291  * NOTE: writers are single threaded thru the log layer.
 292  * This means we can safely reference and change the cb and bp fields
 293  * that ldl_read does not reference w/o holding the cb_rwlock or
 294  * the bp makebusy lock.
 295  */
 296 static void
 297 push_dirty_bp(ml_unit_t *ul, buf_t *bp)
 298 {
 299         buf_t           *newbp;
 300         cirbuf_t        *cb             = &ul->un_wrbuf;
 301 
 302         ASSERT(bp == cb->cb_bp && bp == cb->cb_dirty);
 303         ASSERT((bp->b_bcount & (DEV_BSIZE-1)) == 0);
 304 
 305         /*
 306          * async write the buf
 307          */
 308         writelog(ul, bp);
 309 
 310         /*
 311          * no longer filling any buf
 312          */
 313         cb->cb_dirty = NULL;
 314 
 315         /*
 316          * no extra buffer space; all done
 317          */
 318         if (bp->b_bcount == bp->b_bufsize)
 319                 return;
 320 
 321         /*
 322          * give extra buffer space to a new bp
 323          *      try to take buf off of free list
 324          */
 325         if ((newbp = cb->cb_free) != NULL) {
 326                 cb->cb_free = newbp->b_forw;
 327         } else {
 328                 newbp = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
 329                 sema_init(&newbp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
 330                 sema_init(&newbp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
 331         }
 332         newbp->b_flags = 0;
 333         newbp->b_bcount = 0;
 334         newbp->b_file = NULL;
 335         newbp->b_offset = -1;
 336         newbp->b_bufsize = bp->b_bufsize - bp->b_bcount;
 337         newbp->b_un.b_addr = bp->b_un.b_addr + bp->b_bcount;
 338         bp->b_bufsize = bp->b_bcount;
 339 
 340         /*
 341          * lock out readers and put new buf at LRU position
 342          */
 343         rw_enter(&cb->cb_rwlock, RW_WRITER);
 344         newbp->b_forw = bp->b_forw;
 345         newbp->b_back = bp;
 346         bp->b_forw->b_back = newbp;
 347         bp->b_forw = newbp;
 348         rw_exit(&cb->cb_rwlock);
 349 }
 350 
 351 static void
 352 inval_range(ml_unit_t *ul, cirbuf_t *cb, off_t lof, off_t nb)
 353 {
 354         buf_t           *bp;
 355         off_t           elof    = lof + nb;
 356         off_t           buflof;
 357         off_t           bufelof;
 358 
 359         /*
 360          * discard all bufs that overlap the range (lof, lof + nb)
 361          */
 362         rw_enter(&cb->cb_rwlock, RW_WRITER);
 363         bp = cb->cb_bp;
 364         do {
 365                 if (bp == cb->cb_dirty || bp->b_bcount == 0) {
 366                         bp = bp->b_forw;
 367                         continue;
 368                 }
 369                 buflof = dbtob(bp->b_blkno);
 370                 bufelof = buflof + bp->b_bcount;
 371                 if ((buflof < lof && bufelof <= lof) ||
 372                     (buflof >= elof && bufelof > elof)) {
 373                         bp = bp->b_forw;
 374                         continue;
 375                 }
 376                 makebusy(ul, bp);
 377                 bp->b_flags = 0;
 378                 bp->b_bcount = 0;
 379                 sema_v(&bp->b_sem);
 380                 bp = bp->b_forw;
 381         } while (bp != cb->cb_bp);
 382         rw_exit(&cb->cb_rwlock);
 383 }
 384 
 385 /*
 386  * NOTE: writers are single threaded thru the log layer.
 387  * This means we can safely reference and change the cb and bp fields
 388  * that ldl_read does not reference w/o holding the cb_rwlock or
 389  * the bp makebusy lock.
 390  */
 391 static buf_t *
 392 get_write_bp(ml_unit_t *ul)
 393 {
 394         cirbuf_t        *cb = &ul->un_wrbuf;
 395         buf_t           *bp;
 396 
 397         /*
 398          * cb_dirty is the buffer we are currently filling; if any
 399          */
 400         if ((bp = cb->cb_dirty) != NULL) {
 401                 makebusy(ul, bp);
 402                 return (bp);
 403         }
 404         /*
 405          * discard any bp that overlaps the current tail since we are
 406          * about to overwrite it.
 407          */
 408         inval_range(ul, cb, ul->un_tail_lof, 1);
 409 
 410         /*
 411          * steal LRU buf
 412          */
 413         rw_enter(&cb->cb_rwlock, RW_WRITER);
 414         bp = cb->cb_bp->b_forw;
 415         makebusy(ul, bp);
 416 
 417         cb->cb_dirty = bp;
 418         cb->cb_bp = bp;
 419 
 420         bp->b_flags = 0;
 421         bp->b_bcount = 0;
 422         bp->b_blkno = btodb(ul->un_tail_lof);
 423         ASSERT(dbtob(bp->b_blkno) == ul->un_tail_lof);
 424         rw_exit(&cb->cb_rwlock);
 425 
 426         /*
 427          * NOTE:
 428          *      1. un_tail_lof never addresses >= un_eol_lof
 429          *      2. b_blkno + btodb(b_bufsize) may > un_eol_lof
 430          *              this case is handled in storebuf
 431          */
 432         return (bp);
 433 }
 434 
 435 void
 436 alloc_wrbuf(cirbuf_t *cb, size_t bufsize)
 437 {
 438         int     i;
 439         buf_t   *bp;
 440 
 441         /*
 442          * Clear previous allocation
 443          */
 444         if (cb->cb_nb)
 445                 free_cirbuf(cb);
 446 
 447         bzero(cb, sizeof (*cb));
 448         rw_init(&cb->cb_rwlock, NULL, RW_DRIVER, NULL);
 449 
 450         rw_enter(&cb->cb_rwlock, RW_WRITER);
 451 
 452         /*
 453          * preallocate 3 bp's and put them on the free list.
 454          */
 455         for (i = 0; i < 3; ++i) {
 456                 bp = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
 457                 sema_init(&bp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
 458                 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
 459                 bp->b_offset = -1;
 460                 bp->b_forw = cb->cb_free;
 461                 cb->cb_free = bp;
 462         }
 463 
 464         cb->cb_va = kmem_alloc(bufsize, KM_SLEEP);
 465         cb->cb_nb = bufsize;
 466 
 467         /*
 468          * first bp claims entire write buffer
 469          */
 470         bp = cb->cb_free;
 471         cb->cb_free = bp->b_forw;
 472 
 473         bp->b_forw = bp;
 474         bp->b_back = bp;
 475         cb->cb_bp = bp;
 476         bp->b_un.b_addr = cb->cb_va;
 477         bp->b_bufsize = cb->cb_nb;
 478 
 479         rw_exit(&cb->cb_rwlock);
 480 }
 481 
 482 void
 483 alloc_rdbuf(cirbuf_t *cb, size_t bufsize, size_t blksize)
 484 {
 485         caddr_t va;
 486         size_t  nb;
 487         buf_t   *bp;
 488 
 489         /*
 490          * Clear previous allocation
 491          */
 492         if (cb->cb_nb)
 493                 free_cirbuf(cb);
 494 
 495         bzero(cb, sizeof (*cb));
 496         rw_init(&cb->cb_rwlock, NULL, RW_DRIVER, NULL);
 497 
 498         rw_enter(&cb->cb_rwlock, RW_WRITER);
 499 
 500         cb->cb_va = kmem_alloc(bufsize, KM_SLEEP);
 501         cb->cb_nb = bufsize;
 502 
 503         /*
 504          * preallocate N bufs that are hard-sized to blksize
 505          *      in other words, the read buffer pool is a linked list
 506          *      of statically sized bufs.
 507          */
 508         va = cb->cb_va;
 509         while ((nb = bufsize) != 0) {
 510                 if (nb > blksize)
 511                         nb = blksize;
 512                 bp = kmem_alloc(sizeof (buf_t), KM_SLEEP);
 513                 bzero(bp, sizeof (buf_t));
 514                 sema_init(&bp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
 515                 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
 516                 bp->b_un.b_addr = va;
 517                 bp->b_bufsize = nb;
 518                 if (cb->cb_bp) {
 519                         bp->b_forw = cb->cb_bp->b_forw;
 520                         bp->b_back = cb->cb_bp;
 521                         cb->cb_bp->b_forw->b_back = bp;
 522                         cb->cb_bp->b_forw = bp;
 523                 } else
 524                         bp->b_forw = bp->b_back = bp;
 525                 cb->cb_bp = bp;
 526                 bufsize -= nb;
 527                 va += nb;
 528         }
 529 
 530         rw_exit(&cb->cb_rwlock);
 531 }
 532 
 533 void
 534 free_cirbuf(cirbuf_t *cb)
 535 {
 536         buf_t   *bp;
 537 
 538         if (cb->cb_nb == 0)
 539                 return;
 540 
 541         rw_enter(&cb->cb_rwlock, RW_WRITER);
 542         ASSERT(cb->cb_dirty == NULL);
 543 
 544         /*
 545          * free the active bufs
 546          */
 547         while ((bp = cb->cb_bp) != NULL) {
 548                 if (bp == bp->b_forw)
 549                         cb->cb_bp = NULL;
 550                 else
 551                         cb->cb_bp = bp->b_forw;
 552                 bp->b_back->b_forw = bp->b_forw;
 553                 bp->b_forw->b_back = bp->b_back;
 554                 sema_destroy(&bp->b_sem);
 555                 sema_destroy(&bp->b_io);
 556                 kmem_free(bp, sizeof (buf_t));
 557         }
 558 
 559         /*
 560          * free the free bufs
 561          */
 562         while ((bp = cb->cb_free) != NULL) {
 563                 cb->cb_free = bp->b_forw;
 564                 sema_destroy(&bp->b_sem);
 565                 sema_destroy(&bp->b_io);
 566                 kmem_free(bp, sizeof (buf_t));
 567         }
 568         kmem_free(cb->cb_va, cb->cb_nb);
 569         cb->cb_va = NULL;
 570         cb->cb_nb = 0;
 571         rw_exit(&cb->cb_rwlock);
 572         rw_destroy(&cb->cb_rwlock);
 573 }
 574 
 575 static int
 576 within_range(off_t lof, daddr_t blkno, ulong_t bcount)
 577 {
 578         off_t   blof    = dbtob(blkno);
 579 
 580         return ((lof >= blof) && (lof < (blof + bcount)));
 581 }
 582 
 583 static buf_t *
 584 find_bp(ml_unit_t *ul, cirbuf_t *cb, off_t lof)
 585 {
 586         buf_t *bp;
 587 
 588         /*
 589          * find a buf that contains the offset lof
 590          */
 591         rw_enter(&cb->cb_rwlock, RW_READER);
 592         bp = cb->cb_bp;
 593         do {
 594                 if (bp->b_bcount &&
 595                     within_range(lof, bp->b_blkno, bp->b_bcount)) {
 596                         makebusy(ul, bp);
 597                         rw_exit(&cb->cb_rwlock);
 598                         return (bp);
 599                 }
 600                 bp = bp->b_forw;
 601         } while (bp != cb->cb_bp);
 602         rw_exit(&cb->cb_rwlock);
 603 
 604         return (NULL);
 605 }
 606 
 607 static off_t
 608 find_read_lof(ml_unit_t *ul, cirbuf_t *cb, off_t lof)
 609 {
 610         buf_t   *bp, *bpend;
 611         off_t   rlof;
 612 
 613         /*
 614          * we mustn't:
 615          *      o read past eol
 616          *      o read past the tail
 617          *      o read data that may be being written.
 618          */
 619         rw_enter(&cb->cb_rwlock, RW_READER);
 620         bpend = bp = cb->cb_bp->b_forw;
 621         rlof = ul->un_tail_lof;
 622         do {
 623                 if (bp->b_bcount) {
 624                         rlof = dbtob(bp->b_blkno);
 625                         break;
 626                 }
 627                 bp = bp->b_forw;
 628         } while (bp != bpend);
 629         rw_exit(&cb->cb_rwlock);
 630 
 631         if (lof <= rlof)
 632                 /* lof is prior to the range represented by the write buf */
 633                 return (rlof);
 634         else
 635                 /* lof follows the range represented by the write buf */
 636                 return ((off_t)ul->un_eol_lof);
 637 }
 638 
 639 static buf_t *
 640 get_read_bp(ml_unit_t *ul, off_t lof)
 641 {
 642         cirbuf_t        *cb;
 643         buf_t           *bp;
 644         off_t           rlof;
 645 
 646         /*
 647          * retrieve as much data as possible from the incore buffers
 648          */
 649         if ((bp = find_bp(ul, &ul->un_wrbuf, lof)) != NULL) {
 650                 logstats.ls_lreadsinmem.value.ui64++;
 651                 return (bp);
 652         }
 653         if ((bp = find_bp(ul, &ul->un_rdbuf, lof)) != NULL) {
 654                 logstats.ls_lreadsinmem.value.ui64++;
 655                 return (bp);
 656         }
 657 
 658         /*
 659          * steal the LRU buf
 660          */
 661         cb = &ul->un_rdbuf;
 662         rw_enter(&cb->cb_rwlock, RW_WRITER);
 663         bp = cb->cb_bp->b_forw;
 664         makebusy(ul, bp);
 665         bp->b_flags = 0;
 666         bp->b_bcount = 0;
 667         cb->cb_bp = bp;
 668         rw_exit(&cb->cb_rwlock);
 669 
 670         /*
 671          * don't read past the tail or the end-of-log
 672          */
 673         bp->b_blkno = btodb(lof);
 674         lof = dbtob(bp->b_blkno);
 675         rlof = find_read_lof(ul, &ul->un_wrbuf, lof);
 676         bp->b_bcount = MIN(bp->b_bufsize, rlof - lof);
 677         readlog(ul, bp);
 678         return (bp);
 679 }
 680 
 681 /*
 682  * NOTE: writers are single threaded thru the log layer.
 683  * This means we can safely reference and change the cb and bp fields
 684  * that ldl_read does not reference w/o holding the cb_rwlock or
 685  * the bp makebusy lock.
 686  */
 687 static int
 688 extend_write_bp(ml_unit_t *ul, cirbuf_t *cb, buf_t *bp)
 689 {
 690         buf_t   *bpforw = bp->b_forw;
 691 
 692         ASSERT(bp == cb->cb_bp && bp == cb->cb_dirty);
 693 
 694         /*
 695          * there is no `next' bp; do nothing
 696          */
 697         if (bpforw == bp)
 698                 return (0);
 699 
 700         /*
 701          * buffer space is not adjacent; do nothing
 702          */
 703         if ((bp->b_un.b_addr + bp->b_bufsize) != bpforw->b_un.b_addr)
 704                 return (0);
 705 
 706         /*
 707          * locking protocol requires giving up any bp locks before
 708          * acquiring cb_rwlock.  This is okay because we hold
 709          * un_log_mutex.
 710          */
 711         sema_v(&bp->b_sem);
 712 
 713         /*
 714          * lock out ldl_read
 715          */
 716         rw_enter(&cb->cb_rwlock, RW_WRITER);
 717 
 718         /*
 719          * wait for current IO to finish w/next bp; if necessary
 720          */
 721         makebusy(ul, bpforw);
 722 
 723         /*
 724          * free the next bp and steal its space
 725          */
 726         bp->b_forw = bpforw->b_forw;
 727         bpforw->b_forw->b_back = bp;
 728         bp->b_bufsize += bpforw->b_bufsize;
 729         sema_v(&bpforw->b_sem);
 730         bpforw->b_forw = cb->cb_free;
 731         cb->cb_free = bpforw;
 732         makebusy(ul, bp);
 733         rw_exit(&cb->cb_rwlock);
 734 
 735         return (1);
 736 }
 737 
 738 static size_t
 739 storebuf(ml_unit_t *ul, buf_t *bp, caddr_t va, size_t nb)
 740 {
 741         size_t          copy_nb;
 742         size_t          nb_in_sec;
 743         sect_trailer_t  *st;
 744         size_t          nb_left = nb;
 745         cirbuf_t        *cb     = &ul->un_wrbuf;
 746 
 747 again:
 748         nb_in_sec = NB_LEFT_IN_SECTOR(bp->b_bcount);
 749         copy_nb = MIN(nb_left, nb_in_sec);
 750 
 751         ASSERT(copy_nb);
 752 
 753         bcopy(va, bp->b_un.b_addr + bp->b_bcount, copy_nb);
 754         bp->b_bcount += copy_nb;
 755         va += copy_nb;
 756         nb_left -= copy_nb;
 757         ul->un_tail_lof += copy_nb;
 758 
 759         if ((nb_in_sec -= copy_nb) == 0) {
 760                 st = (sect_trailer_t *)(bp->b_un.b_addr + bp->b_bcount);
 761 
 762                 st->st_tid = ul->un_logmap->mtm_tid;
 763                 st->st_ident = ul->un_tail_ident++;
 764                 bp->b_bcount += sizeof (sect_trailer_t);
 765                 ul->un_tail_lof += sizeof (sect_trailer_t);
 766                 /*
 767                  * log wrapped; async write this bp
 768                  */
 769                 if (ul->un_tail_lof == ul->un_eol_lof) {
 770                         ul->un_tail_lof = ul->un_bol_lof;
 771                         push_dirty_bp(ul, bp);
 772                         return (nb - nb_left);
 773                 }
 774                 /*
 775                  * out of bp space; get more or async write buf
 776                  */
 777                 if (bp->b_bcount == bp->b_bufsize) {
 778                         if (!extend_write_bp(ul, cb, bp)) {
 779                                 push_dirty_bp(ul, bp);
 780                                 return (nb - nb_left);
 781                         }
 782                 }
 783         }
 784         if (nb_left)
 785                 goto again;
 786 
 787         sema_v(&bp->b_sem);
 788         return (nb);
 789 }
 790 
 791 static void
 792 fetchzeroes(caddr_t dst_va, offset_t dst_mof, ulong_t dst_nb, mapentry_t *me)
 793 {
 794         offset_t        src_mof = me->me_mof;
 795         size_t          src_nb  = me->me_nb;
 796 
 797         if (src_mof > dst_mof) {
 798                 ASSERT(src_mof < (dst_mof + dst_nb));
 799                 dst_va += (src_mof - dst_mof);
 800                 dst_nb -= (src_mof - dst_mof);
 801         } else {
 802                 ASSERT(dst_mof < (src_mof + src_nb));
 803                 src_nb -= (dst_mof - src_mof);
 804         }
 805 
 806         src_nb = MIN(src_nb, dst_nb);
 807         ASSERT(src_nb);
 808         bzero(dst_va, src_nb);
 809 }
 810 
 811 /*
 812  * dst_va == NULL means don't copy anything
 813  */
 814 static ulong_t
 815 fetchbuf(
 816         ml_unit_t *ul,
 817         buf_t *bp,
 818         caddr_t dst_va,
 819         size_t dst_nb,
 820         off_t *dst_lofp)
 821 {
 822         caddr_t copy_va;
 823         size_t  copy_nb;
 824         size_t  nb_sec;
 825         off_t   dst_lof         = *dst_lofp;
 826         ulong_t sav_dst_nb      = dst_nb;
 827         ulong_t src_nb          = bp->b_bcount;
 828         off_t   src_lof         = dbtob(bp->b_blkno);
 829         off_t   src_elof        = src_lof + src_nb;
 830         caddr_t src_va          = bp->b_un.b_addr;
 831 
 832         /*
 833          * copy from bp to dst_va
 834          */
 835         while (dst_nb) {
 836                 /*
 837                  * compute address within bp
 838                  */
 839                 copy_va = src_va + (dst_lof - src_lof);
 840 
 841                 /*
 842                  * adjust copy size to amount of data in bp
 843                  */
 844                 copy_nb = MIN(dst_nb, src_elof - dst_lof);
 845 
 846                 /*
 847                  * adjust copy size to amount of data in sector
 848                  */
 849                 nb_sec = NB_LEFT_IN_SECTOR(dst_lof);
 850                 copy_nb = MIN(copy_nb, nb_sec);
 851 
 852                 /*
 853                  * dst_va == NULL means don't do copy (see logseek())
 854                  */
 855                 if (dst_va) {
 856                         bcopy(copy_va, dst_va, copy_nb);
 857                         dst_va += copy_nb;
 858                 }
 859                 dst_lof += copy_nb;
 860                 dst_nb -= copy_nb;
 861                 nb_sec -= copy_nb;
 862 
 863                 /*
 864                  * advance over sector trailer
 865                  */
 866                 if (nb_sec == 0)
 867                         dst_lof += sizeof (sect_trailer_t);
 868 
 869                 /*
 870                  * exhausted buffer
 871                  *      return current lof for next read
 872                  */
 873                 if (dst_lof == src_elof) {
 874                         sema_v(&bp->b_sem);
 875                         if (dst_lof == ul->un_eol_lof)
 876                                 dst_lof = ul->un_bol_lof;
 877                         *dst_lofp = dst_lof;
 878                         return (sav_dst_nb - dst_nb);
 879                 }
 880         }
 881 
 882         /*
 883          * copy complete - return current lof
 884          */
 885         sema_v(&bp->b_sem);
 886         *dst_lofp = dst_lof;
 887         return (sav_dst_nb);
 888 }
 889 
 890 void
 891 ldl_round_commit(ml_unit_t *ul)
 892 {
 893         int             wrapped;
 894         buf_t           *bp;
 895         sect_trailer_t  *st;
 896         size_t          bcount;
 897         cirbuf_t        *cb     = &ul->un_wrbuf;
 898 
 899         /*
 900          * if nothing to write; then do nothing
 901          */
 902         if ((bp = cb->cb_dirty) == NULL)
 903                 return;
 904         makebusy(ul, bp);
 905 
 906         /*
 907          * round up to sector boundary and set new tail
 908          *      don't readjust st_ident if buf is already rounded
 909          */
 910         bcount = P2ROUNDUP(bp->b_bcount, DEV_BSIZE);
 911         if (bcount == bp->b_bcount) {
 912                 sema_v(&bp->b_sem);
 913                 return;
 914         }
 915         bp->b_bcount = bcount;
 916         ul->un_tail_lof = dbtob(bp->b_blkno) + bcount;
 917         wrapped = 0;
 918         if (ul->un_tail_lof == ul->un_eol_lof) {
 919                 ul->un_tail_lof = ul->un_bol_lof;
 920                 ++wrapped;
 921         }
 922         ASSERT(ul->un_tail_lof != ul->un_head_lof);
 923 
 924         /*
 925          * fix up the sector trailer
 926          */
 927         /* LINTED */
 928         st = (sect_trailer_t *)
 929             ((bp->b_un.b_addr + bcount) - sizeof (*st));
 930         st->st_tid = ul->un_logmap->mtm_tid;
 931         st->st_ident = ul->un_tail_ident++;
 932 
 933         /*
 934          * if tail wrapped or we have exhausted this buffer
 935          *      async write the buffer
 936          */
 937         if (wrapped || bcount == bp->b_bufsize)
 938                 push_dirty_bp(ul, bp);
 939         else
 940                 sema_v(&bp->b_sem);
 941 }
 942 
 943 void
 944 ldl_push_commit(ml_unit_t *ul)
 945 {
 946         buf_t           *bp;
 947         cirbuf_t        *cb     = &ul->un_wrbuf;
 948 
 949         /*
 950          * if nothing to write; then do nothing
 951          */
 952         if ((bp = cb->cb_dirty) == NULL)
 953                 return;
 954         makebusy(ul, bp);
 955         push_dirty_bp(ul, bp);
 956 }
 957 
 958 int
 959 ldl_need_commit(ml_unit_t *ul)
 960 {
 961         return (ul->un_resv > (ul->un_maxresv - (ul->un_maxresv>>2)));
 962 }
 963 
 964 int
 965 ldl_has_space(ml_unit_t *ul, mapentry_t *me)
 966 {
 967         off_t   nfb;
 968         off_t   nb;
 969 
 970         ASSERT(MUTEX_HELD(&ul->un_log_mutex));
 971 
 972         /*
 973          * Add up the size used by the deltas
 974          * round nb up to a sector length plus an extra sector
 975          *      w/o the extra sector we couldn't distinguish
 976          *      a full log (head == tail) from an empty log (head == tail)
 977          */
 978         for (nb = DEV_BSIZE; me; me = me->me_hash) {
 979                 nb += sizeof (struct delta);
 980                 if (me->me_dt != DT_CANCEL)
 981                         nb += me->me_nb;
 982         }
 983         nb = P2ROUNDUP(nb, DEV_BSIZE);
 984 
 985         if (ul->un_head_lof <= ul->un_tail_lof)
 986                 nfb = (ul->un_head_lof - ul->un_bol_lof) +
 987                     (ul->un_eol_lof - ul->un_tail_lof);
 988         else
 989                 nfb = ul->un_head_lof - ul->un_tail_lof;
 990 
 991         return (nb < nfb);
 992 }
 993 
 994 void
 995 ldl_write(ml_unit_t *ul, caddr_t bufp, offset_t bufmof, struct mapentry *me)
 996 {
 997         buf_t           *bp;
 998         caddr_t         va;
 999         size_t          nb;
1000         size_t          actual;
1001 
1002         ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1003 
1004         /* Write the delta */
1005 
1006         nb = sizeof (struct delta);
1007         va = (caddr_t)&me->me_delta;
1008         bp = get_write_bp(ul);
1009 
1010         while (nb) {
1011                 if (ul->un_flags & LDL_ERROR) {
1012                         sema_v(&bp->b_sem);
1013                         return;
1014                 }
1015                 actual = storebuf(ul, bp, va, nb);
1016                 ASSERT(actual);
1017                 va += actual;
1018                 nb -= actual;
1019                 if (nb)
1020                         bp = get_write_bp(ul);
1021         }
1022 
1023         /* If a commit, cancel, or 0's; we're almost done */
1024         switch (me->me_dt) {
1025                 case DT_COMMIT:
1026                 case DT_CANCEL:
1027                 case DT_ABZERO:
1028                         /* roll needs to know where the next delta will go */
1029                         me->me_lof = ul->un_tail_lof;
1030                         return;
1031                 default:
1032                         break;
1033         }
1034 
1035         /* Now write the data */
1036 
1037         ASSERT(me->me_nb != 0);
1038 
1039         nb = me->me_nb;
1040         va = (me->me_mof - bufmof) + bufp;
1041         bp = get_write_bp(ul);
1042 
1043         /* Save where we will put the data */
1044         me->me_lof = ul->un_tail_lof;
1045 
1046         while (nb) {
1047                 if (ul->un_flags & LDL_ERROR) {
1048                         sema_v(&bp->b_sem);
1049                         return;
1050                 }
1051                 actual = storebuf(ul, bp, va, nb);
1052                 ASSERT(actual);
1053                 va += actual;
1054                 nb -= actual;
1055                 if (nb)
1056                         bp = get_write_bp(ul);
1057         }
1058 }
1059 
1060 void
1061 ldl_waito(ml_unit_t *ul)
1062 {
1063         buf_t           *bp;
1064         cirbuf_t        *cb     = &ul->un_wrbuf;
1065 
1066         rw_enter(&cb->cb_rwlock, RW_WRITER);
1067         /*
1068          * wait on them
1069          */
1070         bp = cb->cb_bp;
1071         do {
1072                 if ((bp->b_flags & B_DONE) == 0) {
1073                         makebusy(ul, bp);
1074                         sema_v(&bp->b_sem);
1075                 }
1076                 bp = bp->b_forw;
1077         } while (bp != cb->cb_bp);
1078         rw_exit(&cb->cb_rwlock);
1079 }
1080 
1081 /*
1082  * seek nb bytes from location lof
1083  */
1084 static int
1085 logseek(ml_unit_t *ul, off_t lof, size_t nb, off_t *lofp)
1086 {
1087         buf_t   *bp;
1088         ulong_t actual;
1089 
1090         while (nb) {
1091                 bp = get_read_bp(ul, lof);
1092                 if (bp->b_flags & B_ERROR) {
1093                         sema_v(&bp->b_sem);
1094                         return (EIO);
1095                 }
1096                 actual = fetchbuf(ul, bp, NULL, nb, &lof);
1097                 ASSERT(actual);
1098                 nb -= actual;
1099         }
1100         *lofp = lof;
1101         ASSERT(nb == 0);
1102         return (0);
1103 }
1104 
1105 int
1106 ldl_read(
1107         ml_unit_t *ul,          /* Log unit */
1108         caddr_t va,             /* address of buffer to read into */
1109         offset_t mof,           /* mof of buffer */
1110         off_t nb,               /* length of buffer */
1111         mapentry_t *me)         /* Map entry list */
1112 {
1113         buf_t   *bp;
1114         crb_t   *crb;
1115         caddr_t rva;                    /* address to read into */
1116         size_t  rnb;                    /* # of bytes to read */
1117         off_t   lof;                    /* log device offset to read from */
1118         off_t   skip;
1119         ulong_t actual;
1120         int     error;
1121         caddr_t eva     = va + nb;      /* end of buffer */
1122 
1123         for (; me; me = me->me_agenext) {
1124                 ASSERT(me->me_dt != DT_CANCEL);
1125 
1126                 /*
1127                  * check for an cached roll buffer
1128                  */
1129                 crb = me->me_crb;
1130                 if (crb) {
1131                         if (mof > crb->c_mof) {
1132                                 /*
1133                                  * This mapentry overlaps with the beginning of
1134                                  * the supplied buffer
1135                                  */
1136                                 skip = mof - crb->c_mof;
1137                                 bcopy(crb->c_buf + skip, va,
1138                                     MIN(nb, crb->c_nb - skip));
1139                         } else {
1140                                 /*
1141                                  * This mapentry starts at or after
1142                                  * the supplied buffer.
1143                                  */
1144                                 skip = crb->c_mof - mof;
1145                                 bcopy(crb->c_buf, va + skip,
1146                                     MIN(crb->c_nb, nb - skip));
1147                         }
1148                         logstats.ls_lreadsinmem.value.ui64++;
1149                         continue;
1150                 }
1151 
1152                 /*
1153                  * check for a delta full of zeroes - there's no log data
1154                  */
1155                 if (me->me_dt == DT_ABZERO) {
1156                         fetchzeroes(va, mof, nb, me);
1157                         continue;
1158                 }
1159 
1160                 if (mof > me->me_mof) {
1161                         rnb = (size_t)(mof - me->me_mof);
1162                         error = logseek(ul, me->me_lof, rnb, &lof);
1163                         if (error)
1164                                 return (EIO);
1165                         rva = va;
1166                         rnb = me->me_nb - rnb;
1167                         rnb = ((rva + rnb) > eva) ? eva - rva : rnb;
1168                 } else {
1169                         lof = me->me_lof;
1170                         rva = (me->me_mof - mof) + va;
1171                         rnb = ((rva + me->me_nb) > eva) ? eva - rva : me->me_nb;
1172                 }
1173 
1174                 while (rnb) {
1175                         bp = get_read_bp(ul, lof);
1176                         if (bp->b_flags & B_ERROR) {
1177                                 sema_v(&bp->b_sem);
1178                                 return (EIO);
1179                         }
1180                         ASSERT(((me->me_flags & ME_ROLL) == 0) ||
1181                             (bp != ul->un_wrbuf.cb_dirty));
1182                         actual = fetchbuf(ul, bp, rva, rnb, &lof);
1183                         ASSERT(actual);
1184                         rva += actual;
1185                         rnb -= actual;
1186                 }
1187         }
1188         return (0);
1189 }
1190 
1191 void
1192 ldl_savestate(ml_unit_t *ul)
1193 {
1194         int             error;
1195         buf_t           *bp     = ul->un_bp;
1196         ml_odunit_t     *ud     = (void *)bp->b_un.b_addr;
1197         ml_odunit_t     *ud2    = (void *)(bp->b_un.b_addr + DEV_BSIZE);
1198 
1199 #if     DEBUG
1200         /*
1201          * Scan test is running; don't update intermediate state
1202          */
1203         if (ul->un_logmap && ul->un_logmap->mtm_trimlof)
1204                 return;
1205 #endif  /* DEBUG */
1206 
1207         mutex_enter(&ul->un_state_mutex);
1208         bcopy(&ul->un_ondisk, ud, sizeof (*ud));
1209         ud->od_chksum = ud->od_head_ident + ud->od_tail_ident;
1210         bcopy(ud, ud2, sizeof (*ud));
1211 
1212         /* If a snapshot is enabled write through the shapshot driver. */
1213         if (ul->un_ufsvfs->vfs_snapshot)
1214                 UFS_BWRITE2(ul->un_ufsvfs, bp);
1215         else
1216                 BWRITE2(bp);
1217         logstats.ls_ldlwrites.value.ui64++;
1218         error = bp->b_flags & B_ERROR;
1219         mutex_exit(&ul->un_state_mutex);
1220         if (error)
1221                 ldl_seterror(ul, "Error writing ufs log state");
1222 }
1223 
1224 /*
1225  * The head will be set to (new_lof - header) since ldl_sethead is
1226  * called with the new_lof of the data portion of a delta.
1227  */
1228 void
1229 ldl_sethead(ml_unit_t *ul, off_t data_lof, uint32_t tid)
1230 {
1231         off_t           nb;
1232         off_t           new_lof;
1233         uint32_t        new_ident;
1234         daddr_t         beg_blkno;
1235         daddr_t         end_blkno;
1236 
1237         ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1238 
1239         if (data_lof == -1) {
1240                 /* log is empty */
1241                 new_ident = lufs_hd_genid(ul);
1242                 new_lof = ul->un_tail_lof;
1243 
1244         } else {
1245                 /* compute header's lof */
1246                 new_ident = ul->un_head_ident;
1247                 new_lof = data_lof - sizeof (struct delta);
1248 
1249                 /* whoops, header spans sectors; subtract out sector trailer */
1250                 if (btodb(new_lof) != btodb(data_lof))
1251                         new_lof -= sizeof (sect_trailer_t);
1252 
1253                 /* whoops, header wrapped the log; go to last sector */
1254                 if (new_lof < ul->un_bol_lof) {
1255                         /* sector offset */
1256                         new_lof -= dbtob(btodb(new_lof));
1257                         /* add to last sector's lof */
1258                         new_lof += (ul->un_eol_lof - DEV_BSIZE);
1259                 }
1260                 ul->un_head_tid = tid;
1261         }
1262 
1263         /*
1264          * check for nop
1265          */
1266         if (new_lof == ul->un_head_lof)
1267                 return;
1268 
1269         /*
1270          * invalidate the affected bufs and calculate new ident
1271          */
1272         if (new_lof > ul->un_head_lof) {
1273                 nb = new_lof - ul->un_head_lof;
1274                 inval_range(ul, &ul->un_wrbuf, ul->un_head_lof, nb);
1275                 inval_range(ul, &ul->un_rdbuf, ul->un_head_lof, nb);
1276 
1277                 end_blkno = btodb(new_lof);
1278                 beg_blkno = btodb(ul->un_head_lof);
1279                 new_ident += (end_blkno - beg_blkno);
1280         } else {
1281                 nb = ul->un_eol_lof - ul->un_head_lof;
1282                 inval_range(ul, &ul->un_wrbuf, ul->un_head_lof, nb);
1283                 inval_range(ul, &ul->un_rdbuf, ul->un_head_lof, nb);
1284 
1285                 end_blkno = btodb(ul->un_eol_lof);
1286                 beg_blkno = btodb(ul->un_head_lof);
1287                 new_ident += (end_blkno - beg_blkno);
1288 
1289                 nb = new_lof - ul->un_bol_lof;
1290                 inval_range(ul, &ul->un_wrbuf, ul->un_bol_lof, nb);
1291                 inval_range(ul, &ul->un_rdbuf, ul->un_bol_lof, nb);
1292 
1293                 end_blkno = btodb(new_lof);
1294                 beg_blkno = btodb(ul->un_bol_lof);
1295                 new_ident += (end_blkno - beg_blkno);
1296         }
1297         /*
1298          * don't update the head if there has been an error
1299          */
1300         if (ul->un_flags & LDL_ERROR)
1301                 return;
1302 
1303         /* Fix up the head and ident */
1304         ASSERT(new_lof >= ul->un_bol_lof);
1305         ul->un_head_lof = new_lof;
1306         ul->un_head_ident = new_ident;
1307         if (data_lof == -1) {
1308                 ul->un_tail_ident = ul->un_head_ident;
1309         }
1310 
1311 
1312         /* Commit to the database */
1313         ldl_savestate(ul);
1314 
1315         ASSERT(((ul->un_logmap->mtm_debug & MT_SCAN) == 0) ||
1316             ldl_sethead_debug(ul));
1317 }
1318 
1319 /*
1320  * The tail will be set to the sector following lof+nb
1321  *      lof + nb == size of the last delta + commit record
1322  *      this function is called once after the log scan has completed.
1323  */
1324 void
1325 ldl_settail(ml_unit_t *ul, off_t lof, size_t nb)
1326 {
1327         off_t           new_lof;
1328         uint32_t        new_ident;
1329         daddr_t         beg_blkno;
1330         daddr_t         end_blkno;
1331 
1332         ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1333 
1334         if (lof == -1) {
1335                 ul->un_tail_lof = dbtob(btodb(ul->un_head_lof));
1336                 ul->un_head_lof = ul->un_tail_lof;
1337                 ul->un_head_ident = lufs_hd_genid(ul);
1338                 ul->un_tail_ident = ul->un_head_ident;
1339 
1340                 /* Commit to the database */
1341                 ldl_savestate(ul);
1342 
1343                 return;
1344         }
1345 
1346         /*
1347          * new_lof is the offset of the sector following the last commit
1348          */
1349         (void) logseek(ul, lof, nb, &new_lof);
1350         ASSERT(new_lof != dbtob(btodb(ul->un_head_lof)));
1351 
1352         /*
1353          * calculate new ident
1354          */
1355         if (new_lof > ul->un_head_lof) {
1356                 end_blkno = btodb(new_lof);
1357                 beg_blkno = btodb(ul->un_head_lof);
1358                 new_ident = ul->un_head_ident + (end_blkno - beg_blkno);
1359         } else {
1360                 end_blkno = btodb(ul->un_eol_lof);
1361                 beg_blkno = btodb(ul->un_head_lof);
1362                 new_ident = ul->un_head_ident + (end_blkno - beg_blkno);
1363 
1364                 end_blkno = btodb(new_lof);
1365                 beg_blkno = btodb(ul->un_bol_lof);
1366                 new_ident += (end_blkno - beg_blkno);
1367         }
1368 
1369         /* Fix up the tail and ident */
1370         ul->un_tail_lof = new_lof;
1371         ul->un_tail_ident = new_ident;
1372 
1373         /* Commit to the database */
1374         ldl_savestate(ul);
1375 }
1376 
1377 /*
1378  * LOGSCAN STUFF
1379  */
1380 static int
1381 ldl_logscan_ident(ml_unit_t *ul, buf_t *bp, off_t lof)
1382 {
1383         ulong_t         ident;
1384         size_t          nblk, i;
1385         sect_trailer_t  *st;
1386 
1387         /*
1388          * compute ident for first sector in the buffer
1389          */
1390         ident = ul->un_head_ident;
1391         if (bp->b_blkno >= btodb(ul->un_head_lof)) {
1392                 ident += (bp->b_blkno - btodb(ul->un_head_lof));
1393         } else {
1394                 ident += (btodb(ul->un_eol_lof) - btodb(ul->un_head_lof));
1395                 ident += (bp->b_blkno - btodb(ul->un_bol_lof));
1396         }
1397         /*
1398          * truncate the buffer down to the last valid sector
1399          */
1400         nblk = btodb(bp->b_bcount);
1401         bp->b_bcount = 0;
1402         /* LINTED */
1403         st = (sect_trailer_t *)(bp->b_un.b_addr + LDL_USABLE_BSIZE);
1404         for (i = 0; i < nblk; ++i) {
1405                 if (st->st_ident != ident)
1406                         break;
1407 
1408                 /* remember last valid tid for ldl_logscan_error() */
1409                 ul->un_tid = st->st_tid;
1410 
1411                 /* LINTED */
1412                 st = (sect_trailer_t *)(((caddr_t)st) + DEV_BSIZE);
1413                 ++ident;
1414                 bp->b_bcount += DEV_BSIZE;
1415         }
1416         /*
1417          * make sure that lof is still within range
1418          */
1419         return (within_range(lof, bp->b_blkno, bp->b_bcount));
1420 }
1421 
1422 ulong_t
1423 ldl_logscan_nbcommit(off_t lof)
1424 {
1425         /*
1426          * lof is the offset following the commit header.  However,
1427          * if the commit header fell on the end-of-sector, then lof
1428          * has already been advanced to the beginning of the next
1429          * sector.  So do nothing.  Otherwise, return the remaining
1430          * bytes in the sector.
1431          */
1432         if ((lof & (DEV_BSIZE - 1)) == 0)
1433                 return (0);
1434         return (NB_LEFT_IN_SECTOR(lof));
1435 }
1436 
1437 int
1438 ldl_logscan_read(ml_unit_t *ul, off_t *lofp, size_t nb, caddr_t va)
1439 {
1440         buf_t   *bp;
1441         ulong_t actual;
1442 
1443         ASSERT(ul->un_head_lof != ul->un_tail_lof);
1444 
1445         /*
1446          * Check the log data doesn't go out of bounds
1447          */
1448         if (ul->un_head_lof < ul->un_tail_lof) {
1449                 if (!WITHIN(*lofp, nb, ul->un_head_lof,
1450                     (ul->un_tail_lof - ul->un_head_lof))) {
1451                         return (EIO);
1452                 }
1453         } else {
1454                 if (OVERLAP(*lofp, nb, ul->un_tail_lof,
1455                     (ul->un_head_lof - ul->un_tail_lof))) {
1456                         return (EIO);
1457                 }
1458         }
1459 
1460         while (nb) {
1461                 bp = get_read_bp(ul, *lofp);
1462                 if (bp->b_flags & B_ERROR) {
1463                         sema_v(&bp->b_sem);
1464                         return (EIO);
1465                 }
1466                 /*
1467                  * out-of-seq idents means partial transaction
1468                  *      panic, non-corrupting powerfail, ...
1469                  */
1470                 if (!ldl_logscan_ident(ul, bp, *lofp)) {
1471                         sema_v(&bp->b_sem);
1472                         return (EIO);
1473                 }
1474                 /*
1475                  * copy the header into the caller's buf
1476                  */
1477                 actual = fetchbuf(ul, bp, va, nb, lofp);
1478                 if (va)
1479                         va += actual;
1480                 nb -= actual;
1481         }
1482         return (0);
1483 }
1484 
1485 void
1486 ldl_logscan_begin(ml_unit_t *ul)
1487 {
1488         size_t  bufsize;
1489 
1490         ASSERT(ul->un_wrbuf.cb_dirty == NULL);
1491 
1492         /*
1493          * logscan has begun
1494          */
1495         ul->un_flags |= LDL_SCAN;
1496 
1497         /*
1498          * reset the circular bufs
1499          */
1500         bufsize = ldl_bufsize(ul);
1501         alloc_rdbuf(&ul->un_rdbuf, bufsize, bufsize);
1502         alloc_wrbuf(&ul->un_wrbuf, bufsize);
1503 
1504         /*
1505          * set the tail to reflect a full log
1506          */
1507         ul->un_tail_lof = dbtob(btodb(ul->un_head_lof)) - DEV_BSIZE;
1508 
1509         if (ul->un_tail_lof < ul->un_bol_lof)
1510                 ul->un_tail_lof = ul->un_eol_lof - DEV_BSIZE;
1511         if (ul->un_tail_lof >= ul->un_eol_lof)
1512                 ul->un_tail_lof = ul->un_bol_lof;
1513 
1514         /*
1515          * un_tid is used during error processing; it is initialized to
1516          * the tid of the delta at un_head_lof;
1517          */
1518         ul->un_tid = ul->un_head_tid;
1519 }
1520 
1521 void
1522 ldl_logscan_end(ml_unit_t *ul)
1523 {
1524         size_t  bufsize;
1525 
1526         /*
1527          * reset the circular bufs
1528          */
1529         bufsize = ldl_bufsize(ul);
1530         alloc_rdbuf(&ul->un_rdbuf, MAPBLOCKSIZE, MAPBLOCKSIZE);
1531         alloc_wrbuf(&ul->un_wrbuf, bufsize);
1532 
1533         /*
1534          * Done w/scan
1535          */
1536         ul->un_flags &= ~LDL_SCAN;
1537 }
1538 
1539 int
1540 ldl_need_roll(ml_unit_t *ul)
1541 {
1542         off_t   busybytes;
1543         off_t   head;
1544         off_t   tail;
1545         off_t   bol;
1546         off_t   eol;
1547         off_t   nb;
1548 
1549         /*
1550          * snapshot the log state
1551          */
1552         head = ul->un_head_lof;
1553         tail = ul->un_tail_lof;
1554         bol = ul->un_bol_lof;
1555         eol = ul->un_eol_lof;
1556         nb = ul->un_logsize;
1557 
1558         /*
1559          * compute number of busy (inuse) bytes
1560          */
1561         if (head <= tail)
1562                 busybytes = tail - head;
1563         else
1564                 busybytes = (eol - head) + (tail - bol);
1565 
1566         /*
1567          * return TRUE if > 75% full
1568          */
1569         return (busybytes > (nb - (nb >> 2)));
1570 }
1571 
1572 void
1573 ldl_seterror(ml_unit_t *ul, char *why)
1574 {
1575         /*
1576          * already in error state; do nothing
1577          */
1578         if (ul->un_flags & LDL_ERROR)
1579                 return;
1580 
1581         ul->un_flags |= LDL_ERROR;   /* incore */
1582         ul->un_badlog = 1;           /* ondisk (cleared by fsck) */
1583 
1584         /*
1585          * Commit to state sectors
1586          */
1587         uniqtime(&ul->un_timestamp);
1588         ldl_savestate(ul);
1589 
1590         /* Pretty print */
1591         cmn_err(CE_WARN, "%s", why);
1592         cmn_err(CE_WARN, "ufs log for %s changed state to Error",
1593             ul->un_ufsvfs->vfs_fs->fs_fsmnt);
1594         cmn_err(CE_WARN, "Please umount(1M) %s and run fsck(1M)",
1595             ul->un_ufsvfs->vfs_fs->fs_fsmnt);
1596 
1597         /*
1598          * If we aren't in the middle of scan (aka snarf); tell ufs
1599          * to hard lock itself.
1600          */
1601         if ((ul->un_flags & LDL_SCAN) == 0)
1602                 ufs_trans_onerror();
1603 }
1604 
1605 size_t
1606 ldl_bufsize(ml_unit_t *ul)
1607 {
1608         size_t          bufsize;
1609         extern uint32_t ldl_minbufsize;
1610 
1611         /*
1612          * initial guess is the maxtransfer value for this log device
1613          *      increase if too small
1614          *      decrease if too large
1615          */
1616         bufsize = dbtob(btod(ul->un_maxtransfer));
1617         if (bufsize < ldl_minbufsize)
1618                 bufsize = ldl_minbufsize;
1619         if (bufsize > maxphys)
1620                 bufsize = maxphys;
1621         if (bufsize > ul->un_maxtransfer)
1622                 bufsize = ul->un_maxtransfer;
1623         return (bufsize);
1624 }