1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 #include <sys/systm.h>
  27 #include <sys/types.h>
  28 #include <sys/vnode.h>
  29 #include <sys/errno.h>
  30 #include <sys/sysmacros.h>
  31 #include <sys/debug.h>
  32 #include <sys/kmem.h>
  33 #include <sys/conf.h>
  34 #include <sys/proc.h>
  35 #include <sys/cmn_err.h>
  36 #include <sys/fssnap_if.h>
  37 #include <sys/fs/ufs_inode.h>
  38 #include <sys/fs/ufs_filio.h>
  39 #include <sys/fs/ufs_log.h>
  40 #include <sys/fs/ufs_bio.h>
  41 #include <sys/inttypes.h>
  42 #include <sys/callb.h>
  43 #include <sys/tnf_probe.h>
  44 
  45 /*
  46  * Kernel threads for logging
  47  * Currently only one for rolling the log (one per log).
  48  */
  49 
  50 #define LUFS_DEFAULT_NUM_ROLL_BUFS 16
  51 #define LUFS_DEFAULT_MIN_ROLL_BUFS 4
  52 #define LUFS_DEFAULT_MAX_ROLL_BUFS 64
  53 
  54 /*
  55  * Macros
  56  */
  57 #define logmap_need_roll(logmap) ((logmap)->mtm_nme > logmap_maxnme)
  58 #define ldl_empty(ul) ((ul)->un_head_lof == (ul)->un_tail_lof)
  59 
  60 /*
  61  * Tunables
  62  */
  63 uint32_t lufs_num_roll_bufs = LUFS_DEFAULT_NUM_ROLL_BUFS;
  64 uint32_t lufs_min_roll_bufs = LUFS_DEFAULT_MIN_ROLL_BUFS;
  65 uint32_t lufs_max_roll_bufs = LUFS_DEFAULT_MAX_ROLL_BUFS;
  66 long logmap_maxnme = 1536;
  67 int trans_roll_tics = 0;
  68 uint64_t trans_roll_new_delta = 0;
  69 uint64_t lrr_wait = 0;
  70 /*
  71  * Key for thread specific data for the roll thread to
  72  * bypass snapshot throttling
  73  */
  74 uint_t bypass_snapshot_throttle_key;
  75 
  76 /*
  77  * externs
  78  */
  79 extern kmutex_t         ml_scan;
  80 extern kcondvar_t       ml_scan_cv;
  81 extern int              maxphys;
  82 
  83 static void
  84 trans_roll_wait(mt_map_t *logmap, callb_cpr_t *cprinfop)
  85 {
  86         mutex_enter(&logmap->mtm_mutex);
  87         logmap->mtm_ref = 0;
  88         if (logmap->mtm_flags & MTM_FORCE_ROLL) {
  89                 cv_broadcast(&logmap->mtm_from_roll_cv);
  90         }
  91         logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLLING);
  92         CALLB_CPR_SAFE_BEGIN(cprinfop);
  93         (void) cv_reltimedwait(&logmap->mtm_to_roll_cv, &logmap->mtm_mutex,
  94             trans_roll_tics, TR_CLOCK_TICK);
  95         CALLB_CPR_SAFE_END(cprinfop, &logmap->mtm_mutex);
  96         logmap->mtm_flags |= MTM_ROLLING;
  97         mutex_exit(&logmap->mtm_mutex);
  98 }
  99 
 100 /*
 101  * returns the number of 8K buffers to use for rolling the log
 102  */
 103 static uint32_t
 104 log_roll_buffers()
 105 {
 106         /*
 107          * sanity validate the tunable lufs_num_roll_bufs
 108          */
 109         if (lufs_num_roll_bufs < lufs_min_roll_bufs) {
 110                 return (lufs_min_roll_bufs);
 111         }
 112         if (lufs_num_roll_bufs > lufs_max_roll_bufs) {
 113                 return (lufs_max_roll_bufs);
 114         }
 115         return (lufs_num_roll_bufs);
 116 }
 117 
 118 /*
 119  * Find something to roll, then if we don't have cached roll buffers
 120  * covering all the deltas in that MAPBLOCK then read the master
 121  * and overlay the deltas.
 122  * returns;
 123  *      0 if sucessful
 124  *      1 on finding nothing to roll
 125  *      2 on error
 126  */
 127 int
 128 log_roll_read(ml_unit_t *ul, rollbuf_t *rbs, int nmblk, caddr_t roll_bufs,
 129     int *retnbuf)
 130 {
 131         offset_t        mof;
 132         buf_t           *bp;
 133         rollbuf_t       *rbp;
 134         mt_map_t        *logmap = ul->un_logmap;
 135         daddr_t         mblkno;
 136         int             i;
 137         int             error;
 138         int             nbuf;
 139 
 140         /*
 141          * Make sure there is really something to roll
 142          */
 143         mof = 0;
 144         if (!logmap_next_roll(logmap, &mof)) {
 145                 return (1);
 146         }
 147 
 148         /*
 149          * build some master blocks + deltas to roll forward
 150          */
 151         rw_enter(&logmap->mtm_rwlock, RW_READER);
 152         nbuf = 0;
 153         do {
 154                 mof = mof & (offset_t)MAPBLOCKMASK;
 155                 mblkno = lbtodb(mof);
 156 
 157                 /*
 158                  * Check for the case of a new delta to a set up buffer
 159                  */
 160                 for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
 161                         if (P2ALIGN(rbp->rb_bh.b_blkno,
 162                             MAPBLOCKSIZE / DEV_BSIZE) == mblkno) {
 163                                 TNF_PROBE_0(trans_roll_new_delta, "lufs",
 164                                     /* CSTYLED */);
 165                                 trans_roll_new_delta++;
 166                                 /* Flush out the current set of buffers */
 167                                 goto flush_bufs;
 168                         }
 169                 }
 170 
 171                 /*
 172                  * Work out what to roll next. If it isn't cached then read
 173                  * it asynchronously from the master.
 174                  */
 175                 bp = &rbp->rb_bh;
 176                 bp->b_blkno = mblkno;
 177                 bp->b_flags = B_READ;
 178                 bp->b_un.b_addr = roll_bufs + (nbuf << MAPBLOCKSHIFT);
 179                 bp->b_bufsize = MAPBLOCKSIZE;
 180                 if (top_read_roll(rbp, ul)) {
 181                         /* logmap deltas were in use */
 182                         if (nbuf == 0) {
 183                                 /*
 184                                  * On first buffer wait for the logmap user
 185                                  * to finish by grabbing the logmap lock
 186                                  * exclusively rather than spinning
 187                                  */
 188                                 rw_exit(&logmap->mtm_rwlock);
 189                                 lrr_wait++;
 190                                 rw_enter(&logmap->mtm_rwlock, RW_WRITER);
 191                                 rw_exit(&logmap->mtm_rwlock);
 192                                 return (1);
 193                         }
 194                         /* we have at least one buffer - flush it */
 195                         goto flush_bufs;
 196                 }
 197                 if ((bp->b_flags & B_INVAL) == 0) {
 198                         nbuf++;
 199                 }
 200                 mof += MAPBLOCKSIZE;
 201         } while ((nbuf < nmblk) && logmap_next_roll(logmap, &mof));
 202 
 203         /*
 204          * If there was nothing to roll cycle back
 205          */
 206         if (nbuf == 0) {
 207                 rw_exit(&logmap->mtm_rwlock);
 208                 return (1);
 209         }
 210 
 211 flush_bufs:
 212         /*
 213          * For each buffer, if it isn't cached then wait for the read to
 214          * finish and overlay the deltas.
 215          */
 216         for (error = 0, i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
 217                 if (!rbp->rb_crb) {
 218                         bp = &rbp->rb_bh;
 219                         if (trans_not_wait(bp)) {
 220                                 ldl_seterror(ul,
 221                                     "Error reading master during ufs log roll");
 222                                 error = 1;
 223                         }
 224                         /*
 225                          * sync read the data from the log
 226                          */
 227                         if (ldl_read(ul, bp->b_un.b_addr,
 228                             ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK,
 229                             MAPBLOCKSIZE, rbp->rb_age)) {
 230                                 error = 1;
 231                         }
 232                 }
 233 
 234                 /*
 235                  * reset the age bit in the age list
 236                  */
 237                 logmap_list_put_roll(logmap, rbp->rb_age);
 238 
 239                 if (ul->un_flags & LDL_ERROR) {
 240                         error = 1;
 241                 }
 242         }
 243         rw_exit(&logmap->mtm_rwlock);
 244         if (error)
 245                 return (2);
 246         *retnbuf = nbuf;
 247         return (0);
 248 }
 249 
 250 /*
 251  * Write out a cached roll buffer
 252  */
 253 void
 254 log_roll_write_crb(ufsvfs_t *ufsvfsp, rollbuf_t *rbp)
 255 {
 256         crb_t *crb = rbp->rb_crb;
 257         buf_t *bp = &rbp->rb_bh;
 258 
 259         bp->b_blkno = lbtodb(crb->c_mof);
 260         bp->b_un.b_addr = crb->c_buf;
 261         bp->b_bcount = crb->c_nb;
 262         bp->b_bufsize = crb->c_nb;
 263         ASSERT((crb->c_nb & DEV_BMASK) == 0);
 264         bp->b_flags = B_WRITE;
 265         logstats.ls_rwrites.value.ui64++;
 266 
 267         /* if snapshots are enabled, call it */
 268         if (ufsvfsp->vfs_snapshot) {
 269                 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
 270         } else {
 271                 (void) bdev_strategy(bp);
 272         }
 273 }
 274 
 275 /*
 276  * Write out a set of non cached roll buffers
 277  */
 278 void
 279 log_roll_write_bufs(ufsvfs_t *ufsvfsp, rollbuf_t *rbp)
 280 {
 281         buf_t           *bp = &rbp->rb_bh;
 282         buf_t           *bp2;
 283         rbsecmap_t      secmap = rbp->rb_secmap;
 284         int             j, k;
 285 
 286         ASSERT(secmap);
 287         ASSERT((bp->b_flags & B_INVAL) == 0);
 288 
 289         do { /* for each contiguous block of sectors */
 290                 /* find start of next sector to write */
 291                 for (j = 0; j < 16; ++j) {
 292                         if (secmap & UINT16_C(1))
 293                                 break;
 294                         secmap >>= 1;
 295                 }
 296                 bp->b_un.b_addr += (j << DEV_BSHIFT);
 297                 bp->b_blkno += j;
 298 
 299                 /* calculate number of sectors */
 300                 secmap >>= 1;
 301                 j++;
 302                 for (k = 1; j < 16; ++j) {
 303                         if ((secmap & UINT16_C(1)) == 0)
 304                                 break;
 305                         secmap >>= 1;
 306                         k++;
 307                 }
 308                 bp->b_bcount = k << DEV_BSHIFT;
 309                 bp->b_flags = B_WRITE;
 310                 logstats.ls_rwrites.value.ui64++;
 311 
 312                 /* if snapshots are enabled, call it */
 313                 if (ufsvfsp->vfs_snapshot)
 314                         fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
 315                 else
 316                         (void) bdev_strategy(bp);
 317                 if (secmap) {
 318                         /*
 319                          * Allocate another buf_t to handle
 320                          * the next write in this MAPBLOCK
 321                          * Chain them via b_list.
 322                          */
 323                         bp2 = kmem_alloc(sizeof (buf_t), KM_SLEEP);
 324                         bp->b_list = bp2;
 325                         bioinit(bp2);
 326                         bp2->b_iodone = trans_not_done;
 327                         bp2->b_bufsize = MAPBLOCKSIZE;
 328                         bp2->b_edev = bp->b_edev;
 329                         bp2->b_un.b_addr =
 330                             bp->b_un.b_addr + bp->b_bcount;
 331                         bp2->b_blkno = bp->b_blkno + k;
 332                         bp = bp2;
 333                 }
 334         } while (secmap);
 335 }
 336 
 337 /*
 338  * Asynchronously roll the deltas, using the sector map
 339  * in each rollbuf_t.
 340  */
 341 int
 342 log_roll_write(ml_unit_t *ul, rollbuf_t *rbs, int nbuf)
 343 {
 344 
 345         ufsvfs_t        *ufsvfsp = ul->un_ufsvfs;
 346         rollbuf_t       *rbp;
 347         buf_t           *bp, *bp2;
 348         rollbuf_t       *head, *prev, *rbp2;
 349 
 350         /*
 351          * Order the buffers by blkno
 352          */
 353         ASSERT(nbuf > 0);
 354 #ifdef lint
 355         prev = rbs;
 356 #endif
 357         for (head = rbs, rbp = rbs + 1; rbp < rbs + nbuf; rbp++) {
 358                 for (rbp2 = head; rbp2; prev = rbp2, rbp2 = rbp2->rb_next) {
 359                         if (rbp->rb_bh.b_blkno < rbp2->rb_bh.b_blkno) {
 360                                 if (rbp2 == head) {
 361                                         rbp->rb_next = head;
 362                                         head = rbp;
 363                                 } else {
 364                                         prev->rb_next = rbp;
 365                                         rbp->rb_next = rbp2;
 366                                 }
 367                                 break;
 368                         }
 369                 }
 370                 if (rbp2 == NULL) {
 371                         prev->rb_next = rbp;
 372                         rbp->rb_next = NULL;
 373                 }
 374         }
 375 
 376         /*
 377          * issue the in-order writes
 378          */
 379         for (rbp = head; rbp; rbp = rbp2) {
 380                 if (rbp->rb_crb) {
 381                         log_roll_write_crb(ufsvfsp, rbp);
 382                 } else {
 383                         log_roll_write_bufs(ufsvfsp, rbp);
 384                 }
 385                 /* null out the rb_next link for next set of rolling */
 386                 rbp2 = rbp->rb_next;
 387                 rbp->rb_next = NULL;
 388         }
 389 
 390         /*
 391          * wait for all the writes to finish
 392          */
 393         for (rbp = rbs; rbp < rbs + nbuf; rbp++) {
 394                 bp = &rbp->rb_bh;
 395                 if (trans_not_wait(bp)) {
 396                         ldl_seterror(ul,
 397                             "Error writing master during ufs log roll");
 398                 }
 399 
 400                 /*
 401                  * Now wait for all the "cloned" buffer writes (if any)
 402                  * and free those headers
 403                  */
 404                 bp2 = bp->b_list;
 405                 bp->b_list = NULL;
 406                 while (bp2) {
 407                         if (trans_not_wait(bp2)) {
 408                                 ldl_seterror(ul,
 409                                     "Error writing master during ufs log roll");
 410                         }
 411                         bp = bp2;
 412                         bp2 = bp2->b_list;
 413                         kmem_free(bp, sizeof (buf_t));
 414                 }
 415         }
 416 
 417         if (ul->un_flags & LDL_ERROR)
 418                 return (1);
 419         return (0);
 420 }
 421 
 422 void
 423 trans_roll(ml_unit_t *ul)
 424 {
 425         callb_cpr_t     cprinfo;
 426         mt_map_t        *logmap = ul->un_logmap;
 427         rollbuf_t       *rbs;
 428         rollbuf_t       *rbp;
 429         buf_t           *bp;
 430         caddr_t         roll_bufs;
 431         uint32_t        nmblk;
 432         int             i;
 433         int             doingforceroll;
 434         int             nbuf;
 435 
 436         CALLB_CPR_INIT(&cprinfo, &logmap->mtm_mutex, callb_generic_cpr,
 437             "trans_roll");
 438 
 439         /*
 440          * We do not want the roll thread's writes to be
 441          * throttled by the snapshot.
 442          * If they are throttled then we can have a deadlock
 443          * between the roll thread and the snapshot taskq thread:
 444          * roll thread wants the throttling semaphore and
 445          * the snapshot taskq thread cannot release the semaphore
 446          * because it is writing to the log and the log is full.
 447          */
 448 
 449         (void) tsd_set(bypass_snapshot_throttle_key, (void*)1);
 450 
 451         /*
 452          * setup some roll parameters
 453          */
 454         if (trans_roll_tics == 0)
 455                 trans_roll_tics = 5 * hz;
 456         nmblk = log_roll_buffers();
 457 
 458         /*
 459          * allocate the buffers and buffer headers
 460          */
 461         roll_bufs = kmem_alloc(nmblk * MAPBLOCKSIZE, KM_SLEEP);
 462         rbs = kmem_alloc(nmblk * sizeof (rollbuf_t), KM_SLEEP);
 463 
 464         /*
 465          * initialize the buffer headers
 466          */
 467         for (i = 0, rbp = rbs; i < nmblk; ++i, ++rbp) {
 468                 rbp->rb_next = NULL;
 469                 bp = &rbp->rb_bh;
 470                 bioinit(bp);
 471                 bp->b_edev = ul->un_dev;
 472                 bp->b_iodone = trans_not_done;
 473                 bp->b_bufsize = MAPBLOCKSIZE;
 474         }
 475 
 476         doingforceroll = 0;
 477 
 478 again:
 479         /*
 480          * LOOP FOREVER
 481          */
 482 
 483         /*
 484          * exit on demand
 485          */
 486         mutex_enter(&logmap->mtm_mutex);
 487         if ((ul->un_flags & LDL_ERROR) || (logmap->mtm_flags & MTM_ROLL_EXIT)) {
 488                 kmem_free(rbs, nmblk * sizeof (rollbuf_t));
 489                 kmem_free(roll_bufs, nmblk * MAPBLOCKSIZE);
 490                 logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLL_RUNNING |
 491                     MTM_ROLL_EXIT | MTM_ROLLING);
 492                 cv_broadcast(&logmap->mtm_from_roll_cv);
 493                 CALLB_CPR_EXIT(&cprinfo);
 494                 thread_exit();
 495                 /* NOTREACHED */
 496         }
 497 
 498         /*
 499          * MT_SCAN debug mode
 500          *      don't roll except in FORCEROLL situations
 501          */
 502         if (logmap->mtm_debug & MT_SCAN)
 503                 if ((logmap->mtm_flags & MTM_FORCE_ROLL) == 0) {
 504                         mutex_exit(&logmap->mtm_mutex);
 505                         trans_roll_wait(logmap, &cprinfo);
 506                         goto again;
 507                 }
 508         ASSERT(logmap->mtm_trimlof == 0);
 509 
 510         /*
 511          * If we've finished a force roll cycle then wakeup any
 512          * waiters.
 513          */
 514         if (doingforceroll) {
 515                 doingforceroll = 0;
 516                 logmap->mtm_flags &= ~MTM_FORCE_ROLL;
 517                 mutex_exit(&logmap->mtm_mutex);
 518                 cv_broadcast(&logmap->mtm_from_roll_cv);
 519         } else {
 520                 mutex_exit(&logmap->mtm_mutex);
 521         }
 522 
 523         /*
 524          * If someone wants us to roll something; then do it
 525          */
 526         if (logmap->mtm_flags & MTM_FORCE_ROLL) {
 527                 doingforceroll = 1;
 528                 goto rollsomething;
 529         }
 530 
 531         /*
 532          * Log is busy, check if logmap is getting full.
 533          */
 534         if (logmap_need_roll(logmap)) {
 535                 goto rollsomething;
 536         }
 537 
 538         /*
 539          * Check if the log is idle and is not empty
 540          */
 541         if (!logmap->mtm_ref && !ldl_empty(ul)) {
 542                 goto rollsomething;
 543         }
 544 
 545         /*
 546          * Log is busy, check if its getting full
 547          */
 548         if (ldl_need_roll(ul)) {
 549                 goto rollsomething;
 550         }
 551 
 552         /*
 553          * nothing to do; wait a bit and then start over
 554          */
 555         trans_roll_wait(logmap, &cprinfo);
 556         goto again;
 557 
 558         /*
 559          * ROLL SOMETHING
 560          */
 561 
 562 rollsomething:
 563         /*
 564          * Use the cached roll buffers, or read the master
 565          * and overlay the deltas
 566          */
 567         switch (log_roll_read(ul, rbs, nmblk, roll_bufs, &nbuf)) {
 568         case 1: trans_roll_wait(logmap, &cprinfo);
 569                 /* FALLTHROUGH */
 570         case 2: goto again;
 571         /* default case is success */
 572         }
 573 
 574         /*
 575          * Asynchronously write out the deltas
 576          */
 577         if (log_roll_write(ul, rbs, nbuf))
 578                 goto again;
 579 
 580         /*
 581          * free up the deltas in the logmap
 582          */
 583         for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
 584                 bp = &rbp->rb_bh;
 585                 logmap_remove_roll(logmap,
 586                     ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK, MAPBLOCKSIZE);
 587         }
 588 
 589         /*
 590          * free up log space; if possible
 591          */
 592         logmap_sethead(logmap, ul);
 593 
 594         /*
 595          * LOOP
 596          */
 597         goto again;
 598 }