1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 /*
  26  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  27  */
  28 
  29 #include <sys/systm.h>
  30 #include <sys/types.h>
  31 #include <sys/vnode.h>
  32 #include <sys/errno.h>
  33 #include <sys/sysmacros.h>
  34 #include <sys/debug.h>
  35 #include <sys/kmem.h>
  36 #include <sys/conf.h>
  37 #include <sys/proc.h>
  38 #include <sys/cmn_err.h>
  39 #include <sys/fssnap_if.h>
  40 #include <sys/fs/ufs_inode.h>
  41 #include <sys/fs/ufs_filio.h>
  42 #include <sys/fs/ufs_log.h>
  43 #include <sys/fs/ufs_bio.h>
  44 #include <sys/inttypes.h>
  45 #include <sys/callb.h>
  46 #include <sys/tnf_probe.h>
  47 
  48 /*
  49  * Kernel threads for logging
  50  * Currently only one for rolling the log (one per log).
  51  */
  52 
  53 #define LUFS_DEFAULT_NUM_ROLL_BUFS 16
  54 #define LUFS_DEFAULT_MIN_ROLL_BUFS 4
  55 #define LUFS_DEFAULT_MAX_ROLL_BUFS 64
  56 
  57 /*
  58  * Macros
  59  */
  60 #define logmap_need_roll(logmap) ((logmap)->mtm_nme > logmap_maxnme)
  61 #define ldl_empty(ul) ((ul)->un_head_lof == (ul)->un_tail_lof)
  62 
  63 /*
  64  * Tunables
  65  */
  66 uint32_t lufs_num_roll_bufs = LUFS_DEFAULT_NUM_ROLL_BUFS;
  67 uint32_t lufs_min_roll_bufs = LUFS_DEFAULT_MIN_ROLL_BUFS;
  68 uint32_t lufs_max_roll_bufs = LUFS_DEFAULT_MAX_ROLL_BUFS;
  69 long logmap_maxnme = 1536;
  70 int trans_roll_tics = 0;
  71 uint64_t trans_roll_new_delta = 0;
  72 uint64_t lrr_wait = 0;
  73 /*
  74  * Key for thread specific data for the roll thread to
  75  * bypass snapshot throttling
  76  */
  77 uint_t bypass_snapshot_throttle_key;
  78 
  79 /*
  80  * externs
  81  */
  82 extern kmutex_t         ml_scan;
  83 extern kcondvar_t       ml_scan_cv;
  84 
  85 static void
  86 trans_roll_wait(mt_map_t *logmap, callb_cpr_t *cprinfop)
  87 {
  88         mutex_enter(&logmap->mtm_mutex);
  89         logmap->mtm_ref = 0;
  90         if (logmap->mtm_flags & MTM_FORCE_ROLL) {
  91                 cv_broadcast(&logmap->mtm_from_roll_cv);
  92         }
  93         logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLLING);
  94         CALLB_CPR_SAFE_BEGIN(cprinfop);
  95         (void) cv_reltimedwait(&logmap->mtm_to_roll_cv, &logmap->mtm_mutex,
  96             trans_roll_tics, TR_CLOCK_TICK);
  97         CALLB_CPR_SAFE_END(cprinfop, &logmap->mtm_mutex);
  98         logmap->mtm_flags |= MTM_ROLLING;
  99         mutex_exit(&logmap->mtm_mutex);
 100 }
 101 
 102 /*
 103  * returns the number of 8K buffers to use for rolling the log
 104  */
 105 static uint32_t
 106 log_roll_buffers()
 107 {
 108         /*
 109          * sanity validate the tunable lufs_num_roll_bufs
 110          */
 111         if (lufs_num_roll_bufs < lufs_min_roll_bufs) {
 112                 return (lufs_min_roll_bufs);
 113         }
 114         if (lufs_num_roll_bufs > lufs_max_roll_bufs) {
 115                 return (lufs_max_roll_bufs);
 116         }
 117         return (lufs_num_roll_bufs);
 118 }
 119 
 120 /*
 121  * Find something to roll, then if we don't have cached roll buffers
 122  * covering all the deltas in that MAPBLOCK then read the master
 123  * and overlay the deltas.
 124  * returns;
 125  *      0 if sucessful
 126  *      1 on finding nothing to roll
 127  *      2 on error
 128  */
 129 int
 130 log_roll_read(ml_unit_t *ul, rollbuf_t *rbs, int nmblk, caddr_t roll_bufs,
 131     int *retnbuf)
 132 {
 133         offset_t        mof;
 134         buf_t           *bp;
 135         rollbuf_t       *rbp;
 136         mt_map_t        *logmap = ul->un_logmap;
 137         daddr_t         mblkno;
 138         int             i;
 139         int             error;
 140         int             nbuf;
 141 
 142         /*
 143          * Make sure there is really something to roll
 144          */
 145         mof = 0;
 146         if (!logmap_next_roll(logmap, &mof)) {
 147                 return (1);
 148         }
 149 
 150         /*
 151          * build some master blocks + deltas to roll forward
 152          */
 153         rw_enter(&logmap->mtm_rwlock, RW_READER);
 154         nbuf = 0;
 155         do {
 156                 mof = mof & (offset_t)MAPBLOCKMASK;
 157                 mblkno = lbtodb(mof);
 158 
 159                 /*
 160                  * Check for the case of a new delta to a set up buffer
 161                  */
 162                 for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
 163                         if (P2ALIGN(rbp->rb_bh.b_blkno,
 164                             MAPBLOCKSIZE / DEV_BSIZE) == mblkno) {
 165                                 TNF_PROBE_0(trans_roll_new_delta, "lufs",
 166                                     /* CSTYLED */);
 167                                 trans_roll_new_delta++;
 168                                 /* Flush out the current set of buffers */
 169                                 goto flush_bufs;
 170                         }
 171                 }
 172 
 173                 /*
 174                  * Work out what to roll next. If it isn't cached then read
 175                  * it asynchronously from the master.
 176                  */
 177                 bp = &rbp->rb_bh;
 178                 bp->b_blkno = mblkno;
 179                 bp->b_flags = B_READ;
 180                 bp->b_un.b_addr = roll_bufs + (nbuf << MAPBLOCKSHIFT);
 181                 bp->b_bufsize = MAPBLOCKSIZE;
 182                 if (top_read_roll(rbp, ul)) {
 183                         /* logmap deltas were in use */
 184                         if (nbuf == 0) {
 185                                 /*
 186                                  * On first buffer wait for the logmap user
 187                                  * to finish by grabbing the logmap lock
 188                                  * exclusively rather than spinning
 189                                  */
 190                                 rw_exit(&logmap->mtm_rwlock);
 191                                 lrr_wait++;
 192                                 rw_enter(&logmap->mtm_rwlock, RW_WRITER);
 193                                 rw_exit(&logmap->mtm_rwlock);
 194                                 return (1);
 195                         }
 196                         /* we have at least one buffer - flush it */
 197                         goto flush_bufs;
 198                 }
 199                 if ((bp->b_flags & B_INVAL) == 0) {
 200                         nbuf++;
 201                 }
 202                 mof += MAPBLOCKSIZE;
 203         } while ((nbuf < nmblk) && logmap_next_roll(logmap, &mof));
 204 
 205         /*
 206          * If there was nothing to roll cycle back
 207          */
 208         if (nbuf == 0) {
 209                 rw_exit(&logmap->mtm_rwlock);
 210                 return (1);
 211         }
 212 
 213 flush_bufs:
 214         /*
 215          * For each buffer, if it isn't cached then wait for the read to
 216          * finish and overlay the deltas.
 217          */
 218         for (error = 0, i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
 219                 if (!rbp->rb_crb) {
 220                         bp = &rbp->rb_bh;
 221                         if (trans_not_wait(bp)) {
 222                                 ldl_seterror(ul,
 223                                     "Error reading master during ufs log roll");
 224                                 error = 1;
 225                         }
 226                         /*
 227                          * sync read the data from the log
 228                          */
 229                         if (ldl_read(ul, bp->b_un.b_addr,
 230                             ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK,
 231                             MAPBLOCKSIZE, rbp->rb_age)) {
 232                                 error = 1;
 233                         }
 234                 }
 235 
 236                 /*
 237                  * reset the age bit in the age list
 238                  */
 239                 logmap_list_put_roll(logmap, rbp->rb_age);
 240 
 241                 if (ul->un_flags & LDL_ERROR) {
 242                         error = 1;
 243                 }
 244         }
 245         rw_exit(&logmap->mtm_rwlock);
 246         if (error)
 247                 return (2);
 248         *retnbuf = nbuf;
 249         return (0);
 250 }
 251 
 252 /*
 253  * Write out a cached roll buffer
 254  */
 255 void
 256 log_roll_write_crb(ufsvfs_t *ufsvfsp, rollbuf_t *rbp)
 257 {
 258         crb_t *crb = rbp->rb_crb;
 259         buf_t *bp = &rbp->rb_bh;
 260 
 261         bp->b_blkno = lbtodb(crb->c_mof);
 262         bp->b_un.b_addr = crb->c_buf;
 263         bp->b_bcount = crb->c_nb;
 264         bp->b_bufsize = crb->c_nb;
 265         ASSERT((crb->c_nb & DEV_BMASK) == 0);
 266         bp->b_flags = B_WRITE;
 267         logstats.ls_rwrites.value.ui64++;
 268 
 269         /* if snapshots are enabled, call it */
 270         if (ufsvfsp->vfs_snapshot) {
 271                 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
 272         } else {
 273                 (void) bdev_strategy(bp);
 274         }
 275 }
 276 
 277 /*
 278  * Write out a set of non cached roll buffers
 279  */
 280 void
 281 log_roll_write_bufs(ufsvfs_t *ufsvfsp, rollbuf_t *rbp)
 282 {
 283         buf_t           *bp = &rbp->rb_bh;
 284         buf_t           *bp2;
 285         rbsecmap_t      secmap = rbp->rb_secmap;
 286         int             j, k;
 287 
 288         ASSERT(secmap);
 289         ASSERT((bp->b_flags & B_INVAL) == 0);
 290 
 291         do { /* for each contiguous block of sectors */
 292                 /* find start of next sector to write */
 293                 for (j = 0; j < 16; ++j) {
 294                         if (secmap & UINT16_C(1))
 295                                 break;
 296                         secmap >>= 1;
 297                 }
 298                 bp->b_un.b_addr += (j << DEV_BSHIFT);
 299                 bp->b_blkno += j;
 300 
 301                 /* calculate number of sectors */
 302                 secmap >>= 1;
 303                 j++;
 304                 for (k = 1; j < 16; ++j) {
 305                         if ((secmap & UINT16_C(1)) == 0)
 306                                 break;
 307                         secmap >>= 1;
 308                         k++;
 309                 }
 310                 bp->b_bcount = k << DEV_BSHIFT;
 311                 bp->b_flags = B_WRITE;
 312                 logstats.ls_rwrites.value.ui64++;
 313 
 314                 /* if snapshots are enabled, call it */
 315                 if (ufsvfsp->vfs_snapshot)
 316                         fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
 317                 else
 318                         (void) bdev_strategy(bp);
 319                 if (secmap) {
 320                         /*
 321                          * Allocate another buf_t to handle
 322                          * the next write in this MAPBLOCK
 323                          * Chain them via b_list.
 324                          */
 325                         bp2 = kmem_alloc(sizeof (buf_t), KM_SLEEP);
 326                         bp->b_list = bp2;
 327                         bioinit(bp2);
 328                         bp2->b_iodone = trans_not_done;
 329                         bp2->b_bufsize = MAPBLOCKSIZE;
 330                         bp2->b_edev = bp->b_edev;
 331                         bp2->b_un.b_addr =
 332                             bp->b_un.b_addr + bp->b_bcount;
 333                         bp2->b_blkno = bp->b_blkno + k;
 334                         bp = bp2;
 335                 }
 336         } while (secmap);
 337 }
 338 
 339 /*
 340  * Asynchronously roll the deltas, using the sector map
 341  * in each rollbuf_t.
 342  */
 343 int
 344 log_roll_write(ml_unit_t *ul, rollbuf_t *rbs, int nbuf)
 345 {
 346 
 347         ufsvfs_t        *ufsvfsp = ul->un_ufsvfs;
 348         rollbuf_t       *rbp;
 349         buf_t           *bp, *bp2;
 350         rollbuf_t       *head, *prev, *rbp2;
 351 
 352         /*
 353          * Order the buffers by blkno
 354          */
 355         ASSERT(nbuf > 0);
 356 #ifdef lint
 357         prev = rbs;
 358 #endif
 359         for (head = rbs, rbp = rbs + 1; rbp < rbs + nbuf; rbp++) {
 360                 for (rbp2 = head; rbp2; prev = rbp2, rbp2 = rbp2->rb_next) {
 361                         if (rbp->rb_bh.b_blkno < rbp2->rb_bh.b_blkno) {
 362                                 if (rbp2 == head) {
 363                                         rbp->rb_next = head;
 364                                         head = rbp;
 365                                 } else {
 366                                         prev->rb_next = rbp;
 367                                         rbp->rb_next = rbp2;
 368                                 }
 369                                 break;
 370                         }
 371                 }
 372                 if (rbp2 == NULL) {
 373                         prev->rb_next = rbp;
 374                         rbp->rb_next = NULL;
 375                 }
 376         }
 377 
 378         /*
 379          * issue the in-order writes
 380          */
 381         for (rbp = head; rbp; rbp = rbp2) {
 382                 if (rbp->rb_crb) {
 383                         log_roll_write_crb(ufsvfsp, rbp);
 384                 } else {
 385                         log_roll_write_bufs(ufsvfsp, rbp);
 386                 }
 387                 /* null out the rb_next link for next set of rolling */
 388                 rbp2 = rbp->rb_next;
 389                 rbp->rb_next = NULL;
 390         }
 391 
 392         /*
 393          * wait for all the writes to finish
 394          */
 395         for (rbp = rbs; rbp < rbs + nbuf; rbp++) {
 396                 bp = &rbp->rb_bh;
 397                 if (trans_not_wait(bp)) {
 398                         ldl_seterror(ul,
 399                             "Error writing master during ufs log roll");
 400                 }
 401 
 402                 /*
 403                  * Now wait for all the "cloned" buffer writes (if any)
 404                  * and free those headers
 405                  */
 406                 bp2 = bp->b_list;
 407                 bp->b_list = NULL;
 408                 while (bp2) {
 409                         if (trans_not_wait(bp2)) {
 410                                 ldl_seterror(ul,
 411                                     "Error writing master during ufs log roll");
 412                         }
 413                         bp = bp2;
 414                         bp2 = bp2->b_list;
 415                         kmem_free(bp, sizeof (buf_t));
 416                 }
 417         }
 418 
 419         if (ul->un_flags & LDL_ERROR)
 420                 return (1);
 421         return (0);
 422 }
 423 
 424 void
 425 trans_roll(ml_unit_t *ul)
 426 {
 427         callb_cpr_t     cprinfo;
 428         mt_map_t        *logmap = ul->un_logmap;
 429         rollbuf_t       *rbs;
 430         rollbuf_t       *rbp;
 431         buf_t           *bp;
 432         caddr_t         roll_bufs;
 433         uint32_t        nmblk;
 434         int             i;
 435         int             doingforceroll;
 436         int             nbuf;
 437 
 438         CALLB_CPR_INIT(&cprinfo, &logmap->mtm_mutex, callb_generic_cpr,
 439             "trans_roll");
 440 
 441         /*
 442          * We do not want the roll thread's writes to be
 443          * throttled by the snapshot.
 444          * If they are throttled then we can have a deadlock
 445          * between the roll thread and the snapshot taskq thread:
 446          * roll thread wants the throttling semaphore and
 447          * the snapshot taskq thread cannot release the semaphore
 448          * because it is writing to the log and the log is full.
 449          */
 450 
 451         (void) tsd_set(bypass_snapshot_throttle_key, (void*)1);
 452 
 453         /*
 454          * setup some roll parameters
 455          */
 456         if (trans_roll_tics == 0)
 457                 trans_roll_tics = 5 * hz;
 458         nmblk = log_roll_buffers();
 459 
 460         /*
 461          * allocate the buffers and buffer headers
 462          */
 463         roll_bufs = kmem_alloc(nmblk * MAPBLOCKSIZE, KM_SLEEP);
 464         rbs = kmem_alloc(nmblk * sizeof (rollbuf_t), KM_SLEEP);
 465 
 466         /*
 467          * initialize the buffer headers
 468          */
 469         for (i = 0, rbp = rbs; i < nmblk; ++i, ++rbp) {
 470                 rbp->rb_next = NULL;
 471                 bp = &rbp->rb_bh;
 472                 bioinit(bp);
 473                 bp->b_edev = ul->un_dev;
 474                 bp->b_iodone = trans_not_done;
 475                 bp->b_bufsize = MAPBLOCKSIZE;
 476         }
 477 
 478         doingforceroll = 0;
 479 
 480 again:
 481         /*
 482          * LOOP FOREVER
 483          */
 484 
 485         /*
 486          * exit on demand
 487          */
 488         mutex_enter(&logmap->mtm_mutex);
 489         if ((ul->un_flags & LDL_ERROR) || (logmap->mtm_flags & MTM_ROLL_EXIT)) {
 490                 kmem_free(rbs, nmblk * sizeof (rollbuf_t));
 491                 kmem_free(roll_bufs, nmblk * MAPBLOCKSIZE);
 492                 logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLL_RUNNING |
 493                     MTM_ROLL_EXIT | MTM_ROLLING);
 494                 cv_broadcast(&logmap->mtm_from_roll_cv);
 495                 CALLB_CPR_EXIT(&cprinfo);
 496                 thread_exit();
 497                 /* NOTREACHED */
 498         }
 499 
 500         /*
 501          * MT_SCAN debug mode
 502          *      don't roll except in FORCEROLL situations
 503          */
 504         if (logmap->mtm_debug & MT_SCAN)
 505                 if ((logmap->mtm_flags & MTM_FORCE_ROLL) == 0) {
 506                         mutex_exit(&logmap->mtm_mutex);
 507                         trans_roll_wait(logmap, &cprinfo);
 508                         goto again;
 509                 }
 510         ASSERT(logmap->mtm_trimlof == 0);
 511 
 512         /*
 513          * If we've finished a force roll cycle then wakeup any
 514          * waiters.
 515          */
 516         if (doingforceroll) {
 517                 doingforceroll = 0;
 518                 logmap->mtm_flags &= ~MTM_FORCE_ROLL;
 519                 mutex_exit(&logmap->mtm_mutex);
 520                 cv_broadcast(&logmap->mtm_from_roll_cv);
 521         } else {
 522                 mutex_exit(&logmap->mtm_mutex);
 523         }
 524 
 525         /*
 526          * If someone wants us to roll something; then do it
 527          */
 528         if (logmap->mtm_flags & MTM_FORCE_ROLL) {
 529                 doingforceroll = 1;
 530                 goto rollsomething;
 531         }
 532 
 533         /*
 534          * Log is busy, check if logmap is getting full.
 535          */
 536         if (logmap_need_roll(logmap)) {
 537                 goto rollsomething;
 538         }
 539 
 540         /*
 541          * Check if the log is idle and is not empty
 542          */
 543         if (!logmap->mtm_ref && !ldl_empty(ul)) {
 544                 goto rollsomething;
 545         }
 546 
 547         /*
 548          * Log is busy, check if its getting full
 549          */
 550         if (ldl_need_roll(ul)) {
 551                 goto rollsomething;
 552         }
 553 
 554         /*
 555          * nothing to do; wait a bit and then start over
 556          */
 557         trans_roll_wait(logmap, &cprinfo);
 558         goto again;
 559 
 560         /*
 561          * ROLL SOMETHING
 562          */
 563 
 564 rollsomething:
 565         /*
 566          * Use the cached roll buffers, or read the master
 567          * and overlay the deltas
 568          */
 569         switch (log_roll_read(ul, rbs, nmblk, roll_bufs, &nbuf)) {
 570         case 1: trans_roll_wait(logmap, &cprinfo);
 571                 /* FALLTHROUGH */
 572         case 2: goto again;
 573         /* default case is success */
 574         }
 575 
 576         /*
 577          * Asynchronously write out the deltas
 578          */
 579         if (log_roll_write(ul, rbs, nbuf))
 580                 goto again;
 581 
 582         /*
 583          * free up the deltas in the logmap
 584          */
 585         for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) {
 586                 bp = &rbp->rb_bh;
 587                 logmap_remove_roll(logmap,
 588                     ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK, MAPBLOCKSIZE);
 589         }
 590 
 591         /*
 592          * free up log space; if possible
 593          */
 594         logmap_sethead(logmap, ul);
 595 
 596         /*
 597          * LOOP
 598          */
 599         goto again;
 600 }