big-one Wdiff usr/src/uts/common/fs/fsflush.c

Print this page

re #13613 rb4516 Tunables needs volatile keyword

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/fsflush.c
          +++ new/usr/src/uts/common/fs/fsflush.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each

↓ open down ↓

13 lines elided

↑ open up ↑

  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  22   22  /*        All Rights Reserved   */
  23   23  
  24      -
  25   24  /*
  26   25   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  27   26   * Use is subject to license terms.
  28   27   */
       28 +/*
       29 + * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
       30 + */
  29   31  
  30   32  #include <sys/types.h>
  31   33  #include <sys/t_lock.h>
  32   34  #include <sys/param.h>
  33   35  #include <sys/tuneable.h>
  34   36  #include <sys/inline.h>
  35   37  #include <sys/systm.h>
  36   38  #include <sys/proc.h>
  37   39  #include <sys/user.h>
  38   40  #include <sys/var.h>

  39   41  #include <sys/buf.h>
  40   42  #include <sys/vfs.h>
  41   43  #include <sys/cred.h>
  42   44  #include <sys/kmem.h>
  43   45  #include <sys/vnode.h>
  44   46  #include <sys/swap.h>
  45   47  #include <sys/vm.h>
  46   48  #include <sys/debug.h>
  47   49  #include <sys/cmn_err.h>
  48   50  #include <sys/sysinfo.h>
  49   51  #include <sys/callb.h>

↓ open down ↓

11 lines elided

↑ open up ↑

  50   52  #include <sys/reboot.h>
  51   53  #include <sys/time.h>
  52   54  #include <sys/fs/ufs_inode.h>
  53   55  #include <sys/fs/ufs_bio.h>
  54   56  
  55   57  #include <vm/hat.h>
  56   58  #include <vm/page.h>
  57   59  #include <vm/pvn.h>
  58   60  #include <vm/seg_kmem.h>
  59   61  
  60      -int doiflush = 1;       /* non-zero to turn inode flushing on */
  61      -int dopageflush = 1;    /* non-zero to turn page flushing on */
       62 +volatile int doiflush = 1;      /* non-zero to turn inode flushing on */
       63 +volatile int dopageflush = 1;   /* non-zero to turn page flushing on */
  62   64  
  63   65  /*
  64   66   * To improve boot performance, don't run the inode flushing loop until
  65   67   * the specified number of seconds after boot.  To revert to the old
  66   68   * behavior, set fsflush_iflush_delay to 0.  We have not created any new
  67   69   * filesystem danger that did not exist previously, since there is always a
  68   70   * window in between when fsflush does the inode flush loop during which the
  69   71   * system could crash, fail to sync the filesystem, and fsck will be needed
  70   72   * to recover.  We have, however, widened this window.  Finally,
  71   73   * we never delay inode flushing if we're booting into single user mode,

  72   74   * where the administrator may be modifying files or using fsck.  This
  73   75   * modification avoids inode flushes during boot whose only purpose is to
  74   76   * update atimes on files which have been accessed during boot.
  75   77   */
  76   78  int fsflush_iflush_delay = 60;
  77   79  
  78   80  kcondvar_t fsflush_cv;
  79   81  static kmutex_t fsflush_lock;   /* just for the cv_wait */
  80   82  ksema_t fsflush_sema;           /* to serialize with reboot */
  81   83  
  82   84  /*
  83   85   * some statistics for fsflush_do_pages
  84   86   */
  85   87  typedef struct {
  86   88          ulong_t fsf_scan;       /* number of pages scanned */
  87   89          ulong_t fsf_examined;   /* number of page_t's actually examined, can */
  88   90                                  /* be less than fsf_scan due to large pages */
  89   91          ulong_t fsf_locked;     /* pages we actually page_lock()ed */
  90   92          ulong_t fsf_modified;   /* number of modified pages found */
  91   93          ulong_t fsf_coalesce;   /* number of page coalesces done */
  92   94          ulong_t fsf_time;       /* nanoseconds of run time */
  93   95          ulong_t fsf_releases;   /* number of page_release() done */
  94   96  } fsf_stat_t;
  95   97  
  96   98  fsf_stat_t fsf_recent;  /* counts for most recent duty cycle */
  97   99  fsf_stat_t fsf_total;   /* total of counts */
  98  100  ulong_t fsf_cycles;     /* number of runs refelected in fsf_total */
  99  101  
 100  102  /*
 101  103   * data used to determine when we can coalesce consecutive free pages
 102  104   * into larger pages.
 103  105   */
 104  106  #define MAX_PAGESIZES   32
 105  107  static ulong_t          fsf_npgsz;
 106  108  static pgcnt_t          fsf_pgcnt[MAX_PAGESIZES];
 107  109  static pgcnt_t          fsf_mask[MAX_PAGESIZES];
 108  110  
 109  111  
 110  112  /*
 111  113   * Scan page_t's and issue I/O's for modified pages.
 112  114   *
 113  115   * Also coalesces consecutive small sized free pages into the next larger
 114  116   * pagesize. This costs a tiny bit of time in fsflush, but will reduce time
 115  117   * spent scanning on later passes and for anybody allocating large pages.
 116  118   */
 117  119  static void
 118  120  fsflush_do_pages()
 119  121  {
 120  122          vnode_t         *vp;
 121  123          ulong_t         pcount;
 122  124          hrtime_t        timer = gethrtime();
 123  125          ulong_t         releases = 0;
 124  126          ulong_t         nexamined = 0;
 125  127          ulong_t         nlocked = 0;
 126  128          ulong_t         nmodified = 0;
 127  129          ulong_t         ncoalesce = 0;
 128  130          ulong_t         cnt;
 129  131          int             mod;
 130  132          int             fspage = 1;
 131  133          u_offset_t      offset;
 132  134          uint_t          szc;
 133  135  
 134  136          page_t          *coal_page = NULL;  /* 1st page in group to coalesce */
 135  137          uint_t          coal_szc = 0;       /* size code, coal_page->p_szc */
 136  138          uint_t          coal_cnt = 0;       /* count of pages seen */
 137  139  
 138  140          static ulong_t  nscan = 0;
 139  141          static pgcnt_t  last_total_pages = 0;
 140  142          static page_t   *pp = NULL;
 141  143  
 142  144          /*
 143  145           * Check to see if total_pages has changed.
 144  146           */
 145  147          if (total_pages != last_total_pages) {
 146  148                  last_total_pages = total_pages;
 147  149                  nscan = (last_total_pages * (tune.t_fsflushr))/v.v_autoup;
 148  150          }
 149  151  
 150  152          if (pp == NULL)
 151  153                  pp = memsegs->pages;
 152  154  
 153  155          pcount = 0;
 154  156          while (pcount < nscan) {
 155  157  
 156  158                  /*
 157  159                   * move to the next page, skipping over large pages
 158  160                   * and issuing prefetches.
 159  161                   */
 160  162                  if (pp->p_szc && fspage == 0) {
 161  163                          pfn_t pfn;
 162  164  
 163  165                          pfn  = page_pptonum(pp);
 164  166                          cnt = page_get_pagecnt(pp->p_szc);
 165  167                          cnt -= pfn & (cnt - 1);
 166  168                  } else
 167  169                          cnt = 1;
 168  170  
 169  171                  pp = page_nextn(pp, cnt);
 170  172                  prefetch_page_r((void *)pp);
 171  173                  ASSERT(pp != NULL);
 172  174                  pcount += cnt;
 173  175  
 174  176                  /*
 175  177                   * Do a bunch of dirty tests (ie. no locking) to determine
 176  178                   * if we can quickly skip this page. These tests are repeated
 177  179                   * after acquiring the page lock.
 178  180                   */
 179  181                  ++nexamined;
 180  182                  if (PP_ISSWAP(pp)) {
 181  183                          fspage = 0;
 182  184                          coal_page = NULL;
 183  185                          continue;
 184  186                  }
 185  187  
 186  188                  /*
 187  189                   * skip free pages too, but try coalescing them into larger
 188  190                   * pagesizes
 189  191                   */
 190  192                  if (PP_ISFREE(pp)) {
 191  193                          /*
 192  194                           * skip pages with a file system identity or that
 193  195                           * are already maximum size
 194  196                           */
 195  197                          fspage = 0;
 196  198                          szc = pp->p_szc;
 197  199                          if (pp->p_vnode != NULL || szc == fsf_npgsz - 1) {
 198  200                                  coal_page = NULL;
 199  201                                  continue;
 200  202                          }
 201  203  
 202  204                          /*
 203  205                           * If not in a coalescing candidate page or the size
 204  206                           * codes are different, start a new candidate.
 205  207                           */
 206  208                          if (coal_page == NULL || coal_szc != szc) {
 207  209  
 208  210                                  /*
 209  211                                   * page must be properly aligned
 210  212                                   */
 211  213                                  if ((page_pptonum(pp) & fsf_mask[szc]) != 0) {
 212  214                                          coal_page = NULL;
 213  215                                          continue;
 214  216                                  }
 215  217                                  coal_page = pp;
 216  218                                  coal_szc = szc;
 217  219                                  coal_cnt = 1;
 218  220                                  continue;
 219  221                          }
 220  222  
 221  223                          /*
 222  224                           * acceptable to add this to existing candidate page
 223  225                           */
 224  226                          ++coal_cnt;
 225  227                          if (coal_cnt < fsf_pgcnt[coal_szc])
 226  228                                  continue;
 227  229  
 228  230                          /*
 229  231                           * We've got enough pages to coalesce, so do it.
 230  232                           * After promoting, we clear coal_page, so it will
 231  233                           * take another pass to promote this to an even
 232  234                           * larger page.
 233  235                           */
 234  236                          ++ncoalesce;
 235  237                          (void) page_promote_size(coal_page, coal_szc);
 236  238                          coal_page = NULL;
 237  239                          continue;
 238  240                  } else {
 239  241                          coal_page = NULL;
 240  242                  }
 241  243  
 242  244                  if (PP_ISKAS(pp) ||
 243  245                      PAGE_LOCKED(pp) ||
 244  246                      pp->p_lckcnt != 0 ||
 245  247                      pp->p_cowcnt != 0) {
 246  248                          fspage = 0;
 247  249                          continue;
 248  250                  }
 249  251  
 250  252  
 251  253                  /*
 252  254                   * Reject pages that can't be "exclusively" locked.
 253  255                   */
 254  256                  if (!page_trylock(pp, SE_EXCL))
 255  257                          continue;
 256  258                  ++nlocked;
 257  259  
 258  260  
 259  261                  /*
 260  262                   * After locking the page, redo the above checks.
 261  263                   * Since we locked the page, leave out the PAGE_LOCKED() test.
 262  264                   */
 263  265                  vp = pp->p_vnode;
 264  266                  if (PP_ISSWAP(pp) ||
 265  267                      PP_ISFREE(pp) ||
 266  268                      vp == NULL ||
 267  269                      PP_ISKAS(pp) ||
 268  270                      (vp->v_flag & VISSWAP) != 0) {
 269  271                          page_unlock(pp);
 270  272                          fspage = 0;
 271  273                          continue;
 272  274                  }
 273  275                  if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
 274  276                          page_unlock(pp);
 275  277                          continue;
 276  278                  }
 277  279  
 278  280                  fspage = 1;
 279  281                  ASSERT(vp->v_type != VCHR);
 280  282  
 281  283                  /*
 282  284                   * Check the modified bit. Leaving the bit alone in hardware.
 283  285                   * It will be cleared if we do the putpage.
 284  286                   */
 285  287                  if (IS_VMODSORT(vp))
 286  288                          mod = hat_ismod(pp);
 287  289                  else
 288  290                          mod = hat_pagesync(pp,
 289  291                              HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD;
 290  292  
 291  293                  if (mod) {
 292  294                          ++nmodified;
 293  295                          offset = pp->p_offset;
 294  296  
 295  297                          /*
 296  298                           * Hold the vnode before releasing the page lock
 297  299                           * to prevent it from being freed and re-used by
 298  300                           * some other thread.
 299  301                           */
 300  302                          VN_HOLD(vp);
 301  303  
 302  304                          page_unlock(pp);
 303  305  
 304  306                          (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_ASYNC,
 305  307                              kcred, NULL);
 306  308  
 307  309                          VN_RELE(vp);
 308  310                  } else {
 309  311  
 310  312                          /*
 311  313                           * Catch any pages which should be on the cache list,
 312  314                           * but aren't yet.
 313  315                           */
 314  316                          if (hat_page_is_mapped(pp) == 0) {
 315  317                                  ++releases;
 316  318                                  (void) page_release(pp, 1);
 317  319                          } else {
 318  320                                  page_unlock(pp);
 319  321                          }
 320  322                  }
 321  323          }
 322  324  
 323  325          /*
 324  326           * maintain statistics
 325  327           * reset every million wakeups, just to avoid overflow
 326  328           */
 327  329          if (++fsf_cycles == 1000000) {
 328  330                  fsf_cycles = 0;
 329  331                  fsf_total.fsf_scan = 0;
 330  332                  fsf_total.fsf_examined = 0;
 331  333                  fsf_total.fsf_locked = 0;
 332  334                  fsf_total.fsf_modified = 0;
 333  335                  fsf_total.fsf_coalesce = 0;
 334  336                  fsf_total.fsf_time = 0;
 335  337                  fsf_total.fsf_releases = 0;
 336  338          } else {
 337  339                  fsf_total.fsf_scan += fsf_recent.fsf_scan = nscan;
 338  340                  fsf_total.fsf_examined += fsf_recent.fsf_examined = nexamined;
 339  341                  fsf_total.fsf_locked += fsf_recent.fsf_locked = nlocked;
 340  342                  fsf_total.fsf_modified += fsf_recent.fsf_modified = nmodified;
 341  343                  fsf_total.fsf_coalesce += fsf_recent.fsf_coalesce = ncoalesce;
 342  344                  fsf_total.fsf_time += fsf_recent.fsf_time = gethrtime() - timer;
 343  345                  fsf_total.fsf_releases += fsf_recent.fsf_releases = releases;
 344  346          }
 345  347  }
 346  348  
 347  349  /*
 348  350   * As part of file system hardening, this daemon is awakened
 349  351   * every second to flush cached data which includes the
 350  352   * buffer cache, the inode cache and mapped pages.
 351  353   */
 352  354  void
 353  355  fsflush()
 354  356  {
 355  357          struct buf *bp, *dwp;
 356  358          struct hbuf *hp;
 357  359          int autoup;
 358  360          unsigned int ix, icount, count = 0;
 359  361          callb_cpr_t cprinfo;
 360  362          uint_t          bcount;
 361  363          kmutex_t        *hmp;
 362  364          struct vfssw *vswp;
 363  365  
 364  366          proc_fsflush = ttoproc(curthread);
 365  367          proc_fsflush->p_cstime = 0;
 366  368          proc_fsflush->p_stime =  0;
 367  369          proc_fsflush->p_cutime =  0;
 368  370          proc_fsflush->p_utime = 0;
 369  371          bcopy("fsflush", curproc->p_user.u_psargs, 8);
 370  372          bcopy("fsflush", curproc->p_user.u_comm, 7);
 371  373  
 372  374          mutex_init(&fsflush_lock, NULL, MUTEX_DEFAULT, NULL);
 373  375          sema_init(&fsflush_sema, 0, NULL, SEMA_DEFAULT, NULL);
 374  376  
 375  377          /*
 376  378           * Setup page coalescing.
 377  379           */
 378  380          fsf_npgsz = page_num_pagesizes();
 379  381          ASSERT(fsf_npgsz < MAX_PAGESIZES);
 380  382          for (ix = 0; ix < fsf_npgsz - 1; ++ix) {
 381  383                  fsf_pgcnt[ix] =
 382  384                      page_get_pagesize(ix + 1) / page_get_pagesize(ix);
 383  385                  fsf_mask[ix] = page_get_pagecnt(ix + 1) - 1;
 384  386          }
 385  387  
 386  388          autoup = v.v_autoup * hz;
 387  389          icount = v.v_autoup / tune.t_fsflushr;
 388  390          CALLB_CPR_INIT(&cprinfo, &fsflush_lock, callb_generic_cpr, "fsflush");
 389  391  loop:
 390  392          sema_v(&fsflush_sema);
 391  393          mutex_enter(&fsflush_lock);
 392  394          CALLB_CPR_SAFE_BEGIN(&cprinfo);
 393  395          cv_wait(&fsflush_cv, &fsflush_lock);            /* wait for clock */
 394  396          CALLB_CPR_SAFE_END(&cprinfo, &fsflush_lock);
 395  397          mutex_exit(&fsflush_lock);
 396  398          sema_p(&fsflush_sema);
 397  399  
 398  400          /*
 399  401           * Write back all old B_DELWRI buffers on the freelist.
 400  402           */
 401  403          bcount = 0;
 402  404          for (ix = 0; ix < v.v_hbuf; ix++) {
 403  405  
 404  406                  hp = &hbuf[ix];
 405  407                  dwp = (struct buf *)&dwbuf[ix];
 406  408  
 407  409                  bcount += (hp->b_length);
 408  410  
 409  411                  if (dwp->av_forw == dwp) {
 410  412                          continue;
 411  413                  }
 412  414  
 413  415                  hmp = &hbuf[ix].b_lock;
 414  416                  mutex_enter(hmp);
 415  417                  bp = dwp->av_forw;
 416  418  
 417  419                  /*
 418  420                   * Go down only on the delayed write lists.
 419  421                   */
 420  422                  while (bp != dwp) {
 421  423  
 422  424                          ASSERT(bp->b_flags & B_DELWRI);
 423  425  
 424  426                          if ((bp->b_flags & B_DELWRI) &&
 425  427                              (ddi_get_lbolt() - bp->b_start >= autoup) &&
 426  428                              sema_tryp(&bp->b_sem)) {
 427  429                                  bp->b_flags |= B_ASYNC;
 428  430                                  hp->b_length--;
 429  431                                  notavail(bp);
 430  432                                  mutex_exit(hmp);
 431  433                                  if (bp->b_vp == NULL) {
 432  434                                          BWRITE(bp);
 433  435                                  } else {
 434  436                                          UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs,
 435  437                                              bp);
 436  438                                  }
 437  439                                  mutex_enter(hmp);
 438  440                                  bp = dwp->av_forw;
 439  441                          } else {
 440  442                                  bp = bp->av_forw;
 441  443                          }
 442  444                  }
 443  445                  mutex_exit(hmp);
 444  446          }
 445  447  
 446  448          /*
 447  449           *
 448  450           * There is no need to wakeup any thread waiting on bio_mem_cv
 449  451           * since brelse will wake them up as soon as IO is complete.
 450  452           */
 451  453          bfreelist.b_bcount = bcount;
 452  454  
 453  455          if (dopageflush)
 454  456                  fsflush_do_pages();
 455  457  
 456  458          if (!doiflush)
 457  459                  goto loop;
 458  460  
 459  461          /*
 460  462           * If the system was not booted to single user mode, skip the
 461  463           * inode flushing until after fsflush_iflush_delay secs have elapsed.
 462  464           */
 463  465          if ((boothowto & RB_SINGLE) == 0 &&
 464  466              (ddi_get_lbolt64() / hz) < fsflush_iflush_delay)
 465  467                  goto loop;
 466  468  
 467  469          /*
 468  470           * Flush cached attribute information (e.g. inodes).
 469  471           */
 470  472          if (++count >= icount) {
 471  473                  count = 0;
 472  474  
 473  475                  /*
 474  476                   * Sync back cached data.
 475  477                   */
 476  478                  RLOCK_VFSSW();
 477  479                  for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
 478  480                          if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) {
 479  481                                  vfs_refvfssw(vswp);
 480  482                                  RUNLOCK_VFSSW();
 481  483                                  (void) fsop_sync_by_kind(vswp - vfssw,
 482  484                                      SYNC_ATTR, kcred);
 483  485                                  vfs_unrefvfssw(vswp);
 484  486                                  RLOCK_VFSSW();
 485  487                          }
 486  488                  }
 487  489                  RUNLOCK_VFSSW();
 488  490          }
 489  491          goto loop;
 490  492  }

↓ open down ↓

419 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX