1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  22 /*        All Rights Reserved   */
  23 
  24 /*
  25  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  26  * Use is subject to license terms.
  27  */
  28 /*
  29  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  30  */
  31 
  32 #include <sys/types.h>
  33 #include <sys/t_lock.h>
  34 #include <sys/param.h>
  35 #include <sys/tuneable.h>
  36 #include <sys/inline.h>
  37 #include <sys/systm.h>
  38 #include <sys/proc.h>
  39 #include <sys/user.h>
  40 #include <sys/var.h>
  41 #include <sys/buf.h>
  42 #include <sys/vfs.h>
  43 #include <sys/cred.h>
  44 #include <sys/kmem.h>
  45 #include <sys/vnode.h>
  46 #include <sys/swap.h>
  47 #include <sys/vm.h>
  48 #include <sys/debug.h>
  49 #include <sys/cmn_err.h>
  50 #include <sys/sysinfo.h>
  51 #include <sys/callb.h>
  52 #include <sys/reboot.h>
  53 #include <sys/time.h>
  54 #include <sys/fs/ufs_inode.h>
  55 #include <sys/fs/ufs_bio.h>
  56 
  57 #include <vm/hat.h>
  58 #include <vm/page.h>
  59 #include <vm/pvn.h>
  60 #include <vm/seg_kmem.h>
  61 
  62 volatile int doiflush = 1;      /* non-zero to turn inode flushing on */
  63 volatile int dopageflush = 1;   /* non-zero to turn page flushing on */
  64 
  65 /*
  66  * To improve boot performance, don't run the inode flushing loop until
  67  * the specified number of seconds after boot.  To revert to the old
  68  * behavior, set fsflush_iflush_delay to 0.  We have not created any new
  69  * filesystem danger that did not exist previously, since there is always a
  70  * window in between when fsflush does the inode flush loop during which the
  71  * system could crash, fail to sync the filesystem, and fsck will be needed
  72  * to recover.  We have, however, widened this window.  Finally,
  73  * we never delay inode flushing if we're booting into single user mode,
  74  * where the administrator may be modifying files or using fsck.  This
  75  * modification avoids inode flushes during boot whose only purpose is to
  76  * update atimes on files which have been accessed during boot.
  77  */
  78 int fsflush_iflush_delay = 60;
  79 
  80 kcondvar_t fsflush_cv;
  81 static kmutex_t fsflush_lock;   /* just for the cv_wait */
  82 ksema_t fsflush_sema;           /* to serialize with reboot */
  83 
  84 /*
  85  * some statistics for fsflush_do_pages
  86  */
  87 typedef struct {
  88         ulong_t fsf_scan;       /* number of pages scanned */
  89         ulong_t fsf_examined;   /* number of page_t's actually examined, can */
  90                                 /* be less than fsf_scan due to large pages */
  91         ulong_t fsf_locked;     /* pages we actually page_lock()ed */
  92         ulong_t fsf_modified;   /* number of modified pages found */
  93         ulong_t fsf_coalesce;   /* number of page coalesces done */
  94         ulong_t fsf_time;       /* nanoseconds of run time */
  95         ulong_t fsf_releases;   /* number of page_release() done */
  96 } fsf_stat_t;
  97 
  98 fsf_stat_t fsf_recent;  /* counts for most recent duty cycle */
  99 fsf_stat_t fsf_total;   /* total of counts */
 100 ulong_t fsf_cycles;     /* number of runs refelected in fsf_total */
 101 
 102 /*
 103  * data used to determine when we can coalesce consecutive free pages
 104  * into larger pages.
 105  */
 106 #define MAX_PAGESIZES   32
 107 static ulong_t          fsf_npgsz;
 108 static pgcnt_t          fsf_pgcnt[MAX_PAGESIZES];
 109 static pgcnt_t          fsf_mask[MAX_PAGESIZES];
 110 
 111 
 112 /*
 113  * Scan page_t's and issue I/O's for modified pages.
 114  *
 115  * Also coalesces consecutive small sized free pages into the next larger
 116  * pagesize. This costs a tiny bit of time in fsflush, but will reduce time
 117  * spent scanning on later passes and for anybody allocating large pages.
 118  */
 119 static void
 120 fsflush_do_pages()
 121 {
 122         vnode_t         *vp;
 123         ulong_t         pcount;
 124         hrtime_t        timer = gethrtime();
 125         ulong_t         releases = 0;
 126         ulong_t         nexamined = 0;
 127         ulong_t         nlocked = 0;
 128         ulong_t         nmodified = 0;
 129         ulong_t         ncoalesce = 0;
 130         ulong_t         cnt;
 131         int             mod;
 132         int             fspage = 1;
 133         u_offset_t      offset;
 134         uint_t          szc;
 135 
 136         page_t          *coal_page = NULL;  /* 1st page in group to coalesce */
 137         uint_t          coal_szc = 0;       /* size code, coal_page->p_szc */
 138         uint_t          coal_cnt = 0;       /* count of pages seen */
 139 
 140         static ulong_t  nscan = 0;
 141         static pgcnt_t  last_total_pages = 0;
 142         static page_t   *pp = NULL;
 143 
 144         /*
 145          * Check to see if total_pages has changed.
 146          */
 147         if (total_pages != last_total_pages) {
 148                 last_total_pages = total_pages;
 149                 nscan = (last_total_pages * (tune.t_fsflushr))/v.v_autoup;
 150         }
 151 
 152         if (pp == NULL)
 153                 pp = memsegs->pages;
 154 
 155         pcount = 0;
 156         while (pcount < nscan) {
 157 
 158                 /*
 159                  * move to the next page, skipping over large pages
 160                  * and issuing prefetches.
 161                  */
 162                 if (pp->p_szc && fspage == 0) {
 163                         pfn_t pfn;
 164 
 165                         pfn  = page_pptonum(pp);
 166                         cnt = page_get_pagecnt(pp->p_szc);
 167                         cnt -= pfn & (cnt - 1);
 168                 } else
 169                         cnt = 1;
 170 
 171                 pp = page_nextn(pp, cnt);
 172                 prefetch_page_r((void *)pp);
 173                 ASSERT(pp != NULL);
 174                 pcount += cnt;
 175 
 176                 /*
 177                  * Do a bunch of dirty tests (ie. no locking) to determine
 178                  * if we can quickly skip this page. These tests are repeated
 179                  * after acquiring the page lock.
 180                  */
 181                 ++nexamined;
 182                 if (PP_ISSWAP(pp)) {
 183                         fspage = 0;
 184                         coal_page = NULL;
 185                         continue;
 186                 }
 187 
 188                 /*
 189                  * skip free pages too, but try coalescing them into larger
 190                  * pagesizes
 191                  */
 192                 if (PP_ISFREE(pp)) {
 193                         /*
 194                          * skip pages with a file system identity or that
 195                          * are already maximum size
 196                          */
 197                         fspage = 0;
 198                         szc = pp->p_szc;
 199                         if (pp->p_vnode != NULL || szc == fsf_npgsz - 1) {
 200                                 coal_page = NULL;
 201                                 continue;
 202                         }
 203 
 204                         /*
 205                          * If not in a coalescing candidate page or the size
 206                          * codes are different, start a new candidate.
 207                          */
 208                         if (coal_page == NULL || coal_szc != szc) {
 209 
 210                                 /*
 211                                  * page must be properly aligned
 212                                  */
 213                                 if ((page_pptonum(pp) & fsf_mask[szc]) != 0) {
 214                                         coal_page = NULL;
 215                                         continue;
 216                                 }
 217                                 coal_page = pp;
 218                                 coal_szc = szc;
 219                                 coal_cnt = 1;
 220                                 continue;
 221                         }
 222 
 223                         /*
 224                          * acceptable to add this to existing candidate page
 225                          */
 226                         ++coal_cnt;
 227                         if (coal_cnt < fsf_pgcnt[coal_szc])
 228                                 continue;
 229 
 230                         /*
 231                          * We've got enough pages to coalesce, so do it.
 232                          * After promoting, we clear coal_page, so it will
 233                          * take another pass to promote this to an even
 234                          * larger page.
 235                          */
 236                         ++ncoalesce;
 237                         (void) page_promote_size(coal_page, coal_szc);
 238                         coal_page = NULL;
 239                         continue;
 240                 } else {
 241                         coal_page = NULL;
 242                 }
 243 
 244                 if (PP_ISKAS(pp) ||
 245                     PAGE_LOCKED(pp) ||
 246                     pp->p_lckcnt != 0 ||
 247                     pp->p_cowcnt != 0) {
 248                         fspage = 0;
 249                         continue;
 250                 }
 251 
 252 
 253                 /*
 254                  * Reject pages that can't be "exclusively" locked.
 255                  */
 256                 if (!page_trylock(pp, SE_EXCL))
 257                         continue;
 258                 ++nlocked;
 259 
 260 
 261                 /*
 262                  * After locking the page, redo the above checks.
 263                  * Since we locked the page, leave out the PAGE_LOCKED() test.
 264                  */
 265                 vp = pp->p_vnode;
 266                 if (PP_ISSWAP(pp) ||
 267                     PP_ISFREE(pp) ||
 268                     vp == NULL ||
 269                     PP_ISKAS(pp) ||
 270                     (vp->v_flag & VISSWAP) != 0) {
 271                         page_unlock(pp);
 272                         fspage = 0;
 273                         continue;
 274                 }
 275                 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
 276                         page_unlock(pp);
 277                         continue;
 278                 }
 279 
 280                 fspage = 1;
 281                 ASSERT(vp->v_type != VCHR);
 282 
 283                 /*
 284                  * Check the modified bit. Leaving the bit alone in hardware.
 285                  * It will be cleared if we do the putpage.
 286                  */
 287                 if (IS_VMODSORT(vp))
 288                         mod = hat_ismod(pp);
 289                 else
 290                         mod = hat_pagesync(pp,
 291                             HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD;
 292 
 293                 if (mod) {
 294                         ++nmodified;
 295                         offset = pp->p_offset;
 296 
 297                         /*
 298                          * Hold the vnode before releasing the page lock
 299                          * to prevent it from being freed and re-used by
 300                          * some other thread.
 301                          */
 302                         VN_HOLD(vp);
 303 
 304                         page_unlock(pp);
 305 
 306                         (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_ASYNC,
 307                             kcred, NULL);
 308 
 309                         VN_RELE(vp);
 310                 } else {
 311 
 312                         /*
 313                          * Catch any pages which should be on the cache list,
 314                          * but aren't yet.
 315                          */
 316                         if (hat_page_is_mapped(pp) == 0) {
 317                                 ++releases;
 318                                 (void) page_release(pp, 1);
 319                         } else {
 320                                 page_unlock(pp);
 321                         }
 322                 }
 323         }
 324 
 325         /*
 326          * maintain statistics
 327          * reset every million wakeups, just to avoid overflow
 328          */
 329         if (++fsf_cycles == 1000000) {
 330                 fsf_cycles = 0;
 331                 fsf_total.fsf_scan = 0;
 332                 fsf_total.fsf_examined = 0;
 333                 fsf_total.fsf_locked = 0;
 334                 fsf_total.fsf_modified = 0;
 335                 fsf_total.fsf_coalesce = 0;
 336                 fsf_total.fsf_time = 0;
 337                 fsf_total.fsf_releases = 0;
 338         } else {
 339                 fsf_total.fsf_scan += fsf_recent.fsf_scan = nscan;
 340                 fsf_total.fsf_examined += fsf_recent.fsf_examined = nexamined;
 341                 fsf_total.fsf_locked += fsf_recent.fsf_locked = nlocked;
 342                 fsf_total.fsf_modified += fsf_recent.fsf_modified = nmodified;
 343                 fsf_total.fsf_coalesce += fsf_recent.fsf_coalesce = ncoalesce;
 344                 fsf_total.fsf_time += fsf_recent.fsf_time = gethrtime() - timer;
 345                 fsf_total.fsf_releases += fsf_recent.fsf_releases = releases;
 346         }
 347 }
 348 
 349 /*
 350  * As part of file system hardening, this daemon is awakened
 351  * every second to flush cached data which includes the
 352  * buffer cache, the inode cache and mapped pages.
 353  */
 354 void
 355 fsflush()
 356 {
 357         struct buf *bp, *dwp;
 358         struct hbuf *hp;
 359         int autoup;
 360         unsigned int ix, icount, count = 0;
 361         callb_cpr_t cprinfo;
 362         uint_t          bcount;
 363         kmutex_t        *hmp;
 364         struct vfssw *vswp;
 365 
 366         proc_fsflush = ttoproc(curthread);
 367         proc_fsflush->p_cstime = 0;
 368         proc_fsflush->p_stime =  0;
 369         proc_fsflush->p_cutime =  0;
 370         proc_fsflush->p_utime = 0;
 371         bcopy("fsflush", curproc->p_user.u_psargs, 8);
 372         bcopy("fsflush", curproc->p_user.u_comm, 7);
 373 
 374         mutex_init(&fsflush_lock, NULL, MUTEX_DEFAULT, NULL);
 375         sema_init(&fsflush_sema, 0, NULL, SEMA_DEFAULT, NULL);
 376 
 377         /*
 378          * Setup page coalescing.
 379          */
 380         fsf_npgsz = page_num_pagesizes();
 381         ASSERT(fsf_npgsz < MAX_PAGESIZES);
 382         for (ix = 0; ix < fsf_npgsz - 1; ++ix) {
 383                 fsf_pgcnt[ix] =
 384                     page_get_pagesize(ix + 1) / page_get_pagesize(ix);
 385                 fsf_mask[ix] = page_get_pagecnt(ix + 1) - 1;
 386         }
 387 
 388         autoup = v.v_autoup * hz;
 389         icount = v.v_autoup / tune.t_fsflushr;
 390         CALLB_CPR_INIT(&cprinfo, &fsflush_lock, callb_generic_cpr, "fsflush");
 391 loop:
 392         sema_v(&fsflush_sema);
 393         mutex_enter(&fsflush_lock);
 394         CALLB_CPR_SAFE_BEGIN(&cprinfo);
 395         cv_wait(&fsflush_cv, &fsflush_lock);            /* wait for clock */
 396         CALLB_CPR_SAFE_END(&cprinfo, &fsflush_lock);
 397         mutex_exit(&fsflush_lock);
 398         sema_p(&fsflush_sema);
 399 
 400         /*
 401          * Write back all old B_DELWRI buffers on the freelist.
 402          */
 403         bcount = 0;
 404         for (ix = 0; ix < v.v_hbuf; ix++) {
 405 
 406                 hp = &hbuf[ix];
 407                 dwp = (struct buf *)&dwbuf[ix];
 408 
 409                 bcount += (hp->b_length);
 410 
 411                 if (dwp->av_forw == dwp) {
 412                         continue;
 413                 }
 414 
 415                 hmp = &hbuf[ix].b_lock;
 416                 mutex_enter(hmp);
 417                 bp = dwp->av_forw;
 418 
 419                 /*
 420                  * Go down only on the delayed write lists.
 421                  */
 422                 while (bp != dwp) {
 423 
 424                         ASSERT(bp->b_flags & B_DELWRI);
 425 
 426                         if ((bp->b_flags & B_DELWRI) &&
 427                             (ddi_get_lbolt() - bp->b_start >= autoup) &&
 428                             sema_tryp(&bp->b_sem)) {
 429                                 bp->b_flags |= B_ASYNC;
 430                                 hp->b_length--;
 431                                 notavail(bp);
 432                                 mutex_exit(hmp);
 433                                 if (bp->b_vp == NULL) {
 434                                         BWRITE(bp);
 435                                 } else {
 436                                         UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs,
 437                                             bp);
 438                                 }
 439                                 mutex_enter(hmp);
 440                                 bp = dwp->av_forw;
 441                         } else {
 442                                 bp = bp->av_forw;
 443                         }
 444                 }
 445                 mutex_exit(hmp);
 446         }
 447 
 448         /*
 449          *
 450          * There is no need to wakeup any thread waiting on bio_mem_cv
 451          * since brelse will wake them up as soon as IO is complete.
 452          */
 453         bfreelist.b_bcount = bcount;
 454 
 455         if (dopageflush)
 456                 fsflush_do_pages();
 457 
 458         if (!doiflush)
 459                 goto loop;
 460 
 461         /*
 462          * If the system was not booted to single user mode, skip the
 463          * inode flushing until after fsflush_iflush_delay secs have elapsed.
 464          */
 465         if ((boothowto & RB_SINGLE) == 0 &&
 466             (ddi_get_lbolt64() / hz) < fsflush_iflush_delay)
 467                 goto loop;
 468 
 469         /*
 470          * Flush cached attribute information (e.g. inodes).
 471          */
 472         if (++count >= icount) {
 473                 count = 0;
 474 
 475                 /*
 476                  * Sync back cached data.
 477                  */
 478                 RLOCK_VFSSW();
 479                 for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
 480                         if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) {
 481                                 vfs_refvfssw(vswp);
 482                                 RUNLOCK_VFSSW();
 483                                 (void) fsop_sync_by_kind(vswp - vfssw,
 484                                     SYNC_ATTR, kcred);
 485                                 vfs_unrefvfssw(vswp);
 486                                 RLOCK_VFSSW();
 487                         }
 488                 }
 489                 RUNLOCK_VFSSW();
 490         }
 491         goto loop;
 492 }