io-lx-public-vs-joyent Old usr/src/uts/common/vm/vm

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  24  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  25  */
  26 
  27 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  28 /*        All Rights Reserved   */
  29 
  30 /*
  31  * University Copyright- Copyright (c) 1982, 1986, 1988
  32  * The Regents of the University of California
  33  * All Rights Reserved
  34  *
  35  * University Acknowledgment- Portions of this document are derived from
  36  * software developed by the University of California, Berkeley, and its
  37  * contributors.
  38  */
  39 
  40 /*
  41  * VM - paged vnode.
  42  *
  43  * This file supplies vm support for the vnode operations that deal with pages.
  44  */
  45 #include <sys/types.h>
  46 #include <sys/t_lock.h>
  47 #include <sys/param.h>
  48 #include <sys/sysmacros.h>
  49 #include <sys/systm.h>
  50 #include <sys/time.h>
  51 #include <sys/buf.h>
  52 #include <sys/vnode.h>
  53 #include <sys/uio.h>
  54 #include <sys/vmsystm.h>
  55 #include <sys/mman.h>
  56 #include <sys/vfs.h>
  57 #include <sys/cred.h>
  58 #include <sys/user.h>
  59 #include <sys/kmem.h>
  60 #include <sys/cmn_err.h>
  61 #include <sys/debug.h>
  62 #include <sys/cpuvar.h>
  63 #include <sys/vtrace.h>
  64 #include <sys/tnf_probe.h>
  65 
  66 #include <vm/hat.h>
  67 #include <vm/as.h>
  68 #include <vm/seg.h>
  69 #include <vm/rm.h>
  70 #include <vm/pvn.h>
  71 #include <vm/page.h>
  72 #include <vm/seg_map.h>
  73 #include <vm/seg_kmem.h>
  74 #include <sys/fs/swapnode.h>
  75 
  76 int pvn_nofodklust = 0;
  77 int pvn_write_noklust = 0;
  78 
  79 uint_t pvn_vmodsort_supported = 0;      /* set if HAT supports VMODSORT */
  80 uint_t pvn_vmodsort_disable = 0;        /* set in /etc/system to disable HAT */
  81                                         /* support for vmodsort for testing */
  82 
  83 static struct kmem_cache *marker_cache = NULL;
  84 
  85 /*
  86  * Find the largest contiguous block which contains `addr' for file offset
  87  * `offset' in it while living within the file system block sizes (`vp_off'
  88  * and `vp_len') and the address space limits for which no pages currently
  89  * exist and which map to consecutive file offsets.
  90  */
  91 page_t *
  92 pvn_read_kluster(
  93         struct vnode *vp,
  94         u_offset_t off,
  95         struct seg *seg,
  96         caddr_t addr,
  97         u_offset_t *offp,                       /* return values */
  98         size_t *lenp,                           /* return values */
  99         u_offset_t vp_off,
 100         size_t vp_len,
 101         int isra)
 102 {
 103         ssize_t deltaf, deltab;
 104         page_t *pp;
 105         page_t *plist = NULL;
 106         spgcnt_t pagesavail;
 107         u_offset_t vp_end;
 108 
 109         ASSERT(off >= vp_off && off < vp_off + vp_len);
 110 
 111         /*
 112          * We only want to do klustering/read ahead if there
 113          * is more than minfree pages currently available.
 114          */
 115         pagesavail = freemem - minfree;
 116 
 117         if (pagesavail <= 0)
 118                 if (isra)
 119                         return ((page_t *)NULL);    /* ra case - give up */
 120                 else
 121                         pagesavail = 1;             /* must return a page */
 122 
 123         /* We calculate in pages instead of bytes due to 32-bit overflows */
 124         if (pagesavail < (spgcnt_t)btopr(vp_len)) {
 125                 /*
 126                  * Don't have enough free memory for the
 127                  * max request, try sizing down vp request.
 128                  */
 129                 deltab = (ssize_t)(off - vp_off);
 130                 vp_len -= deltab;
 131                 vp_off += deltab;
 132                 if (pagesavail < btopr(vp_len)) {
 133                         /*
 134                          * Still not enough memory, just settle for
 135                          * pagesavail which is at least 1.
 136                          */
 137                         vp_len = ptob(pagesavail);
 138                 }
 139         }
 140 
 141         vp_end = vp_off + vp_len;
 142         ASSERT(off >= vp_off && off < vp_end);
 143 
 144         if (isra && SEGOP_KLUSTER(seg, addr, 0))
 145                 return ((page_t *)NULL);        /* segment driver says no */
 146 
 147         if ((plist = page_create_va(vp, off,
 148             PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL)
 149                 return ((page_t *)NULL);
 150 
 151         if (vp_len <= PAGESIZE || pvn_nofodklust) {
 152                 *offp = off;
 153                 *lenp = MIN(vp_len, PAGESIZE);
 154         } else {
 155                 /*
 156                  * Scan back from front by incrementing "deltab" and
 157                  * comparing "off" with "vp_off + deltab" to avoid
 158                  * "signed" versus "unsigned" conversion problems.
 159                  */
 160                 for (deltab = PAGESIZE; off >= vp_off + deltab;
 161                     deltab += PAGESIZE) {
 162                         /*
 163                          * Call back to the segment driver to verify that
 164                          * the klustering/read ahead operation makes sense.
 165                          */
 166                         if (SEGOP_KLUSTER(seg, addr, -deltab))
 167                                 break;          /* page not eligible */
 168                         if ((pp = page_create_va(vp, off - deltab,
 169                             PAGESIZE, PG_EXCL, seg, addr - deltab))
 170                             == NULL)
 171                                 break;          /* already have the page */
 172                         /*
 173                          * Add page to front of page list.
 174                          */
 175                         page_add(&plist, pp);
 176                 }
 177                 deltab -= PAGESIZE;
 178 
 179                 /* scan forward from front */
 180                 for (deltaf = PAGESIZE; off + deltaf < vp_end;
 181                     deltaf += PAGESIZE) {
 182                         /*
 183                          * Call back to the segment driver to verify that
 184                          * the klustering/read ahead operation makes sense.
 185                          */
 186                         if (SEGOP_KLUSTER(seg, addr, deltaf))
 187                                 break;          /* page not file extension */
 188                         if ((pp = page_create_va(vp, off + deltaf,
 189                             PAGESIZE, PG_EXCL, seg, addr + deltaf))
 190                             == NULL)
 191                                 break;          /* already have page */
 192 
 193                         /*
 194                          * Add page to end of page list.
 195                          */
 196                         page_add(&plist, pp);
 197                         plist = plist->p_next;
 198                 }
 199                 *offp = off = off - deltab;
 200                 *lenp = deltab + deltaf;
 201                 ASSERT(off >= vp_off);
 202 
 203                 /*
 204                  * If we ended up getting more than was actually
 205                  * requested, retract the returned length to only
 206                  * reflect what was requested.  This might happen
 207                  * if we were allowed to kluster pages across a
 208                  * span of (say) 5 frags, and frag size is less
 209                  * than PAGESIZE.  We need a whole number of
 210                  * pages to contain those frags, but the returned
 211                  * size should only allow the returned range to
 212                  * extend as far as the end of the frags.
 213                  */
 214                 if ((vp_off + vp_len) < (off + *lenp)) {
 215                         ASSERT(vp_end > off);
 216                         *lenp = vp_end - off;
 217                 }
 218         }
 219         TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER,
 220             "pvn_read_kluster:seg %p addr %x isra %x",
 221             seg, addr, isra);
 222         return (plist);
 223 }
 224 
 225 /*
 226  * Handle pages for this vnode on either side of the page "pp"
 227  * which has been locked by the caller.  This routine will also
 228  * do klustering in the range [vp_off, vp_off + vp_len] up
 229  * until a page which is not found.  The offset and length
 230  * of pages included is returned in "*offp" and "*lenp".
 231  *
 232  * Returns a list of dirty locked pages all ready to be
 233  * written back.
 234  */
 235 page_t *
 236 pvn_write_kluster(
 237         struct vnode *vp,
 238         page_t *pp,
 239         u_offset_t *offp,               /* return values */
 240         size_t *lenp,                   /* return values */
 241         u_offset_t vp_off,
 242         size_t vp_len,
 243         int flags)
 244 {
 245         u_offset_t off;
 246         page_t *dirty;
 247         size_t deltab, deltaf;
 248         se_t se;
 249         u_offset_t vp_end;
 250 
 251         off = pp->p_offset;
 252 
 253         /*
 254          * Kustering should not be done if we are invalidating
 255          * pages since we could destroy pages that belong to
 256          * some other process if this is a swap vnode.
 257          */
 258         if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) {
 259                 *offp = off;
 260                 *lenp = PAGESIZE;
 261                 return (pp);
 262         }
 263 
 264         if (flags & (B_FREE | B_INVAL))
 265                 se = SE_EXCL;
 266         else
 267                 se = SE_SHARED;
 268 
 269         dirty = pp;
 270         /*
 271          * Scan backwards looking for pages to kluster by incrementing
 272          * "deltab" and comparing "off" with "vp_off + deltab" to
 273          * avoid "signed" versus "unsigned" conversion problems.
 274          */
 275         for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) {
 276                 pp = page_lookup_nowait(vp, off - deltab, se);
 277                 if (pp == NULL)
 278                         break;          /* page not found */
 279                 if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
 280                         break;
 281                 page_add(&dirty, pp);
 282         }
 283         deltab -= PAGESIZE;
 284 
 285         vp_end = vp_off + vp_len;
 286         /* now scan forwards looking for pages to kluster */
 287         for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) {
 288                 pp = page_lookup_nowait(vp, off + deltaf, se);
 289                 if (pp == NULL)
 290                         break;          /* page not found */
 291                 if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
 292                         break;
 293                 page_add(&dirty, pp);
 294                 dirty = dirty->p_next;
 295         }
 296 
 297         *offp = off - deltab;
 298         *lenp = deltab + deltaf;
 299         return (dirty);
 300 }
 301 
 302 /*
 303  * Generic entry point used to release the "shared/exclusive" lock
 304  * and the "p_iolock" on pages after i/o is complete.
 305  */
 306 void
 307 pvn_io_done(page_t *plist)
 308 {
 309         page_t *pp;
 310 
 311         while (plist != NULL) {
 312                 pp = plist;
 313                 page_sub(&plist, pp);
 314                 page_io_unlock(pp);
 315                 page_unlock(pp);
 316         }
 317 }
 318 
 319 /*
 320  * Entry point to be used by file system getpage subr's and
 321  * other such routines which either want to unlock pages (B_ASYNC
 322  * request) or destroy a list of pages if an error occurred.
 323  */
 324 void
 325 pvn_read_done(page_t *plist, int flags)
 326 {
 327         page_t *pp;
 328 
 329         while (plist != NULL) {
 330                 pp = plist;
 331                 page_sub(&plist, pp);
 332                 page_io_unlock(pp);
 333                 if (flags & B_ERROR) {
 334                         /*LINTED: constant in conditional context*/
 335                         VN_DISPOSE(pp, B_INVAL, 0, kcred);
 336                 } else {
 337                         (void) page_release(pp, 0);
 338                 }
 339         }
 340 }
 341 
 342 /*
 343  * Automagic pageout.
 344  * When memory gets tight, start freeing pages popping out of the
 345  * write queue.
 346  */
 347 int     write_free = 1;
 348 pgcnt_t pages_before_pager = 200;       /* LMXXX */
 349 
 350 /*
 351  * Routine to be called when page-out's complete.
 352  * The caller, typically VOP_PUTPAGE, has to explicity call this routine
 353  * after waiting for i/o to complete (biowait) to free the list of
 354  * pages associated with the buffer.  These pages must be locked
 355  * before i/o is initiated.
 356  *
 357  * If a write error occurs, the pages are marked as modified
 358  * so the write will be re-tried later.
 359  */
 360 
 361 void
 362 pvn_write_done(page_t *plist, int flags)
 363 {
 364         int dfree = 0;
 365         int pgrec = 0;
 366         int pgout = 0;
 367         int pgpgout = 0;
 368         int anonpgout = 0;
 369         int anonfree = 0;
 370         int fspgout = 0;
 371         int fsfree = 0;
 372         int execpgout = 0;
 373         int execfree = 0;
 374         page_t *pp;
 375         struct cpu *cpup;
 376         struct vnode *vp = NULL;        /* for probe */
 377         uint_t ppattr;
 378         kmutex_t *vphm = NULL;
 379 
 380         ASSERT((flags & B_READ) == 0);
 381 
 382         /*
 383          * If we are about to start paging anyway, start freeing pages.
 384          */
 385         if (write_free && freemem < lotsfree + pages_before_pager &&
 386             (flags & B_ERROR) == 0) {
 387                 flags |= B_FREE;
 388         }
 389 
 390         /*
 391          * Handle each page involved in the i/o operation.
 392          */
 393         while (plist != NULL) {
 394                 pp = plist;
 395                 ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp));
 396                 page_sub(&plist, pp);
 397 
 398                 /* Kernel probe support */
 399                 if (vp == NULL)
 400                         vp = pp->p_vnode;
 401 
 402                 if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) {
 403                         /*
 404                          * Move page to the top of the v_page list.
 405                          * Skip pages modified during IO.
 406                          */
 407                         vphm = page_vnode_mutex(vp);
 408                         mutex_enter(vphm);
 409                         if ((pp->p_vpnext != pp) && !hat_ismod(pp)) {
 410                                 page_vpsub(&vp->v_pages, pp);
 411                                 page_vpadd(&vp->v_pages, pp);
 412                         }
 413                         mutex_exit(vphm);
 414                 }
 415 
 416                 if (flags & B_ERROR) {
 417                         /*
 418                          * Write operation failed.  We don't want
 419                          * to destroy (or free) the page unless B_FORCE
 420                          * is set. We set the mod bit again and release
 421                          * all locks on the page so that it will get written
 422                          * back again later when things are hopefully
 423                          * better again.
 424                          * If B_INVAL and B_FORCE is set we really have
 425                          * to destroy the page.
 426                          */
 427                         if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) {
 428                                 page_io_unlock(pp);
 429                                 /*LINTED: constant in conditional context*/
 430                                 VN_DISPOSE(pp, B_INVAL, 0, kcred);
 431                         } else {
 432                                 hat_setmod_only(pp);
 433                                 page_io_unlock(pp);
 434                                 page_unlock(pp);
 435                         }
 436                 } else if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) {
 437                         /*
 438                          * If B_INVALCURONLY is set, then we handle that case
 439                          * in the next conditional if hat_page_is_mapped()
 440                          * indicates that there are no additional mappings
 441                          * to the page.
 442                          */
 443 
 444                         /*
 445                          * XXX - Failed writes with B_INVAL set are
 446                          * not handled appropriately.
 447                          */
 448                         page_io_unlock(pp);
 449                         /*LINTED: constant in conditional context*/
 450                         VN_DISPOSE(pp, B_INVAL, 0, kcred);
 451                 } else if (flags & B_FREE ||!hat_page_is_mapped(pp)) {
 452                         /*
 453                          * Update statistics for pages being paged out
 454                          */
 455                         if (pp->p_vnode) {
 456                                 if (IS_SWAPFSVP(pp->p_vnode)) {
 457                                         anonpgout++;
 458                                 } else {
 459                                         if (pp->p_vnode->v_flag & VVMEXEC) {
 460                                                 execpgout++;
 461                                         } else {
 462                                                 fspgout++;
 463                                         }
 464                                 }
 465                         }
 466                         page_io_unlock(pp);
 467                         pgout = 1;
 468                         pgpgout++;
 469                         TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT,
 470                             "page_ws_out:pp %p", pp);
 471 
 472                         /*
 473                          * The page_struct_lock need not be acquired to
 474                          * examine "p_lckcnt" and "p_cowcnt" since we'll
 475                          * have an "exclusive" lock if the upgrade succeeds.
 476                          */
 477                         if (page_tryupgrade(pp) &&
 478                             pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
 479                                 /*
 480                                  * Check if someone has reclaimed the
 481                                  * page.  If ref and mod are not set, no
 482                                  * one is using it so we can free it.
 483                                  * The rest of the system is careful
 484                                  * to use the NOSYNC flag to unload
 485                                  * translations set up for i/o w/o
 486                                  * affecting ref and mod bits.
 487                                  *
 488                                  * Obtain a copy of the real hardware
 489                                  * mod bit using hat_pagesync(pp, HAT_DONTZERO)
 490                                  * to avoid having to flush the cache.
 491                                  */
 492                                 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
 493                                     HAT_SYNC_STOPON_MOD);
 494                         ck_refmod:
 495                                 if (!(ppattr & (P_REF | P_MOD))) {
 496                                         if (hat_page_is_mapped(pp)) {
 497                                                 /*
 498                                                  * Doesn't look like the page
 499                                                  * was modified so now we
 500                                                  * really have to unload the
 501                                                  * translations.  Meanwhile
 502                                                  * another CPU could've
 503                                                  * modified it so we have to
 504                                                  * check again.  We don't loop
 505                                                  * forever here because now
 506                                                  * the translations are gone
 507                                                  * and no one can get a new one
 508                                                  * since we have the "exclusive"
 509                                                  * lock on the page.
 510                                                  */
 511                                                 (void) hat_pageunload(pp,
 512                                                     HAT_FORCE_PGUNLOAD);
 513                                                 ppattr = hat_page_getattr(pp,
 514                                                     P_REF | P_MOD);
 515                                                 goto ck_refmod;
 516                                         }
 517                                         /*
 518                                          * Update statistics for pages being
 519                                          * freed
 520                                          */
 521                                         if (pp->p_vnode) {
 522                                                 if (IS_SWAPFSVP(pp->p_vnode)) {
 523                                                         anonfree++;
 524                                                 } else {
 525                                                         if (pp->p_vnode->v_flag
 526                                                             & VVMEXEC) {
 527                                                                 execfree++;
 528                                                         } else {
 529                                                                 fsfree++;
 530                                                         }
 531                                                 }
 532                                         }
 533                                         /*LINTED: constant in conditional ctx*/
 534                                         VN_DISPOSE(pp, B_FREE,
 535                                             (flags & B_DONTNEED), kcred);
 536                                         dfree++;
 537                                 } else {
 538                                         page_unlock(pp);
 539                                         pgrec++;
 540                                         TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE,
 541                                             "page_ws_free:pp %p", pp);
 542                                 }
 543                         } else {
 544                                 /*
 545                                  * Page is either `locked' in memory
 546                                  * or was reclaimed and now has a
 547                                  * "shared" lock, so release it.
 548                                  */
 549                                 page_unlock(pp);
 550                         }
 551                 } else {
 552                         /*
 553                          * Neither B_FREE nor B_INVAL nor B_ERROR.
 554                          * Just release locks.
 555                          */
 556                         page_io_unlock(pp);
 557                         page_unlock(pp);
 558                 }
 559         }
 560 
 561         CPU_STATS_ENTER_K();
 562         cpup = CPU;             /* get cpup now that CPU cannot change */
 563         CPU_STATS_ADDQ(cpup, vm, dfree, dfree);
 564         CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec);
 565         CPU_STATS_ADDQ(cpup, vm, pgout, pgout);
 566         CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout);
 567         CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout);
 568         CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree);
 569         CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout);
 570         CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree);
 571         CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout);
 572         CPU_STATS_ADDQ(cpup, vm, execfree, execfree);
 573         CPU_STATS_EXIT_K();
 574 
 575         /* Kernel probe */
 576         TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */,
 577             tnf_opaque, vnode,                  vp,
 578             tnf_ulong,  pages_pageout,          pgpgout,
 579             tnf_ulong,  pages_freed,            dfree,
 580             tnf_ulong,  pages_reclaimed,        pgrec);
 581 }
 582 
 583 /*
 584  * Flags are composed of {B_ASYNC, B_INVAL, B_INVALCURONLY, B_FREE,
 585  * B_DONTNEED, B_DELWRI, B_TRUNC, B_FORCE}.
 586  * B_DELWRI indicates that this page is part of a kluster
 587  * operation and is only to be considered if it doesn't involve any
 588  * waiting here.  B_TRUNC indicates that the file is being truncated
 589  * and so no i/o needs to be done. B_FORCE indicates that the page
 590  * must be destroyed so don't try wrting it out.
 591  *
 592  * The caller must ensure that the page is locked.  Returns 1, if
 593  * the page should be written back (the "iolock" is held in this
 594  * case), or 0 if the page has been dealt with or has been
 595  * unlocked.
 596  */
 597 int
 598 pvn_getdirty(page_t *pp, int flags)
 599 {
 600         ASSERT((flags & (B_INVAL | B_FREE)) ?
 601             PAGE_EXCL(pp) : PAGE_SHARED(pp));
 602         ASSERT(PP_ISFREE(pp) == 0);
 603 
 604         /*
 605          * If trying to invalidate or free a logically `locked' page,
 606          * forget it.  Don't need page_struct_lock to check p_lckcnt and
 607          * p_cowcnt as the page is exclusively locked.
 608          */
 609         if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) &&
 610             (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) {
 611                 page_unlock(pp);
 612                 return (0);
 613         }
 614 
 615         /*
 616          * Now acquire the i/o lock so we can add it to the dirty
 617          * list (if necessary).  We avoid blocking on the i/o lock
 618          * in the following cases:
 619          *
 620          *      If B_DELWRI is set, which implies that this request is
 621          *      due to a klustering operartion.
 622          *
 623          *      If this is an async (B_ASYNC) operation and we are not doing
 624          *      invalidation (B_INVAL) [The current i/o or fsflush will ensure
 625          *      that the the page is written out].
 626          */
 627         if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) {
 628                 if (!page_io_trylock(pp)) {
 629                         page_unlock(pp);
 630                         return (0);
 631                 }
 632         } else {
 633                 page_io_lock(pp);
 634         }
 635 
 636         /*
 637          * If we want to free or invalidate the page then
 638          * we need to unload it so that anyone who wants
 639          * it will have to take a minor fault to get it.
 640          * If we are only invalidating the page for the
 641          * current process, then pass in a different flag.
 642          * Otherwise, we're just writing the page back so we
 643          * need to sync up the hardwre and software mod bit to
 644          * detect any future modifications.  We clear the
 645          * software mod bit when we put the page on the dirty
 646          * list.
 647          */
 648         if (flags & B_INVALCURONLY) {
 649                 (void) hat_pageunload(pp, HAT_CURPROC_PGUNLOAD);
 650         } else if (flags & (B_INVAL | B_FREE)) {
 651                 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
 652         } else {
 653                 (void) hat_pagesync(pp, HAT_SYNC_ZERORM);
 654         }
 655 
 656         if (!hat_ismod(pp) || (flags & B_TRUNC)) {
 657                 /*
 658                  * Don't need to add it to the
 659                  * list after all.
 660                  */
 661                 page_io_unlock(pp);
 662                 if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) {
 663                         /*LINTED: constant in conditional context*/
 664                         VN_DISPOSE(pp, B_INVAL, 0, kcred);
 665                 } else if (flags & B_FREE) {
 666                         /*LINTED: constant in conditional context*/
 667                         VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred);
 668                 } else {
 669                         /*
 670                          * This is advisory path for the callers
 671                          * of VOP_PUTPAGE() who prefer freeing the
 672                          * page _only_ if no one else is accessing it.
 673                          * E.g. segmap_release()
 674                          * We also take this path for B_INVALCURONLY and
 675                          * let page_release call VN_DISPOSE if no one else is
 676                          * using the page.
 677                          *
 678                          * The above hat_ismod() check is useless because:
 679                          * (1) we may not be holding SE_EXCL lock;
 680                          * (2) we've not unloaded _all_ translations
 681                          *
 682                          * Let page_release() do the heavy-lifting.
 683                          */
 684                         (void) page_release(pp, 1);
 685                 }
 686                 return (0);
 687         }
 688 
 689         /*
 690          * Page is dirty, get it ready for the write back
 691          * and add page to the dirty list.
 692          */
 693         hat_clrrefmod(pp);
 694 
 695         /*
 696          * If we're going to free the page when we're done
 697          * then we can let others try to use it starting now.
 698          * We'll detect the fact that they used it when the
 699          * i/o is done and avoid freeing the page.
 700          */
 701         if (flags & (B_FREE | B_INVALCURONLY))
 702                 page_downgrade(pp);
 703 
 704 
 705         TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp);
 706 
 707         return (1);
 708 }
 709 
 710 
 711 /*ARGSUSED*/
 712 static int
 713 marker_constructor(void *buf, void *cdrarg, int kmflags)
 714 {
 715         page_t *mark = buf;
 716         bzero(mark, sizeof (page_t));
 717         mark->p_hash = PVN_VPLIST_HASH_TAG;
 718         return (0);
 719 }
 720 
 721 void
 722 pvn_init()
 723 {
 724         if (pvn_vmodsort_disable == 0)
 725                 pvn_vmodsort_supported = hat_supported(HAT_VMODSORT, NULL);
 726         marker_cache = kmem_cache_create("marker_cache",
 727             sizeof (page_t), 0, marker_constructor,
 728             NULL, NULL, NULL, NULL, 0);
 729 }
 730 
 731 
 732 /*
 733  * Process a vnode's page list for all pages whose offset is >= off.
 734  * Pages are to either be free'd, invalidated, or written back to disk.
 735  *
 736  * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE
 737  * is specified, otherwise they are "shared" locked.
 738  *
 739  * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC}
 740  *
 741  * Special marker page_t's are inserted in the list in order
 742  * to keep track of where we are in the list when locks are dropped.
 743  *
 744  * Note the list is circular and insertions can happen only at the
 745  * head and tail of the list. The algorithm ensures visiting all pages
 746  * on the list in the following way:
 747  *
 748  *    Drop two marker pages at the end of the list.
 749  *
 750  *    Move one marker page backwards towards the start of the list until
 751  *    it is at the list head, processing the pages passed along the way.
 752  *
 753  *    Due to race conditions when the vphm mutex is dropped, additional pages
 754  *    can be added to either end of the list, so we'll continue to move
 755  *    the marker and process pages until it is up against the end marker.
 756  *
 757  * There is one special exit condition. If we are processing a VMODSORT
 758  * vnode and only writing back modified pages, we can stop as soon as
 759  * we run into an unmodified page.  This makes fsync(3) operations fast.
 760  */
 761 int
 762 pvn_vplist_dirty(
 763         vnode_t         *vp,
 764         u_offset_t      off,
 765         int             (*putapage)(vnode_t *, page_t *, u_offset_t *,
 766                         size_t *, int, cred_t *),
 767         int             flags,
 768         cred_t          *cred)
 769 {
 770         page_t          *pp;
 771         page_t          *mark;          /* marker page that moves toward head */
 772         page_t          *end;           /* marker page at end of list */
 773         int             err = 0;
 774         int             error;
 775         kmutex_t        *vphm;
 776         se_t            se;
 777         page_t          **where_to_move;
 778 
 779         ASSERT(vp->v_type != VCHR);
 780 
 781         if (vp->v_pages == NULL)
 782                 return (0);
 783 
 784 
 785         /*
 786          * Serialize vplist_dirty operations on this vnode by setting VVMLOCK.
 787          *
 788          * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync()
 789          * from getting blocked while flushing pages to a dead NFS server.
 790          */
 791         mutex_enter(&vp->v_lock);
 792         if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) {
 793                 mutex_exit(&vp->v_lock);
 794                 return (EAGAIN);
 795         }
 796 
 797         while (vp->v_flag & VVMLOCK)
 798                 cv_wait(&vp->v_cv, &vp->v_lock);
 799 
 800         if (vp->v_pages == NULL) {
 801                 mutex_exit(&vp->v_lock);
 802                 return (0);
 803         }
 804 
 805         vp->v_flag |= VVMLOCK;
 806         mutex_exit(&vp->v_lock);
 807 
 808 
 809         /*
 810          * Set up the marker pages used to walk the list
 811          */
 812         end = kmem_cache_alloc(marker_cache, KM_SLEEP);
 813         end->p_vnode = vp;
 814         end->p_offset = (u_offset_t)-2;
 815         mark = kmem_cache_alloc(marker_cache, KM_SLEEP);
 816         mark->p_vnode = vp;
 817         mark->p_offset = (u_offset_t)-1;
 818 
 819         /*
 820          * Grab the lock protecting the vnode's page list
 821          * note that this lock is dropped at times in the loop.
 822          */
 823         vphm = page_vnode_mutex(vp);
 824         mutex_enter(vphm);
 825         if (vp->v_pages == NULL)
 826                 goto leave;
 827 
 828         /*
 829          * insert the markers and loop through the list of pages
 830          */
 831         page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark);
 832         page_vpadd(&mark->p_vpnext, end);
 833         for (;;) {
 834 
 835                 /*
 836                  * If only doing an async write back, then we can
 837                  * stop as soon as we get to start of the list.
 838                  */
 839                 if (flags == B_ASYNC && vp->v_pages == mark)
 840                         break;
 841 
 842                 /*
 843                  * otherwise stop when we've gone through all the pages
 844                  */
 845                 if (mark->p_vpprev == end)
 846                         break;
 847 
 848                 pp = mark->p_vpprev;
 849                 if (vp->v_pages == pp)
 850                         where_to_move = &vp->v_pages;
 851                 else
 852                         where_to_move = &pp->p_vpprev->p_vpnext;
 853 
 854                 ASSERT(pp->p_vnode == vp);
 855 
 856                 /*
 857                  * If just flushing dirty pages to disk and this vnode
 858                  * is using a sorted list of pages, we can stop processing
 859                  * as soon as we find an unmodified page. Since all the
 860                  * modified pages are visited first.
 861                  */
 862                 if (IS_VMODSORT(vp) &&
 863                     !(flags & (B_INVAL | B_FREE | B_TRUNC))) {
 864                         if (!hat_ismod(pp) && !page_io_locked(pp)) {
 865 #ifdef  DEBUG
 866                                 /*
 867                                  * For debug kernels examine what should be
 868                                  * all the remaining clean pages, asserting
 869                                  * that they are not modified.
 870                                  */
 871                                 page_t  *chk = pp;
 872                                 int     attr;
 873 
 874                                 page_vpsub(&vp->v_pages, mark);
 875                                 page_vpadd(where_to_move, mark);
 876                                 do {
 877                                         chk = chk->p_vpprev;
 878                                         ASSERT(chk != end);
 879                                         if (chk == mark)
 880                                                 continue;
 881                                         attr = hat_page_getattr(chk, P_MOD |
 882                                             P_REF);
 883                                         if ((attr & P_MOD) == 0)
 884                                                 continue;
 885                                         panic("v_pages list not all clean: "
 886                                             "page_t*=%p vnode=%p off=%lx "
 887                                             "attr=0x%x last clean page_t*=%p\n",
 888                                             (void *)chk, (void *)chk->p_vnode,
 889                                             (long)chk->p_offset, attr,
 890                                             (void *)pp);
 891                                 } while (chk != vp->v_pages);
 892 #endif
 893                                 break;
 894                         } else if (!(flags & B_ASYNC) && !hat_ismod(pp)) {
 895                                 /*
 896                                  * Couldn't get io lock, wait until IO is done.
 897                                  * Block only for sync IO since we don't want
 898                                  * to block async IO.
 899                                  */
 900                                 mutex_exit(vphm);
 901                                 page_io_wait(pp);
 902                                 mutex_enter(vphm);
 903                                 continue;
 904                         }
 905                 }
 906 
 907                 /*
 908                  * Skip this page if the offset is out of the desired range.
 909                  * Just move the marker and continue.
 910                  */
 911                 if (pp->p_offset < off) {
 912                         page_vpsub(&vp->v_pages, mark);
 913                         page_vpadd(where_to_move, mark);
 914                         continue;
 915                 }
 916 
 917                 /*
 918                  * If we are supposed to invalidate or free this
 919                  * page, then we need an exclusive lock.
 920                  */
 921                 se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
 922 
 923                 /*
 924                  * We must acquire the page lock for all synchronous
 925                  * operations (invalidate, free and write).
 926                  */
 927                 if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) {
 928                         /*
 929                          * If the page_lock() drops the mutex
 930                          * we must retry the loop.
 931                          */
 932                         if (!page_lock(pp, se, vphm, P_NO_RECLAIM))
 933                                 continue;
 934 
 935                         /*
 936                          * It's ok to move the marker page now.
 937                          */
 938                         page_vpsub(&vp->v_pages, mark);
 939                         page_vpadd(where_to_move, mark);
 940                 } else {
 941 
 942                         /*
 943                          * update the marker page for all remaining cases
 944                          */
 945                         page_vpsub(&vp->v_pages, mark);
 946                         page_vpadd(where_to_move, mark);
 947 
 948                         /*
 949                          * For write backs, If we can't lock the page, it's
 950                          * invalid or in the process of being destroyed.  Skip
 951                          * it, assuming someone else is writing it.
 952                          */
 953                         if (!page_trylock(pp, se))
 954                                 continue;
 955                 }
 956 
 957                 ASSERT(pp->p_vnode == vp);
 958 
 959                 /*
 960                  * Successfully locked the page, now figure out what to
 961                  * do with it. Free pages are easily dealt with, invalidate
 962                  * if desired or just go on to the next page.
 963                  */
 964                 if (PP_ISFREE(pp)) {
 965                         if ((flags & B_INVAL) == 0) {
 966                                 page_unlock(pp);
 967                                 continue;
 968                         }
 969 
 970                         /*
 971                          * Invalidate (destroy) the page.
 972                          */
 973                         mutex_exit(vphm);
 974                         page_destroy_free(pp);
 975                         mutex_enter(vphm);
 976                         continue;
 977                 }
 978 
 979                 /*
 980                  * pvn_getdirty() figures out what do do with a dirty page.
 981                  * If the page is dirty, the putapage() routine will write it
 982                  * and will kluster any other adjacent dirty pages it can.
 983                  *
 984                  * pvn_getdirty() and `(*putapage)' unlock the page.
 985                  */
 986                 mutex_exit(vphm);
 987                 if (pvn_getdirty(pp, flags)) {
 988                         error = (*putapage)(vp, pp, NULL, NULL, flags, cred);
 989                         if (!err)
 990                                 err = error;
 991                 }
 992                 mutex_enter(vphm);
 993         }
 994         page_vpsub(&vp->v_pages, mark);
 995         page_vpsub(&vp->v_pages, end);
 996 
 997 leave:
 998         /*
 999          * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds
1000          */
1001         mutex_exit(vphm);
1002         kmem_cache_free(marker_cache, mark);
1003         kmem_cache_free(marker_cache, end);
1004         mutex_enter(&vp->v_lock);
1005         vp->v_flag &= ~VVMLOCK;
1006         cv_broadcast(&vp->v_cv);
1007         mutex_exit(&vp->v_lock);
1008         return (err);
1009 }
1010 
1011 /*
1012  * Walk the vp->v_pages list, for every page call the callback function
1013  * pointed by *page_check. If page_check returns non-zero, then mark the
1014  * page as modified and if VMODSORT is set, move it to the end of v_pages
1015  * list. Moving makes sense only if we have at least two pages - this also
1016  * avoids having v_pages temporarily being NULL after calling page_vpsub()
1017  * if there was just one page.
1018  */
1019 void
1020 pvn_vplist_setdirty(vnode_t *vp, int (*page_check)(page_t *))
1021 {
1022         page_t  *pp, *next, *end;
1023         kmutex_t        *vphm;
1024         int     shuffle;
1025 
1026         vphm = page_vnode_mutex(vp);
1027         mutex_enter(vphm);
1028 
1029         if (vp->v_pages == NULL) {
1030                 mutex_exit(vphm);
1031                 return;
1032         }
1033 
1034         end = vp->v_pages->p_vpprev;
1035         shuffle = IS_VMODSORT(vp) && (vp->v_pages != end);
1036         pp = vp->v_pages;
1037 
1038         for (;;) {
1039                 next = pp->p_vpnext;
1040                 if (pp->p_hash != PVN_VPLIST_HASH_TAG && page_check(pp)) {
1041                         /*
1042                          * hat_setmod_only() in contrast to hat_setmod() does
1043                          * not shuffle the pages and does not grab the mutex
1044                          * page_vnode_mutex. Exactly what we need.
1045                          */
1046                         hat_setmod_only(pp);
1047                         if (shuffle) {
1048                                 page_vpsub(&vp->v_pages, pp);
1049                                 ASSERT(vp->v_pages != NULL);
1050                                 page_vpadd(&vp->v_pages->p_vpprev->p_vpnext,
1051                                     pp);
1052                         }
1053                 }
1054                 /* Stop if we have just processed the last page. */
1055                 if (pp == end)
1056                         break;
1057                 pp = next;
1058         }
1059 
1060         mutex_exit(vphm);
1061 }
1062 
1063 /*
1064  * Zero out zbytes worth of data. Caller should be aware that this
1065  * routine may enter back into the fs layer (xxx_getpage). Locks
1066  * that the xxx_getpage routine may need should not be held while
1067  * calling this.
1068  */
1069 void
1070 pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes)
1071 {
1072         caddr_t addr;
1073 
1074         ASSERT(vp->v_type != VCHR);
1075 
1076         if (vp->v_pages == NULL)
1077                 return;
1078 
1079         /*
1080          * zbytes may be zero but there still may be some portion of
1081          * a page which needs clearing (since zbytes is a function
1082          * of filesystem block size, not pagesize.)
1083          */
1084         if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0)
1085                 return;
1086 
1087         /*
1088          * We get the last page and handle the partial
1089          * zeroing via kernel mappings.  This will make the page
1090          * dirty so that we know that when this page is written
1091          * back, the zeroed information will go out with it.  If
1092          * the page is not currently in memory, then the kzero
1093          * operation will cause it to be brought it.  We use kzero
1094          * instead of bzero so that if the page cannot be read in
1095          * for any reason, the system will not panic.  We need
1096          * to zero out a minimum of the fs given zbytes, but we
1097          * might also have to do more to get the entire last page.
1098          */
1099 
1100         if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE)
1101                 panic("pvn_vptrunc zbytes");
1102         addr = segmap_getmapflt(segkmap, vp, vplen,
1103             MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE);
1104         (void) kzero(addr + (vplen & MAXBOFFSET),
1105             MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)));
1106         (void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC);
1107 }
1108 
1109 /*
1110  * Handles common work of the VOP_GETPAGE routines by iterating page by page
1111  * calling the getpage helper for each.
1112  */
1113 int
1114 pvn_getpages(
1115         int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, page_t *[],
1116                 size_t, struct seg *, caddr_t, enum seg_rw, cred_t *),
1117         struct vnode *vp,
1118         u_offset_t off,
1119         size_t len,
1120         uint_t *protp,
1121         page_t *pl[],
1122         size_t plsz,
1123         struct seg *seg,
1124         caddr_t addr,
1125         enum seg_rw rw,
1126         struct cred *cred)
1127 {
1128         page_t **ppp;
1129         u_offset_t o, eoff;
1130         size_t sz, xlen;
1131         int err;
1132 
1133         /* ensure that we have enough space */
1134         ASSERT(pl == NULL || plsz >= len);
1135 
1136         /*
1137          * Loop one page at a time and let getapage function fill
1138          * in the next page in array.  We only allow one page to be
1139          * returned at a time (except for the last page) so that we
1140          * don't have any problems with duplicates and other such
1141          * painful problems.  This is a very simple minded algorithm,
1142          * but it does the job correctly.  We hope that the cost of a
1143          * getapage call for a resident page that we might have been
1144          * able to get from an earlier call doesn't cost too much.
1145          */
1146         ppp = pl;
1147         sz = (pl != NULL) ? PAGESIZE : 0;
1148         eoff = off + len;
1149         xlen = len;
1150         for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE,
1151             xlen -= PAGESIZE) {
1152                 if (o + PAGESIZE >= eoff && pl != NULL) {
1153                         /*
1154                          * Last time through - allow the all of
1155                          * what's left of the pl[] array to be used.
1156                          */
1157                         sz = plsz - (o - off);
1158                 }
1159                 err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr,
1160                     rw, cred);
1161                 if (err) {
1162                         /*
1163                          * Release any pages we already got.
1164                          */
1165                         if (o > off && pl != NULL) {
1166                                 for (ppp = pl; *ppp != NULL; *ppp++ = NULL)
1167                                         (void) page_release(*ppp, 1);
1168                         }
1169                         break;
1170                 }
1171                 if (pl != NULL)
1172                         ppp++;
1173         }
1174         return (err);
1175 }
1176 
1177 /*
1178  * Initialize the page list array.
1179  */
1180 /*ARGSUSED*/
1181 void
1182 pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz,
1183     u_offset_t off, size_t io_len, enum seg_rw rw)
1184 {
1185         ssize_t sz;
1186         page_t *ppcur, **ppp;
1187 
1188         /*
1189          * Set up to load plsz worth
1190          * starting at the needed page.
1191          */
1192         while (pp != NULL && pp->p_offset != off) {
1193                 /*
1194                  * Remove page from the i/o list,
1195                  * release the i/o and the page lock.
1196                  */
1197                 ppcur = pp;
1198                 page_sub(&pp, ppcur);
1199                 page_io_unlock(ppcur);
1200                 (void) page_release(ppcur, 1);
1201         }
1202 
1203         if (pp == NULL) {
1204                 pl[0] = NULL;
1205                 return;
1206         }
1207 
1208         sz = plsz;
1209 
1210         /*
1211          * Initialize the page list array.
1212          */
1213         ppp = pl;
1214         do {
1215                 ppcur = pp;
1216                 *ppp++ = ppcur;
1217                 page_sub(&pp, ppcur);
1218                 page_io_unlock(ppcur);
1219                 if (rw != S_CREATE)
1220                         page_downgrade(ppcur);
1221                 sz -= PAGESIZE;
1222         } while (sz > 0 && pp != NULL);
1223         *ppp = NULL;            /* terminate list */
1224 
1225         /*
1226          * Now free the remaining pages that weren't
1227          * loaded in the page list.
1228          */
1229         while (pp != NULL) {
1230                 ppcur = pp;
1231                 page_sub(&pp, ppcur);
1232                 page_io_unlock(ppcur);
1233                 (void) page_release(ppcur, 1);
1234         }
1235 }