io-lx-public-vs-joyent Wdiff usr/src/uts/common/vm/vm_pvn.c

Print this page

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/vm/vm_pvn.c
          +++ new/usr/src/uts/common/vm/vm_pvn.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *

↓ open down ↓

12 lines elided

↑ open up ↑

  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  23      - * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  24   23   * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
       24 + * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  25   25   */
  26   26  
  27   27  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  28   28  /*        All Rights Reserved   */
  29   29  
  30   30  /*
  31   31   * University Copyright- Copyright (c) 1982, 1986, 1988
  32   32   * The Regents of the University of California
  33   33   * All Rights Reserved
  34   34   *

  35   35   * University Acknowledgment- Portions of this document are derived from
  36   36   * software developed by the University of California, Berkeley, and its
  37   37   * contributors.
  38   38   */
  39   39  
  40   40  /*
  41   41   * VM - paged vnode.
  42   42   *
  43   43   * This file supplies vm support for the vnode operations that deal with pages.
  44   44   */
  45   45  #include <sys/types.h>
  46   46  #include <sys/t_lock.h>
  47   47  #include <sys/param.h>
  48   48  #include <sys/sysmacros.h>
  49   49  #include <sys/systm.h>
  50   50  #include <sys/time.h>
  51   51  #include <sys/buf.h>
  52   52  #include <sys/vnode.h>
  53   53  #include <sys/uio.h>
  54   54  #include <sys/vmsystm.h>
  55   55  #include <sys/mman.h>
  56   56  #include <sys/vfs.h>
  57   57  #include <sys/cred.h>
  58   58  #include <sys/user.h>
  59   59  #include <sys/kmem.h>
  60   60  #include <sys/cmn_err.h>
  61   61  #include <sys/debug.h>
  62   62  #include <sys/cpuvar.h>
  63   63  #include <sys/vtrace.h>
  64   64  #include <sys/tnf_probe.h>
  65   65  
  66   66  #include <vm/hat.h>
  67   67  #include <vm/as.h>
  68   68  #include <vm/seg.h>
  69   69  #include <vm/rm.h>
  70   70  #include <vm/pvn.h>
  71   71  #include <vm/page.h>
  72   72  #include <vm/seg_map.h>
  73   73  #include <vm/seg_kmem.h>
  74   74  #include <sys/fs/swapnode.h>
  75   75  
  76   76  int pvn_nofodklust = 0;
  77   77  int pvn_write_noklust = 0;
  78   78  
  79   79  uint_t pvn_vmodsort_supported = 0;      /* set if HAT supports VMODSORT */
  80   80  uint_t pvn_vmodsort_disable = 0;        /* set in /etc/system to disable HAT */
  81   81                                          /* support for vmodsort for testing */
  82   82  
  83   83  static struct kmem_cache *marker_cache = NULL;
  84   84  
  85   85  /*
  86   86   * Find the largest contiguous block which contains `addr' for file offset
  87   87   * `offset' in it while living within the file system block sizes (`vp_off'
  88   88   * and `vp_len') and the address space limits for which no pages currently
  89   89   * exist and which map to consecutive file offsets.
  90   90   */
  91   91  page_t *
  92   92  pvn_read_kluster(
  93   93          struct vnode *vp,
  94   94          u_offset_t off,
  95   95          struct seg *seg,
  96   96          caddr_t addr,
  97   97          u_offset_t *offp,                       /* return values */
  98   98          size_t *lenp,                           /* return values */
  99   99          u_offset_t vp_off,
 100  100          size_t vp_len,
 101  101          int isra)
 102  102  {
 103  103          ssize_t deltaf, deltab;
 104  104          page_t *pp;
 105  105          page_t *plist = NULL;
 106  106          spgcnt_t pagesavail;
 107  107          u_offset_t vp_end;
 108  108  
 109  109          ASSERT(off >= vp_off && off < vp_off + vp_len);
 110  110  
 111  111          /*
 112  112           * We only want to do klustering/read ahead if there
 113  113           * is more than minfree pages currently available.
 114  114           */
 115  115          pagesavail = freemem - minfree;
 116  116  
 117  117          if (pagesavail <= 0)
 118  118                  if (isra)
 119  119                          return ((page_t *)NULL);    /* ra case - give up */
 120  120                  else
 121  121                          pagesavail = 1;             /* must return a page */
 122  122  
 123  123          /* We calculate in pages instead of bytes due to 32-bit overflows */
 124  124          if (pagesavail < (spgcnt_t)btopr(vp_len)) {
 125  125                  /*
 126  126                   * Don't have enough free memory for the
 127  127                   * max request, try sizing down vp request.
 128  128                   */
 129  129                  deltab = (ssize_t)(off - vp_off);
 130  130                  vp_len -= deltab;
 131  131                  vp_off += deltab;
 132  132                  if (pagesavail < btopr(vp_len)) {
 133  133                          /*
 134  134                           * Still not enough memory, just settle for
 135  135                           * pagesavail which is at least 1.
 136  136                           */
 137  137                          vp_len = ptob(pagesavail);
 138  138                  }
 139  139          }
 140  140  
 141  141          vp_end = vp_off + vp_len;
 142  142          ASSERT(off >= vp_off && off < vp_end);
 143  143  
 144  144          if (isra && SEGOP_KLUSTER(seg, addr, 0))
 145  145                  return ((page_t *)NULL);        /* segment driver says no */
 146  146  
 147  147          if ((plist = page_create_va(vp, off,
 148  148              PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL)
 149  149                  return ((page_t *)NULL);
 150  150  
 151  151          if (vp_len <= PAGESIZE || pvn_nofodklust) {
 152  152                  *offp = off;
 153  153                  *lenp = MIN(vp_len, PAGESIZE);
 154  154          } else {
 155  155                  /*
 156  156                   * Scan back from front by incrementing "deltab" and
 157  157                   * comparing "off" with "vp_off + deltab" to avoid
 158  158                   * "signed" versus "unsigned" conversion problems.
 159  159                   */
 160  160                  for (deltab = PAGESIZE; off >= vp_off + deltab;
 161  161                      deltab += PAGESIZE) {
 162  162                          /*
 163  163                           * Call back to the segment driver to verify that
 164  164                           * the klustering/read ahead operation makes sense.
 165  165                           */
 166  166                          if (SEGOP_KLUSTER(seg, addr, -deltab))
 167  167                                  break;          /* page not eligible */
 168  168                          if ((pp = page_create_va(vp, off - deltab,
 169  169                              PAGESIZE, PG_EXCL, seg, addr - deltab))
 170  170                              == NULL)
 171  171                                  break;          /* already have the page */
 172  172                          /*
 173  173                           * Add page to front of page list.
 174  174                           */
 175  175                          page_add(&plist, pp);
 176  176                  }
 177  177                  deltab -= PAGESIZE;
 178  178  
 179  179                  /* scan forward from front */
 180  180                  for (deltaf = PAGESIZE; off + deltaf < vp_end;
 181  181                      deltaf += PAGESIZE) {
 182  182                          /*
 183  183                           * Call back to the segment driver to verify that
 184  184                           * the klustering/read ahead operation makes sense.
 185  185                           */
 186  186                          if (SEGOP_KLUSTER(seg, addr, deltaf))
 187  187                                  break;          /* page not file extension */
 188  188                          if ((pp = page_create_va(vp, off + deltaf,
 189  189                              PAGESIZE, PG_EXCL, seg, addr + deltaf))
 190  190                              == NULL)
 191  191                                  break;          /* already have page */
 192  192  
 193  193                          /*
 194  194                           * Add page to end of page list.
 195  195                           */
 196  196                          page_add(&plist, pp);
 197  197                          plist = plist->p_next;
 198  198                  }
 199  199                  *offp = off = off - deltab;
 200  200                  *lenp = deltab + deltaf;
 201  201                  ASSERT(off >= vp_off);
 202  202  
 203  203                  /*
 204  204                   * If we ended up getting more than was actually
 205  205                   * requested, retract the returned length to only
 206  206                   * reflect what was requested.  This might happen
 207  207                   * if we were allowed to kluster pages across a
 208  208                   * span of (say) 5 frags, and frag size is less
 209  209                   * than PAGESIZE.  We need a whole number of
 210  210                   * pages to contain those frags, but the returned
 211  211                   * size should only allow the returned range to
 212  212                   * extend as far as the end of the frags.
 213  213                   */
 214  214                  if ((vp_off + vp_len) < (off + *lenp)) {
 215  215                          ASSERT(vp_end > off);
 216  216                          *lenp = vp_end - off;
 217  217                  }
 218  218          }
 219  219          TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER,
 220  220              "pvn_read_kluster:seg %p addr %x isra %x",
 221  221              seg, addr, isra);
 222  222          return (plist);
 223  223  }
 224  224  
 225  225  /*
 226  226   * Handle pages for this vnode on either side of the page "pp"
 227  227   * which has been locked by the caller.  This routine will also
 228  228   * do klustering in the range [vp_off, vp_off + vp_len] up
 229  229   * until a page which is not found.  The offset and length
 230  230   * of pages included is returned in "*offp" and "*lenp".
 231  231   *
 232  232   * Returns a list of dirty locked pages all ready to be
 233  233   * written back.
 234  234   */
 235  235  page_t *
 236  236  pvn_write_kluster(
 237  237          struct vnode *vp,
 238  238          page_t *pp,
 239  239          u_offset_t *offp,               /* return values */
 240  240          size_t *lenp,                   /* return values */
 241  241          u_offset_t vp_off,
 242  242          size_t vp_len,
 243  243          int flags)
 244  244  {
 245  245          u_offset_t off;
 246  246          page_t *dirty;
 247  247          size_t deltab, deltaf;
 248  248          se_t se;
 249  249          u_offset_t vp_end;
 250  250  
 251  251          off = pp->p_offset;
 252  252  
 253  253          /*
 254  254           * Kustering should not be done if we are invalidating
 255  255           * pages since we could destroy pages that belong to
 256  256           * some other process if this is a swap vnode.
 257  257           */
 258  258          if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) {
 259  259                  *offp = off;
 260  260                  *lenp = PAGESIZE;
 261  261                  return (pp);
 262  262          }
 263  263  
 264  264          if (flags & (B_FREE | B_INVAL))
 265  265                  se = SE_EXCL;
 266  266          else
 267  267                  se = SE_SHARED;
 268  268  
 269  269          dirty = pp;
 270  270          /*
 271  271           * Scan backwards looking for pages to kluster by incrementing
 272  272           * "deltab" and comparing "off" with "vp_off + deltab" to
 273  273           * avoid "signed" versus "unsigned" conversion problems.
 274  274           */
 275  275          for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) {
 276  276                  pp = page_lookup_nowait(vp, off - deltab, se);
 277  277                  if (pp == NULL)
 278  278                          break;          /* page not found */
 279  279                  if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
 280  280                          break;
 281  281                  page_add(&dirty, pp);
 282  282          }
 283  283          deltab -= PAGESIZE;
 284  284  
 285  285          vp_end = vp_off + vp_len;
 286  286          /* now scan forwards looking for pages to kluster */
 287  287          for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) {
 288  288                  pp = page_lookup_nowait(vp, off + deltaf, se);
 289  289                  if (pp == NULL)
 290  290                          break;          /* page not found */
 291  291                  if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
 292  292                          break;
 293  293                  page_add(&dirty, pp);
 294  294                  dirty = dirty->p_next;
 295  295          }
 296  296  
 297  297          *offp = off - deltab;
 298  298          *lenp = deltab + deltaf;
 299  299          return (dirty);
 300  300  }
 301  301  
 302  302  /*
 303  303   * Generic entry point used to release the "shared/exclusive" lock
 304  304   * and the "p_iolock" on pages after i/o is complete.
 305  305   */
 306  306  void
 307  307  pvn_io_done(page_t *plist)
 308  308  {
 309  309          page_t *pp;
 310  310  
 311  311          while (plist != NULL) {
 312  312                  pp = plist;
 313  313                  page_sub(&plist, pp);
 314  314                  page_io_unlock(pp);
 315  315                  page_unlock(pp);
 316  316          }
 317  317  }
 318  318  
 319  319  /*
 320  320   * Entry point to be used by file system getpage subr's and
 321  321   * other such routines which either want to unlock pages (B_ASYNC
 322  322   * request) or destroy a list of pages if an error occurred.
 323  323   */
 324  324  void
 325  325  pvn_read_done(page_t *plist, int flags)
 326  326  {
 327  327          page_t *pp;
 328  328  
 329  329          while (plist != NULL) {
 330  330                  pp = plist;
 331  331                  page_sub(&plist, pp);
 332  332                  page_io_unlock(pp);
 333  333                  if (flags & B_ERROR) {
 334  334                          /*LINTED: constant in conditional context*/
 335  335                          VN_DISPOSE(pp, B_INVAL, 0, kcred);
 336  336                  } else {
 337  337                          (void) page_release(pp, 0);
 338  338                  }
 339  339          }
 340  340  }
 341  341  
 342  342  /*
 343  343   * Automagic pageout.
 344  344   * When memory gets tight, start freeing pages popping out of the
 345  345   * write queue.
 346  346   */
 347  347  int     write_free = 1;
 348  348  pgcnt_t pages_before_pager = 200;       /* LMXXX */
 349  349  
 350  350  /*
 351  351   * Routine to be called when page-out's complete.
 352  352   * The caller, typically VOP_PUTPAGE, has to explicity call this routine
 353  353   * after waiting for i/o to complete (biowait) to free the list of
 354  354   * pages associated with the buffer.  These pages must be locked
 355  355   * before i/o is initiated.
 356  356   *
 357  357   * If a write error occurs, the pages are marked as modified
 358  358   * so the write will be re-tried later.
 359  359   */
 360  360  
 361  361  void
 362  362  pvn_write_done(page_t *plist, int flags)
 363  363  {
 364  364          int dfree = 0;
 365  365          int pgrec = 0;
 366  366          int pgout = 0;
 367  367          int pgpgout = 0;
 368  368          int anonpgout = 0;
 369  369          int anonfree = 0;
 370  370          int fspgout = 0;
 371  371          int fsfree = 0;
 372  372          int execpgout = 0;
 373  373          int execfree = 0;
 374  374          page_t *pp;
 375  375          struct cpu *cpup;
 376  376          struct vnode *vp = NULL;        /* for probe */
 377  377          uint_t ppattr;
 378  378          kmutex_t *vphm = NULL;
 379  379  
 380  380          ASSERT((flags & B_READ) == 0);
 381  381  
 382  382          /*
 383  383           * If we are about to start paging anyway, start freeing pages.
 384  384           */
 385  385          if (write_free && freemem < lotsfree + pages_before_pager &&
 386  386              (flags & B_ERROR) == 0) {
 387  387                  flags |= B_FREE;
 388  388          }
 389  389  
 390  390          /*
 391  391           * Handle each page involved in the i/o operation.
 392  392           */
 393  393          while (plist != NULL) {
 394  394                  pp = plist;
 395  395                  ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp));
 396  396                  page_sub(&plist, pp);
 397  397  
 398  398                  /* Kernel probe support */
 399  399                  if (vp == NULL)
 400  400                          vp = pp->p_vnode;
 401  401  
 402  402                  if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) {
 403  403                          /*
 404  404                           * Move page to the top of the v_page list.
 405  405                           * Skip pages modified during IO.
 406  406                           */
 407  407                          vphm = page_vnode_mutex(vp);
 408  408                          mutex_enter(vphm);
 409  409                          if ((pp->p_vpnext != pp) && !hat_ismod(pp)) {
 410  410                                  page_vpsub(&vp->v_pages, pp);
 411  411                                  page_vpadd(&vp->v_pages, pp);
 412  412                          }
 413  413                          mutex_exit(vphm);
 414  414                  }
 415  415  
 416  416                  if (flags & B_ERROR) {
 417  417                          /*
 418  418                           * Write operation failed.  We don't want
 419  419                           * to destroy (or free) the page unless B_FORCE
 420  420                           * is set. We set the mod bit again and release
 421  421                           * all locks on the page so that it will get written
 422  422                           * back again later when things are hopefully
 423  423                           * better again.
 424  424                           * If B_INVAL and B_FORCE is set we really have
 425  425                           * to destroy the page.
 426  426                           */
 427  427                          if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) {
 428  428                                  page_io_unlock(pp);
 429  429                                  /*LINTED: constant in conditional context*/
 430  430                                  VN_DISPOSE(pp, B_INVAL, 0, kcred);
 431  431                          } else {
 432  432                                  hat_setmod_only(pp);
 433  433                                  page_io_unlock(pp);
 434  434                                  page_unlock(pp);
 435  435                          }
 436  436                  } else if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) {
 437  437                          /*
 438  438                           * If B_INVALCURONLY is set, then we handle that case
 439  439                           * in the next conditional if hat_page_is_mapped()
 440  440                           * indicates that there are no additional mappings
 441  441                           * to the page.
 442  442                           */
 443  443  
 444  444                          /*
 445  445                           * XXX - Failed writes with B_INVAL set are
 446  446                           * not handled appropriately.
 447  447                           */
 448  448                          page_io_unlock(pp);
 449  449                          /*LINTED: constant in conditional context*/
 450  450                          VN_DISPOSE(pp, B_INVAL, 0, kcred);
 451  451                  } else if (flags & B_FREE ||!hat_page_is_mapped(pp)) {
 452  452                          /*
 453  453                           * Update statistics for pages being paged out
 454  454                           */
 455  455                          if (pp->p_vnode) {
 456  456                                  if (IS_SWAPFSVP(pp->p_vnode)) {
 457  457                                          anonpgout++;
 458  458                                  } else {
 459  459                                          if (pp->p_vnode->v_flag & VVMEXEC) {
 460  460                                                  execpgout++;
 461  461                                          } else {
 462  462                                                  fspgout++;
 463  463                                          }
 464  464                                  }
 465  465                          }
 466  466                          page_io_unlock(pp);
 467  467                          pgout = 1;
 468  468                          pgpgout++;
 469  469                          TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT,
 470  470                              "page_ws_out:pp %p", pp);
 471  471  
 472  472                          /*
 473  473                           * The page_struct_lock need not be acquired to
 474  474                           * examine "p_lckcnt" and "p_cowcnt" since we'll
 475  475                           * have an "exclusive" lock if the upgrade succeeds.
 476  476                           */
 477  477                          if (page_tryupgrade(pp) &&
 478  478                              pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
 479  479                                  /*
 480  480                                   * Check if someone has reclaimed the
 481  481                                   * page.  If ref and mod are not set, no
 482  482                                   * one is using it so we can free it.
 483  483                                   * The rest of the system is careful
 484  484                                   * to use the NOSYNC flag to unload
 485  485                                   * translations set up for i/o w/o
 486  486                                   * affecting ref and mod bits.
 487  487                                   *
 488  488                                   * Obtain a copy of the real hardware
 489  489                                   * mod bit using hat_pagesync(pp, HAT_DONTZERO)
 490  490                                   * to avoid having to flush the cache.
 491  491                                   */
 492  492                                  ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
 493  493                                      HAT_SYNC_STOPON_MOD);
 494  494                          ck_refmod:
 495  495                                  if (!(ppattr & (P_REF | P_MOD))) {
 496  496                                          if (hat_page_is_mapped(pp)) {
 497  497                                                  /*
 498  498                                                   * Doesn't look like the page
 499  499                                                   * was modified so now we
 500  500                                                   * really have to unload the
 501  501                                                   * translations.  Meanwhile
 502  502                                                   * another CPU could've
 503  503                                                   * modified it so we have to
 504  504                                                   * check again.  We don't loop
 505  505                                                   * forever here because now
 506  506                                                   * the translations are gone
 507  507                                                   * and no one can get a new one
 508  508                                                   * since we have the "exclusive"
 509  509                                                   * lock on the page.
 510  510                                                   */
 511  511                                                  (void) hat_pageunload(pp,
 512  512                                                      HAT_FORCE_PGUNLOAD);
 513  513                                                  ppattr = hat_page_getattr(pp,
 514  514                                                      P_REF | P_MOD);
 515  515                                                  goto ck_refmod;
 516  516                                          }
 517  517                                          /*
 518  518                                           * Update statistics for pages being
 519  519                                           * freed
 520  520                                           */
 521  521                                          if (pp->p_vnode) {
 522  522                                                  if (IS_SWAPFSVP(pp->p_vnode)) {
 523  523                                                          anonfree++;
 524  524                                                  } else {
 525  525                                                          if (pp->p_vnode->v_flag
 526  526                                                              & VVMEXEC) {
 527  527                                                                  execfree++;
 528  528                                                          } else {
 529  529                                                                  fsfree++;
 530  530                                                          }
 531  531                                                  }
 532  532                                          }
 533  533                                          /*LINTED: constant in conditional ctx*/
 534  534                                          VN_DISPOSE(pp, B_FREE,
 535  535                                              (flags & B_DONTNEED), kcred);
 536  536                                          dfree++;
 537  537                                  } else {
 538  538                                          page_unlock(pp);
 539  539                                          pgrec++;
 540  540                                          TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE,
 541  541                                              "page_ws_free:pp %p", pp);
 542  542                                  }
 543  543                          } else {
 544  544                                  /*
 545  545                                   * Page is either `locked' in memory
 546  546                                   * or was reclaimed and now has a
 547  547                                   * "shared" lock, so release it.
 548  548                                   */
 549  549                                  page_unlock(pp);
 550  550                          }
 551  551                  } else {
 552  552                          /*
 553  553                           * Neither B_FREE nor B_INVAL nor B_ERROR.
 554  554                           * Just release locks.
 555  555                           */
 556  556                          page_io_unlock(pp);
 557  557                          page_unlock(pp);
 558  558                  }
 559  559          }
 560  560  
 561  561          CPU_STATS_ENTER_K();
 562  562          cpup = CPU;             /* get cpup now that CPU cannot change */
 563  563          CPU_STATS_ADDQ(cpup, vm, dfree, dfree);
 564  564          CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec);
 565  565          CPU_STATS_ADDQ(cpup, vm, pgout, pgout);
 566  566          CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout);
 567  567          CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout);
 568  568          CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree);
 569  569          CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout);
 570  570          CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree);
 571  571          CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout);
 572  572          CPU_STATS_ADDQ(cpup, vm, execfree, execfree);
 573  573          CPU_STATS_EXIT_K();
 574  574  
 575  575          /* Kernel probe */
 576  576          TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */,
 577  577              tnf_opaque, vnode,                  vp,
 578  578              tnf_ulong,  pages_pageout,          pgpgout,
 579  579              tnf_ulong,  pages_freed,            dfree,
 580  580              tnf_ulong,  pages_reclaimed,        pgrec);
 581  581  }
 582  582  
 583  583  /*
 584  584   * Flags are composed of {B_ASYNC, B_INVAL, B_INVALCURONLY, B_FREE,
 585  585   * B_DONTNEED, B_DELWRI, B_TRUNC, B_FORCE}.
 586  586   * B_DELWRI indicates that this page is part of a kluster
 587  587   * operation and is only to be considered if it doesn't involve any
 588  588   * waiting here.  B_TRUNC indicates that the file is being truncated
 589  589   * and so no i/o needs to be done. B_FORCE indicates that the page
 590  590   * must be destroyed so don't try wrting it out.
 591  591   *
 592  592   * The caller must ensure that the page is locked.  Returns 1, if
 593  593   * the page should be written back (the "iolock" is held in this
 594  594   * case), or 0 if the page has been dealt with or has been
 595  595   * unlocked.
 596  596   */
 597  597  int
 598  598  pvn_getdirty(page_t *pp, int flags)
 599  599  {
 600  600          ASSERT((flags & (B_INVAL | B_FREE)) ?
 601  601              PAGE_EXCL(pp) : PAGE_SHARED(pp));
 602  602          ASSERT(PP_ISFREE(pp) == 0);
 603  603  
 604  604          /*
 605  605           * If trying to invalidate or free a logically `locked' page,
 606  606           * forget it.  Don't need page_struct_lock to check p_lckcnt and
 607  607           * p_cowcnt as the page is exclusively locked.
 608  608           */
 609  609          if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) &&
 610  610              (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) {
 611  611                  page_unlock(pp);
 612  612                  return (0);
 613  613          }
 614  614  
 615  615          /*
 616  616           * Now acquire the i/o lock so we can add it to the dirty
 617  617           * list (if necessary).  We avoid blocking on the i/o lock
 618  618           * in the following cases:
 619  619           *
 620  620           *      If B_DELWRI is set, which implies that this request is
 621  621           *      due to a klustering operartion.
 622  622           *
 623  623           *      If this is an async (B_ASYNC) operation and we are not doing
 624  624           *      invalidation (B_INVAL) [The current i/o or fsflush will ensure
 625  625           *      that the the page is written out].
 626  626           */
 627  627          if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) {
 628  628                  if (!page_io_trylock(pp)) {
 629  629                          page_unlock(pp);
 630  630                          return (0);
 631  631                  }
 632  632          } else {
 633  633                  page_io_lock(pp);
 634  634          }
 635  635  
 636  636          /*
 637  637           * If we want to free or invalidate the page then
 638  638           * we need to unload it so that anyone who wants
 639  639           * it will have to take a minor fault to get it.
 640  640           * If we are only invalidating the page for the
 641  641           * current process, then pass in a different flag.
 642  642           * Otherwise, we're just writing the page back so we
 643  643           * need to sync up the hardwre and software mod bit to
 644  644           * detect any future modifications.  We clear the
 645  645           * software mod bit when we put the page on the dirty
 646  646           * list.
 647  647           */
 648  648          if (flags & B_INVALCURONLY) {
 649  649                  (void) hat_pageunload(pp, HAT_CURPROC_PGUNLOAD);
 650  650          } else if (flags & (B_INVAL | B_FREE)) {
 651  651                  (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
 652  652          } else {
 653  653                  (void) hat_pagesync(pp, HAT_SYNC_ZERORM);
 654  654          }
 655  655  
 656  656          if (!hat_ismod(pp) || (flags & B_TRUNC)) {
 657  657                  /*
 658  658                   * Don't need to add it to the
 659  659                   * list after all.
 660  660                   */
 661  661                  page_io_unlock(pp);
 662  662                  if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) {
 663  663                          /*LINTED: constant in conditional context*/
 664  664                          VN_DISPOSE(pp, B_INVAL, 0, kcred);
 665  665                  } else if (flags & B_FREE) {
 666  666                          /*LINTED: constant in conditional context*/
 667  667                          VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred);
 668  668                  } else {
 669  669                          /*
 670  670                           * This is advisory path for the callers
 671  671                           * of VOP_PUTPAGE() who prefer freeing the
 672  672                           * page _only_ if no one else is accessing it.
 673  673                           * E.g. segmap_release()
 674  674                           * We also take this path for B_INVALCURONLY and
 675  675                           * let page_release call VN_DISPOSE if no one else is
 676  676                           * using the page.
 677  677                           *
 678  678                           * The above hat_ismod() check is useless because:
 679  679                           * (1) we may not be holding SE_EXCL lock;
 680  680                           * (2) we've not unloaded _all_ translations
 681  681                           *
 682  682                           * Let page_release() do the heavy-lifting.
 683  683                           */
 684  684                          (void) page_release(pp, 1);
 685  685                  }
 686  686                  return (0);
 687  687          }
 688  688  
 689  689          /*
 690  690           * Page is dirty, get it ready for the write back
 691  691           * and add page to the dirty list.
 692  692           */
 693  693          hat_clrrefmod(pp);
 694  694  
 695  695          /*
 696  696           * If we're going to free the page when we're done
 697  697           * then we can let others try to use it starting now.
 698  698           * We'll detect the fact that they used it when the
 699  699           * i/o is done and avoid freeing the page.
 700  700           */
 701  701          if (flags & (B_FREE | B_INVALCURONLY))
 702  702                  page_downgrade(pp);
 703  703  
 704  704  
 705  705          TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp);
 706  706  
 707  707          return (1);
 708  708  }
 709  709  
 710  710  
 711  711  /*ARGSUSED*/
 712  712  static int
 713  713  marker_constructor(void *buf, void *cdrarg, int kmflags)
 714  714  {
 715  715          page_t *mark = buf;
 716  716          bzero(mark, sizeof (page_t));
 717  717          mark->p_hash = PVN_VPLIST_HASH_TAG;
 718  718          return (0);
 719  719  }
 720  720  
 721  721  void
 722  722  pvn_init()
 723  723  {
 724  724          if (pvn_vmodsort_disable == 0)
 725  725                  pvn_vmodsort_supported = hat_supported(HAT_VMODSORT, NULL);
 726  726          marker_cache = kmem_cache_create("marker_cache",
 727  727              sizeof (page_t), 0, marker_constructor,
 728  728              NULL, NULL, NULL, NULL, 0);
 729  729  }
 730  730  
 731  731  
 732  732  /*
 733  733   * Process a vnode's page list for all pages whose offset is >= off.
 734  734   * Pages are to either be free'd, invalidated, or written back to disk.
 735  735   *
 736  736   * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE
 737  737   * is specified, otherwise they are "shared" locked.
 738  738   *
 739  739   * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC}
 740  740   *
 741  741   * Special marker page_t's are inserted in the list in order
 742  742   * to keep track of where we are in the list when locks are dropped.
 743  743   *
 744  744   * Note the list is circular and insertions can happen only at the
 745  745   * head and tail of the list. The algorithm ensures visiting all pages
 746  746   * on the list in the following way:
 747  747   *
 748  748   *    Drop two marker pages at the end of the list.
 749  749   *
 750  750   *    Move one marker page backwards towards the start of the list until
 751  751   *    it is at the list head, processing the pages passed along the way.
 752  752   *
 753  753   *    Due to race conditions when the vphm mutex is dropped, additional pages
 754  754   *    can be added to either end of the list, so we'll continue to move
 755  755   *    the marker and process pages until it is up against the end marker.
 756  756   *
 757  757   * There is one special exit condition. If we are processing a VMODSORT
 758  758   * vnode and only writing back modified pages, we can stop as soon as
 759  759   * we run into an unmodified page.  This makes fsync(3) operations fast.
 760  760   */
 761  761  int
 762  762  pvn_vplist_dirty(
 763  763          vnode_t         *vp,
 764  764          u_offset_t      off,
 765  765          int             (*putapage)(vnode_t *, page_t *, u_offset_t *,
 766  766                          size_t *, int, cred_t *),
 767  767          int             flags,
 768  768          cred_t          *cred)
 769  769  {
 770  770          page_t          *pp;
 771  771          page_t          *mark;          /* marker page that moves toward head */
 772  772          page_t          *end;           /* marker page at end of list */
 773  773          int             err = 0;
 774  774          int             error;
 775  775          kmutex_t        *vphm;
 776  776          se_t            se;
 777  777          page_t          **where_to_move;
 778  778  
 779  779          ASSERT(vp->v_type != VCHR);
 780  780  
 781  781          if (vp->v_pages == NULL)
 782  782                  return (0);
 783  783  
 784  784  
 785  785          /*
 786  786           * Serialize vplist_dirty operations on this vnode by setting VVMLOCK.
 787  787           *
 788  788           * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync()
 789  789           * from getting blocked while flushing pages to a dead NFS server.
 790  790           */
 791  791          mutex_enter(&vp->v_lock);
 792  792          if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) {
 793  793                  mutex_exit(&vp->v_lock);
 794  794                  return (EAGAIN);
 795  795          }
 796  796  
 797  797          while (vp->v_flag & VVMLOCK)
 798  798                  cv_wait(&vp->v_cv, &vp->v_lock);
 799  799  
 800  800          if (vp->v_pages == NULL) {
 801  801                  mutex_exit(&vp->v_lock);
 802  802                  return (0);
 803  803          }
 804  804  
 805  805          vp->v_flag |= VVMLOCK;
 806  806          mutex_exit(&vp->v_lock);
 807  807  
 808  808  
 809  809          /*
 810  810           * Set up the marker pages used to walk the list
 811  811           */
 812  812          end = kmem_cache_alloc(marker_cache, KM_SLEEP);
 813  813          end->p_vnode = vp;
 814  814          end->p_offset = (u_offset_t)-2;
 815  815          mark = kmem_cache_alloc(marker_cache, KM_SLEEP);
 816  816          mark->p_vnode = vp;
 817  817          mark->p_offset = (u_offset_t)-1;
 818  818  
 819  819          /*
 820  820           * Grab the lock protecting the vnode's page list
 821  821           * note that this lock is dropped at times in the loop.
 822  822           */
 823  823          vphm = page_vnode_mutex(vp);
 824  824          mutex_enter(vphm);
 825  825          if (vp->v_pages == NULL)
 826  826                  goto leave;
 827  827  
 828  828          /*
 829  829           * insert the markers and loop through the list of pages
 830  830           */
 831  831          page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark);
 832  832          page_vpadd(&mark->p_vpnext, end);
 833  833          for (;;) {
 834  834  
 835  835                  /*
 836  836                   * If only doing an async write back, then we can
 837  837                   * stop as soon as we get to start of the list.
 838  838                   */
 839  839                  if (flags == B_ASYNC && vp->v_pages == mark)
 840  840                          break;
 841  841  
 842  842                  /*
 843  843                   * otherwise stop when we've gone through all the pages
 844  844                   */
 845  845                  if (mark->p_vpprev == end)
 846  846                          break;
 847  847  
 848  848                  pp = mark->p_vpprev;
 849  849                  if (vp->v_pages == pp)
 850  850                          where_to_move = &vp->v_pages;
 851  851                  else
 852  852                          where_to_move = &pp->p_vpprev->p_vpnext;
 853  853  
 854  854                  ASSERT(pp->p_vnode == vp);
 855  855  
 856  856                  /*
 857  857                   * If just flushing dirty pages to disk and this vnode
 858  858                   * is using a sorted list of pages, we can stop processing
 859  859                   * as soon as we find an unmodified page. Since all the
 860  860                   * modified pages are visited first.
 861  861                   */
 862  862                  if (IS_VMODSORT(vp) &&
 863  863                      !(flags & (B_INVAL | B_FREE | B_TRUNC))) {
 864  864                          if (!hat_ismod(pp) && !page_io_locked(pp)) {
 865  865  #ifdef  DEBUG
 866  866                                  /*
 867  867                                   * For debug kernels examine what should be
 868  868                                   * all the remaining clean pages, asserting
 869  869                                   * that they are not modified.
 870  870                                   */
 871  871                                  page_t  *chk = pp;
 872  872                                  int     attr;
 873  873  
 874  874                                  page_vpsub(&vp->v_pages, mark);
 875  875                                  page_vpadd(where_to_move, mark);
 876  876                                  do {
 877  877                                          chk = chk->p_vpprev;
 878  878                                          ASSERT(chk != end);
 879  879                                          if (chk == mark)
 880  880                                                  continue;
 881  881                                          attr = hat_page_getattr(chk, P_MOD |
 882  882                                              P_REF);
 883  883                                          if ((attr & P_MOD) == 0)
 884  884                                                  continue;
 885  885                                          panic("v_pages list not all clean: "
 886  886                                              "page_t*=%p vnode=%p off=%lx "
 887  887                                              "attr=0x%x last clean page_t*=%p\n",
 888  888                                              (void *)chk, (void *)chk->p_vnode,
 889  889                                              (long)chk->p_offset, attr,
 890  890                                              (void *)pp);
 891  891                                  } while (chk != vp->v_pages);
 892  892  #endif
 893  893                                  break;
 894  894                          } else if (!(flags & B_ASYNC) && !hat_ismod(pp)) {
 895  895                                  /*
 896  896                                   * Couldn't get io lock, wait until IO is done.
 897  897                                   * Block only for sync IO since we don't want
 898  898                                   * to block async IO.
 899  899                                   */
 900  900                                  mutex_exit(vphm);
 901  901                                  page_io_wait(pp);
 902  902                                  mutex_enter(vphm);
 903  903                                  continue;
 904  904                          }
 905  905                  }
 906  906  
 907  907                  /*
 908  908                   * Skip this page if the offset is out of the desired range.
 909  909                   * Just move the marker and continue.
 910  910                   */
 911  911                  if (pp->p_offset < off) {
 912  912                          page_vpsub(&vp->v_pages, mark);
 913  913                          page_vpadd(where_to_move, mark);
 914  914                          continue;
 915  915                  }
 916  916  
 917  917                  /*
 918  918                   * If we are supposed to invalidate or free this
 919  919                   * page, then we need an exclusive lock.
 920  920                   */
 921  921                  se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
 922  922  
 923  923                  /*
 924  924                   * We must acquire the page lock for all synchronous
 925  925                   * operations (invalidate, free and write).
 926  926                   */
 927  927                  if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) {
 928  928                          /*
 929  929                           * If the page_lock() drops the mutex
 930  930                           * we must retry the loop.
 931  931                           */
 932  932                          if (!page_lock(pp, se, vphm, P_NO_RECLAIM))
 933  933                                  continue;
 934  934  
 935  935                          /*
 936  936                           * It's ok to move the marker page now.
 937  937                           */
 938  938                          page_vpsub(&vp->v_pages, mark);
 939  939                          page_vpadd(where_to_move, mark);
 940  940                  } else {
 941  941  
 942  942                          /*
 943  943                           * update the marker page for all remaining cases
 944  944                           */
 945  945                          page_vpsub(&vp->v_pages, mark);
 946  946                          page_vpadd(where_to_move, mark);
 947  947  
 948  948                          /*
 949  949                           * For write backs, If we can't lock the page, it's
 950  950                           * invalid or in the process of being destroyed.  Skip
 951  951                           * it, assuming someone else is writing it.
 952  952                           */
 953  953                          if (!page_trylock(pp, se))
 954  954                                  continue;
 955  955                  }
 956  956  
 957  957                  ASSERT(pp->p_vnode == vp);
 958  958  
 959  959                  /*
 960  960                   * Successfully locked the page, now figure out what to
 961  961                   * do with it. Free pages are easily dealt with, invalidate
 962  962                   * if desired or just go on to the next page.
 963  963                   */
 964  964                  if (PP_ISFREE(pp)) {
 965  965                          if ((flags & B_INVAL) == 0) {
 966  966                                  page_unlock(pp);
 967  967                                  continue;
 968  968                          }
 969  969  
 970  970                          /*
 971  971                           * Invalidate (destroy) the page.
 972  972                           */
 973  973                          mutex_exit(vphm);
 974  974                          page_destroy_free(pp);
 975  975                          mutex_enter(vphm);
 976  976                          continue;
 977  977                  }
 978  978  
 979  979                  /*
 980  980                   * pvn_getdirty() figures out what do do with a dirty page.
 981  981                   * If the page is dirty, the putapage() routine will write it
 982  982                   * and will kluster any other adjacent dirty pages it can.
 983  983                   *
 984  984                   * pvn_getdirty() and `(*putapage)' unlock the page.
 985  985                   */
 986  986                  mutex_exit(vphm);
 987  987                  if (pvn_getdirty(pp, flags)) {
 988  988                          error = (*putapage)(vp, pp, NULL, NULL, flags, cred);
 989  989                          if (!err)
 990  990                                  err = error;
 991  991                  }
 992  992                  mutex_enter(vphm);
 993  993          }
 994  994          page_vpsub(&vp->v_pages, mark);
 995  995          page_vpsub(&vp->v_pages, end);
 996  996  
 997  997  leave:
 998  998          /*
 999  999           * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds
1000 1000           */
1001 1001          mutex_exit(vphm);
1002 1002          kmem_cache_free(marker_cache, mark);
1003 1003          kmem_cache_free(marker_cache, end);
1004 1004          mutex_enter(&vp->v_lock);
1005 1005          vp->v_flag &= ~VVMLOCK;
1006 1006          cv_broadcast(&vp->v_cv);
1007 1007          mutex_exit(&vp->v_lock);
1008 1008          return (err);
1009 1009  }
1010 1010  
1011 1011  /*
1012 1012   * Walk the vp->v_pages list, for every page call the callback function
1013 1013   * pointed by *page_check. If page_check returns non-zero, then mark the
1014 1014   * page as modified and if VMODSORT is set, move it to the end of v_pages
1015 1015   * list. Moving makes sense only if we have at least two pages - this also
1016 1016   * avoids having v_pages temporarily being NULL after calling page_vpsub()
1017 1017   * if there was just one page.
1018 1018   */
1019 1019  void
1020 1020  pvn_vplist_setdirty(vnode_t *vp, int (*page_check)(page_t *))
1021 1021  {
1022 1022          page_t  *pp, *next, *end;
1023 1023          kmutex_t        *vphm;
1024 1024          int     shuffle;
1025 1025  
1026 1026          vphm = page_vnode_mutex(vp);
1027 1027          mutex_enter(vphm);
1028 1028  
1029 1029          if (vp->v_pages == NULL) {
1030 1030                  mutex_exit(vphm);
1031 1031                  return;
1032 1032          }
1033 1033  
1034 1034          end = vp->v_pages->p_vpprev;
1035 1035          shuffle = IS_VMODSORT(vp) && (vp->v_pages != end);
1036 1036          pp = vp->v_pages;
1037 1037  
1038 1038          for (;;) {
1039 1039                  next = pp->p_vpnext;
1040 1040                  if (pp->p_hash != PVN_VPLIST_HASH_TAG && page_check(pp)) {
1041 1041                          /*
1042 1042                           * hat_setmod_only() in contrast to hat_setmod() does
1043 1043                           * not shuffle the pages and does not grab the mutex
1044 1044                           * page_vnode_mutex. Exactly what we need.
1045 1045                           */
1046 1046                          hat_setmod_only(pp);
1047 1047                          if (shuffle) {
1048 1048                                  page_vpsub(&vp->v_pages, pp);
1049 1049                                  ASSERT(vp->v_pages != NULL);
1050 1050                                  page_vpadd(&vp->v_pages->p_vpprev->p_vpnext,
1051 1051                                      pp);
1052 1052                          }
1053 1053                  }
1054 1054                  /* Stop if we have just processed the last page. */
1055 1055                  if (pp == end)
1056 1056                          break;
1057 1057                  pp = next;
1058 1058          }
1059 1059  
1060 1060          mutex_exit(vphm);
1061 1061  }
1062 1062  
1063 1063  /*
1064 1064   * Zero out zbytes worth of data. Caller should be aware that this
1065 1065   * routine may enter back into the fs layer (xxx_getpage). Locks
1066 1066   * that the xxx_getpage routine may need should not be held while
1067 1067   * calling this.
1068 1068   */
1069 1069  void
1070 1070  pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes)
1071 1071  {
1072 1072          caddr_t addr;
1073 1073  
1074 1074          ASSERT(vp->v_type != VCHR);
1075 1075  
1076 1076          if (vp->v_pages == NULL)
1077 1077                  return;
1078 1078  
1079 1079          /*
1080 1080           * zbytes may be zero but there still may be some portion of
1081 1081           * a page which needs clearing (since zbytes is a function
1082 1082           * of filesystem block size, not pagesize.)
1083 1083           */
1084 1084          if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0)
1085 1085                  return;
1086 1086  
1087 1087          /*
1088 1088           * We get the last page and handle the partial
1089 1089           * zeroing via kernel mappings.  This will make the page
1090 1090           * dirty so that we know that when this page is written
1091 1091           * back, the zeroed information will go out with it.  If
1092 1092           * the page is not currently in memory, then the kzero
1093 1093           * operation will cause it to be brought it.  We use kzero
1094 1094           * instead of bzero so that if the page cannot be read in
1095 1095           * for any reason, the system will not panic.  We need
1096 1096           * to zero out a minimum of the fs given zbytes, but we
1097 1097           * might also have to do more to get the entire last page.
1098 1098           */
1099 1099  
1100 1100          if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE)
1101 1101                  panic("pvn_vptrunc zbytes");
1102 1102          addr = segmap_getmapflt(segkmap, vp, vplen,
1103 1103              MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE);
1104 1104          (void) kzero(addr + (vplen & MAXBOFFSET),
1105 1105              MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)));
1106 1106          (void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC);
1107 1107  }
1108 1108  
1109 1109  /*
1110 1110   * Handles common work of the VOP_GETPAGE routines by iterating page by page
1111 1111   * calling the getpage helper for each.
1112 1112   */
1113 1113  int
1114 1114  pvn_getpages(
1115 1115          int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, page_t *[],
1116 1116                  size_t, struct seg *, caddr_t, enum seg_rw, cred_t *),
1117 1117          struct vnode *vp,
1118 1118          u_offset_t off,
1119 1119          size_t len,
1120 1120          uint_t *protp,
1121 1121          page_t *pl[],
1122 1122          size_t plsz,
1123 1123          struct seg *seg,
1124 1124          caddr_t addr,
1125 1125          enum seg_rw rw,
1126 1126          struct cred *cred)
1127 1127  {
1128 1128          page_t **ppp;
1129 1129          u_offset_t o, eoff;
1130 1130          size_t sz, xlen;
1131 1131          int err;
1132 1132  
1133 1133          /* ensure that we have enough space */
1134 1134          ASSERT(pl == NULL || plsz >= len);
1135 1135  
1136 1136          /*
1137 1137           * Loop one page at a time and let getapage function fill
1138 1138           * in the next page in array.  We only allow one page to be
1139 1139           * returned at a time (except for the last page) so that we
1140 1140           * don't have any problems with duplicates and other such
1141 1141           * painful problems.  This is a very simple minded algorithm,
1142 1142           * but it does the job correctly.  We hope that the cost of a
1143 1143           * getapage call for a resident page that we might have been
1144 1144           * able to get from an earlier call doesn't cost too much.
1145 1145           */
1146 1146          ppp = pl;
1147 1147          sz = (pl != NULL) ? PAGESIZE : 0;
1148 1148          eoff = off + len;
1149 1149          xlen = len;
1150 1150          for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE,
1151 1151              xlen -= PAGESIZE) {
1152 1152                  if (o + PAGESIZE >= eoff && pl != NULL) {
1153 1153                          /*
1154 1154                           * Last time through - allow the all of
1155 1155                           * what's left of the pl[] array to be used.
1156 1156                           */
1157 1157                          sz = plsz - (o - off);
1158 1158                  }
1159 1159                  err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr,
1160 1160                      rw, cred);
1161 1161                  if (err) {
1162 1162                          /*
1163 1163                           * Release any pages we already got.
1164 1164                           */
1165 1165                          if (o > off && pl != NULL) {
1166 1166                                  for (ppp = pl; *ppp != NULL; *ppp++ = NULL)
1167 1167                                          (void) page_release(*ppp, 1);
1168 1168                          }
1169 1169                          break;
1170 1170                  }
1171 1171                  if (pl != NULL)
1172 1172                          ppp++;
1173 1173          }
1174 1174          return (err);
1175 1175  }
1176 1176  
1177 1177  /*
1178 1178   * Initialize the page list array.
1179 1179   */
1180 1180  /*ARGSUSED*/
1181 1181  void
1182 1182  pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz,
1183 1183      u_offset_t off, size_t io_len, enum seg_rw rw)
1184 1184  {
1185 1185          ssize_t sz;
1186 1186          page_t *ppcur, **ppp;
1187 1187  
1188 1188          /*
1189 1189           * Set up to load plsz worth
1190 1190           * starting at the needed page.
1191 1191           */
1192 1192          while (pp != NULL && pp->p_offset != off) {
1193 1193                  /*
1194 1194                   * Remove page from the i/o list,
1195 1195                   * release the i/o and the page lock.
1196 1196                   */
1197 1197                  ppcur = pp;
1198 1198                  page_sub(&pp, ppcur);
1199 1199                  page_io_unlock(ppcur);
1200 1200                  (void) page_release(ppcur, 1);
1201 1201          }
1202 1202  
1203 1203          if (pp == NULL) {
1204 1204                  pl[0] = NULL;
1205 1205                  return;
1206 1206          }
1207 1207  
1208 1208          sz = plsz;
1209 1209  
1210 1210          /*
1211 1211           * Initialize the page list array.
1212 1212           */
1213 1213          ppp = pl;
1214 1214          do {
1215 1215                  ppcur = pp;
1216 1216                  *ppp++ = ppcur;
1217 1217                  page_sub(&pp, ppcur);
1218 1218                  page_io_unlock(ppcur);
1219 1219                  if (rw != S_CREATE)
1220 1220                          page_downgrade(ppcur);
1221 1221                  sz -= PAGESIZE;
1222 1222          } while (sz > 0 && pp != NULL);
1223 1223          *ppp = NULL;            /* terminate list */
1224 1224  
1225 1225          /*
1226 1226           * Now free the remaining pages that weren't
1227 1227           * loaded in the page list.
1228 1228           */
1229 1229          while (pp != NULL) {
1230 1230                  ppcur = pp;
1231 1231                  page_sub(&pp, ppcur);
1232 1232                  page_io_unlock(ppcur);
1233 1233                  (void) page_release(ppcur, 1);
1234 1234          }
1235 1235  }

↓ open down ↓

1201 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX