Print this page
    
OS-881 To workaround OS-580 add support to only invalidate mappings from a single process
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/vm/vm_pvn.c
          +++ new/usr/src/uts/common/vm/vm_pvn.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  
    | 
      ↓ open down ↓ | 
    13 lines elided | 
    
      ↑ open up ↑ | 
  
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
       24 + * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  24   25   */
  25   26  
  26   27  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  27   28  /*        All Rights Reserved   */
  28   29  
  29   30  /*
  30   31   * University Copyright- Copyright (c) 1982, 1986, 1988
  31   32   * The Regents of the University of California
  32   33   * All Rights Reserved
  33   34   *
  34   35   * University Acknowledgment- Portions of this document are derived from
  35   36   * software developed by the University of California, Berkeley, and its
  36   37   * contributors.
  37   38   */
  38   39  
  39   40  /*
  40   41   * VM - paged vnode.
  41   42   *
  42   43   * This file supplies vm support for the vnode operations that deal with pages.
  43   44   */
  44   45  #include <sys/types.h>
  45   46  #include <sys/t_lock.h>
  46   47  #include <sys/param.h>
  47   48  #include <sys/sysmacros.h>
  48   49  #include <sys/systm.h>
  49   50  #include <sys/time.h>
  50   51  #include <sys/buf.h>
  51   52  #include <sys/vnode.h>
  52   53  #include <sys/uio.h>
  53   54  #include <sys/vmsystm.h>
  54   55  #include <sys/mman.h>
  55   56  #include <sys/vfs.h>
  56   57  #include <sys/cred.h>
  57   58  #include <sys/user.h>
  58   59  #include <sys/kmem.h>
  59   60  #include <sys/cmn_err.h>
  60   61  #include <sys/debug.h>
  61   62  #include <sys/cpuvar.h>
  62   63  #include <sys/vtrace.h>
  63   64  #include <sys/tnf_probe.h>
  64   65  
  65   66  #include <vm/hat.h>
  66   67  #include <vm/as.h>
  67   68  #include <vm/seg.h>
  68   69  #include <vm/rm.h>
  69   70  #include <vm/pvn.h>
  70   71  #include <vm/page.h>
  71   72  #include <vm/seg_map.h>
  72   73  #include <vm/seg_kmem.h>
  73   74  #include <sys/fs/swapnode.h>
  74   75  
  75   76  int pvn_nofodklust = 0;
  76   77  int pvn_write_noklust = 0;
  77   78  
  78   79  uint_t pvn_vmodsort_supported = 0;      /* set if HAT supports VMODSORT */
  79   80  uint_t pvn_vmodsort_disable = 0;        /* set in /etc/system to disable HAT */
  80   81                                          /* support for vmodsort for testing */
  81   82  
  82   83  static struct kmem_cache *marker_cache = NULL;
  83   84  
  84   85  /*
  85   86   * Find the largest contiguous block which contains `addr' for file offset
  86   87   * `offset' in it while living within the file system block sizes (`vp_off'
  87   88   * and `vp_len') and the address space limits for which no pages currently
  88   89   * exist and which map to consecutive file offsets.
  89   90   */
  90   91  page_t *
  91   92  pvn_read_kluster(
  92   93          struct vnode *vp,
  93   94          u_offset_t off,
  94   95          struct seg *seg,
  95   96          caddr_t addr,
  96   97          u_offset_t *offp,                       /* return values */
  97   98          size_t *lenp,                           /* return values */
  98   99          u_offset_t vp_off,
  99  100          size_t vp_len,
 100  101          int isra)
 101  102  {
 102  103          ssize_t deltaf, deltab;
 103  104          page_t *pp;
 104  105          page_t *plist = NULL;
 105  106          spgcnt_t pagesavail;
 106  107          u_offset_t vp_end;
 107  108  
 108  109          ASSERT(off >= vp_off && off < vp_off + vp_len);
 109  110  
 110  111          /*
 111  112           * We only want to do klustering/read ahead if there
 112  113           * is more than minfree pages currently available.
 113  114           */
 114  115          pagesavail = freemem - minfree;
 115  116  
 116  117          if (pagesavail <= 0)
 117  118                  if (isra)
 118  119                          return ((page_t *)NULL);    /* ra case - give up */
 119  120                  else
 120  121                          pagesavail = 1;             /* must return a page */
 121  122  
 122  123          /* We calculate in pages instead of bytes due to 32-bit overflows */
 123  124          if (pagesavail < (spgcnt_t)btopr(vp_len)) {
 124  125                  /*
 125  126                   * Don't have enough free memory for the
 126  127                   * max request, try sizing down vp request.
 127  128                   */
 128  129                  deltab = (ssize_t)(off - vp_off);
 129  130                  vp_len -= deltab;
 130  131                  vp_off += deltab;
 131  132                  if (pagesavail < btopr(vp_len)) {
 132  133                          /*
 133  134                           * Still not enough memory, just settle for
 134  135                           * pagesavail which is at least 1.
 135  136                           */
 136  137                          vp_len = ptob(pagesavail);
 137  138                  }
 138  139          }
 139  140  
 140  141          vp_end = vp_off + vp_len;
 141  142          ASSERT(off >= vp_off && off < vp_end);
 142  143  
 143  144          if (isra && SEGOP_KLUSTER(seg, addr, 0))
 144  145                  return ((page_t *)NULL);        /* segment driver says no */
 145  146  
 146  147          if ((plist = page_create_va(vp, off,
 147  148              PAGESIZE, PG_EXCL | PG_WAIT, seg, addr)) == NULL)
 148  149                  return ((page_t *)NULL);
 149  150  
 150  151          if (vp_len <= PAGESIZE || pvn_nofodklust) {
 151  152                  *offp = off;
 152  153                  *lenp = MIN(vp_len, PAGESIZE);
 153  154          } else {
 154  155                  /*
 155  156                   * Scan back from front by incrementing "deltab" and
 156  157                   * comparing "off" with "vp_off + deltab" to avoid
 157  158                   * "signed" versus "unsigned" conversion problems.
 158  159                   */
 159  160                  for (deltab = PAGESIZE; off >= vp_off + deltab;
 160  161                      deltab += PAGESIZE) {
 161  162                          /*
 162  163                           * Call back to the segment driver to verify that
 163  164                           * the klustering/read ahead operation makes sense.
 164  165                           */
 165  166                          if (SEGOP_KLUSTER(seg, addr, -deltab))
 166  167                                  break;          /* page not eligible */
 167  168                          if ((pp = page_create_va(vp, off - deltab,
 168  169                              PAGESIZE, PG_EXCL, seg, addr - deltab))
 169  170                              == NULL)
 170  171                                  break;          /* already have the page */
 171  172                          /*
 172  173                           * Add page to front of page list.
 173  174                           */
 174  175                          page_add(&plist, pp);
 175  176                  }
 176  177                  deltab -= PAGESIZE;
 177  178  
 178  179                  /* scan forward from front */
 179  180                  for (deltaf = PAGESIZE; off + deltaf < vp_end;
 180  181                      deltaf += PAGESIZE) {
 181  182                          /*
 182  183                           * Call back to the segment driver to verify that
 183  184                           * the klustering/read ahead operation makes sense.
 184  185                           */
 185  186                          if (SEGOP_KLUSTER(seg, addr, deltaf))
 186  187                                  break;          /* page not file extension */
 187  188                          if ((pp = page_create_va(vp, off + deltaf,
 188  189                              PAGESIZE, PG_EXCL, seg, addr + deltaf))
 189  190                              == NULL)
 190  191                                  break;          /* already have page */
 191  192  
 192  193                          /*
 193  194                           * Add page to end of page list.
 194  195                           */
 195  196                          page_add(&plist, pp);
 196  197                          plist = plist->p_next;
 197  198                  }
 198  199                  *offp = off = off - deltab;
 199  200                  *lenp = deltab + deltaf;
 200  201                  ASSERT(off >= vp_off);
 201  202  
 202  203                  /*
 203  204                   * If we ended up getting more than was actually
 204  205                   * requested, retract the returned length to only
 205  206                   * reflect what was requested.  This might happen
 206  207                   * if we were allowed to kluster pages across a
 207  208                   * span of (say) 5 frags, and frag size is less
 208  209                   * than PAGESIZE.  We need a whole number of
 209  210                   * pages to contain those frags, but the returned
 210  211                   * size should only allow the returned range to
 211  212                   * extend as far as the end of the frags.
 212  213                   */
 213  214                  if ((vp_off + vp_len) < (off + *lenp)) {
 214  215                          ASSERT(vp_end > off);
 215  216                          *lenp = vp_end - off;
 216  217                  }
 217  218          }
 218  219          TRACE_3(TR_FAC_VM, TR_PVN_READ_KLUSTER,
 219  220              "pvn_read_kluster:seg %p addr %x isra %x",
 220  221              seg, addr, isra);
 221  222          return (plist);
 222  223  }
 223  224  
 224  225  /*
 225  226   * Handle pages for this vnode on either side of the page "pp"
 226  227   * which has been locked by the caller.  This routine will also
 227  228   * do klustering in the range [vp_off, vp_off + vp_len] up
 228  229   * until a page which is not found.  The offset and length
 229  230   * of pages included is returned in "*offp" and "*lenp".
 230  231   *
 231  232   * Returns a list of dirty locked pages all ready to be
 232  233   * written back.
 233  234   */
 234  235  page_t *
 235  236  pvn_write_kluster(
 236  237          struct vnode *vp,
 237  238          page_t *pp,
 238  239          u_offset_t *offp,               /* return values */
 239  240          size_t *lenp,                   /* return values */
 240  241          u_offset_t vp_off,
 241  242          size_t vp_len,
 242  243          int flags)
 243  244  {
 244  245          u_offset_t off;
 245  246          page_t *dirty;
 246  247          size_t deltab, deltaf;
 247  248          se_t se;
 248  249          u_offset_t vp_end;
 249  250  
 250  251          off = pp->p_offset;
 251  252  
 252  253          /*
 253  254           * Kustering should not be done if we are invalidating
 254  255           * pages since we could destroy pages that belong to
 255  256           * some other process if this is a swap vnode.
 256  257           */
 257  258          if (pvn_write_noklust || ((flags & B_INVAL) && IS_SWAPVP(vp))) {
 258  259                  *offp = off;
 259  260                  *lenp = PAGESIZE;
 260  261                  return (pp);
 261  262          }
 262  263  
 263  264          if (flags & (B_FREE | B_INVAL))
 264  265                  se = SE_EXCL;
 265  266          else
 266  267                  se = SE_SHARED;
 267  268  
 268  269          dirty = pp;
 269  270          /*
 270  271           * Scan backwards looking for pages to kluster by incrementing
 271  272           * "deltab" and comparing "off" with "vp_off + deltab" to
 272  273           * avoid "signed" versus "unsigned" conversion problems.
 273  274           */
 274  275          for (deltab = PAGESIZE; off >= vp_off + deltab; deltab += PAGESIZE) {
 275  276                  pp = page_lookup_nowait(vp, off - deltab, se);
 276  277                  if (pp == NULL)
 277  278                          break;          /* page not found */
 278  279                  if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
 279  280                          break;
 280  281                  page_add(&dirty, pp);
 281  282          }
 282  283          deltab -= PAGESIZE;
 283  284  
 284  285          vp_end = vp_off + vp_len;
 285  286          /* now scan forwards looking for pages to kluster */
 286  287          for (deltaf = PAGESIZE; off + deltaf < vp_end; deltaf += PAGESIZE) {
 287  288                  pp = page_lookup_nowait(vp, off + deltaf, se);
 288  289                  if (pp == NULL)
 289  290                          break;          /* page not found */
 290  291                  if (pvn_getdirty(pp, flags | B_DELWRI) == 0)
 291  292                          break;
 292  293                  page_add(&dirty, pp);
 293  294                  dirty = dirty->p_next;
 294  295          }
 295  296  
 296  297          *offp = off - deltab;
 297  298          *lenp = deltab + deltaf;
 298  299          return (dirty);
 299  300  }
 300  301  
 301  302  /*
 302  303   * Generic entry point used to release the "shared/exclusive" lock
 303  304   * and the "p_iolock" on pages after i/o is complete.
 304  305   */
 305  306  void
 306  307  pvn_io_done(page_t *plist)
 307  308  {
 308  309          page_t *pp;
 309  310  
 310  311          while (plist != NULL) {
 311  312                  pp = plist;
 312  313                  page_sub(&plist, pp);
 313  314                  page_io_unlock(pp);
 314  315                  page_unlock(pp);
 315  316          }
 316  317  }
 317  318  
 318  319  /*
 319  320   * Entry point to be used by file system getpage subr's and
 320  321   * other such routines which either want to unlock pages (B_ASYNC
 321  322   * request) or destroy a list of pages if an error occurred.
 322  323   */
 323  324  void
 324  325  pvn_read_done(page_t *plist, int flags)
 325  326  {
 326  327          page_t *pp;
 327  328  
 328  329          while (plist != NULL) {
 329  330                  pp = plist;
 330  331                  page_sub(&plist, pp);
 331  332                  page_io_unlock(pp);
 332  333                  if (flags & B_ERROR) {
 333  334                          /*LINTED: constant in conditional context*/
 334  335                          VN_DISPOSE(pp, B_INVAL, 0, kcred);
 335  336                  } else {
 336  337                          (void) page_release(pp, 0);
 337  338                  }
 338  339          }
 339  340  }
 340  341  
 341  342  /*
 342  343   * Automagic pageout.
 343  344   * When memory gets tight, start freeing pages popping out of the
 344  345   * write queue.
 345  346   */
 346  347  int     write_free = 1;
 347  348  pgcnt_t pages_before_pager = 200;       /* LMXXX */
 348  349  
 349  350  /*
 350  351   * Routine to be called when page-out's complete.
 351  352   * The caller, typically VOP_PUTPAGE, has to explicity call this routine
 352  353   * after waiting for i/o to complete (biowait) to free the list of
 353  354   * pages associated with the buffer.  These pages must be locked
 354  355   * before i/o is initiated.
 355  356   *
 356  357   * If a write error occurs, the pages are marked as modified
 357  358   * so the write will be re-tried later.
 358  359   */
 359  360  
 360  361  void
 361  362  pvn_write_done(page_t *plist, int flags)
 362  363  {
 363  364          int dfree = 0;
 364  365          int pgrec = 0;
 365  366          int pgout = 0;
 366  367          int pgpgout = 0;
 367  368          int anonpgout = 0;
 368  369          int anonfree = 0;
 369  370          int fspgout = 0;
 370  371          int fsfree = 0;
 371  372          int execpgout = 0;
 372  373          int execfree = 0;
 373  374          page_t *pp;
 374  375          struct cpu *cpup;
 375  376          struct vnode *vp = NULL;        /* for probe */
 376  377          uint_t ppattr;
 377  378          kmutex_t *vphm = NULL;
 378  379  
 379  380          ASSERT((flags & B_READ) == 0);
 380  381  
 381  382          /*
 382  383           * If we are about to start paging anyway, start freeing pages.
 383  384           */
 384  385          if (write_free && freemem < lotsfree + pages_before_pager &&
 385  386              (flags & B_ERROR) == 0) {
 386  387                  flags |= B_FREE;
 387  388          }
 388  389  
 389  390          /*
 390  391           * Handle each page involved in the i/o operation.
 391  392           */
 392  393          while (plist != NULL) {
 393  394                  pp = plist;
 394  395                  ASSERT(PAGE_LOCKED(pp) && page_iolock_assert(pp));
 395  396                  page_sub(&plist, pp);
 396  397  
 397  398                  /* Kernel probe support */
 398  399                  if (vp == NULL)
 399  400                          vp = pp->p_vnode;
 400  401  
 401  402                  if (((flags & B_ERROR) == 0) && IS_VMODSORT(vp)) {
 402  403                          /*
 403  404                           * Move page to the top of the v_page list.
 404  405                           * Skip pages modified during IO.
 405  406                           */
 406  407                          vphm = page_vnode_mutex(vp);
 407  408                          mutex_enter(vphm);
 408  409                          if ((pp->p_vpnext != pp) && !hat_ismod(pp)) {
 409  410                                  page_vpsub(&vp->v_pages, pp);
 410  411                                  page_vpadd(&vp->v_pages, pp);
 411  412                          }
 412  413                          mutex_exit(vphm);
 413  414                  }
 414  415  
 415  416                  if (flags & B_ERROR) {
 416  417                          /*
 417  418                           * Write operation failed.  We don't want
 418  419                           * to destroy (or free) the page unless B_FORCE
 419  420                           * is set. We set the mod bit again and release
 420  421                           * all locks on the page so that it will get written
 421  422                           * back again later when things are hopefully
 422  423                           * better again.
 423  424                           * If B_INVAL and B_FORCE is set we really have
 424  425                           * to destroy the page.
  
    | 
      ↓ open down ↓ | 
    391 lines elided | 
    
      ↑ open up ↑ | 
  
 425  426                           */
 426  427                          if ((flags & (B_INVAL|B_FORCE)) == (B_INVAL|B_FORCE)) {
 427  428                                  page_io_unlock(pp);
 428  429                                  /*LINTED: constant in conditional context*/
 429  430                                  VN_DISPOSE(pp, B_INVAL, 0, kcred);
 430  431                          } else {
 431  432                                  hat_setmod_only(pp);
 432  433                                  page_io_unlock(pp);
 433  434                                  page_unlock(pp);
 434  435                          }
 435      -                } else if (flags & B_INVAL) {
      436 +                } else if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) {
 436  437                          /*
      438 +                         * If B_INVALCURONLY is set, then we handle that case
      439 +                         * in the next conditional if hat_page_is_mapped()
      440 +                         * indicates that there are no additional mappings
      441 +                         * to the page.
      442 +                         */
      443 +
      444 +                        /*
 437  445                           * XXX - Failed writes with B_INVAL set are
 438  446                           * not handled appropriately.
 439  447                           */
 440  448                          page_io_unlock(pp);
 441  449                          /*LINTED: constant in conditional context*/
 442  450                          VN_DISPOSE(pp, B_INVAL, 0, kcred);
 443  451                  } else if (flags & B_FREE ||!hat_page_is_mapped(pp)) {
 444  452                          /*
 445  453                           * Update statistics for pages being paged out
 446  454                           */
 447  455                          if (pp->p_vnode) {
 448  456                                  if (IS_SWAPFSVP(pp->p_vnode)) {
 449  457                                          anonpgout++;
 450  458                                  } else {
 451  459                                          if (pp->p_vnode->v_flag & VVMEXEC) {
 452  460                                                  execpgout++;
 453  461                                          } else {
 454  462                                                  fspgout++;
 455  463                                          }
 456  464                                  }
 457  465                          }
 458  466                          page_io_unlock(pp);
 459  467                          pgout = 1;
 460  468                          pgpgout++;
 461  469                          TRACE_1(TR_FAC_VM, TR_PAGE_WS_OUT,
 462  470                              "page_ws_out:pp %p", pp);
 463  471  
 464  472                          /*
 465  473                           * The page_struct_lock need not be acquired to
 466  474                           * examine "p_lckcnt" and "p_cowcnt" since we'll
 467  475                           * have an "exclusive" lock if the upgrade succeeds.
 468  476                           */
 469  477                          if (page_tryupgrade(pp) &&
 470  478                              pp->p_lckcnt == 0 && pp->p_cowcnt == 0) {
 471  479                                  /*
 472  480                                   * Check if someone has reclaimed the
 473  481                                   * page.  If ref and mod are not set, no
 474  482                                   * one is using it so we can free it.
 475  483                                   * The rest of the system is careful
 476  484                                   * to use the NOSYNC flag to unload
 477  485                                   * translations set up for i/o w/o
 478  486                                   * affecting ref and mod bits.
 479  487                                   *
 480  488                                   * Obtain a copy of the real hardware
 481  489                                   * mod bit using hat_pagesync(pp, HAT_DONTZERO)
 482  490                                   * to avoid having to flush the cache.
 483  491                                   */
 484  492                                  ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
 485  493                                      HAT_SYNC_STOPON_MOD);
 486  494                          ck_refmod:
 487  495                                  if (!(ppattr & (P_REF | P_MOD))) {
 488  496                                          if (hat_page_is_mapped(pp)) {
 489  497                                                  /*
 490  498                                                   * Doesn't look like the page
 491  499                                                   * was modified so now we
 492  500                                                   * really have to unload the
 493  501                                                   * translations.  Meanwhile
 494  502                                                   * another CPU could've
 495  503                                                   * modified it so we have to
 496  504                                                   * check again.  We don't loop
 497  505                                                   * forever here because now
 498  506                                                   * the translations are gone
 499  507                                                   * and no one can get a new one
 500  508                                                   * since we have the "exclusive"
 501  509                                                   * lock on the page.
 502  510                                                   */
 503  511                                                  (void) hat_pageunload(pp,
 504  512                                                      HAT_FORCE_PGUNLOAD);
 505  513                                                  ppattr = hat_page_getattr(pp,
 506  514                                                      P_REF | P_MOD);
 507  515                                                  goto ck_refmod;
 508  516                                          }
 509  517                                          /*
 510  518                                           * Update statistics for pages being
 511  519                                           * freed
 512  520                                           */
 513  521                                          if (pp->p_vnode) {
 514  522                                                  if (IS_SWAPFSVP(pp->p_vnode)) {
 515  523                                                          anonfree++;
 516  524                                                  } else {
 517  525                                                          if (pp->p_vnode->v_flag
 518  526                                                              & VVMEXEC) {
 519  527                                                                  execfree++;
 520  528                                                          } else {
 521  529                                                                  fsfree++;
 522  530                                                          }
 523  531                                                  }
 524  532                                          }
 525  533                                          /*LINTED: constant in conditional ctx*/
 526  534                                          VN_DISPOSE(pp, B_FREE,
 527  535                                              (flags & B_DONTNEED), kcred);
 528  536                                          dfree++;
 529  537                                  } else {
 530  538                                          page_unlock(pp);
 531  539                                          pgrec++;
 532  540                                          TRACE_1(TR_FAC_VM, TR_PAGE_WS_FREE,
 533  541                                              "page_ws_free:pp %p", pp);
 534  542                                  }
 535  543                          } else {
 536  544                                  /*
 537  545                                   * Page is either `locked' in memory
 538  546                                   * or was reclaimed and now has a
 539  547                                   * "shared" lock, so release it.
 540  548                                   */
 541  549                                  page_unlock(pp);
 542  550                          }
 543  551                  } else {
 544  552                          /*
 545  553                           * Neither B_FREE nor B_INVAL nor B_ERROR.
 546  554                           * Just release locks.
 547  555                           */
 548  556                          page_io_unlock(pp);
 549  557                          page_unlock(pp);
 550  558                  }
 551  559          }
 552  560  
 553  561          CPU_STATS_ENTER_K();
 554  562          cpup = CPU;             /* get cpup now that CPU cannot change */
 555  563          CPU_STATS_ADDQ(cpup, vm, dfree, dfree);
 556  564          CPU_STATS_ADDQ(cpup, vm, pgrec, pgrec);
 557  565          CPU_STATS_ADDQ(cpup, vm, pgout, pgout);
 558  566          CPU_STATS_ADDQ(cpup, vm, pgpgout, pgpgout);
 559  567          CPU_STATS_ADDQ(cpup, vm, anonpgout, anonpgout);
 560  568          CPU_STATS_ADDQ(cpup, vm, anonfree, anonfree);
 561  569          CPU_STATS_ADDQ(cpup, vm, fspgout, fspgout);
 562  570          CPU_STATS_ADDQ(cpup, vm, fsfree, fsfree);
 563  571          CPU_STATS_ADDQ(cpup, vm, execpgout, execpgout);
 564  572          CPU_STATS_ADDQ(cpup, vm, execfree, execfree);
 565  573          CPU_STATS_EXIT_K();
  
    | 
      ↓ open down ↓ | 
    119 lines elided | 
    
      ↑ open up ↑ | 
  
 566  574  
 567  575          /* Kernel probe */
 568  576          TNF_PROBE_4(pageout, "vm pageio io", /* CSTYLED */,
 569  577              tnf_opaque, vnode,                  vp,
 570  578              tnf_ulong,  pages_pageout,          pgpgout,
 571  579              tnf_ulong,  pages_freed,            dfree,
 572  580              tnf_ulong,  pages_reclaimed,        pgrec);
 573  581  }
 574  582  
 575  583  /*
 576      - * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI,
 577      - * B_TRUNC, B_FORCE}.  B_DELWRI indicates that this page is part of a kluster
      584 + * Flags are composed of {B_ASYNC, B_INVAL, B_INVALCURONLY, B_FREE,
      585 + * B_DONTNEED, B_DELWRI, B_TRUNC, B_FORCE}.
      586 + * B_DELWRI indicates that this page is part of a kluster
 578  587   * operation and is only to be considered if it doesn't involve any
 579  588   * waiting here.  B_TRUNC indicates that the file is being truncated
 580  589   * and so no i/o needs to be done. B_FORCE indicates that the page
 581  590   * must be destroyed so don't try wrting it out.
 582  591   *
 583  592   * The caller must ensure that the page is locked.  Returns 1, if
 584  593   * the page should be written back (the "iolock" is held in this
 585  594   * case), or 0 if the page has been dealt with or has been
 586  595   * unlocked.
 587  596   */
 588  597  int
 589  598  pvn_getdirty(page_t *pp, int flags)
 590  599  {
 591  600          ASSERT((flags & (B_INVAL | B_FREE)) ?
 592  601              PAGE_EXCL(pp) : PAGE_SHARED(pp));
 593  602          ASSERT(PP_ISFREE(pp) == 0);
 594  603  
 595  604          /*
 596  605           * If trying to invalidate or free a logically `locked' page,
 597  606           * forget it.  Don't need page_struct_lock to check p_lckcnt and
 598  607           * p_cowcnt as the page is exclusively locked.
 599  608           */
 600  609          if ((flags & (B_INVAL | B_FREE)) && !(flags & (B_TRUNC|B_FORCE)) &&
 601  610              (pp->p_lckcnt != 0 || pp->p_cowcnt != 0)) {
 602  611                  page_unlock(pp);
 603  612                  return (0);
 604  613          }
 605  614  
 606  615          /*
 607  616           * Now acquire the i/o lock so we can add it to the dirty
 608  617           * list (if necessary).  We avoid blocking on the i/o lock
 609  618           * in the following cases:
 610  619           *
 611  620           *      If B_DELWRI is set, which implies that this request is
 612  621           *      due to a klustering operartion.
 613  622           *
 614  623           *      If this is an async (B_ASYNC) operation and we are not doing
 615  624           *      invalidation (B_INVAL) [The current i/o or fsflush will ensure
 616  625           *      that the the page is written out].
 617  626           */
 618  627          if ((flags & B_DELWRI) || ((flags & (B_INVAL | B_ASYNC)) == B_ASYNC)) {
 619  628                  if (!page_io_trylock(pp)) {
 620  629                          page_unlock(pp);
  
    | 
      ↓ open down ↓ | 
    33 lines elided | 
    
      ↑ open up ↑ | 
  
 621  630                          return (0);
 622  631                  }
 623  632          } else {
 624  633                  page_io_lock(pp);
 625  634          }
 626  635  
 627  636          /*
 628  637           * If we want to free or invalidate the page then
 629  638           * we need to unload it so that anyone who wants
 630  639           * it will have to take a minor fault to get it.
      640 +         * If we are only invalidating the page for the
      641 +         * current process, then pass in a different flag.
 631  642           * Otherwise, we're just writing the page back so we
 632  643           * need to sync up the hardwre and software mod bit to
 633  644           * detect any future modifications.  We clear the
 634  645           * software mod bit when we put the page on the dirty
 635  646           * list.
 636  647           */
 637      -        if (flags & (B_INVAL | B_FREE)) {
      648 +        if (flags & B_INVALCURONLY) {
      649 +                (void) hat_pageunload(pp, HAT_CURPROC_PGUNLOAD);
      650 +        } else if (flags & (B_INVAL | B_FREE)) {
 638  651                  (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
 639  652          } else {
 640  653                  (void) hat_pagesync(pp, HAT_SYNC_ZERORM);
 641  654          }
 642  655  
 643  656          if (!hat_ismod(pp) || (flags & B_TRUNC)) {
 644  657                  /*
 645  658                   * Don't need to add it to the
 646  659                   * list after all.
 647  660                   */
 648  661                  page_io_unlock(pp);
 649      -                if (flags & B_INVAL) {
      662 +                if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) {
 650  663                          /*LINTED: constant in conditional context*/
 651  664                          VN_DISPOSE(pp, B_INVAL, 0, kcred);
 652  665                  } else if (flags & B_FREE) {
 653  666                          /*LINTED: constant in conditional context*/
 654  667                          VN_DISPOSE(pp, B_FREE, (flags & B_DONTNEED), kcred);
 655  668                  } else {
 656  669                          /*
 657  670                           * This is advisory path for the callers
 658  671                           * of VOP_PUTPAGE() who prefer freeing the
 659  672                           * page _only_ if no one else is accessing it.
 660  673                           * E.g. segmap_release()
      674 +                         * We also take this path for B_INVALCURONLY and
      675 +                         * let page_release call VN_DISPOSE if no one else is
      676 +                         * using the page.
 661  677                           *
 662  678                           * The above hat_ismod() check is useless because:
 663  679                           * (1) we may not be holding SE_EXCL lock;
 664  680                           * (2) we've not unloaded _all_ translations
 665  681                           *
 666  682                           * Let page_release() do the heavy-lifting.
 667  683                           */
 668  684                          (void) page_release(pp, 1);
 669  685                  }
 670  686                  return (0);
 671  687          }
 672  688  
 673  689          /*
 674  690           * Page is dirty, get it ready for the write back
  
    | 
      ↓ open down ↓ | 
    4 lines elided | 
    
      ↑ open up ↑ | 
  
 675  691           * and add page to the dirty list.
 676  692           */
 677  693          hat_clrrefmod(pp);
 678  694  
 679  695          /*
 680  696           * If we're going to free the page when we're done
 681  697           * then we can let others try to use it starting now.
 682  698           * We'll detect the fact that they used it when the
 683  699           * i/o is done and avoid freeing the page.
 684  700           */
 685      -        if (flags & B_FREE)
      701 +        if (flags & (B_FREE | B_INVALCURONLY))
 686  702                  page_downgrade(pp);
 687  703  
 688  704  
 689  705          TRACE_1(TR_FAC_VM, TR_PVN_GETDIRTY, "pvn_getdirty:pp %p", pp);
 690  706  
 691  707          return (1);
 692  708  }
 693  709  
 694  710  
 695  711  /*ARGSUSED*/
 696  712  static int
 697  713  marker_constructor(void *buf, void *cdrarg, int kmflags)
 698  714  {
 699  715          page_t *mark = buf;
 700  716          bzero(mark, sizeof (page_t));
 701  717          mark->p_hash = PVN_VPLIST_HASH_TAG;
 702  718          return (0);
 703  719  }
 704  720  
 705  721  void
 706  722  pvn_init()
 707  723  {
 708  724          if (pvn_vmodsort_disable == 0)
 709  725                  pvn_vmodsort_supported = hat_supported(HAT_VMODSORT, NULL);
 710  726          marker_cache = kmem_cache_create("marker_cache",
 711  727              sizeof (page_t), 0, marker_constructor,
 712  728              NULL, NULL, NULL, NULL, 0);
 713  729  }
 714  730  
 715  731  
 716  732  /*
 717  733   * Process a vnode's page list for all pages whose offset is >= off.
 718  734   * Pages are to either be free'd, invalidated, or written back to disk.
 719  735   *
 720  736   * An "exclusive" lock is acquired for each page if B_INVAL or B_FREE
 721  737   * is specified, otherwise they are "shared" locked.
 722  738   *
 723  739   * Flags are {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_TRUNC}
 724  740   *
 725  741   * Special marker page_t's are inserted in the list in order
 726  742   * to keep track of where we are in the list when locks are dropped.
 727  743   *
 728  744   * Note the list is circular and insertions can happen only at the
 729  745   * head and tail of the list. The algorithm ensures visiting all pages
 730  746   * on the list in the following way:
 731  747   *
 732  748   *    Drop two marker pages at the end of the list.
 733  749   *
 734  750   *    Move one marker page backwards towards the start of the list until
 735  751   *    it is at the list head, processing the pages passed along the way.
 736  752   *
 737  753   *    Due to race conditions when the vphm mutex is dropped, additional pages
 738  754   *    can be added to either end of the list, so we'll continue to move
 739  755   *    the marker and process pages until it is up against the end marker.
 740  756   *
 741  757   * There is one special exit condition. If we are processing a VMODSORT
 742  758   * vnode and only writing back modified pages, we can stop as soon as
 743  759   * we run into an unmodified page.  This makes fsync(3) operations fast.
 744  760   */
 745  761  int
 746  762  pvn_vplist_dirty(
 747  763          vnode_t         *vp,
 748  764          u_offset_t      off,
 749  765          int             (*putapage)(vnode_t *, page_t *, u_offset_t *,
 750  766                          size_t *, int, cred_t *),
 751  767          int             flags,
 752  768          cred_t          *cred)
 753  769  {
 754  770          page_t          *pp;
 755  771          page_t          *mark;          /* marker page that moves toward head */
 756  772          page_t          *end;           /* marker page at end of list */
 757  773          int             err = 0;
 758  774          int             error;
 759  775          kmutex_t        *vphm;
 760  776          se_t            se;
 761  777          page_t          **where_to_move;
 762  778  
 763  779          ASSERT(vp->v_type != VCHR);
 764  780  
 765  781          if (vp->v_pages == NULL)
 766  782                  return (0);
 767  783  
 768  784  
 769  785          /*
 770  786           * Serialize vplist_dirty operations on this vnode by setting VVMLOCK.
 771  787           *
 772  788           * Don't block on VVMLOCK if B_ASYNC is set. This prevents sync()
 773  789           * from getting blocked while flushing pages to a dead NFS server.
 774  790           */
 775  791          mutex_enter(&vp->v_lock);
 776  792          if ((vp->v_flag & VVMLOCK) && (flags & B_ASYNC)) {
 777  793                  mutex_exit(&vp->v_lock);
 778  794                  return (EAGAIN);
 779  795          }
 780  796  
 781  797          while (vp->v_flag & VVMLOCK)
 782  798                  cv_wait(&vp->v_cv, &vp->v_lock);
 783  799  
 784  800          if (vp->v_pages == NULL) {
 785  801                  mutex_exit(&vp->v_lock);
 786  802                  return (0);
 787  803          }
 788  804  
 789  805          vp->v_flag |= VVMLOCK;
 790  806          mutex_exit(&vp->v_lock);
 791  807  
 792  808  
 793  809          /*
 794  810           * Set up the marker pages used to walk the list
 795  811           */
 796  812          end = kmem_cache_alloc(marker_cache, KM_SLEEP);
 797  813          end->p_vnode = vp;
 798  814          end->p_offset = (u_offset_t)-2;
 799  815          mark = kmem_cache_alloc(marker_cache, KM_SLEEP);
 800  816          mark->p_vnode = vp;
 801  817          mark->p_offset = (u_offset_t)-1;
 802  818  
 803  819          /*
 804  820           * Grab the lock protecting the vnode's page list
 805  821           * note that this lock is dropped at times in the loop.
 806  822           */
 807  823          vphm = page_vnode_mutex(vp);
 808  824          mutex_enter(vphm);
 809  825          if (vp->v_pages == NULL)
 810  826                  goto leave;
 811  827  
 812  828          /*
 813  829           * insert the markers and loop through the list of pages
 814  830           */
 815  831          page_vpadd(&vp->v_pages->p_vpprev->p_vpnext, mark);
 816  832          page_vpadd(&mark->p_vpnext, end);
 817  833          for (;;) {
 818  834  
 819  835                  /*
 820  836                   * If only doing an async write back, then we can
 821  837                   * stop as soon as we get to start of the list.
 822  838                   */
 823  839                  if (flags == B_ASYNC && vp->v_pages == mark)
 824  840                          break;
 825  841  
 826  842                  /*
 827  843                   * otherwise stop when we've gone through all the pages
 828  844                   */
 829  845                  if (mark->p_vpprev == end)
 830  846                          break;
 831  847  
 832  848                  pp = mark->p_vpprev;
 833  849                  if (vp->v_pages == pp)
 834  850                          where_to_move = &vp->v_pages;
 835  851                  else
 836  852                          where_to_move = &pp->p_vpprev->p_vpnext;
 837  853  
 838  854                  ASSERT(pp->p_vnode == vp);
 839  855  
 840  856                  /*
 841  857                   * If just flushing dirty pages to disk and this vnode
 842  858                   * is using a sorted list of pages, we can stop processing
 843  859                   * as soon as we find an unmodified page. Since all the
 844  860                   * modified pages are visited first.
 845  861                   */
 846  862                  if (IS_VMODSORT(vp) &&
 847  863                      !(flags & (B_INVAL | B_FREE | B_TRUNC))) {
 848  864                          if (!hat_ismod(pp) && !page_io_locked(pp)) {
 849  865  #ifdef  DEBUG
 850  866                                  /*
 851  867                                   * For debug kernels examine what should be
 852  868                                   * all the remaining clean pages, asserting
 853  869                                   * that they are not modified.
 854  870                                   */
 855  871                                  page_t  *chk = pp;
 856  872                                  int     attr;
 857  873  
 858  874                                  page_vpsub(&vp->v_pages, mark);
 859  875                                  page_vpadd(where_to_move, mark);
 860  876                                  do {
 861  877                                          chk = chk->p_vpprev;
 862  878                                          ASSERT(chk != end);
 863  879                                          if (chk == mark)
 864  880                                                  continue;
 865  881                                          attr = hat_page_getattr(chk, P_MOD |
 866  882                                              P_REF);
 867  883                                          if ((attr & P_MOD) == 0)
 868  884                                                  continue;
 869  885                                          panic("v_pages list not all clean: "
 870  886                                              "page_t*=%p vnode=%p off=%lx "
 871  887                                              "attr=0x%x last clean page_t*=%p\n",
 872  888                                              (void *)chk, (void *)chk->p_vnode,
 873  889                                              (long)chk->p_offset, attr,
 874  890                                              (void *)pp);
 875  891                                  } while (chk != vp->v_pages);
 876  892  #endif
 877  893                                  break;
 878  894                          } else if (!(flags & B_ASYNC) && !hat_ismod(pp)) {
 879  895                                  /*
 880  896                                   * Couldn't get io lock, wait until IO is done.
 881  897                                   * Block only for sync IO since we don't want
 882  898                                   * to block async IO.
 883  899                                   */
 884  900                                  mutex_exit(vphm);
 885  901                                  page_io_wait(pp);
 886  902                                  mutex_enter(vphm);
 887  903                                  continue;
 888  904                          }
 889  905                  }
 890  906  
 891  907                  /*
 892  908                   * Skip this page if the offset is out of the desired range.
 893  909                   * Just move the marker and continue.
 894  910                   */
 895  911                  if (pp->p_offset < off) {
 896  912                          page_vpsub(&vp->v_pages, mark);
 897  913                          page_vpadd(where_to_move, mark);
 898  914                          continue;
 899  915                  }
 900  916  
 901  917                  /*
 902  918                   * If we are supposed to invalidate or free this
 903  919                   * page, then we need an exclusive lock.
 904  920                   */
 905  921                  se = (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED;
 906  922  
 907  923                  /*
 908  924                   * We must acquire the page lock for all synchronous
 909  925                   * operations (invalidate, free and write).
 910  926                   */
 911  927                  if ((flags & B_INVAL) != 0 || (flags & B_ASYNC) == 0) {
 912  928                          /*
 913  929                           * If the page_lock() drops the mutex
 914  930                           * we must retry the loop.
 915  931                           */
 916  932                          if (!page_lock(pp, se, vphm, P_NO_RECLAIM))
 917  933                                  continue;
 918  934  
 919  935                          /*
 920  936                           * It's ok to move the marker page now.
 921  937                           */
 922  938                          page_vpsub(&vp->v_pages, mark);
 923  939                          page_vpadd(where_to_move, mark);
 924  940                  } else {
 925  941  
 926  942                          /*
 927  943                           * update the marker page for all remaining cases
 928  944                           */
 929  945                          page_vpsub(&vp->v_pages, mark);
 930  946                          page_vpadd(where_to_move, mark);
 931  947  
 932  948                          /*
 933  949                           * For write backs, If we can't lock the page, it's
 934  950                           * invalid or in the process of being destroyed.  Skip
 935  951                           * it, assuming someone else is writing it.
 936  952                           */
 937  953                          if (!page_trylock(pp, se))
 938  954                                  continue;
 939  955                  }
 940  956  
 941  957                  ASSERT(pp->p_vnode == vp);
 942  958  
 943  959                  /*
 944  960                   * Successfully locked the page, now figure out what to
 945  961                   * do with it. Free pages are easily dealt with, invalidate
 946  962                   * if desired or just go on to the next page.
 947  963                   */
 948  964                  if (PP_ISFREE(pp)) {
 949  965                          if ((flags & B_INVAL) == 0) {
 950  966                                  page_unlock(pp);
 951  967                                  continue;
 952  968                          }
 953  969  
 954  970                          /*
 955  971                           * Invalidate (destroy) the page.
 956  972                           */
 957  973                          mutex_exit(vphm);
 958  974                          page_destroy_free(pp);
 959  975                          mutex_enter(vphm);
 960  976                          continue;
 961  977                  }
 962  978  
 963  979                  /*
 964  980                   * pvn_getdirty() figures out what do do with a dirty page.
 965  981                   * If the page is dirty, the putapage() routine will write it
 966  982                   * and will kluster any other adjacent dirty pages it can.
 967  983                   *
 968  984                   * pvn_getdirty() and `(*putapage)' unlock the page.
 969  985                   */
 970  986                  mutex_exit(vphm);
 971  987                  if (pvn_getdirty(pp, flags)) {
 972  988                          error = (*putapage)(vp, pp, NULL, NULL, flags, cred);
 973  989                          if (!err)
 974  990                                  err = error;
 975  991                  }
 976  992                  mutex_enter(vphm);
 977  993          }
 978  994          page_vpsub(&vp->v_pages, mark);
 979  995          page_vpsub(&vp->v_pages, end);
 980  996  
 981  997  leave:
 982  998          /*
 983  999           * Release v_pages mutex, also VVMLOCK and wakeup blocked thrds
 984 1000           */
 985 1001          mutex_exit(vphm);
 986 1002          kmem_cache_free(marker_cache, mark);
 987 1003          kmem_cache_free(marker_cache, end);
 988 1004          mutex_enter(&vp->v_lock);
 989 1005          vp->v_flag &= ~VVMLOCK;
 990 1006          cv_broadcast(&vp->v_cv);
 991 1007          mutex_exit(&vp->v_lock);
 992 1008          return (err);
 993 1009  }
 994 1010  
 995 1011  /*
 996 1012   * Walk the vp->v_pages list, for every page call the callback function
 997 1013   * pointed by *page_check. If page_check returns non-zero, then mark the
 998 1014   * page as modified and if VMODSORT is set, move it to the end of v_pages
 999 1015   * list. Moving makes sense only if we have at least two pages - this also
1000 1016   * avoids having v_pages temporarily being NULL after calling page_vpsub()
1001 1017   * if there was just one page.
1002 1018   */
1003 1019  void
1004 1020  pvn_vplist_setdirty(vnode_t *vp, int (*page_check)(page_t *))
1005 1021  {
1006 1022          page_t  *pp, *next, *end;
1007 1023          kmutex_t        *vphm;
1008 1024          int     shuffle;
1009 1025  
1010 1026          vphm = page_vnode_mutex(vp);
1011 1027          mutex_enter(vphm);
1012 1028  
1013 1029          if (vp->v_pages == NULL) {
1014 1030                  mutex_exit(vphm);
1015 1031                  return;
1016 1032          }
1017 1033  
1018 1034          end = vp->v_pages->p_vpprev;
1019 1035          shuffle = IS_VMODSORT(vp) && (vp->v_pages != end);
1020 1036          pp = vp->v_pages;
1021 1037  
1022 1038          for (;;) {
1023 1039                  next = pp->p_vpnext;
1024 1040                  if (pp->p_hash != PVN_VPLIST_HASH_TAG && page_check(pp)) {
1025 1041                          /*
1026 1042                           * hat_setmod_only() in contrast to hat_setmod() does
1027 1043                           * not shuffle the pages and does not grab the mutex
1028 1044                           * page_vnode_mutex. Exactly what we need.
1029 1045                           */
1030 1046                          hat_setmod_only(pp);
1031 1047                          if (shuffle) {
1032 1048                                  page_vpsub(&vp->v_pages, pp);
1033 1049                                  ASSERT(vp->v_pages != NULL);
1034 1050                                  page_vpadd(&vp->v_pages->p_vpprev->p_vpnext,
1035 1051                                      pp);
1036 1052                          }
1037 1053                  }
1038 1054                  /* Stop if we have just processed the last page. */
1039 1055                  if (pp == end)
1040 1056                          break;
1041 1057                  pp = next;
1042 1058          }
1043 1059  
1044 1060          mutex_exit(vphm);
1045 1061  }
1046 1062  
1047 1063  /*
1048 1064   * Zero out zbytes worth of data. Caller should be aware that this
1049 1065   * routine may enter back into the fs layer (xxx_getpage). Locks
1050 1066   * that the xxx_getpage routine may need should not be held while
1051 1067   * calling this.
1052 1068   */
1053 1069  void
1054 1070  pvn_vpzero(struct vnode *vp, u_offset_t vplen, size_t zbytes)
1055 1071  {
1056 1072          caddr_t addr;
1057 1073  
1058 1074          ASSERT(vp->v_type != VCHR);
1059 1075  
1060 1076          if (vp->v_pages == NULL)
1061 1077                  return;
1062 1078  
1063 1079          /*
1064 1080           * zbytes may be zero but there still may be some portion of
1065 1081           * a page which needs clearing (since zbytes is a function
1066 1082           * of filesystem block size, not pagesize.)
1067 1083           */
1068 1084          if (zbytes == 0 && (PAGESIZE - (vplen & PAGEOFFSET)) == 0)
1069 1085                  return;
1070 1086  
1071 1087          /*
1072 1088           * We get the last page and handle the partial
1073 1089           * zeroing via kernel mappings.  This will make the page
1074 1090           * dirty so that we know that when this page is written
1075 1091           * back, the zeroed information will go out with it.  If
1076 1092           * the page is not currently in memory, then the kzero
1077 1093           * operation will cause it to be brought it.  We use kzero
1078 1094           * instead of bzero so that if the page cannot be read in
1079 1095           * for any reason, the system will not panic.  We need
1080 1096           * to zero out a minimum of the fs given zbytes, but we
1081 1097           * might also have to do more to get the entire last page.
1082 1098           */
1083 1099  
1084 1100          if ((zbytes + (vplen & MAXBOFFSET)) > MAXBSIZE)
1085 1101                  panic("pvn_vptrunc zbytes");
1086 1102          addr = segmap_getmapflt(segkmap, vp, vplen,
1087 1103              MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)), 1, S_WRITE);
1088 1104          (void) kzero(addr + (vplen & MAXBOFFSET),
1089 1105              MAX(zbytes, PAGESIZE - (vplen & PAGEOFFSET)));
1090 1106          (void) segmap_release(segkmap, addr, SM_WRITE | SM_ASYNC);
1091 1107  }
1092 1108  
1093 1109  /*
1094 1110   * Handles common work of the VOP_GETPAGE routines by iterating page by page
1095 1111   * calling the getpage helper for each.
1096 1112   */
1097 1113  int
1098 1114  pvn_getpages(
1099 1115          int (*getpage)(vnode_t *, u_offset_t, size_t, uint_t *, page_t *[],
1100 1116                  size_t, struct seg *, caddr_t, enum seg_rw, cred_t *),
1101 1117          struct vnode *vp,
1102 1118          u_offset_t off,
1103 1119          size_t len,
1104 1120          uint_t *protp,
1105 1121          page_t *pl[],
1106 1122          size_t plsz,
1107 1123          struct seg *seg,
1108 1124          caddr_t addr,
1109 1125          enum seg_rw rw,
1110 1126          struct cred *cred)
1111 1127  {
1112 1128          page_t **ppp;
1113 1129          u_offset_t o, eoff;
1114 1130          size_t sz, xlen;
1115 1131          int err;
1116 1132  
1117 1133          /* ensure that we have enough space */
1118 1134          ASSERT(pl == NULL || plsz >= len);
1119 1135  
1120 1136          /*
1121 1137           * Loop one page at a time and let getapage function fill
1122 1138           * in the next page in array.  We only allow one page to be
1123 1139           * returned at a time (except for the last page) so that we
1124 1140           * don't have any problems with duplicates and other such
1125 1141           * painful problems.  This is a very simple minded algorithm,
1126 1142           * but it does the job correctly.  We hope that the cost of a
1127 1143           * getapage call for a resident page that we might have been
1128 1144           * able to get from an earlier call doesn't cost too much.
1129 1145           */
1130 1146          ppp = pl;
1131 1147          sz = (pl != NULL) ? PAGESIZE : 0;
1132 1148          eoff = off + len;
1133 1149          xlen = len;
1134 1150          for (o = off; o < eoff; o += PAGESIZE, addr += PAGESIZE,
1135 1151              xlen -= PAGESIZE) {
1136 1152                  if (o + PAGESIZE >= eoff && pl != NULL) {
1137 1153                          /*
1138 1154                           * Last time through - allow the all of
1139 1155                           * what's left of the pl[] array to be used.
1140 1156                           */
1141 1157                          sz = plsz - (o - off);
1142 1158                  }
1143 1159                  err = (*getpage)(vp, o, xlen, protp, ppp, sz, seg, addr,
1144 1160                      rw, cred);
1145 1161                  if (err) {
1146 1162                          /*
1147 1163                           * Release any pages we already got.
1148 1164                           */
1149 1165                          if (o > off && pl != NULL) {
1150 1166                                  for (ppp = pl; *ppp != NULL; *ppp++ = NULL)
1151 1167                                          (void) page_release(*ppp, 1);
1152 1168                          }
1153 1169                          break;
1154 1170                  }
1155 1171                  if (pl != NULL)
1156 1172                          ppp++;
1157 1173          }
1158 1174          return (err);
1159 1175  }
1160 1176  
1161 1177  /*
1162 1178   * Initialize the page list array.
1163 1179   */
1164 1180  /*ARGSUSED*/
1165 1181  void
1166 1182  pvn_plist_init(page_t *pp, page_t *pl[], size_t plsz,
1167 1183      u_offset_t off, size_t io_len, enum seg_rw rw)
1168 1184  {
1169 1185          ssize_t sz;
1170 1186          page_t *ppcur, **ppp;
1171 1187  
1172 1188          /*
1173 1189           * Set up to load plsz worth
1174 1190           * starting at the needed page.
1175 1191           */
1176 1192          while (pp != NULL && pp->p_offset != off) {
1177 1193                  /*
1178 1194                   * Remove page from the i/o list,
1179 1195                   * release the i/o and the page lock.
1180 1196                   */
1181 1197                  ppcur = pp;
1182 1198                  page_sub(&pp, ppcur);
1183 1199                  page_io_unlock(ppcur);
1184 1200                  (void) page_release(ppcur, 1);
1185 1201          }
1186 1202  
1187 1203          if (pp == NULL) {
1188 1204                  pl[0] = NULL;
1189 1205                  return;
1190 1206          }
1191 1207  
1192 1208          sz = plsz;
1193 1209  
1194 1210          /*
1195 1211           * Initialize the page list array.
1196 1212           */
1197 1213          ppp = pl;
1198 1214          do {
1199 1215                  ppcur = pp;
1200 1216                  *ppp++ = ppcur;
1201 1217                  page_sub(&pp, ppcur);
1202 1218                  page_io_unlock(ppcur);
1203 1219                  if (rw != S_CREATE)
1204 1220                          page_downgrade(ppcur);
1205 1221                  sz -= PAGESIZE;
1206 1222          } while (sz > 0 && pp != NULL);
1207 1223          *ppp = NULL;            /* terminate list */
1208 1224  
1209 1225          /*
1210 1226           * Now free the remaining pages that weren't
1211 1227           * loaded in the page list.
1212 1228           */
1213 1229          while (pp != NULL) {
1214 1230                  ppcur = pp;
1215 1231                  page_sub(&pp, ppcur);
1216 1232                  page_io_unlock(ppcur);
1217 1233                  (void) page_release(ppcur, 1);
1218 1234          }
1219 1235  }
  
    | 
      ↓ open down ↓ | 
    524 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX