io-lx-public-vs-joyent Wdiff usr/src/uts/common/vm/vm_as.c

Print this page

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/vm/vm_as.c
          +++ new/usr/src/uts/common/vm/vm_as.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each

↓ open down ↓

13 lines elided

↑ open up ↑

  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24      - * Copyright 2016 Joyent, Inc.
       24 + * Copyright 2015, Joyent, Inc.  All rights reserved.
  25   25   * Copyright (c) 2016 by Delphix. All rights reserved.
  26   26   */
  27   27  
  28   28  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  29   29  /*        All Rights Reserved   */
  30   30  
  31   31  /*
  32   32   * University Copyright- Copyright (c) 1982, 1986, 1988
  33   33   * The Regents of the University of California
  34   34   * All Rights Reserved

  35   35   *
  36   36   * University Acknowledgment- Portions of this document are derived from
  37   37   * software developed by the University of California, Berkeley, and its
  38   38   * contributors.
  39   39   */
  40   40  
  41   41  /*
  42   42   * VM - address spaces.
  43   43   */
  44   44  
  45   45  #include <sys/types.h>
  46   46  #include <sys/t_lock.h>
  47   47  #include <sys/param.h>
  48   48  #include <sys/errno.h>
  49   49  #include <sys/systm.h>
  50   50  #include <sys/mman.h>
  51   51  #include <sys/sysmacros.h>
  52   52  #include <sys/cpuvar.h>
  53   53  #include <sys/sysinfo.h>
  54   54  #include <sys/kmem.h>
  55   55  #include <sys/vnode.h>
  56   56  #include <sys/vmsystm.h>
  57   57  #include <sys/cmn_err.h>
  58   58  #include <sys/debug.h>
  59   59  #include <sys/tnf_probe.h>
  60   60  #include <sys/vtrace.h>
  61   61  #include <sys/ddi.h>
  62   62  
  63   63  #include <vm/hat.h>
  64   64  #include <vm/as.h>

↓ open down ↓

30 lines elided

↑ open up ↑

  65   65  #include <vm/seg.h>
  66   66  #include <vm/seg_vn.h>
  67   67  #include <vm/seg_dev.h>
  68   68  #include <vm/seg_kmem.h>
  69   69  #include <vm/seg_map.h>
  70   70  #include <vm/seg_spt.h>
  71   71  #include <vm/page.h>
  72   72  
  73   73  clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
  74   74  
  75      -ulong_t as_user_seg_limit = 0xffff; /* max segments in an (non-kas) AS */
  76      -
  77   75  static struct kmem_cache *as_cache;
  78   76  
  79   77  static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
  80   78  static void as_clearwatchprot(struct as *, caddr_t, size_t);
  81   79  int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
  82   80  
  83   81  
  84   82  /*
  85   83   * Verifying the segment lists is very time-consuming; it may not be
  86   84   * desirable always to define VERIFY_SEGLIST when DEBUG is set.

  87   85   */
  88   86  #ifdef DEBUG
  89   87  #define VERIFY_SEGLIST
  90   88  int do_as_verify = 0;
  91   89  #endif
  92   90  
  93   91  /*
  94   92   * Allocate a new callback data structure entry and fill in the events of
  95   93   * interest, the address range of interest, and the callback argument.
  96   94   * Link the entry on the as->a_callbacks list. A callback entry for the
  97   95   * entire address space may be specified with vaddr = 0 and size = -1.
  98   96   *
  99   97   * CALLERS RESPONSIBILITY: If not calling from within the process context for
 100   98   * the specified as, the caller must guarantee persistence of the specified as
 101   99   * for the duration of this function (eg. pages being locked within the as
 102  100   * will guarantee persistence).
 103  101   */
 104  102  int
 105  103  as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
 106  104      caddr_t vaddr, size_t size, int sleepflag)
 107  105  {
 108  106          struct as_callback      *current_head, *cb;
 109  107          caddr_t                 saddr;
 110  108          size_t                  rsize;
 111  109  
 112  110          /* callback function and an event are mandatory */
 113  111          if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
 114  112                  return (EINVAL);
 115  113  
 116  114          /* Adding a callback after as_free has been called is not allowed */
 117  115          if (as == &kas)
 118  116                  return (ENOMEM);
 119  117  
 120  118          /*
 121  119           * vaddr = 0 and size = -1 is used to indicate that the callback range
 122  120           * is the entire address space so no rounding is done in that case.
 123  121           */
 124  122          if (size != -1) {
 125  123                  saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
 126  124                  rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
 127  125                      (size_t)saddr;
 128  126                  /* check for wraparound */
 129  127                  if (saddr + rsize < saddr)
 130  128                          return (ENOMEM);
 131  129          } else {
 132  130                  if (vaddr != 0)
 133  131                          return (EINVAL);
 134  132                  saddr = vaddr;
 135  133                  rsize = size;
 136  134          }
 137  135  
 138  136          /* Allocate and initialize a callback entry */
 139  137          cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
 140  138          if (cb == NULL)
 141  139                  return (EAGAIN);
 142  140  
 143  141          cb->ascb_func = cb_func;
 144  142          cb->ascb_arg = arg;
 145  143          cb->ascb_events = events;
 146  144          cb->ascb_saddr = saddr;
 147  145          cb->ascb_len = rsize;
 148  146  
 149  147          /* Add the entry to the list */
 150  148          mutex_enter(&as->a_contents);
 151  149          current_head = as->a_callbacks;
 152  150          as->a_callbacks = cb;
 153  151          cb->ascb_next = current_head;
 154  152  
 155  153          /*
 156  154           * The call to this function may lose in a race with
 157  155           * a pertinent event - eg. a thread does long term memory locking
 158  156           * but before the callback is added another thread executes as_unmap.
 159  157           * A broadcast here resolves that.
 160  158           */
 161  159          if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
 162  160                  AS_CLRUNMAPWAIT(as);
 163  161                  cv_broadcast(&as->a_cv);
 164  162          }
 165  163  
 166  164          mutex_exit(&as->a_contents);
 167  165          return (0);
 168  166  }
 169  167  
 170  168  /*
 171  169   * Search the callback list for an entry which pertains to arg.
 172  170   *
 173  171   * This is called from within the client upon completion of the callback.
 174  172   * RETURN VALUES:
 175  173   *      AS_CALLBACK_DELETED  (callback entry found and deleted)
 176  174   *      AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
 177  175   *      AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
 178  176   *                      entry will be made in as_do_callbacks)
 179  177   *
 180  178   * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
 181  179   * set, it indicates that as_do_callbacks is processing this entry.  The
 182  180   * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
 183  181   * to unblock as_do_callbacks, in case it is blocked.
 184  182   *
 185  183   * CALLERS RESPONSIBILITY: If not calling from within the process context for
 186  184   * the specified as, the caller must guarantee persistence of the specified as
 187  185   * for the duration of this function (eg. pages being locked within the as
 188  186   * will guarantee persistence).
 189  187   */
 190  188  uint_t
 191  189  as_delete_callback(struct as *as, void *arg)
 192  190  {
 193  191          struct as_callback **prevcb = &as->a_callbacks;
 194  192          struct as_callback *cb;
 195  193          uint_t rc = AS_CALLBACK_NOTFOUND;
 196  194  
 197  195          mutex_enter(&as->a_contents);
 198  196          for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
 199  197                  if (cb->ascb_arg != arg)
 200  198                          continue;
 201  199  
 202  200                  /*
 203  201                   * If the events indicate AS_CALLBACK_CALLED, just clear
 204  202                   * AS_ALL_EVENT in the events field and wakeup the thread
 205  203                   * that may be waiting in as_do_callbacks.  as_do_callbacks
 206  204                   * will take care of removing this entry from the list.  In
 207  205                   * that case, return AS_CALLBACK_DELETE_DEFERRED.  Otherwise
 208  206                   * (AS_CALLBACK_CALLED not set), just remove it from the
 209  207                   * list, return the memory and return AS_CALLBACK_DELETED.
 210  208                   */
 211  209                  if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
 212  210                          /* leave AS_CALLBACK_CALLED */
 213  211                          cb->ascb_events &= ~AS_ALL_EVENT;
 214  212                          rc = AS_CALLBACK_DELETE_DEFERRED;
 215  213                          cv_broadcast(&as->a_cv);
 216  214                  } else {
 217  215                          *prevcb = cb->ascb_next;
 218  216                          kmem_free(cb, sizeof (struct as_callback));
 219  217                          rc = AS_CALLBACK_DELETED;
 220  218                  }
 221  219                  break;
 222  220          }
 223  221          mutex_exit(&as->a_contents);
 224  222          return (rc);
 225  223  }
 226  224  
 227  225  /*
 228  226   * Searches the as callback list for a matching entry.
 229  227   * Returns a pointer to the first matching callback, or NULL if
 230  228   * nothing is found.
 231  229   * This function never sleeps so it is ok to call it with more
 232  230   * locks held but the (required) a_contents mutex.
 233  231   *
 234  232   * See also comment on as_do_callbacks below.
 235  233   */
 236  234  static struct as_callback *
 237  235  as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
 238  236      size_t event_len)
 239  237  {
 240  238          struct as_callback      *cb;
 241  239  
 242  240          ASSERT(MUTEX_HELD(&as->a_contents));
 243  241          for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
 244  242                  /*
 245  243                   * If the callback has not already been called, then
 246  244                   * check if events or address range pertains.  An event_len
 247  245                   * of zero means do an unconditional callback.
 248  246                   */
 249  247                  if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
 250  248                      ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
 251  249                      (event_addr + event_len < cb->ascb_saddr) ||
 252  250                      (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
 253  251                          continue;
 254  252                  }
 255  253                  break;
 256  254          }
 257  255          return (cb);
 258  256  }
 259  257  
 260  258  /*
 261  259   * Executes a given callback and removes it from the callback list for
 262  260   * this address space.
 263  261   * This function may sleep so the caller must drop all locks except
 264  262   * a_contents before calling this func.
 265  263   *
 266  264   * See also comments on as_do_callbacks below.
 267  265   */
 268  266  static void
 269  267  as_execute_callback(struct as *as, struct as_callback *cb,
 270  268      uint_t events)
 271  269  {
 272  270          struct as_callback **prevcb;
 273  271          void    *cb_arg;
 274  272  
 275  273          ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
 276  274          cb->ascb_events |= AS_CALLBACK_CALLED;
 277  275          mutex_exit(&as->a_contents);
 278  276          (*cb->ascb_func)(as, cb->ascb_arg, events);
 279  277          mutex_enter(&as->a_contents);
 280  278          /*
 281  279           * the callback function is required to delete the callback
 282  280           * when the callback function determines it is OK for
 283  281           * this thread to continue. as_delete_callback will clear
 284  282           * the AS_ALL_EVENT in the events field when it is deleted.
 285  283           * If the callback function called as_delete_callback,
 286  284           * events will already be cleared and there will be no blocking.
 287  285           */
 288  286          while ((cb->ascb_events & events) != 0) {
 289  287                  cv_wait(&as->a_cv, &as->a_contents);
 290  288          }
 291  289          /*
 292  290           * This entry needs to be taken off the list. Normally, the
 293  291           * callback func itself does that, but unfortunately the list
 294  292           * may have changed while the callback was running because the
 295  293           * a_contents mutex was dropped and someone else other than the
 296  294           * callback func itself could have called as_delete_callback,
 297  295           * so we have to search to find this entry again.  The entry
 298  296           * must have AS_CALLBACK_CALLED, and have the same 'arg'.
 299  297           */
 300  298          cb_arg = cb->ascb_arg;
 301  299          prevcb = &as->a_callbacks;
 302  300          for (cb = as->a_callbacks; cb != NULL;
 303  301              prevcb = &cb->ascb_next, cb = *prevcb) {
 304  302                  if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
 305  303                      (cb_arg != cb->ascb_arg)) {
 306  304                          continue;
 307  305                  }
 308  306                  *prevcb = cb->ascb_next;
 309  307                  kmem_free(cb, sizeof (struct as_callback));
 310  308                  break;
 311  309          }
 312  310  }
 313  311  
 314  312  /*
 315  313   * Check the callback list for a matching event and intersection of
 316  314   * address range. If there is a match invoke the callback.  Skip an entry if:
 317  315   *    - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
 318  316   *    - not event of interest
 319  317   *    - not address range of interest
 320  318   *
 321  319   * An event_len of zero indicates a request for an unconditional callback
 322  320   * (regardless of event), only the AS_CALLBACK_CALLED is checked.  The
 323  321   * a_contents lock must be dropped before a callback, so only one callback
 324  322   * can be done before returning. Return -1 (true) if a callback was
 325  323   * executed and removed from the list, else return 0 (false).
 326  324   *
 327  325   * The logically separate parts, i.e. finding a matching callback and
 328  326   * executing a given callback have been separated into two functions
 329  327   * so that they can be called with different sets of locks held beyond
 330  328   * the always-required a_contents. as_find_callback does not sleep so
 331  329   * it is ok to call it if more locks than a_contents (i.e. the a_lock
 332  330   * rwlock) are held. as_execute_callback on the other hand may sleep
 333  331   * so all locks beyond a_contents must be dropped by the caller if one
 334  332   * does not want to end comatose.
 335  333   */
 336  334  static int
 337  335  as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
 338  336      size_t event_len)
 339  337  {
 340  338          struct as_callback *cb;
 341  339  
 342  340          if ((cb = as_find_callback(as, events, event_addr, event_len))) {
 343  341                  as_execute_callback(as, cb, events);
 344  342                  return (-1);
 345  343          }
 346  344          return (0);
 347  345  }
 348  346  
 349  347  /*
 350  348   * Search for the segment containing addr. If a segment containing addr
 351  349   * exists, that segment is returned.  If no such segment exists, and
 352  350   * the list spans addresses greater than addr, then the first segment
 353  351   * whose base is greater than addr is returned; otherwise, NULL is
 354  352   * returned unless tail is true, in which case the last element of the
 355  353   * list is returned.
 356  354   *
 357  355   * a_seglast is used to cache the last found segment for repeated
 358  356   * searches to the same addr (which happens frequently).
 359  357   */
 360  358  struct seg *
 361  359  as_findseg(struct as *as, caddr_t addr, int tail)
 362  360  {
 363  361          struct seg *seg = as->a_seglast;
 364  362          avl_index_t where;
 365  363  
 366  364          ASSERT(AS_LOCK_HELD(as));
 367  365  
 368  366          if (seg != NULL &&
 369  367              seg->s_base <= addr &&
 370  368              addr < seg->s_base + seg->s_size)
 371  369                  return (seg);
 372  370  
 373  371          seg = avl_find(&as->a_segtree, &addr, &where);
 374  372          if (seg != NULL)
 375  373                  return (as->a_seglast = seg);
 376  374  
 377  375          seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 378  376          if (seg == NULL && tail)
 379  377                  seg = avl_last(&as->a_segtree);
 380  378          return (as->a_seglast = seg);
 381  379  }
 382  380  
 383  381  #ifdef VERIFY_SEGLIST
 384  382  /*
 385  383   * verify that the linked list is coherent
 386  384   */
 387  385  static void
 388  386  as_verify(struct as *as)
 389  387  {
 390  388          struct seg *seg, *seglast, *p, *n;
 391  389          uint_t nsegs = 0;
 392  390  
 393  391          if (do_as_verify == 0)
 394  392                  return;
 395  393  
 396  394          seglast = as->a_seglast;
 397  395  
 398  396          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 399  397                  ASSERT(seg->s_as == as);
 400  398                  p = AS_SEGPREV(as, seg);
 401  399                  n = AS_SEGNEXT(as, seg);
 402  400                  ASSERT(p == NULL || p->s_as == as);
 403  401                  ASSERT(p == NULL || p->s_base < seg->s_base);
 404  402                  ASSERT(n == NULL || n->s_base > seg->s_base);
 405  403                  ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
 406  404                  if (seg == seglast)
 407  405                          seglast = NULL;
 408  406                  nsegs++;
 409  407          }
 410  408          ASSERT(seglast == NULL);
 411  409          ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
 412  410  }
 413  411  #endif /* VERIFY_SEGLIST */
 414  412  
 415  413  /*
 416  414   * Add a new segment to the address space. The avl_find()
 417  415   * may be expensive so we attempt to use last segment accessed
 418  416   * in as_gap() as an insertion point.
 419  417   */
 420  418  int
 421  419  as_addseg(struct as  *as, struct seg *newseg)
 422  420  {
 423  421          struct seg *seg;
 424  422          caddr_t addr;
 425  423          caddr_t eaddr;
 426  424          avl_index_t where;
 427  425  
 428  426          ASSERT(AS_WRITE_HELD(as));
 429  427  
 430  428          as->a_updatedir = 1;    /* inform /proc */
 431  429          gethrestime(&as->a_updatetime);
 432  430  
 433  431          if (as->a_lastgaphl != NULL) {
 434  432                  struct seg *hseg = NULL;
 435  433                  struct seg *lseg = NULL;
 436  434  
 437  435                  if (as->a_lastgaphl->s_base > newseg->s_base) {
 438  436                          hseg = as->a_lastgaphl;
 439  437                          lseg = AVL_PREV(&as->a_segtree, hseg);
 440  438                  } else {
 441  439                          lseg = as->a_lastgaphl;
 442  440                          hseg = AVL_NEXT(&as->a_segtree, lseg);
 443  441                  }
 444  442  
 445  443                  if (hseg && lseg && lseg->s_base < newseg->s_base &&
 446  444                      hseg->s_base > newseg->s_base) {
 447  445                          avl_insert_here(&as->a_segtree, newseg, lseg,
 448  446                              AVL_AFTER);
 449  447                          as->a_lastgaphl = NULL;
 450  448                          as->a_seglast = newseg;
 451  449                          return (0);
 452  450                  }
 453  451                  as->a_lastgaphl = NULL;
 454  452          }
 455  453  
 456  454          addr = newseg->s_base;
 457  455          eaddr = addr + newseg->s_size;
 458  456  again:
 459  457  
 460  458          seg = avl_find(&as->a_segtree, &addr, &where);
 461  459  
 462  460          if (seg == NULL)
 463  461                  seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 464  462  
 465  463          if (seg == NULL)
 466  464                  seg = avl_last(&as->a_segtree);
 467  465  
 468  466          if (seg != NULL) {
 469  467                  caddr_t base = seg->s_base;
 470  468  
 471  469                  /*
 472  470                   * If top of seg is below the requested address, then
 473  471                   * the insertion point is at the end of the linked list,
 474  472                   * and seg points to the tail of the list.  Otherwise,
 475  473                   * the insertion point is immediately before seg.
 476  474                   */
 477  475                  if (base + seg->s_size > addr) {
 478  476                          if (addr >= base || eaddr > base) {
 479  477  #ifdef __sparc
 480  478                                  extern struct seg_ops segnf_ops;
 481  479  
 482  480                                  /*
 483  481                                   * no-fault segs must disappear if overlaid.
 484  482                                   * XXX need new segment type so
 485  483                                   * we don't have to check s_ops
 486  484                                   */
 487  485                                  if (seg->s_ops == &segnf_ops) {
 488  486                                          seg_unmap(seg);
 489  487                                          goto again;
 490  488                                  }
 491  489  #endif
 492  490                                  return (-1);    /* overlapping segment */
 493  491                          }
 494  492                  }
 495  493          }
 496  494          as->a_seglast = newseg;
 497  495          avl_insert(&as->a_segtree, newseg, where);
 498  496  
 499  497  #ifdef VERIFY_SEGLIST
 500  498          as_verify(as);
 501  499  #endif
 502  500          return (0);
 503  501  }
 504  502  
 505  503  struct seg *
 506  504  as_removeseg(struct as *as, struct seg *seg)
 507  505  {
 508  506          avl_tree_t *t;
 509  507  
 510  508          ASSERT(AS_WRITE_HELD(as));
 511  509  
 512  510          as->a_updatedir = 1;    /* inform /proc */
 513  511          gethrestime(&as->a_updatetime);
 514  512  
 515  513          if (seg == NULL)
 516  514                  return (NULL);
 517  515  
 518  516          t = &as->a_segtree;
 519  517          if (as->a_seglast == seg)
 520  518                  as->a_seglast = NULL;
 521  519          as->a_lastgaphl = NULL;
 522  520  
 523  521          /*
 524  522           * if this segment is at an address higher than
 525  523           * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
 526  524           */
 527  525          if (as->a_lastgap &&
 528  526              (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
 529  527                  as->a_lastgap = AVL_NEXT(t, seg);
 530  528  
 531  529          /*
 532  530           * remove the segment from the seg tree
 533  531           */
 534  532          avl_remove(t, seg);
 535  533  
 536  534  #ifdef VERIFY_SEGLIST
 537  535          as_verify(as);
 538  536  #endif
 539  537          return (seg);
 540  538  }
 541  539  
 542  540  /*
 543  541   * Find a segment containing addr.
 544  542   */
 545  543  struct seg *
 546  544  as_segat(struct as *as, caddr_t addr)
 547  545  {
 548  546          struct seg *seg = as->a_seglast;
 549  547  
 550  548          ASSERT(AS_LOCK_HELD(as));
 551  549  
 552  550          if (seg != NULL && seg->s_base <= addr &&
 553  551              addr < seg->s_base + seg->s_size)
 554  552                  return (seg);
 555  553  
 556  554          seg = avl_find(&as->a_segtree, &addr, NULL);
 557  555          return (seg);
 558  556  }
 559  557  
 560  558  /*
 561  559   * Serialize all searches for holes in an address space to
 562  560   * prevent two or more threads from allocating the same virtual
 563  561   * address range.  The address space must not be "read/write"
 564  562   * locked by the caller since we may block.
 565  563   */
 566  564  void
 567  565  as_rangelock(struct as *as)
 568  566  {
 569  567          mutex_enter(&as->a_contents);
 570  568          while (AS_ISCLAIMGAP(as))
 571  569                  cv_wait(&as->a_cv, &as->a_contents);
 572  570          AS_SETCLAIMGAP(as);
 573  571          mutex_exit(&as->a_contents);
 574  572  }
 575  573  
 576  574  /*
 577  575   * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
 578  576   */
 579  577  void
 580  578  as_rangeunlock(struct as *as)
 581  579  {
 582  580          mutex_enter(&as->a_contents);
 583  581          AS_CLRCLAIMGAP(as);
 584  582          cv_signal(&as->a_cv);
 585  583          mutex_exit(&as->a_contents);
 586  584  }
 587  585  
 588  586  /*
 589  587   * compar segments (or just an address) by segment address range
 590  588   */
 591  589  static int
 592  590  as_segcompar(const void *x, const void *y)
 593  591  {
 594  592          struct seg *a = (struct seg *)x;
 595  593          struct seg *b = (struct seg *)y;
 596  594  
 597  595          if (a->s_base < b->s_base)
 598  596                  return (-1);
 599  597          if (a->s_base >= b->s_base + b->s_size)
 600  598                  return (1);
 601  599          return (0);
 602  600  }
 603  601  
 604  602  
 605  603  void
 606  604  as_avlinit(struct as *as)
 607  605  {
 608  606          avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
 609  607              offsetof(struct seg, s_tree));
 610  608          avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
 611  609              offsetof(struct watched_page, wp_link));
 612  610  }
 613  611  
 614  612  /*ARGSUSED*/
 615  613  static int
 616  614  as_constructor(void *buf, void *cdrarg, int kmflags)
 617  615  {
 618  616          struct as *as = buf;
 619  617  
 620  618          mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
 621  619          cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
 622  620          rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
 623  621          as_avlinit(as);
 624  622          return (0);
 625  623  }
 626  624  
 627  625  /*ARGSUSED1*/
 628  626  static void
 629  627  as_destructor(void *buf, void *cdrarg)
 630  628  {
 631  629          struct as *as = buf;
 632  630  
 633  631          avl_destroy(&as->a_segtree);
 634  632          mutex_destroy(&as->a_contents);
 635  633          cv_destroy(&as->a_cv);
 636  634          rw_destroy(&as->a_lock);
 637  635  }
 638  636  
 639  637  void
 640  638  as_init(void)
 641  639  {
 642  640          as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
 643  641              as_constructor, as_destructor, NULL, NULL, NULL, 0);
 644  642  }
 645  643  
 646  644  /*
 647  645   * Allocate and initialize an address space data structure.
 648  646   * We call hat_alloc to allow any machine dependent
 649  647   * information in the hat structure to be initialized.
 650  648   */
 651  649  struct as *
 652  650  as_alloc(void)
 653  651  {
 654  652          struct as *as;
 655  653  
 656  654          as = kmem_cache_alloc(as_cache, KM_SLEEP);
 657  655  
 658  656          as->a_flags             = 0;
 659  657          as->a_vbits             = 0;
 660  658          as->a_hrm               = NULL;
 661  659          as->a_seglast           = NULL;
 662  660          as->a_size              = 0;
 663  661          as->a_resvsize          = 0;
 664  662          as->a_updatedir         = 0;
 665  663          gethrestime(&as->a_updatetime);
 666  664          as->a_objectdir         = NULL;
 667  665          as->a_sizedir           = 0;
 668  666          as->a_userlimit         = (caddr_t)USERLIMIT;
 669  667          as->a_lastgap           = NULL;
 670  668          as->a_lastgaphl         = NULL;
 671  669          as->a_callbacks         = NULL;
 672  670          as->a_proc              = NULL;
 673  671  
 674  672          AS_LOCK_ENTER(as, RW_WRITER);
 675  673          as->a_hat = hat_alloc(as);      /* create hat for default system mmu */
 676  674          AS_LOCK_EXIT(as);
 677  675  
 678  676          return (as);
 679  677  }
 680  678  
 681  679  /*
 682  680   * Free an address space data structure.
 683  681   * Need to free the hat first and then
 684  682   * all the segments on this as and finally
 685  683   * the space for the as struct itself.
 686  684   */
 687  685  void
 688  686  as_free(struct as *as)
 689  687  {
 690  688          struct hat *hat = as->a_hat;
 691  689          struct seg *seg, *next;
 692  690          boolean_t free_started = B_FALSE;
 693  691  
 694  692  top:
 695  693          /*
 696  694           * Invoke ALL callbacks. as_do_callbacks will do one callback
 697  695           * per call, and not return (-1) until the callback has completed.
 698  696           * When as_do_callbacks returns zero, all callbacks have completed.
 699  697           */
 700  698          mutex_enter(&as->a_contents);
 701  699          while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
 702  700                  ;
 703  701  
 704  702          mutex_exit(&as->a_contents);
 705  703          AS_LOCK_ENTER(as, RW_WRITER);
 706  704  
 707  705          if (!free_started) {
 708  706                  free_started = B_TRUE;
 709  707                  hat_free_start(hat);
 710  708          }
 711  709          for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
 712  710                  int err;
 713  711  
 714  712                  next = AS_SEGNEXT(as, seg);
 715  713  retry:
 716  714                  err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
 717  715                  if (err == EAGAIN) {
 718  716                          mutex_enter(&as->a_contents);
 719  717                          if (as->a_callbacks) {
 720  718                                  AS_LOCK_EXIT(as);
 721  719                          } else if (!AS_ISNOUNMAPWAIT(as)) {
 722  720                                  /*
 723  721                                   * Memory is currently locked. Wait for a
 724  722                                   * cv_signal that it has been unlocked, then
 725  723                                   * try the operation again.
 726  724                                   */
 727  725                                  if (AS_ISUNMAPWAIT(as) == 0)
 728  726                                          cv_broadcast(&as->a_cv);
 729  727                                  AS_SETUNMAPWAIT(as);
 730  728                                  AS_LOCK_EXIT(as);
 731  729                                  while (AS_ISUNMAPWAIT(as))
 732  730                                          cv_wait(&as->a_cv, &as->a_contents);
 733  731                          } else {
 734  732                                  /*
 735  733                                   * We may have raced with
 736  734                                   * segvn_reclaim()/segspt_reclaim(). In this
 737  735                                   * case clean nounmapwait flag and retry since
 738  736                                   * softlockcnt in this segment may be already
 739  737                                   * 0.  We don't drop as writer lock so our
 740  738                                   * number of retries without sleeping should
 741  739                                   * be very small. See segvn_reclaim() for
 742  740                                   * more comments.
 743  741                                   */
 744  742                                  AS_CLRNOUNMAPWAIT(as);
 745  743                                  mutex_exit(&as->a_contents);
 746  744                                  goto retry;
 747  745                          }
 748  746                          mutex_exit(&as->a_contents);
 749  747                          goto top;
 750  748                  } else {
 751  749                          /*
 752  750                           * We do not expect any other error return at this
 753  751                           * time. This is similar to an ASSERT in seg_unmap()
 754  752                           */
 755  753                          ASSERT(err == 0);
 756  754                  }
 757  755          }
 758  756          hat_free_end(hat);
 759  757          AS_LOCK_EXIT(as);
 760  758  
 761  759          /* /proc stuff */
 762  760          ASSERT(avl_numnodes(&as->a_wpage) == 0);
 763  761          if (as->a_objectdir) {
 764  762                  kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
 765  763                  as->a_objectdir = NULL;
 766  764                  as->a_sizedir = 0;
 767  765          }
 768  766  
 769  767          /*
 770  768           * Free the struct as back to kmem.  Assert it has no segments.
 771  769           */
 772  770          ASSERT(avl_numnodes(&as->a_segtree) == 0);
 773  771          kmem_cache_free(as_cache, as);
 774  772  }
 775  773  
 776  774  int
 777  775  as_dup(struct as *as, struct proc *forkedproc)
 778  776  {
 779  777          struct as *newas;
 780  778          struct seg *seg, *newseg;
 781  779          size_t  purgesize = 0;
 782  780          int error;
 783  781  
 784  782          AS_LOCK_ENTER(as, RW_WRITER);
 785  783          as_clearwatch(as);
 786  784          newas = as_alloc();
 787  785          newas->a_userlimit = as->a_userlimit;
 788  786          newas->a_proc = forkedproc;
 789  787  
 790  788          AS_LOCK_ENTER(newas, RW_WRITER);
 791  789  
 792  790          (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
 793  791  
 794  792          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 795  793  
 796  794                  if (seg->s_flags & S_PURGE) {
 797  795                          purgesize += seg->s_size;
 798  796                          continue;
 799  797                  }
 800  798  
 801  799                  newseg = seg_alloc(newas, seg->s_base, seg->s_size);
 802  800                  if (newseg == NULL) {
 803  801                          AS_LOCK_EXIT(newas);
 804  802                          as_setwatch(as);
 805  803                          AS_LOCK_EXIT(as);
 806  804                          as_free(newas);
 807  805                          return (-1);
 808  806                  }
 809  807                  if ((error = SEGOP_DUP(seg, newseg)) != 0) {
 810  808                          /*
 811  809                           * We call seg_free() on the new seg
 812  810                           * because the segment is not set up
 813  811                           * completely; i.e. it has no ops.
 814  812                           */
 815  813                          as_setwatch(as);
 816  814                          AS_LOCK_EXIT(as);
 817  815                          seg_free(newseg);
 818  816                          AS_LOCK_EXIT(newas);
 819  817                          as_free(newas);
 820  818                          return (error);
 821  819                  }
 822  820                  newas->a_size += seg->s_size;
 823  821          }
 824  822          newas->a_resvsize = as->a_resvsize - purgesize;
 825  823  
 826  824          error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
 827  825  
 828  826          AS_LOCK_EXIT(newas);
 829  827  
 830  828          as_setwatch(as);
 831  829          AS_LOCK_EXIT(as);
 832  830          if (error != 0) {
 833  831                  as_free(newas);
 834  832                  return (error);
 835  833          }
 836  834          forkedproc->p_as = newas;
 837  835          return (0);
 838  836  }
 839  837  
 840  838  /*
 841  839   * Handle a ``fault'' at addr for size bytes.
 842  840   */
 843  841  faultcode_t
 844  842  as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
 845  843      enum fault_type type, enum seg_rw rw)
 846  844  {
 847  845          struct seg *seg;
 848  846          caddr_t raddr;                  /* rounded down addr */
 849  847          size_t rsize;                   /* rounded up size */
 850  848          size_t ssize;
 851  849          faultcode_t res = 0;
 852  850          caddr_t addrsav;
 853  851          struct seg *segsav;
 854  852          int as_lock_held;
 855  853          klwp_t *lwp = ttolwp(curthread);
 856  854          zone_t *zonep = curzone;
 857  855  
 858  856  retry:
 859  857          /*
 860  858           * Indicate that the lwp is not to be stopped while waiting for a
 861  859           * pagefault.  This is to avoid deadlock while debugging a process
 862  860           * via /proc over NFS (in particular).
 863  861           */
 864  862          if (lwp != NULL)
 865  863                  lwp->lwp_nostop++;
 866  864  
 867  865          /*
 868  866           * same length must be used when we softlock and softunlock.  We
 869  867           * don't support softunlocking lengths less than the original length
 870  868           * when there is largepage support.  See seg_dev.c for more
 871  869           * comments.
 872  870           */
 873  871          switch (type) {
 874  872  
 875  873          case F_SOFTLOCK:
 876  874                  CPU_STATS_ADD_K(vm, softlock, 1);
 877  875                  break;
 878  876  
 879  877          case F_SOFTUNLOCK:
 880  878                  break;
 881  879  
 882  880          case F_PROT:
 883  881                  CPU_STATS_ADD_K(vm, prot_fault, 1);

↓ open down ↓

797 lines elided

↑ open up ↑

 884  882                  break;
 885  883  
 886  884          case F_INVAL:
 887  885                  CPU_STATS_ENTER_K();
 888  886                  CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
 889  887                  if (as == &kas)
 890  888                          CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
 891  889                  CPU_STATS_EXIT_K();
 892  890                  if (zonep->zone_pg_flt_delay != 0) {
 893  891                          /*
 894      -                         * The zone in which this process is running is
 895      -                         * currently over it's physical memory cap. Throttle
 896      -                         * page faults to help the user-land memory capper
 897      -                         * catch up. Note that drv_usectohz() rounds up.
      892 +                         * The zone in which this process is running
      893 +                         * is currently over it's physical memory cap.
      894 +                         * Throttle page faults to help the user-land
      895 +                         * memory capper catch up. Note that
      896 +                         * drv_usectohz() rounds up.
 898  897                           */
 899  898                          atomic_add_64(&zonep->zone_pf_throttle, 1);
 900  899                          atomic_add_64(&zonep->zone_pf_throttle_usec,
 901  900                              zonep->zone_pg_flt_delay);
 902      -                        if (zonep->zone_pg_flt_delay < TICK_TO_USEC(1)) {
      901 +                        if (zonep->zone_pg_flt_delay < TICK_TO_USEC(1))
 903  902                                  drv_usecwait(zonep->zone_pg_flt_delay);
 904      -                        } else {
      903 +                        else
 905  904                                  delay(drv_usectohz(zonep->zone_pg_flt_delay));
 906      -                        }
 907  905                  }
 908  906                  break;
 909  907          }
 910  908  
 911  909          /* Kernel probe */
 912  910          TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */,
 913  911              tnf_opaque, address,        addr,
 914  912              tnf_fault_type,     fault_type,     type,
 915  913              tnf_seg_access,     access,         rw);
 916  914

 917  915          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
 918  916          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
 919  917              (size_t)raddr;
 920  918  
 921  919          /*
 922  920           * XXX -- Don't grab the as lock for segkmap. We should grab it for
 923  921           * correctness, but then we could be stuck holding this lock for
 924  922           * a LONG time if the fault needs to be resolved on a slow
 925  923           * filesystem, and then no-one will be able to exec new commands,
 926  924           * as exec'ing requires the write lock on the as.
 927  925           */
 928  926          if (as == &kas && segkmap && segkmap->s_base <= raddr &&
 929  927              raddr + size < segkmap->s_base + segkmap->s_size) {
 930  928                  seg = segkmap;
 931  929                  as_lock_held = 0;
 932  930          } else {
 933  931                  AS_LOCK_ENTER(as, RW_READER);
 934  932  
 935  933                  seg = as_segat(as, raddr);
 936  934                  if (seg == NULL) {
 937  935                          AS_LOCK_EXIT(as);
 938  936                          if (lwp != NULL)
 939  937                                  lwp->lwp_nostop--;
 940  938                          return (FC_NOMAP);
 941  939                  }
 942  940  
 943  941                  as_lock_held = 1;
 944  942          }
 945  943  
 946  944          addrsav = raddr;
 947  945          segsav = seg;
 948  946  
 949  947          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
 950  948                  if (raddr >= seg->s_base + seg->s_size) {
 951  949                          seg = AS_SEGNEXT(as, seg);
 952  950                          if (seg == NULL || raddr != seg->s_base) {
 953  951                                  res = FC_NOMAP;
 954  952                                  break;
 955  953                          }
 956  954                  }
 957  955                  if (raddr + rsize > seg->s_base + seg->s_size)
 958  956                          ssize = seg->s_base + seg->s_size - raddr;
 959  957                  else
 960  958                          ssize = rsize;
 961  959  
 962  960                  res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw);
 963  961                  if (res != 0)
 964  962                          break;
 965  963          }
 966  964  
 967  965          /*
 968  966           * If we were SOFTLOCKing and encountered a failure,
 969  967           * we must SOFTUNLOCK the range we already did. (Maybe we
 970  968           * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
 971  969           * right here...)
 972  970           */
 973  971          if (res != 0 && type == F_SOFTLOCK) {
 974  972                  for (seg = segsav; addrsav < raddr; addrsav += ssize) {
 975  973                          if (addrsav >= seg->s_base + seg->s_size)
 976  974                                  seg = AS_SEGNEXT(as, seg);
 977  975                          ASSERT(seg != NULL);
 978  976                          /*
 979  977                           * Now call the fault routine again to perform the
 980  978                           * unlock using S_OTHER instead of the rw variable
 981  979                           * since we never got a chance to touch the pages.
 982  980                           */
 983  981                          if (raddr > seg->s_base + seg->s_size)
 984  982                                  ssize = seg->s_base + seg->s_size - addrsav;
 985  983                          else
 986  984                                  ssize = raddr - addrsav;
 987  985                          (void) SEGOP_FAULT(hat, seg, addrsav, ssize,
 988  986                              F_SOFTUNLOCK, S_OTHER);
 989  987                  }
 990  988          }
 991  989          if (as_lock_held)
 992  990                  AS_LOCK_EXIT(as);
 993  991          if (lwp != NULL)
 994  992                  lwp->lwp_nostop--;
 995  993  
 996  994          /*
 997  995           * If the lower levels returned EDEADLK for a fault,
 998  996           * It means that we should retry the fault.  Let's wait
 999  997           * a bit also to let the deadlock causing condition clear.
1000  998           * This is part of a gross hack to work around a design flaw
1001  999           * in the ufs/sds logging code and should go away when the
1002 1000           * logging code is re-designed to fix the problem. See bug
1003 1001           * 4125102 for details of the problem.
1004 1002           */
1005 1003          if (FC_ERRNO(res) == EDEADLK) {
1006 1004                  delay(deadlk_wait);
1007 1005                  res = 0;
1008 1006                  goto retry;
1009 1007          }
1010 1008          return (res);
1011 1009  }
1012 1010  
1013 1011  
1014 1012  
1015 1013  /*
1016 1014   * Asynchronous ``fault'' at addr for size bytes.
1017 1015   */
1018 1016  faultcode_t
1019 1017  as_faulta(struct as *as, caddr_t addr, size_t size)
1020 1018  {
1021 1019          struct seg *seg;
1022 1020          caddr_t raddr;                  /* rounded down addr */
1023 1021          size_t rsize;                   /* rounded up size */
1024 1022          faultcode_t res = 0;
1025 1023          klwp_t *lwp = ttolwp(curthread);
1026 1024  
1027 1025  retry:
1028 1026          /*
1029 1027           * Indicate that the lwp is not to be stopped while waiting
1030 1028           * for a pagefault.  This is to avoid deadlock while debugging
1031 1029           * a process via /proc over NFS (in particular).
1032 1030           */
1033 1031          if (lwp != NULL)
1034 1032                  lwp->lwp_nostop++;
1035 1033  
1036 1034          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1037 1035          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1038 1036              (size_t)raddr;
1039 1037  
1040 1038          AS_LOCK_ENTER(as, RW_READER);
1041 1039          seg = as_segat(as, raddr);
1042 1040          if (seg == NULL) {
1043 1041                  AS_LOCK_EXIT(as);
1044 1042                  if (lwp != NULL)
1045 1043                          lwp->lwp_nostop--;
1046 1044                  return (FC_NOMAP);
1047 1045          }
1048 1046  
1049 1047          for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1050 1048                  if (raddr >= seg->s_base + seg->s_size) {
1051 1049                          seg = AS_SEGNEXT(as, seg);
1052 1050                          if (seg == NULL || raddr != seg->s_base) {
1053 1051                                  res = FC_NOMAP;
1054 1052                                  break;
1055 1053                          }
1056 1054                  }
1057 1055                  res = SEGOP_FAULTA(seg, raddr);
1058 1056                  if (res != 0)
1059 1057                          break;
1060 1058          }
1061 1059          AS_LOCK_EXIT(as);
1062 1060          if (lwp != NULL)
1063 1061                  lwp->lwp_nostop--;
1064 1062          /*
1065 1063           * If the lower levels returned EDEADLK for a fault,
1066 1064           * It means that we should retry the fault.  Let's wait
1067 1065           * a bit also to let the deadlock causing condition clear.
1068 1066           * This is part of a gross hack to work around a design flaw
1069 1067           * in the ufs/sds logging code and should go away when the
1070 1068           * logging code is re-designed to fix the problem. See bug
1071 1069           * 4125102 for details of the problem.
1072 1070           */
1073 1071          if (FC_ERRNO(res) == EDEADLK) {
1074 1072                  delay(deadlk_wait);
1075 1073                  res = 0;
1076 1074                  goto retry;
1077 1075          }
1078 1076          return (res);
1079 1077  }
1080 1078  
1081 1079  /*
1082 1080   * Set the virtual mapping for the interval from [addr : addr + size)
1083 1081   * in address space `as' to have the specified protection.
1084 1082   * It is ok for the range to cross over several segments,
1085 1083   * as long as they are contiguous.
1086 1084   */
1087 1085  int
1088 1086  as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1089 1087  {
1090 1088          struct seg *seg;
1091 1089          struct as_callback *cb;
1092 1090          size_t ssize;
1093 1091          caddr_t raddr;                  /* rounded down addr */
1094 1092          size_t rsize;                   /* rounded up size */
1095 1093          int error = 0, writer = 0;
1096 1094          caddr_t saveraddr;
1097 1095          size_t saversize;
1098 1096  
1099 1097  setprot_top:
1100 1098          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1101 1099          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1102 1100              (size_t)raddr;
1103 1101  
1104 1102          if (raddr + rsize < raddr)              /* check for wraparound */
1105 1103                  return (ENOMEM);
1106 1104  
1107 1105          saveraddr = raddr;
1108 1106          saversize = rsize;
1109 1107  
1110 1108          /*
1111 1109           * Normally we only lock the as as a reader. But
1112 1110           * if due to setprot the segment driver needs to split
1113 1111           * a segment it will return IE_RETRY. Therefore we re-acquire
1114 1112           * the as lock as a writer so the segment driver can change
1115 1113           * the seg list. Also the segment driver will return IE_RETRY
1116 1114           * after it has changed the segment list so we therefore keep
1117 1115           * locking as a writer. Since these opeartions should be rare
1118 1116           * want to only lock as a writer when necessary.
1119 1117           */
1120 1118          if (writer || avl_numnodes(&as->a_wpage) != 0) {
1121 1119                  AS_LOCK_ENTER(as, RW_WRITER);
1122 1120          } else {
1123 1121                  AS_LOCK_ENTER(as, RW_READER);
1124 1122          }
1125 1123  
1126 1124          as_clearwatchprot(as, raddr, rsize);
1127 1125          seg = as_segat(as, raddr);
1128 1126          if (seg == NULL) {
1129 1127                  as_setwatch(as);
1130 1128                  AS_LOCK_EXIT(as);
1131 1129                  return (ENOMEM);
1132 1130          }
1133 1131  
1134 1132          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1135 1133                  if (raddr >= seg->s_base + seg->s_size) {
1136 1134                          seg = AS_SEGNEXT(as, seg);
1137 1135                          if (seg == NULL || raddr != seg->s_base) {
1138 1136                                  error = ENOMEM;
1139 1137                                  break;
1140 1138                          }
1141 1139                  }
1142 1140                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
1143 1141                          ssize = seg->s_base + seg->s_size - raddr;
1144 1142                  else
1145 1143                          ssize = rsize;
1146 1144  retry:
1147 1145                  error = SEGOP_SETPROT(seg, raddr, ssize, prot);
1148 1146  
1149 1147                  if (error == IE_NOMEM) {
1150 1148                          error = EAGAIN;
1151 1149                          break;
1152 1150                  }
1153 1151  
1154 1152                  if (error == IE_RETRY) {
1155 1153                          AS_LOCK_EXIT(as);
1156 1154                          writer = 1;
1157 1155                          goto setprot_top;
1158 1156                  }
1159 1157  
1160 1158                  if (error == EAGAIN) {
1161 1159                          /*
1162 1160                           * Make sure we have a_lock as writer.
1163 1161                           */
1164 1162                          if (writer == 0) {
1165 1163                                  AS_LOCK_EXIT(as);
1166 1164                                  writer = 1;
1167 1165                                  goto setprot_top;
1168 1166                          }
1169 1167  
1170 1168                          /*
1171 1169                           * Memory is currently locked.  It must be unlocked
1172 1170                           * before this operation can succeed through a retry.
1173 1171                           * The possible reasons for locked memory and
1174 1172                           * corresponding strategies for unlocking are:
1175 1173                           * (1) Normal I/O
1176 1174                           *      wait for a signal that the I/O operation
1177 1175                           *      has completed and the memory is unlocked.
1178 1176                           * (2) Asynchronous I/O
1179 1177                           *      The aio subsystem does not unlock pages when
1180 1178                           *      the I/O is completed. Those pages are unlocked
1181 1179                           *      when the application calls aiowait/aioerror.
1182 1180                           *      So, to prevent blocking forever, cv_broadcast()
1183 1181                           *      is done to wake up aio_cleanup_thread.
1184 1182                           *      Subsequently, segvn_reclaim will be called, and
1185 1183                           *      that will do AS_CLRUNMAPWAIT() and wake us up.
1186 1184                           * (3) Long term page locking:
1187 1185                           *      Drivers intending to have pages locked for a
1188 1186                           *      period considerably longer than for normal I/O
1189 1187                           *      (essentially forever) may have registered for a
1190 1188                           *      callback so they may unlock these pages on
1191 1189                           *      request. This is needed to allow this operation
1192 1190                           *      to succeed. Each entry on the callback list is
1193 1191                           *      examined. If the event or address range pertains
1194 1192                           *      the callback is invoked (unless it already is in
1195 1193                           *      progress). The a_contents lock must be dropped
1196 1194                           *      before the callback, so only one callback can
1197 1195                           *      be done at a time. Go to the top and do more
1198 1196                           *      until zero is returned. If zero is returned,
1199 1197                           *      either there were no callbacks for this event
1200 1198                           *      or they were already in progress.
1201 1199                           */
1202 1200                          mutex_enter(&as->a_contents);
1203 1201                          if (as->a_callbacks &&
1204 1202                              (cb = as_find_callback(as, AS_SETPROT_EVENT,
1205 1203                              seg->s_base, seg->s_size))) {
1206 1204                                  AS_LOCK_EXIT(as);
1207 1205                                  as_execute_callback(as, cb, AS_SETPROT_EVENT);
1208 1206                          } else if (!AS_ISNOUNMAPWAIT(as)) {
1209 1207                                  if (AS_ISUNMAPWAIT(as) == 0)
1210 1208                                          cv_broadcast(&as->a_cv);
1211 1209                                  AS_SETUNMAPWAIT(as);
1212 1210                                  AS_LOCK_EXIT(as);
1213 1211                                  while (AS_ISUNMAPWAIT(as))
1214 1212                                          cv_wait(&as->a_cv, &as->a_contents);
1215 1213                          } else {
1216 1214                                  /*
1217 1215                                   * We may have raced with
1218 1216                                   * segvn_reclaim()/segspt_reclaim(). In this
1219 1217                                   * case clean nounmapwait flag and retry since
1220 1218                                   * softlockcnt in this segment may be already
1221 1219                                   * 0.  We don't drop as writer lock so our
1222 1220                                   * number of retries without sleeping should
1223 1221                                   * be very small. See segvn_reclaim() for
1224 1222                                   * more comments.
1225 1223                                   */
1226 1224                                  AS_CLRNOUNMAPWAIT(as);
1227 1225                                  mutex_exit(&as->a_contents);
1228 1226                                  goto retry;
1229 1227                          }
1230 1228                          mutex_exit(&as->a_contents);
1231 1229                          goto setprot_top;
1232 1230                  } else if (error != 0)
1233 1231                          break;
1234 1232          }
1235 1233          if (error != 0) {
1236 1234                  as_setwatch(as);
1237 1235          } else {
1238 1236                  as_setwatchprot(as, saveraddr, saversize, prot);
1239 1237          }
1240 1238          AS_LOCK_EXIT(as);
1241 1239          return (error);
1242 1240  }
1243 1241  
1244 1242  /*
1245 1243   * Check to make sure that the interval [addr, addr + size)
1246 1244   * in address space `as' has at least the specified protection.
1247 1245   * It is ok for the range to cross over several segments, as long
1248 1246   * as they are contiguous.
1249 1247   */
1250 1248  int
1251 1249  as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1252 1250  {
1253 1251          struct seg *seg;
1254 1252          size_t ssize;
1255 1253          caddr_t raddr;                  /* rounded down addr */
1256 1254          size_t rsize;                   /* rounded up size */
1257 1255          int error = 0;
1258 1256  
1259 1257          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1260 1258          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1261 1259              (size_t)raddr;
1262 1260  
1263 1261          if (raddr + rsize < raddr)              /* check for wraparound */
1264 1262                  return (ENOMEM);
1265 1263  
1266 1264          /*
1267 1265           * This is ugly as sin...
1268 1266           * Normally, we only acquire the address space readers lock.
1269 1267           * However, if the address space has watchpoints present,
1270 1268           * we must acquire the writer lock on the address space for
1271 1269           * the benefit of as_clearwatchprot() and as_setwatchprot().
1272 1270           */
1273 1271          if (avl_numnodes(&as->a_wpage) != 0)
1274 1272                  AS_LOCK_ENTER(as, RW_WRITER);
1275 1273          else
1276 1274                  AS_LOCK_ENTER(as, RW_READER);
1277 1275          as_clearwatchprot(as, raddr, rsize);
1278 1276          seg = as_segat(as, raddr);
1279 1277          if (seg == NULL) {
1280 1278                  as_setwatch(as);
1281 1279                  AS_LOCK_EXIT(as);
1282 1280                  return (ENOMEM);
1283 1281          }
1284 1282  
1285 1283          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1286 1284                  if (raddr >= seg->s_base + seg->s_size) {
1287 1285                          seg = AS_SEGNEXT(as, seg);
1288 1286                          if (seg == NULL || raddr != seg->s_base) {
1289 1287                                  error = ENOMEM;
1290 1288                                  break;
1291 1289                          }
1292 1290                  }
1293 1291                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
1294 1292                          ssize = seg->s_base + seg->s_size - raddr;
1295 1293                  else
1296 1294                          ssize = rsize;
1297 1295  
1298 1296                  error = SEGOP_CHECKPROT(seg, raddr, ssize, prot);
1299 1297                  if (error != 0)
1300 1298                          break;
1301 1299          }
1302 1300          as_setwatch(as);
1303 1301          AS_LOCK_EXIT(as);
1304 1302          return (error);
1305 1303  }
1306 1304  
1307 1305  int
1308 1306  as_unmap(struct as *as, caddr_t addr, size_t size)
1309 1307  {
1310 1308          struct seg *seg, *seg_next;
1311 1309          struct as_callback *cb;
1312 1310          caddr_t raddr, eaddr;
1313 1311          size_t ssize, rsize = 0;
1314 1312          int err;
1315 1313  
1316 1314  top:
1317 1315          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1318 1316          eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1319 1317              (uintptr_t)PAGEMASK);
1320 1318  
1321 1319          AS_LOCK_ENTER(as, RW_WRITER);
1322 1320  
1323 1321          as->a_updatedir = 1;    /* inform /proc */
1324 1322          gethrestime(&as->a_updatetime);
1325 1323  
1326 1324          /*
1327 1325           * Use as_findseg to find the first segment in the range, then
1328 1326           * step through the segments in order, following s_next.
1329 1327           */
1330 1328          as_clearwatchprot(as, raddr, eaddr - raddr);
1331 1329  
1332 1330          for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
1333 1331                  if (eaddr <= seg->s_base)
1334 1332                          break;          /* eaddr was in a gap; all done */
1335 1333  
1336 1334                  /* this is implied by the test above */
1337 1335                  ASSERT(raddr < eaddr);
1338 1336  
1339 1337                  if (raddr < seg->s_base)
1340 1338                          raddr = seg->s_base;    /* raddr was in a gap */
1341 1339  
1342 1340                  if (eaddr > (seg->s_base + seg->s_size))
1343 1341                          ssize = seg->s_base + seg->s_size - raddr;
1344 1342                  else
1345 1343                          ssize = eaddr - raddr;
1346 1344  
1347 1345                  /*
1348 1346                   * Save next segment pointer since seg can be
1349 1347                   * destroyed during the segment unmap operation.
1350 1348                   */
1351 1349                  seg_next = AS_SEGNEXT(as, seg);
1352 1350  
1353 1351                  /*
1354 1352                   * We didn't count /dev/null mappings, so ignore them here.
1355 1353                   * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1356 1354                   * we have to do this check here while we have seg.)
1357 1355                   */
1358 1356                  rsize = 0;
1359 1357                  if (!SEG_IS_DEVNULL_MAPPING(seg) &&
1360 1358                      !SEG_IS_PARTIAL_RESV(seg))
1361 1359                          rsize = ssize;
1362 1360  
1363 1361  retry:
1364 1362                  err = SEGOP_UNMAP(seg, raddr, ssize);
1365 1363                  if (err == EAGAIN) {
1366 1364                          /*
1367 1365                           * Memory is currently locked.  It must be unlocked
1368 1366                           * before this operation can succeed through a retry.
1369 1367                           * The possible reasons for locked memory and
1370 1368                           * corresponding strategies for unlocking are:
1371 1369                           * (1) Normal I/O
1372 1370                           *      wait for a signal that the I/O operation
1373 1371                           *      has completed and the memory is unlocked.
1374 1372                           * (2) Asynchronous I/O
1375 1373                           *      The aio subsystem does not unlock pages when
1376 1374                           *      the I/O is completed. Those pages are unlocked
1377 1375                           *      when the application calls aiowait/aioerror.
1378 1376                           *      So, to prevent blocking forever, cv_broadcast()
1379 1377                           *      is done to wake up aio_cleanup_thread.
1380 1378                           *      Subsequently, segvn_reclaim will be called, and
1381 1379                           *      that will do AS_CLRUNMAPWAIT() and wake us up.
1382 1380                           * (3) Long term page locking:
1383 1381                           *      Drivers intending to have pages locked for a
1384 1382                           *      period considerably longer than for normal I/O
1385 1383                           *      (essentially forever) may have registered for a
1386 1384                           *      callback so they may unlock these pages on
1387 1385                           *      request. This is needed to allow this operation
1388 1386                           *      to succeed. Each entry on the callback list is
1389 1387                           *      examined. If the event or address range pertains
1390 1388                           *      the callback is invoked (unless it already is in
1391 1389                           *      progress). The a_contents lock must be dropped
1392 1390                           *      before the callback, so only one callback can
1393 1391                           *      be done at a time. Go to the top and do more
1394 1392                           *      until zero is returned. If zero is returned,
1395 1393                           *      either there were no callbacks for this event
1396 1394                           *      or they were already in progress.
1397 1395                           */
1398 1396                          mutex_enter(&as->a_contents);
1399 1397                          if (as->a_callbacks &&
1400 1398                              (cb = as_find_callback(as, AS_UNMAP_EVENT,
1401 1399                              seg->s_base, seg->s_size))) {
1402 1400                                  AS_LOCK_EXIT(as);
1403 1401                                  as_execute_callback(as, cb, AS_UNMAP_EVENT);
1404 1402                          } else if (!AS_ISNOUNMAPWAIT(as)) {
1405 1403                                  if (AS_ISUNMAPWAIT(as) == 0)
1406 1404                                          cv_broadcast(&as->a_cv);
1407 1405                                  AS_SETUNMAPWAIT(as);
1408 1406                                  AS_LOCK_EXIT(as);
1409 1407                                  while (AS_ISUNMAPWAIT(as))
1410 1408                                          cv_wait(&as->a_cv, &as->a_contents);
1411 1409                          } else {
1412 1410                                  /*
1413 1411                                   * We may have raced with
1414 1412                                   * segvn_reclaim()/segspt_reclaim(). In this
1415 1413                                   * case clean nounmapwait flag and retry since
1416 1414                                   * softlockcnt in this segment may be already
1417 1415                                   * 0.  We don't drop as writer lock so our
1418 1416                                   * number of retries without sleeping should
1419 1417                                   * be very small. See segvn_reclaim() for
1420 1418                                   * more comments.
1421 1419                                   */
1422 1420                                  AS_CLRNOUNMAPWAIT(as);
1423 1421                                  mutex_exit(&as->a_contents);
1424 1422                                  goto retry;
1425 1423                          }
1426 1424                          mutex_exit(&as->a_contents);
1427 1425                          goto top;
1428 1426                  } else if (err == IE_RETRY) {
1429 1427                          AS_LOCK_EXIT(as);
1430 1428                          goto top;
1431 1429                  } else if (err) {
1432 1430                          as_setwatch(as);
1433 1431                          AS_LOCK_EXIT(as);
1434 1432                          return (-1);
1435 1433                  }
1436 1434  
1437 1435                  as->a_size -= ssize;
1438 1436                  if (rsize)
1439 1437                          as->a_resvsize -= rsize;
1440 1438                  raddr += ssize;
1441 1439          }
1442 1440          AS_LOCK_EXIT(as);
1443 1441          return (0);
1444 1442  }
1445 1443  
1446 1444  static int
1447 1445  as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
1448 1446      int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1449 1447  {
1450 1448          uint_t szc;
1451 1449          uint_t nszc;
1452 1450          int error;
1453 1451          caddr_t a;
1454 1452          caddr_t eaddr;
1455 1453          size_t segsize;
1456 1454          struct seg *seg;
1457 1455          size_t pgsz;
1458 1456          int do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
1459 1457          uint_t save_szcvec;
1460 1458  
1461 1459          ASSERT(AS_WRITE_HELD(as));
1462 1460          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1463 1461          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1464 1462          ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
1465 1463          if (!do_off) {
1466 1464                  vn_a->offset = 0;
1467 1465          }
1468 1466  
1469 1467          if (szcvec <= 1) {
1470 1468                  seg = seg_alloc(as, addr, size);
1471 1469                  if (seg == NULL) {
1472 1470                          return (ENOMEM);
1473 1471                  }
1474 1472                  vn_a->szc = 0;
1475 1473                  error = (*crfp)(seg, vn_a);
1476 1474                  if (error != 0) {
1477 1475                          seg_free(seg);
1478 1476                  } else {
1479 1477                          as->a_size += size;
1480 1478                          as->a_resvsize += size;
1481 1479                  }
1482 1480                  return (error);
1483 1481          }
1484 1482  
1485 1483          eaddr = addr + size;
1486 1484          save_szcvec = szcvec;
1487 1485          szcvec >>= 1;
1488 1486          szc = 0;
1489 1487          nszc = 0;
1490 1488          while (szcvec) {
1491 1489                  if ((szcvec & 0x1) == 0) {
1492 1490                          nszc++;
1493 1491                          szcvec >>= 1;
1494 1492                          continue;
1495 1493                  }
1496 1494                  nszc++;
1497 1495                  pgsz = page_get_pagesize(nszc);
1498 1496                  a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
1499 1497                  if (a != addr) {
1500 1498                          ASSERT(a < eaddr);
1501 1499                          segsize = a - addr;
1502 1500                          seg = seg_alloc(as, addr, segsize);
1503 1501                          if (seg == NULL) {
1504 1502                                  return (ENOMEM);
1505 1503                          }
1506 1504                          vn_a->szc = szc;
1507 1505                          error = (*crfp)(seg, vn_a);
1508 1506                          if (error != 0) {
1509 1507                                  seg_free(seg);
1510 1508                                  return (error);
1511 1509                          }
1512 1510                          as->a_size += segsize;
1513 1511                          as->a_resvsize += segsize;
1514 1512                          *segcreated = 1;
1515 1513                          if (do_off) {
1516 1514                                  vn_a->offset += segsize;
1517 1515                          }
1518 1516                          addr = a;
1519 1517                  }
1520 1518                  szc = nszc;
1521 1519                  szcvec >>= 1;
1522 1520          }
1523 1521  
1524 1522          ASSERT(addr < eaddr);
1525 1523          szcvec = save_szcvec | 1; /* add 8K pages */
1526 1524          while (szcvec) {
1527 1525                  a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
1528 1526                  ASSERT(a >= addr);
1529 1527                  if (a != addr) {
1530 1528                          segsize = a - addr;
1531 1529                          seg = seg_alloc(as, addr, segsize);
1532 1530                          if (seg == NULL) {
1533 1531                                  return (ENOMEM);
1534 1532                          }
1535 1533                          vn_a->szc = szc;
1536 1534                          error = (*crfp)(seg, vn_a);
1537 1535                          if (error != 0) {
1538 1536                                  seg_free(seg);
1539 1537                                  return (error);
1540 1538                          }
1541 1539                          as->a_size += segsize;
1542 1540                          as->a_resvsize += segsize;
1543 1541                          *segcreated = 1;
1544 1542                          if (do_off) {
1545 1543                                  vn_a->offset += segsize;
1546 1544                          }
1547 1545                          addr = a;
1548 1546                  }
1549 1547                  szcvec &= ~(1 << szc);
1550 1548                  if (szcvec) {
1551 1549                          szc = highbit(szcvec) - 1;
1552 1550                          pgsz = page_get_pagesize(szc);
1553 1551                  }
1554 1552          }
1555 1553          ASSERT(addr == eaddr);
1556 1554  
1557 1555          return (0);
1558 1556  }
1559 1557  
1560 1558  static int
1561 1559  as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
1562 1560      int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1563 1561  {
1564 1562          uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
1565 1563          int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
1566 1564          uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1567 1565              type, 0);
1568 1566          int error;
1569 1567          struct seg *seg;
1570 1568          struct vattr va;
1571 1569          u_offset_t eoff;
1572 1570          size_t save_size = 0;
1573 1571          extern size_t textrepl_size_thresh;
1574 1572  
1575 1573          ASSERT(AS_WRITE_HELD(as));
1576 1574          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1577 1575          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1578 1576          ASSERT(vn_a->vp != NULL);
1579 1577          ASSERT(vn_a->amp == NULL);
1580 1578  
1581 1579  again:
1582 1580          if (szcvec <= 1) {
1583 1581                  seg = seg_alloc(as, addr, size);
1584 1582                  if (seg == NULL) {
1585 1583                          return (ENOMEM);
1586 1584                  }
1587 1585                  vn_a->szc = 0;
1588 1586                  error = (*crfp)(seg, vn_a);
1589 1587                  if (error != 0) {
1590 1588                          seg_free(seg);
1591 1589                  } else {
1592 1590                          as->a_size += size;
1593 1591                          as->a_resvsize += size;
1594 1592                  }
1595 1593                  return (error);
1596 1594          }
1597 1595  
1598 1596          va.va_mask = AT_SIZE;
1599 1597          if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) {
1600 1598                  szcvec = 0;
1601 1599                  goto again;
1602 1600          }
1603 1601          eoff = vn_a->offset & PAGEMASK;
1604 1602          if (eoff >= va.va_size) {
1605 1603                  szcvec = 0;
1606 1604                  goto again;
1607 1605          }
1608 1606          eoff += size;
1609 1607          if (btopr(va.va_size) < btopr(eoff)) {
1610 1608                  save_size = size;
1611 1609                  size = va.va_size - (vn_a->offset & PAGEMASK);
1612 1610                  size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
1613 1611                  szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1614 1612                      type, 0);
1615 1613                  if (szcvec <= 1) {
1616 1614                          size = save_size;
1617 1615                          goto again;
1618 1616                  }
1619 1617          }
1620 1618  
1621 1619          if (size > textrepl_size_thresh) {
1622 1620                  vn_a->flags |= _MAP_TEXTREPL;
1623 1621          }
1624 1622          error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a,
1625 1623              segcreated);
1626 1624          if (error != 0) {
1627 1625                  return (error);
1628 1626          }
1629 1627          if (save_size) {
1630 1628                  addr += size;
1631 1629                  size = save_size - size;
1632 1630                  szcvec = 0;
1633 1631                  goto again;
1634 1632          }
1635 1633          return (0);
1636 1634  }
1637 1635  
1638 1636  /*
1639 1637   * as_map_ansegs: shared or private anonymous memory.  Note that the flags
1640 1638   * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1641 1639   */
1642 1640  static int
1643 1641  as_map_ansegs(struct as *as, caddr_t addr, size_t size,
1644 1642      int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1645 1643  {
1646 1644          uint_t szcvec;
1647 1645          uchar_t type;
1648 1646  
1649 1647          ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
1650 1648          if (vn_a->type == MAP_SHARED) {
1651 1649                  type = MAPPGSZC_SHM;
1652 1650          } else if (vn_a->type == MAP_PRIVATE) {
1653 1651                  if (vn_a->szc == AS_MAP_HEAP) {
1654 1652                          type = MAPPGSZC_HEAP;
1655 1653                  } else if (vn_a->szc == AS_MAP_STACK) {
1656 1654                          type = MAPPGSZC_STACK;
1657 1655                  } else {
1658 1656                          type = MAPPGSZC_PRIVM;
1659 1657                  }
1660 1658          }
1661 1659          szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
1662 1660              (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
1663 1661              (vn_a->flags & MAP_TEXT), type, 0);
1664 1662          ASSERT(AS_WRITE_HELD(as));
1665 1663          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1666 1664          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1667 1665          ASSERT(vn_a->vp == NULL);
1668 1666  
1669 1667          return (as_map_segvn_segs(as, addr, size, szcvec,
1670 1668              crfp, vn_a, segcreated));
1671 1669  }
1672 1670  
1673 1671  int
1674 1672  as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp)
1675 1673  {
1676 1674          AS_LOCK_ENTER(as, RW_WRITER);
1677 1675          return (as_map_locked(as, addr, size, crfp, argsp));
1678 1676  }
1679 1677  
1680 1678  int
1681 1679  as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(),
1682 1680      void *argsp)
1683 1681  {
1684 1682          struct seg *seg = NULL;
1685 1683          caddr_t raddr;                  /* rounded down addr */
1686 1684          size_t rsize;                   /* rounded up size */
1687 1685          int error;
1688 1686          int unmap = 0;
1689 1687          /*
1690 1688           * The use of a_proc is preferred to handle the case where curproc is
1691 1689           * a door_call server and is allocating memory in the client's (a_proc)
1692 1690           * address space.
1693 1691           * When creating a shared memory segment a_proc will be NULL so we
1694 1692           * fallback to curproc in that case.
1695 1693           */
1696 1694          struct proc *p = (as->a_proc == NULL) ? curproc : as->a_proc;
1697 1695          struct segvn_crargs crargs;
1698 1696  
1699 1697          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1700 1698          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1701 1699              (size_t)raddr;
1702 1700  
1703 1701          /*

↓ open down ↓

787 lines elided

↑ open up ↑

1704 1702           * check for wrap around
1705 1703           */
1706 1704          if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
1707 1705                  AS_LOCK_EXIT(as);
1708 1706                  return (ENOMEM);
1709 1707          }
1710 1708  
1711 1709          as->a_updatedir = 1;    /* inform /proc */
1712 1710          gethrestime(&as->a_updatetime);
1713 1711  
1714      -        if (as != &kas) {
1715      -                if (as->a_size + rsize > (size_t)p->p_vmem_ctl) {
1716      -                        AS_LOCK_EXIT(as);
     1712 +        if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) {
     1713 +                AS_LOCK_EXIT(as);
1717 1714  
1718      -                        (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM],
1719      -                            p->p_rctls, p, RCA_UNSAFE_ALL);
1720      -                        return (ENOMEM);
1721      -                }
     1715 +                (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
     1716 +                    RCA_UNSAFE_ALL);
1722 1717  
1723      -                /*
1724      -                 * Keep the number of segments in a userspace AS constrained to
1725      -                 * a reasonable limit.  Linux enforces a value slightly less
1726      -                 * than 64k in order to avoid ELF limits if/when a process
1727      -                 * dumps core.  While SunOS avoids that specific problem with
1728      -                 * other tricks, the limit is still valuable to keep kernel
1729      -                 * memory consumption in check.
1730      -                 */
1731      -                if (avl_numnodes(&as->a_segtree) >= as_user_seg_limit) {
1732      -                        AS_LOCK_EXIT(as);
1733      -                        atomic_inc_32(&p->p_zone->zone_mfseglim);
1734      -                        return (ENOMEM);
1735      -                }
     1718 +                return (ENOMEM);
1736 1719          }
1737 1720  
1738 1721          if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
1739 1722                  crargs = *(struct segvn_crargs *)argsp;
1740 1723                  error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap);
1741 1724                  if (error != 0) {
1742 1725                          AS_LOCK_EXIT(as);
1743 1726                          if (unmap) {
1744 1727                                  (void) as_unmap(as, addr, size);
1745 1728                          }

1746 1729                          return (error);
1747 1730                  }
1748 1731          } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
1749 1732                  crargs = *(struct segvn_crargs *)argsp;
1750 1733                  error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap);
1751 1734                  if (error != 0) {
1752 1735                          AS_LOCK_EXIT(as);
1753 1736                          if (unmap) {
1754 1737                                  (void) as_unmap(as, addr, size);
1755 1738                          }
1756 1739                          return (error);
1757 1740                  }
1758 1741          } else {
1759 1742                  seg = seg_alloc(as, addr, size);
1760 1743                  if (seg == NULL) {
1761 1744                          AS_LOCK_EXIT(as);
1762 1745                          return (ENOMEM);
1763 1746                  }
1764 1747  
1765 1748                  error = (*crfp)(seg, argsp);
1766 1749                  if (error != 0) {
1767 1750                          seg_free(seg);
1768 1751                          AS_LOCK_EXIT(as);
1769 1752                          return (error);
1770 1753                  }
1771 1754                  /*
1772 1755                   * Add size now so as_unmap will work if as_ctl fails.
1773 1756                   */
1774 1757                  as->a_size += rsize;
1775 1758                  as->a_resvsize += rsize;
1776 1759          }
1777 1760  
1778 1761          as_setwatch(as);
1779 1762  
1780 1763          /*
1781 1764           * If the address space is locked,
1782 1765           * establish memory locks for the new segment.
1783 1766           */
1784 1767          mutex_enter(&as->a_contents);
1785 1768          if (AS_ISPGLCK(as)) {
1786 1769                  mutex_exit(&as->a_contents);
1787 1770                  AS_LOCK_EXIT(as);
1788 1771                  error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
1789 1772                  if (error != 0)
1790 1773                          (void) as_unmap(as, addr, size);
1791 1774          } else {
1792 1775                  mutex_exit(&as->a_contents);
1793 1776                  AS_LOCK_EXIT(as);
1794 1777          }
1795 1778          return (error);
1796 1779  }
1797 1780  
1798 1781  
1799 1782  /*
1800 1783   * Delete all segments in the address space marked with S_PURGE.
1801 1784   * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1802 1785   * These segments are deleted as a first step before calls to as_gap(), so
1803 1786   * that they don't affect mmap() or shmat().
1804 1787   */
1805 1788  void
1806 1789  as_purge(struct as *as)
1807 1790  {
1808 1791          struct seg *seg;
1809 1792          struct seg *next_seg;
1810 1793  
1811 1794          /*
1812 1795           * the setting of NEEDSPURGE is protect by as_rangelock(), so
1813 1796           * no need to grab a_contents mutex for this check
1814 1797           */
1815 1798          if ((as->a_flags & AS_NEEDSPURGE) == 0)
1816 1799                  return;
1817 1800  
1818 1801          AS_LOCK_ENTER(as, RW_WRITER);
1819 1802          next_seg = NULL;
1820 1803          seg = AS_SEGFIRST(as);
1821 1804          while (seg != NULL) {
1822 1805                  next_seg = AS_SEGNEXT(as, seg);
1823 1806                  if (seg->s_flags & S_PURGE)
1824 1807                          SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1825 1808                  seg = next_seg;
1826 1809          }
1827 1810          AS_LOCK_EXIT(as);
1828 1811  
1829 1812          mutex_enter(&as->a_contents);
1830 1813          as->a_flags &= ~AS_NEEDSPURGE;
1831 1814          mutex_exit(&as->a_contents);
1832 1815  }
1833 1816  
1834 1817  /*
1835 1818   * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1836 1819   * range of addresses at least "minlen" long, where the base of the range is
1837 1820   * at "off" phase from an "align" boundary and there is space for a
1838 1821   * "redzone"-sized redzone on eithe rside of the range.  Thus,
1839 1822   * if align was 4M and off was 16k, the user wants a hole which will start
1840 1823   * 16k into a 4M page.
1841 1824   *
1842 1825   * If flags specifies AH_HI, the hole will have the highest possible address
1843 1826   * in the range.  We use the as->a_lastgap field to figure out where to
1844 1827   * start looking for a gap.
1845 1828   *
1846 1829   * Otherwise, the gap will have the lowest possible address.
1847 1830   *
1848 1831   * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1849 1832   *
1850 1833   * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1851 1834   * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1852 1835   *
1853 1836   * NOTE: This routine is not correct when base+len overflows caddr_t.
1854 1837   */
1855 1838  int
1856 1839  as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
1857 1840      uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off)
1858 1841  {
1859 1842          caddr_t lobound = *basep;
1860 1843          caddr_t hibound = lobound + *lenp;
1861 1844          struct seg *lseg, *hseg;
1862 1845          caddr_t lo, hi;
1863 1846          int forward;
1864 1847          caddr_t save_base;
1865 1848          size_t save_len;
1866 1849          size_t save_minlen;
1867 1850          size_t save_redzone;
1868 1851          int fast_path = 1;
1869 1852  
1870 1853          save_base = *basep;
1871 1854          save_len = *lenp;
1872 1855          save_minlen = minlen;
1873 1856          save_redzone = redzone;
1874 1857  
1875 1858          /*
1876 1859           * For the first pass/fast_path, just add align and redzone into
1877 1860           * minlen since if we get an allocation, we can guarantee that it
1878 1861           * will fit the alignment and redzone requested.
1879 1862           * This increases the chance that hibound will be adjusted to
1880 1863           * a_lastgap->s_base which will likely allow us to find an
1881 1864           * acceptable hole in the address space quicker.
1882 1865           * If we can't find a hole with this fast_path, then we look for
1883 1866           * smaller holes in which the alignment and offset may allow
1884 1867           * the allocation to fit.
1885 1868           */
1886 1869          minlen += align;
1887 1870          minlen += 2 * redzone;
1888 1871          redzone = 0;
1889 1872  
1890 1873          AS_LOCK_ENTER(as, RW_READER);
1891 1874          if (AS_SEGFIRST(as) == NULL) {
1892 1875                  if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
1893 1876                      align, redzone, off)) {
1894 1877                          AS_LOCK_EXIT(as);
1895 1878                          return (0);
1896 1879                  } else {
1897 1880                          AS_LOCK_EXIT(as);
1898 1881                          *basep = save_base;
1899 1882                          *lenp = save_len;
1900 1883                          return (-1);
1901 1884                  }
1902 1885          }
1903 1886  
1904 1887  retry:
1905 1888          /*
1906 1889           * Set up to iterate over all the inter-segment holes in the given
1907 1890           * direction.  lseg is NULL for the lowest-addressed hole and hseg is
1908 1891           * NULL for the highest-addressed hole.  If moving backwards, we reset
1909 1892           * sseg to denote the highest-addressed segment.
1910 1893           */
1911 1894          forward = (flags & AH_DIR) == AH_LO;
1912 1895          if (forward) {
1913 1896                  hseg = as_findseg(as, lobound, 1);
1914 1897                  lseg = AS_SEGPREV(as, hseg);
1915 1898          } else {
1916 1899  
1917 1900                  /*
1918 1901                   * If allocating at least as much as the last allocation,
1919 1902                   * use a_lastgap's base as a better estimate of hibound.
1920 1903                   */
1921 1904                  if (as->a_lastgap &&
1922 1905                      minlen >= as->a_lastgap->s_size &&
1923 1906                      hibound >= as->a_lastgap->s_base)
1924 1907                          hibound = as->a_lastgap->s_base;
1925 1908  
1926 1909                  hseg = as_findseg(as, hibound, 1);
1927 1910                  if (hseg->s_base + hseg->s_size < hibound) {
1928 1911                          lseg = hseg;
1929 1912                          hseg = NULL;
1930 1913                  } else {
1931 1914                          lseg = AS_SEGPREV(as, hseg);
1932 1915                  }
1933 1916          }
1934 1917  
1935 1918          for (;;) {
1936 1919                  /*
1937 1920                   * Set lo and hi to the hole's boundaries.  (We should really
1938 1921                   * use MAXADDR in place of hibound in the expression below,
1939 1922                   * but can't express it easily; using hibound in its place is
1940 1923                   * harmless.)
1941 1924                   */
1942 1925                  lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
1943 1926                  hi = (hseg == NULL) ? hibound : hseg->s_base;
1944 1927                  /*
1945 1928                   * If the iteration has moved past the interval from lobound
1946 1929                   * to hibound it's pointless to continue.
1947 1930                   */
1948 1931                  if ((forward && lo > hibound) || (!forward && hi < lobound))
1949 1932                          break;
1950 1933                  else if (lo > hibound || hi < lobound)
1951 1934                          goto cont;
1952 1935                  /*
1953 1936                   * Candidate hole lies at least partially within the allowable
1954 1937                   * range.  Restrict it to fall completely within that range,
1955 1938                   * i.e., to [max(lo, lobound), min(hi, hibound)].
1956 1939                   */
1957 1940                  if (lo < lobound)
1958 1941                          lo = lobound;
1959 1942                  if (hi > hibound)
1960 1943                          hi = hibound;
1961 1944                  /*
1962 1945                   * Verify that the candidate hole is big enough and meets
1963 1946                   * hardware constraints.  If the hole is too small, no need
1964 1947                   * to do the further checks since they will fail.
1965 1948                   */
1966 1949                  *basep = lo;
1967 1950                  *lenp = hi - lo;
1968 1951                  if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
1969 1952                      minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
1970 1953                      ((flags & AH_CONTAIN) == 0 ||
1971 1954                      (*basep <= addr && *basep + *lenp > addr))) {
1972 1955                          if (!forward)
1973 1956                                  as->a_lastgap = hseg;
1974 1957                          if (hseg != NULL)
1975 1958                                  as->a_lastgaphl = hseg;
1976 1959                          else
1977 1960                                  as->a_lastgaphl = lseg;
1978 1961                          AS_LOCK_EXIT(as);
1979 1962                          return (0);
1980 1963                  }
1981 1964          cont:
1982 1965                  /*
1983 1966                   * Move to the next hole.
1984 1967                   */
1985 1968                  if (forward) {
1986 1969                          lseg = hseg;
1987 1970                          if (lseg == NULL)
1988 1971                                  break;
1989 1972                          hseg = AS_SEGNEXT(as, hseg);
1990 1973                  } else {
1991 1974                          hseg = lseg;
1992 1975                          if (hseg == NULL)
1993 1976                                  break;
1994 1977                          lseg = AS_SEGPREV(as, lseg);
1995 1978                  }
1996 1979          }
1997 1980          if (fast_path && (align != 0 || save_redzone != 0)) {
1998 1981                  fast_path = 0;
1999 1982                  minlen = save_minlen;
2000 1983                  redzone = save_redzone;
2001 1984                  goto retry;
2002 1985          }
2003 1986          *basep = save_base;
2004 1987          *lenp = save_len;
2005 1988          AS_LOCK_EXIT(as);
2006 1989          return (-1);
2007 1990  }
2008 1991  
2009 1992  /*
2010 1993   * Find a hole of at least size minlen within [*basep, *basep + *lenp).
2011 1994   *
2012 1995   * If flags specifies AH_HI, the hole will have the highest possible address
2013 1996   * in the range.  We use the as->a_lastgap field to figure out where to
2014 1997   * start looking for a gap.
2015 1998   *
2016 1999   * Otherwise, the gap will have the lowest possible address.
2017 2000   *
2018 2001   * If flags specifies AH_CONTAIN, the hole will contain the address addr.
2019 2002   *
2020 2003   * If an adequate hole is found, base and len are set to reflect the part of
2021 2004   * the hole that is within range, and 0 is returned, otherwise,
2022 2005   * -1 is returned.
2023 2006   *
2024 2007   * NOTE: This routine is not correct when base+len overflows caddr_t.
2025 2008   */
2026 2009  int
2027 2010  as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
2028 2011      caddr_t addr)
2029 2012  {
2030 2013  
2031 2014          return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
2032 2015  }
2033 2016  
2034 2017  /*
2035 2018   * Return the next range within [base, base + len) that is backed
2036 2019   * with "real memory".  Skip holes and non-seg_vn segments.
2037 2020   * We're lazy and only return one segment at a time.
2038 2021   */
2039 2022  int
2040 2023  as_memory(struct as *as, caddr_t *basep, size_t *lenp)
2041 2024  {
2042 2025          extern struct seg_ops segspt_shmops;    /* needs a header file */
2043 2026          struct seg *seg;
2044 2027          caddr_t addr, eaddr;
2045 2028          caddr_t segend;
2046 2029  
2047 2030          AS_LOCK_ENTER(as, RW_READER);
2048 2031  
2049 2032          addr = *basep;
2050 2033          eaddr = addr + *lenp;
2051 2034  
2052 2035          seg = as_findseg(as, addr, 0);
2053 2036          if (seg != NULL)
2054 2037                  addr = MAX(seg->s_base, addr);
2055 2038  
2056 2039          for (;;) {
2057 2040                  if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2058 2041                          AS_LOCK_EXIT(as);
2059 2042                          return (EINVAL);
2060 2043                  }
2061 2044  
2062 2045                  if (seg->s_ops == &segvn_ops) {
2063 2046                          segend = seg->s_base + seg->s_size;
2064 2047                          break;
2065 2048                  }
2066 2049  
2067 2050                  /*
2068 2051                   * We do ISM by looking into the private data
2069 2052                   * to determine the real size of the segment.
2070 2053                   */
2071 2054                  if (seg->s_ops == &segspt_shmops) {
2072 2055                          segend = seg->s_base + spt_realsize(seg);
2073 2056                          if (addr < segend)
2074 2057                                  break;
2075 2058                  }
2076 2059  
2077 2060                  seg = AS_SEGNEXT(as, seg);
2078 2061  
2079 2062                  if (seg != NULL)
2080 2063                          addr = seg->s_base;
2081 2064          }
2082 2065  
2083 2066          *basep = addr;
2084 2067  
2085 2068          if (segend > eaddr)
2086 2069                  *lenp = eaddr - addr;
2087 2070          else
2088 2071                  *lenp = segend - addr;
2089 2072  
2090 2073          AS_LOCK_EXIT(as);
2091 2074          return (0);
2092 2075  }
2093 2076  
2094 2077  /*
2095 2078   * Swap the pages associated with the address space as out to
2096 2079   * secondary storage, returning the number of bytes actually
2097 2080   * swapped.
2098 2081   *
2099 2082   * The value returned is intended to correlate well with the process's
2100 2083   * memory requirements.  Its usefulness for this purpose depends on
2101 2084   * how well the segment-level routines do at returning accurate
2102 2085   * information.
2103 2086   */
2104 2087  size_t
2105 2088  as_swapout(struct as *as)
2106 2089  {
2107 2090          struct seg *seg;
2108 2091          size_t swpcnt = 0;
2109 2092  
2110 2093          /*
2111 2094           * Kernel-only processes have given up their address
2112 2095           * spaces.  Of course, we shouldn't be attempting to
2113 2096           * swap out such processes in the first place...
2114 2097           */
2115 2098          if (as == NULL)
2116 2099                  return (0);
2117 2100  
2118 2101          AS_LOCK_ENTER(as, RW_READER);
2119 2102  
2120 2103          /*
2121 2104           * Free all mapping resources associated with the address
2122 2105           * space.  The segment-level swapout routines capitalize
2123 2106           * on this unmapping by scavanging pages that have become
2124 2107           * unmapped here.
2125 2108           */
2126 2109          hat_swapout(as->a_hat);
2127 2110  
2128 2111          /*
2129 2112           * Call the swapout routines of all segments in the address
2130 2113           * space to do the actual work, accumulating the amount of
2131 2114           * space reclaimed.
2132 2115           */
2133 2116          for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2134 2117                  struct seg_ops *ov = seg->s_ops;
2135 2118  
2136 2119                  /*
2137 2120                   * We have to check to see if the seg has
2138 2121                   * an ops vector because the seg may have
2139 2122                   * been in the middle of being set up when
2140 2123                   * the process was picked for swapout.
2141 2124                   */
2142 2125                  if ((ov != NULL) && (ov->swapout != NULL))
2143 2126                          swpcnt += SEGOP_SWAPOUT(seg);
2144 2127          }
2145 2128          AS_LOCK_EXIT(as);
2146 2129          return (swpcnt);
2147 2130  }
2148 2131  
2149 2132  /*
2150 2133   * Determine whether data from the mappings in interval [addr, addr + size)
2151 2134   * are in the primary memory (core) cache.
2152 2135   */
2153 2136  int
2154 2137  as_incore(struct as *as, caddr_t addr,
2155 2138      size_t size, char *vec, size_t *sizep)
2156 2139  {
2157 2140          struct seg *seg;
2158 2141          size_t ssize;
2159 2142          caddr_t raddr;          /* rounded down addr */
2160 2143          size_t rsize;           /* rounded up size */
2161 2144          size_t isize;                   /* iteration size */
2162 2145          int error = 0;          /* result, assume success */
2163 2146  
2164 2147          *sizep = 0;
2165 2148          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2166 2149          rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2167 2150              (size_t)raddr;
2168 2151  
2169 2152          if (raddr + rsize < raddr)              /* check for wraparound */
2170 2153                  return (ENOMEM);
2171 2154  
2172 2155          AS_LOCK_ENTER(as, RW_READER);
2173 2156          seg = as_segat(as, raddr);
2174 2157          if (seg == NULL) {
2175 2158                  AS_LOCK_EXIT(as);
2176 2159                  return (-1);
2177 2160          }
2178 2161  
2179 2162          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2180 2163                  if (raddr >= seg->s_base + seg->s_size) {
2181 2164                          seg = AS_SEGNEXT(as, seg);
2182 2165                          if (seg == NULL || raddr != seg->s_base) {
2183 2166                                  error = -1;
2184 2167                                  break;
2185 2168                          }
2186 2169                  }
2187 2170                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
2188 2171                          ssize = seg->s_base + seg->s_size - raddr;
2189 2172                  else
2190 2173                          ssize = rsize;
2191 2174                  *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec);
2192 2175                  if (isize != ssize) {
2193 2176                          error = -1;
2194 2177                          break;
2195 2178                  }
2196 2179                  vec += btopr(ssize);
2197 2180          }
2198 2181          AS_LOCK_EXIT(as);
2199 2182          return (error);
2200 2183  }
2201 2184  
2202 2185  static void
2203 2186  as_segunlock(struct seg *seg, caddr_t addr, int attr,
2204 2187      ulong_t *bitmap, size_t position, size_t npages)
2205 2188  {
2206 2189          caddr_t range_start;
2207 2190          size_t  pos1 = position;
2208 2191          size_t  pos2;
2209 2192          size_t  size;
2210 2193          size_t  end_pos = npages + position;
2211 2194  
2212 2195          while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2213 2196                  size = ptob((pos2 - pos1));
2214 2197                  range_start = (caddr_t)((uintptr_t)addr +
2215 2198                      ptob(pos1 - position));
2216 2199  
2217 2200                  (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK,
2218 2201                      (ulong_t *)NULL, (size_t)NULL);
2219 2202                  pos1 = pos2;
2220 2203          }
2221 2204  }
2222 2205  
2223 2206  static void
2224 2207  as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
2225 2208      caddr_t raddr, size_t rsize)
2226 2209  {
2227 2210          struct seg *seg = as_segat(as, raddr);
2228 2211          size_t ssize;
2229 2212  
2230 2213          while (rsize != 0) {
2231 2214                  if (raddr >= seg->s_base + seg->s_size)
2232 2215                          seg = AS_SEGNEXT(as, seg);
2233 2216  
2234 2217                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
2235 2218                          ssize = seg->s_base + seg->s_size - raddr;
2236 2219                  else
2237 2220                          ssize = rsize;
2238 2221  
2239 2222                  as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
2240 2223  
2241 2224                  rsize -= ssize;
2242 2225                  raddr += ssize;
2243 2226          }
2244 2227  }
2245 2228  
2246 2229  /*
2247 2230   * Cache control operations over the interval [addr, addr + size) in
2248 2231   * address space "as".
2249 2232   */
2250 2233  /*ARGSUSED*/
2251 2234  int
2252 2235  as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
2253 2236      uintptr_t arg, ulong_t *lock_map, size_t pos)
2254 2237  {
2255 2238          struct seg *seg;        /* working segment */
2256 2239          caddr_t raddr;          /* rounded down addr */
2257 2240          caddr_t initraddr;      /* saved initial rounded down addr */
2258 2241          size_t rsize;           /* rounded up size */
2259 2242          size_t initrsize;       /* saved initial rounded up size */
2260 2243          size_t ssize;           /* size of seg */
2261 2244          int error = 0;                  /* result */
2262 2245          size_t mlock_size;      /* size of bitmap */
2263 2246          ulong_t *mlock_map;     /* pointer to bitmap used */
2264 2247                                  /* to represent the locked */
2265 2248                                  /* pages. */
2266 2249  retry:
2267 2250          if (error == IE_RETRY)
2268 2251                  AS_LOCK_ENTER(as, RW_WRITER);
2269 2252          else
2270 2253                  AS_LOCK_ENTER(as, RW_READER);
2271 2254  
2272 2255          /*
2273 2256           * If these are address space lock/unlock operations, loop over
2274 2257           * all segments in the address space, as appropriate.
2275 2258           */
2276 2259          if (func == MC_LOCKAS) {
2277 2260                  size_t npages, idx;
2278 2261                  size_t rlen = 0;        /* rounded as length */
2279 2262  
2280 2263                  idx = pos;
2281 2264  
2282 2265                  if (arg & MCL_FUTURE) {
2283 2266                          mutex_enter(&as->a_contents);
2284 2267                          AS_SETPGLCK(as);
2285 2268                          mutex_exit(&as->a_contents);
2286 2269                  }
2287 2270                  if ((arg & MCL_CURRENT) == 0) {
2288 2271                          AS_LOCK_EXIT(as);
2289 2272                          return (0);
2290 2273                  }
2291 2274  
2292 2275                  seg = AS_SEGFIRST(as);
2293 2276                  if (seg == NULL) {
2294 2277                          AS_LOCK_EXIT(as);
2295 2278                          return (0);
2296 2279                  }
2297 2280  
2298 2281                  do {
2299 2282                          raddr = (caddr_t)((uintptr_t)seg->s_base &
2300 2283                              (uintptr_t)PAGEMASK);
2301 2284                          rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2302 2285                              PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2303 2286                  } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2304 2287  
2305 2288                  mlock_size = BT_BITOUL(btopr(rlen));
2306 2289                  if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2307 2290                      sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2308 2291                                  AS_LOCK_EXIT(as);
2309 2292                                  return (EAGAIN);
2310 2293                  }
2311 2294  
2312 2295                  for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2313 2296                          error = SEGOP_LOCKOP(seg, seg->s_base,
2314 2297                              seg->s_size, attr, MC_LOCK, mlock_map, pos);
2315 2298                          if (error != 0)
2316 2299                                  break;
2317 2300                          pos += seg_pages(seg);
2318 2301                  }
2319 2302  
2320 2303                  if (error) {
2321 2304                          for (seg = AS_SEGFIRST(as); seg != NULL;
2322 2305                              seg = AS_SEGNEXT(as, seg)) {
2323 2306  
2324 2307                                  raddr = (caddr_t)((uintptr_t)seg->s_base &
2325 2308                                      (uintptr_t)PAGEMASK);
2326 2309                                  npages = seg_pages(seg);
2327 2310                                  as_segunlock(seg, raddr, attr, mlock_map,
2328 2311                                      idx, npages);
2329 2312                                  idx += npages;
2330 2313                          }
2331 2314                  }
2332 2315  
2333 2316                  kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2334 2317                  AS_LOCK_EXIT(as);
2335 2318                  goto lockerr;
2336 2319          } else if (func == MC_UNLOCKAS) {
2337 2320                  mutex_enter(&as->a_contents);
2338 2321                  AS_CLRPGLCK(as);
2339 2322                  mutex_exit(&as->a_contents);
2340 2323  
2341 2324                  for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2342 2325                          error = SEGOP_LOCKOP(seg, seg->s_base,
2343 2326                              seg->s_size, attr, MC_UNLOCK, NULL, 0);
2344 2327                          if (error != 0)
2345 2328                                  break;
2346 2329                  }
2347 2330  
2348 2331                  AS_LOCK_EXIT(as);
2349 2332                  goto lockerr;
2350 2333          }
2351 2334  
2352 2335          /*
2353 2336           * Normalize addresses and sizes.
2354 2337           */
2355 2338          initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2356 2339          initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2357 2340              (size_t)raddr;
2358 2341  
2359 2342          if (raddr + rsize < raddr) {            /* check for wraparound */
2360 2343                  AS_LOCK_EXIT(as);
2361 2344                  return (ENOMEM);
2362 2345          }
2363 2346  
2364 2347          /*
2365 2348           * Get initial segment.
2366 2349           */
2367 2350          if ((seg = as_segat(as, raddr)) == NULL) {
2368 2351                  AS_LOCK_EXIT(as);
2369 2352                  return (ENOMEM);
2370 2353          }
2371 2354  
2372 2355          if (func == MC_LOCK) {
2373 2356                  mlock_size = BT_BITOUL(btopr(rsize));
2374 2357                  if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2375 2358                      sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2376 2359                                  AS_LOCK_EXIT(as);
2377 2360                                  return (EAGAIN);
2378 2361                  }
2379 2362          }
2380 2363  
2381 2364          /*
2382 2365           * Loop over all segments.  If a hole in the address range is
2383 2366           * discovered, then fail.  For each segment, perform the appropriate
2384 2367           * control operation.
2385 2368           */
2386 2369          while (rsize != 0) {
2387 2370  
2388 2371                  /*
2389 2372                   * Make sure there's no hole, calculate the portion
2390 2373                   * of the next segment to be operated over.
2391 2374                   */
2392 2375                  if (raddr >= seg->s_base + seg->s_size) {
2393 2376                          seg = AS_SEGNEXT(as, seg);
2394 2377                          if (seg == NULL || raddr != seg->s_base) {
2395 2378                                  if (func == MC_LOCK) {
2396 2379                                          as_unlockerr(as, attr, mlock_map,
2397 2380                                              initraddr, initrsize - rsize);
2398 2381                                          kmem_free(mlock_map,
2399 2382                                              mlock_size * sizeof (ulong_t));
2400 2383                                  }
2401 2384                                  AS_LOCK_EXIT(as);
2402 2385                                  return (ENOMEM);
2403 2386                          }
2404 2387                  }
2405 2388                  if ((raddr + rsize) > (seg->s_base + seg->s_size))
2406 2389                          ssize = seg->s_base + seg->s_size - raddr;
2407 2390                  else
2408 2391                          ssize = rsize;
2409 2392  
2410 2393                  /*
2411 2394                   * Dispatch on specific function.
2412 2395                   */
2413 2396                  switch (func) {
2414 2397  
2415 2398                  /*
2416 2399                   * Synchronize cached data from mappings with backing
2417 2400                   * objects.
2418 2401                   */
2419 2402                  case MC_SYNC:
2420 2403                          if (error = SEGOP_SYNC(seg, raddr, ssize,
2421 2404                              attr, (uint_t)arg)) {
2422 2405                                  AS_LOCK_EXIT(as);
2423 2406                                  return (error);
2424 2407                          }
2425 2408                          break;
2426 2409  
2427 2410                  /*
2428 2411                   * Lock pages in memory.
2429 2412                   */
2430 2413                  case MC_LOCK:
2431 2414                          if (error = SEGOP_LOCKOP(seg, raddr, ssize,
2432 2415                              attr, func, mlock_map, pos)) {
2433 2416                                  as_unlockerr(as, attr, mlock_map, initraddr,
2434 2417                                      initrsize - rsize + ssize);
2435 2418                                  kmem_free(mlock_map, mlock_size *
2436 2419                                      sizeof (ulong_t));
2437 2420                                  AS_LOCK_EXIT(as);
2438 2421                                  goto lockerr;
2439 2422                          }
2440 2423                          break;
2441 2424  
2442 2425                  /*
2443 2426                   * Unlock mapped pages.
2444 2427                   */
2445 2428                  case MC_UNLOCK:
2446 2429                          (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func,
2447 2430                              (ulong_t *)NULL, (size_t)NULL);
2448 2431                          break;
2449 2432  
2450 2433                  /*
2451 2434                   * Store VM advise for mapped pages in segment layer.
2452 2435                   */
2453 2436                  case MC_ADVISE:
2454 2437                          error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg);
2455 2438  
2456 2439                          /*
2457 2440                           * Check for regular errors and special retry error
2458 2441                           */
2459 2442                          if (error) {
2460 2443                                  if (error == IE_RETRY) {
2461 2444                                          /*
2462 2445                                           * Need to acquire writers lock, so
2463 2446                                           * have to drop readers lock and start
2464 2447                                           * all over again
2465 2448                                           */
2466 2449                                          AS_LOCK_EXIT(as);
2467 2450                                          goto retry;
2468 2451                                  } else if (error == IE_REATTACH) {
2469 2452                                          /*
2470 2453                                           * Find segment for current address
2471 2454                                           * because current segment just got
2472 2455                                           * split or concatenated
2473 2456                                           */
2474 2457                                          seg = as_segat(as, raddr);
2475 2458                                          if (seg == NULL) {
2476 2459                                                  AS_LOCK_EXIT(as);
2477 2460                                                  return (ENOMEM);
2478 2461                                          }
2479 2462                                  } else {
2480 2463                                          /*
2481 2464                                           * Regular error
2482 2465                                           */
2483 2466                                          AS_LOCK_EXIT(as);
2484 2467                                          return (error);
2485 2468                                  }
2486 2469                          }
2487 2470                          break;
2488 2471  
2489 2472                  case MC_INHERIT_ZERO:
2490 2473                          if (seg->s_ops->inherit == NULL) {
2491 2474                                  error = ENOTSUP;
2492 2475                          } else {
2493 2476                                  error = SEGOP_INHERIT(seg, raddr, ssize,
2494 2477                                      SEGP_INH_ZERO);
2495 2478                          }
2496 2479                          if (error != 0) {
2497 2480                                  AS_LOCK_EXIT(as);
2498 2481                                  return (error);
2499 2482                          }
2500 2483                          break;
2501 2484  
2502 2485                  /*
2503 2486                   * Can't happen.
2504 2487                   */
2505 2488                  default:
2506 2489                          panic("as_ctl: bad operation %d", func);
2507 2490                          /*NOTREACHED*/
2508 2491                  }
2509 2492  
2510 2493                  rsize -= ssize;
2511 2494                  raddr += ssize;
2512 2495          }
2513 2496  
2514 2497          if (func == MC_LOCK)
2515 2498                  kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2516 2499          AS_LOCK_EXIT(as);
2517 2500          return (0);
2518 2501  lockerr:
2519 2502  
2520 2503          /*
2521 2504           * If the lower levels returned EDEADLK for a segment lockop,
2522 2505           * it means that we should retry the operation.  Let's wait
2523 2506           * a bit also to let the deadlock causing condition clear.
2524 2507           * This is part of a gross hack to work around a design flaw
2525 2508           * in the ufs/sds logging code and should go away when the
2526 2509           * logging code is re-designed to fix the problem. See bug
2527 2510           * 4125102 for details of the problem.
2528 2511           */
2529 2512          if (error == EDEADLK) {
2530 2513                  delay(deadlk_wait);
2531 2514                  error = 0;
2532 2515                  goto retry;
2533 2516          }
2534 2517          return (error);
2535 2518  }
2536 2519  
2537 2520  int
2538 2521  fc_decode(faultcode_t fault_err)
2539 2522  {
2540 2523          int error = 0;
2541 2524  
2542 2525          switch (FC_CODE(fault_err)) {
2543 2526          case FC_OBJERR:
2544 2527                  error = FC_ERRNO(fault_err);
2545 2528                  break;
2546 2529          case FC_PROT:
2547 2530                  error = EACCES;
2548 2531                  break;
2549 2532          default:
2550 2533                  error = EFAULT;
2551 2534                  break;
2552 2535          }
2553 2536          return (error);
2554 2537  }
2555 2538  
2556 2539  /*
2557 2540   * Pagelock pages from a range that spans more than 1 segment.  Obtain shadow
2558 2541   * lists from each segment and copy them to one contiguous shadow list (plist)
2559 2542   * as expected by the caller.  Save pointers to per segment shadow lists at
2560 2543   * the tail of plist so that they can be used during as_pageunlock().
2561 2544   */
2562 2545  static int
2563 2546  as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2564 2547      caddr_t addr, size_t size, enum seg_rw rw)
2565 2548  {
2566 2549          caddr_t sv_addr = addr;
2567 2550          size_t sv_size = size;
2568 2551          struct seg *sv_seg = seg;
2569 2552          ulong_t segcnt = 1;
2570 2553          ulong_t cnt;
2571 2554          size_t ssize;
2572 2555          pgcnt_t npages = btop(size);
2573 2556          page_t **plist;
2574 2557          page_t **pl;
2575 2558          int error;
2576 2559          caddr_t eaddr;
2577 2560          faultcode_t fault_err = 0;
2578 2561          pgcnt_t pl_off;
2579 2562          extern struct seg_ops segspt_shmops;
2580 2563  
2581 2564          ASSERT(AS_LOCK_HELD(as));
2582 2565          ASSERT(seg != NULL);
2583 2566          ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2584 2567          ASSERT(addr + size > seg->s_base + seg->s_size);
2585 2568          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2586 2569          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2587 2570  
2588 2571          /*
2589 2572           * Count the number of segments covered by the range we are about to
2590 2573           * lock. The segment count is used to size the shadow list we return
2591 2574           * back to the caller.
2592 2575           */
2593 2576          for (; size != 0; size -= ssize, addr += ssize) {
2594 2577                  if (addr >= seg->s_base + seg->s_size) {
2595 2578  
2596 2579                          seg = AS_SEGNEXT(as, seg);
2597 2580                          if (seg == NULL || addr != seg->s_base) {
2598 2581                                  AS_LOCK_EXIT(as);
2599 2582                                  return (EFAULT);
2600 2583                          }
2601 2584                          /*
2602 2585                           * Do a quick check if subsequent segments
2603 2586                           * will most likely support pagelock.
2604 2587                           */
2605 2588                          if (seg->s_ops == &segvn_ops) {
2606 2589                                  vnode_t *vp;
2607 2590  
2608 2591                                  if (SEGOP_GETVP(seg, addr, &vp) != 0 ||
2609 2592                                      vp != NULL) {
2610 2593                                          AS_LOCK_EXIT(as);
2611 2594                                          goto slow;
2612 2595                                  }
2613 2596                          } else if (seg->s_ops != &segspt_shmops) {
2614 2597                                  AS_LOCK_EXIT(as);
2615 2598                                  goto slow;
2616 2599                          }
2617 2600                          segcnt++;
2618 2601                  }
2619 2602                  if (addr + size > seg->s_base + seg->s_size) {
2620 2603                          ssize = seg->s_base + seg->s_size - addr;
2621 2604                  } else {
2622 2605                          ssize = size;
2623 2606                  }
2624 2607          }
2625 2608          ASSERT(segcnt > 1);
2626 2609  
2627 2610          plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2628 2611  
2629 2612          addr = sv_addr;
2630 2613          size = sv_size;
2631 2614          seg = sv_seg;
2632 2615  
2633 2616          for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2634 2617                  if (addr >= seg->s_base + seg->s_size) {
2635 2618                          seg = AS_SEGNEXT(as, seg);
2636 2619                          ASSERT(seg != NULL && addr == seg->s_base);
2637 2620                          cnt++;
2638 2621                          ASSERT(cnt < segcnt);
2639 2622                  }
2640 2623                  if (addr + size > seg->s_base + seg->s_size) {
2641 2624                          ssize = seg->s_base + seg->s_size - addr;
2642 2625                  } else {
2643 2626                          ssize = size;
2644 2627                  }
2645 2628                  pl = &plist[npages + cnt];
2646 2629                  error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2647 2630                      L_PAGELOCK, rw);
2648 2631                  if (error) {
2649 2632                          break;
2650 2633                  }
2651 2634                  ASSERT(plist[npages + cnt] != NULL);
2652 2635                  ASSERT(pl_off + btop(ssize) <= npages);
2653 2636                  bcopy(plist[npages + cnt], &plist[pl_off],
2654 2637                      btop(ssize) * sizeof (page_t *));
2655 2638                  pl_off += btop(ssize);
2656 2639          }
2657 2640  
2658 2641          if (size == 0) {
2659 2642                  AS_LOCK_EXIT(as);
2660 2643                  ASSERT(cnt == segcnt - 1);
2661 2644                  *ppp = plist;
2662 2645                  return (0);
2663 2646          }
2664 2647  
2665 2648          /*
2666 2649           * one of pagelock calls failed. The error type is in error variable.
2667 2650           * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2668 2651           * type is either EFAULT or ENOTSUP. Otherwise just return the error
2669 2652           * back to the caller.
2670 2653           */
2671 2654  
2672 2655          eaddr = addr;
2673 2656          seg = sv_seg;
2674 2657  
2675 2658          for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2676 2659                  if (addr >= seg->s_base + seg->s_size) {
2677 2660                          seg = AS_SEGNEXT(as, seg);
2678 2661                          ASSERT(seg != NULL && addr == seg->s_base);
2679 2662                          cnt++;
2680 2663                          ASSERT(cnt < segcnt);
2681 2664                  }
2682 2665                  if (eaddr > seg->s_base + seg->s_size) {
2683 2666                          ssize = seg->s_base + seg->s_size - addr;
2684 2667                  } else {
2685 2668                          ssize = eaddr - addr;
2686 2669                  }
2687 2670                  pl = &plist[npages + cnt];
2688 2671                  ASSERT(*pl != NULL);
2689 2672                  (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2690 2673                      L_PAGEUNLOCK, rw);
2691 2674          }
2692 2675  
2693 2676          AS_LOCK_EXIT(as);
2694 2677  
2695 2678          kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2696 2679  
2697 2680          if (error != ENOTSUP && error != EFAULT) {
2698 2681                  return (error);
2699 2682          }
2700 2683  
2701 2684  slow:
2702 2685          /*
2703 2686           * If we are here because pagelock failed due to the need to cow fault
2704 2687           * in the pages we want to lock F_SOFTLOCK will do this job and in
2705 2688           * next as_pagelock() call for this address range pagelock will
2706 2689           * hopefully succeed.
2707 2690           */
2708 2691          fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2709 2692          if (fault_err != 0) {
2710 2693                  return (fc_decode(fault_err));
2711 2694          }
2712 2695          *ppp = NULL;
2713 2696  
2714 2697          return (0);
2715 2698  }
2716 2699  
2717 2700  /*
2718 2701   * lock pages in a given address space. Return shadow list. If
2719 2702   * the list is NULL, the MMU mapping is also locked.
2720 2703   */
2721 2704  int
2722 2705  as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
2723 2706      size_t size, enum seg_rw rw)
2724 2707  {
2725 2708          size_t rsize;
2726 2709          caddr_t raddr;
2727 2710          faultcode_t fault_err;
2728 2711          struct seg *seg;
2729 2712          int err;
2730 2713  
2731 2714          TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START,
2732 2715              "as_pagelock_start: addr %p size %ld", addr, size);
2733 2716  
2734 2717          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2735 2718          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2736 2719              (size_t)raddr;
2737 2720  
2738 2721          /*
2739 2722           * if the request crosses two segments let
2740 2723           * as_fault handle it.
2741 2724           */
2742 2725          AS_LOCK_ENTER(as, RW_READER);
2743 2726  
2744 2727          seg = as_segat(as, raddr);
2745 2728          if (seg == NULL) {
2746 2729                  AS_LOCK_EXIT(as);
2747 2730                  return (EFAULT);
2748 2731          }
2749 2732          ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2750 2733          if (raddr + rsize > seg->s_base + seg->s_size) {
2751 2734                  return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2752 2735          }
2753 2736          if (raddr + rsize <= raddr) {
2754 2737                  AS_LOCK_EXIT(as);
2755 2738                  return (EFAULT);
2756 2739          }
2757 2740  
2758 2741          TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
2759 2742              "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize);
2760 2743  
2761 2744          /*
2762 2745           * try to lock pages and pass back shadow list
2763 2746           */
2764 2747          err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2765 2748  
2766 2749          TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
2767 2750  
2768 2751          AS_LOCK_EXIT(as);
2769 2752  
2770 2753          if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2771 2754                  return (err);
2772 2755          }
2773 2756  
2774 2757          /*
2775 2758           * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2776 2759           * to no pagelock support for this segment or pages need to be cow
2777 2760           * faulted in. If fault is needed F_SOFTLOCK will do this job for
2778 2761           * this as_pagelock() call and in the next as_pagelock() call for the
2779 2762           * same address range pagelock call will hopefull succeed.
2780 2763           */
2781 2764          fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2782 2765          if (fault_err != 0) {
2783 2766                  return (fc_decode(fault_err));
2784 2767          }
2785 2768          *ppp = NULL;
2786 2769  
2787 2770          TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end");
2788 2771          return (0);
2789 2772  }
2790 2773  
2791 2774  /*
2792 2775   * unlock pages locked by as_pagelock_segs().  Retrieve per segment shadow
2793 2776   * lists from the end of plist and call pageunlock interface for each segment.
2794 2777   * Drop as lock and free plist.
2795 2778   */
2796 2779  static void
2797 2780  as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
2798 2781      struct page **plist, enum seg_rw rw)
2799 2782  {
2800 2783          ulong_t cnt;
2801 2784          caddr_t eaddr = addr + size;
2802 2785          pgcnt_t npages = btop(size);
2803 2786          size_t ssize;
2804 2787          page_t **pl;
2805 2788  
2806 2789          ASSERT(AS_LOCK_HELD(as));
2807 2790          ASSERT(seg != NULL);
2808 2791          ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2809 2792          ASSERT(addr + size > seg->s_base + seg->s_size);
2810 2793          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2811 2794          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2812 2795          ASSERT(plist != NULL);
2813 2796  
2814 2797          for (cnt = 0; addr < eaddr; addr += ssize) {
2815 2798                  if (addr >= seg->s_base + seg->s_size) {
2816 2799                          seg = AS_SEGNEXT(as, seg);
2817 2800                          ASSERT(seg != NULL && addr == seg->s_base);
2818 2801                          cnt++;
2819 2802                  }
2820 2803                  if (eaddr > seg->s_base + seg->s_size) {
2821 2804                          ssize = seg->s_base + seg->s_size - addr;
2822 2805                  } else {
2823 2806                          ssize = eaddr - addr;
2824 2807                  }
2825 2808                  pl = &plist[npages + cnt];
2826 2809                  ASSERT(*pl != NULL);
2827 2810                  (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2828 2811                      L_PAGEUNLOCK, rw);
2829 2812          }
2830 2813          ASSERT(cnt > 0);
2831 2814          AS_LOCK_EXIT(as);
2832 2815  
2833 2816          cnt++;
2834 2817          kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2835 2818  }
2836 2819  
2837 2820  /*
2838 2821   * unlock pages in a given address range
2839 2822   */
2840 2823  void
2841 2824  as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2842 2825      enum seg_rw rw)
2843 2826  {
2844 2827          struct seg *seg;
2845 2828          size_t rsize;
2846 2829          caddr_t raddr;
2847 2830  
2848 2831          TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START,
2849 2832              "as_pageunlock_start: addr %p size %ld", addr, size);
2850 2833  
2851 2834          /*
2852 2835           * if the shadow list is NULL, as_pagelock was
2853 2836           * falling back to as_fault
2854 2837           */
2855 2838          if (pp == NULL) {
2856 2839                  (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2857 2840                  return;
2858 2841          }
2859 2842  
2860 2843          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2861 2844          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2862 2845              (size_t)raddr;
2863 2846  
2864 2847          AS_LOCK_ENTER(as, RW_READER);
2865 2848          seg = as_segat(as, raddr);
2866 2849          ASSERT(seg != NULL);
2867 2850  
2868 2851          TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
2869 2852              "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
2870 2853  
2871 2854          ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2872 2855          if (raddr + rsize <= seg->s_base + seg->s_size) {
2873 2856                  SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2874 2857          } else {
2875 2858                  as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2876 2859                  return;
2877 2860          }
2878 2861          AS_LOCK_EXIT(as);
2879 2862          TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
2880 2863  }
2881 2864  
2882 2865  int
2883 2866  as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2884 2867      boolean_t wait)
2885 2868  {
2886 2869          struct seg *seg;
2887 2870          size_t ssize;
2888 2871          caddr_t raddr;                  /* rounded down addr */
2889 2872          size_t rsize;                   /* rounded up size */
2890 2873          int error = 0;
2891 2874          size_t pgsz = page_get_pagesize(szc);
2892 2875  
2893 2876  setpgsz_top:
2894 2877          if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
2895 2878                  return (EINVAL);
2896 2879          }
2897 2880  
2898 2881          raddr = addr;
2899 2882          rsize = size;
2900 2883  
2901 2884          if (raddr + rsize < raddr)              /* check for wraparound */
2902 2885                  return (ENOMEM);
2903 2886  
2904 2887          AS_LOCK_ENTER(as, RW_WRITER);
2905 2888          as_clearwatchprot(as, raddr, rsize);
2906 2889          seg = as_segat(as, raddr);
2907 2890          if (seg == NULL) {
2908 2891                  as_setwatch(as);
2909 2892                  AS_LOCK_EXIT(as);
2910 2893                  return (ENOMEM);
2911 2894          }
2912 2895  
2913 2896          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2914 2897                  if (raddr >= seg->s_base + seg->s_size) {
2915 2898                          seg = AS_SEGNEXT(as, seg);
2916 2899                          if (seg == NULL || raddr != seg->s_base) {
2917 2900                                  error = ENOMEM;
2918 2901                                  break;
2919 2902                          }
2920 2903                  }
2921 2904                  if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2922 2905                          ssize = seg->s_base + seg->s_size - raddr;
2923 2906                  } else {
2924 2907                          ssize = rsize;
2925 2908                  }
2926 2909  
2927 2910  retry:
2928 2911                  error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
2929 2912  
2930 2913                  if (error == IE_NOMEM) {
2931 2914                          error = EAGAIN;
2932 2915                          break;
2933 2916                  }
2934 2917  
2935 2918                  if (error == IE_RETRY) {
2936 2919                          AS_LOCK_EXIT(as);
2937 2920                          goto setpgsz_top;
2938 2921                  }
2939 2922  
2940 2923                  if (error == ENOTSUP) {
2941 2924                          error = EINVAL;
2942 2925                          break;
2943 2926                  }
2944 2927  
2945 2928                  if (wait && (error == EAGAIN)) {
2946 2929                          /*
2947 2930                           * Memory is currently locked.  It must be unlocked
2948 2931                           * before this operation can succeed through a retry.
2949 2932                           * The possible reasons for locked memory and
2950 2933                           * corresponding strategies for unlocking are:
2951 2934                           * (1) Normal I/O
2952 2935                           *      wait for a signal that the I/O operation
2953 2936                           *      has completed and the memory is unlocked.
2954 2937                           * (2) Asynchronous I/O
2955 2938                           *      The aio subsystem does not unlock pages when
2956 2939                           *      the I/O is completed. Those pages are unlocked
2957 2940                           *      when the application calls aiowait/aioerror.
2958 2941                           *      So, to prevent blocking forever, cv_broadcast()
2959 2942                           *      is done to wake up aio_cleanup_thread.
2960 2943                           *      Subsequently, segvn_reclaim will be called, and
2961 2944                           *      that will do AS_CLRUNMAPWAIT() and wake us up.
2962 2945                           * (3) Long term page locking:
2963 2946                           *      This is not relevant for as_setpagesize()
2964 2947                           *      because we cannot change the page size for
2965 2948                           *      driver memory. The attempt to do so will
2966 2949                           *      fail with a different error than EAGAIN so
2967 2950                           *      there's no need to trigger as callbacks like
2968 2951                           *      as_unmap, as_setprot or as_free would do.
2969 2952                           */
2970 2953                          mutex_enter(&as->a_contents);
2971 2954                          if (!AS_ISNOUNMAPWAIT(as)) {
2972 2955                                  if (AS_ISUNMAPWAIT(as) == 0) {
2973 2956                                          cv_broadcast(&as->a_cv);
2974 2957                                  }
2975 2958                                  AS_SETUNMAPWAIT(as);
2976 2959                                  AS_LOCK_EXIT(as);
2977 2960                                  while (AS_ISUNMAPWAIT(as)) {
2978 2961                                          cv_wait(&as->a_cv, &as->a_contents);
2979 2962                                  }
2980 2963                          } else {
2981 2964                                  /*
2982 2965                                   * We may have raced with
2983 2966                                   * segvn_reclaim()/segspt_reclaim(). In this
2984 2967                                   * case clean nounmapwait flag and retry since
2985 2968                                   * softlockcnt in this segment may be already
2986 2969                                   * 0.  We don't drop as writer lock so our
2987 2970                                   * number of retries without sleeping should
2988 2971                                   * be very small. See segvn_reclaim() for
2989 2972                                   * more comments.
2990 2973                                   */
2991 2974                                  AS_CLRNOUNMAPWAIT(as);
2992 2975                                  mutex_exit(&as->a_contents);
2993 2976                                  goto retry;
2994 2977                          }
2995 2978                          mutex_exit(&as->a_contents);
2996 2979                          goto setpgsz_top;
2997 2980                  } else if (error != 0) {
2998 2981                          break;
2999 2982                  }
3000 2983          }
3001 2984          as_setwatch(as);
3002 2985          AS_LOCK_EXIT(as);
3003 2986          return (error);
3004 2987  }
3005 2988  
3006 2989  /*
3007 2990   * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments
3008 2991   * in its chunk where s_szc is less than the szc we want to set.
3009 2992   */
3010 2993  static int
3011 2994  as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3012 2995      int *retry)
3013 2996  {
3014 2997          struct seg *seg;
3015 2998          size_t ssize;
3016 2999          int error;
3017 3000  
3018 3001          ASSERT(AS_WRITE_HELD(as));
3019 3002  
3020 3003          seg = as_segat(as, raddr);
3021 3004          if (seg == NULL) {
3022 3005                  panic("as_iset3_default_lpsize: no seg");
3023 3006          }
3024 3007  
3025 3008          for (; rsize != 0; rsize -= ssize, raddr += ssize) {
3026 3009                  if (raddr >= seg->s_base + seg->s_size) {
3027 3010                          seg = AS_SEGNEXT(as, seg);
3028 3011                          if (seg == NULL || raddr != seg->s_base) {
3029 3012                                  panic("as_iset3_default_lpsize: as changed");
3030 3013                          }
3031 3014                  }
3032 3015                  if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3033 3016                          ssize = seg->s_base + seg->s_size - raddr;
3034 3017                  } else {
3035 3018                          ssize = rsize;
3036 3019                  }
3037 3020  
3038 3021                  if (szc > seg->s_szc) {
3039 3022                          error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
3040 3023                          /* Only retry on EINVAL segments that have no vnode. */
3041 3024                          if (error == EINVAL) {
3042 3025                                  vnode_t *vp = NULL;
3043 3026                                  if ((SEGOP_GETTYPE(seg, raddr) & MAP_SHARED) &&
3044 3027                                      (SEGOP_GETVP(seg, raddr, &vp) != 0 ||
3045 3028                                      vp == NULL)) {
3046 3029                                          *retry = 1;
3047 3030                                  } else {
3048 3031                                          *retry = 0;
3049 3032                                  }
3050 3033                          }
3051 3034                          if (error) {
3052 3035                                  return (error);
3053 3036                          }
3054 3037                  }
3055 3038          }
3056 3039          return (0);
3057 3040  }
3058 3041  
3059 3042  /*
3060 3043   * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
3061 3044   * pagesize on each segment in its range, but if any fails with EINVAL,
3062 3045   * then it reduces the pagesizes to the next size in the bitmap and
3063 3046   * retries as_iset3_default_lpsize(). The reason why the code retries
3064 3047   * smaller allowed sizes on EINVAL is because (a) the anon offset may not
3065 3048   * match the bigger sizes, and (b) it's hard to get this offset (to begin
3066 3049   * with) to pass to map_pgszcvec().
3067 3050   */
3068 3051  static int
3069 3052  as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
3070 3053      uint_t szcvec)
3071 3054  {
3072 3055          int error;
3073 3056          int retry;
3074 3057  
3075 3058          ASSERT(AS_WRITE_HELD(as));
3076 3059  
3077 3060          for (;;) {
3078 3061                  error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
3079 3062                  if (error == EINVAL && retry) {
3080 3063                          szcvec &= ~(1 << szc);
3081 3064                          if (szcvec <= 1) {
3082 3065                                  return (EINVAL);
3083 3066                          }
3084 3067                          szc = highbit(szcvec) - 1;
3085 3068                  } else {
3086 3069                          return (error);
3087 3070                  }
3088 3071          }
3089 3072  }
3090 3073  
3091 3074  /*
3092 3075   * as_iset1_default_lpsize() breaks its chunk into areas where existing
3093 3076   * segments have a smaller szc than we want to set. For each such area,
3094 3077   * it calls as_iset2_default_lpsize()
3095 3078   */
3096 3079  static int
3097 3080  as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3098 3081      uint_t szcvec)
3099 3082  {
3100 3083          struct seg *seg;
3101 3084          size_t ssize;
3102 3085          caddr_t setaddr = raddr;
3103 3086          size_t setsize = 0;
3104 3087          int set;
3105 3088          int error;
3106 3089  
3107 3090          ASSERT(AS_WRITE_HELD(as));
3108 3091  
3109 3092          seg = as_segat(as, raddr);
3110 3093          if (seg == NULL) {
3111 3094                  panic("as_iset1_default_lpsize: no seg");
3112 3095          }
3113 3096          if (seg->s_szc < szc) {
3114 3097                  set = 1;
3115 3098          } else {
3116 3099                  set = 0;
3117 3100          }
3118 3101  
3119 3102          for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3120 3103                  if (raddr >= seg->s_base + seg->s_size) {
3121 3104                          seg = AS_SEGNEXT(as, seg);
3122 3105                          if (seg == NULL || raddr != seg->s_base) {
3123 3106                                  panic("as_iset1_default_lpsize: as changed");
3124 3107                          }
3125 3108                          if (seg->s_szc >= szc && set) {
3126 3109                                  ASSERT(setsize != 0);
3127 3110                                  error = as_iset2_default_lpsize(as,
3128 3111                                      setaddr, setsize, szc, szcvec);
3129 3112                                  if (error) {
3130 3113                                          return (error);
3131 3114                                  }
3132 3115                                  set = 0;
3133 3116                          } else if (seg->s_szc < szc && !set) {
3134 3117                                  setaddr = raddr;
3135 3118                                  setsize = 0;
3136 3119                                  set = 1;
3137 3120                          }
3138 3121                  }
3139 3122                  if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3140 3123                          ssize = seg->s_base + seg->s_size - raddr;
3141 3124                  } else {
3142 3125                          ssize = rsize;
3143 3126                  }
3144 3127          }
3145 3128          error = 0;
3146 3129          if (set) {
3147 3130                  ASSERT(setsize != 0);
3148 3131                  error = as_iset2_default_lpsize(as, setaddr, setsize,
3149 3132                      szc, szcvec);
3150 3133          }
3151 3134          return (error);
3152 3135  }
3153 3136  
3154 3137  /*
3155 3138   * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3156 3139   * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3157 3140   * chunk to as_iset1_default_lpsize().
3158 3141   */
3159 3142  static int
3160 3143  as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
3161 3144      int type)
3162 3145  {
3163 3146          int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
3164 3147          uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
3165 3148              flags, rtype, 1);
3166 3149          uint_t szc;
3167 3150          uint_t nszc;
3168 3151          int error;
3169 3152          caddr_t a;
3170 3153          caddr_t eaddr;
3171 3154          size_t segsize;
3172 3155          size_t pgsz;
3173 3156          uint_t save_szcvec;
3174 3157  
3175 3158          ASSERT(AS_WRITE_HELD(as));
3176 3159          ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
3177 3160          ASSERT(IS_P2ALIGNED(size, PAGESIZE));
3178 3161  
3179 3162          szcvec &= ~1;
3180 3163          if (szcvec <= 1) {      /* skip if base page size */
3181 3164                  return (0);
3182 3165          }
3183 3166  
3184 3167          /* Get the pagesize of the first larger page size. */
3185 3168          szc = lowbit(szcvec) - 1;
3186 3169          pgsz = page_get_pagesize(szc);
3187 3170          eaddr = addr + size;
3188 3171          addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3189 3172          eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3190 3173  
3191 3174          save_szcvec = szcvec;
3192 3175          szcvec >>= (szc + 1);
3193 3176          nszc = szc;
3194 3177          while (szcvec) {
3195 3178                  if ((szcvec & 0x1) == 0) {
3196 3179                          nszc++;
3197 3180                          szcvec >>= 1;
3198 3181                          continue;
3199 3182                  }
3200 3183                  nszc++;
3201 3184                  pgsz = page_get_pagesize(nszc);
3202 3185                  a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3203 3186                  if (a != addr) {
3204 3187                          ASSERT(szc > 0);
3205 3188                          ASSERT(a < eaddr);
3206 3189                          segsize = a - addr;
3207 3190                          error = as_iset1_default_lpsize(as, addr, segsize, szc,
3208 3191                              save_szcvec);
3209 3192                          if (error) {
3210 3193                                  return (error);
3211 3194                          }
3212 3195                          addr = a;
3213 3196                  }
3214 3197                  szc = nszc;
3215 3198                  szcvec >>= 1;
3216 3199          }
3217 3200  
3218 3201          ASSERT(addr < eaddr);
3219 3202          szcvec = save_szcvec;
3220 3203          while (szcvec) {
3221 3204                  a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3222 3205                  ASSERT(a >= addr);
3223 3206                  if (a != addr) {
3224 3207                          ASSERT(szc > 0);
3225 3208                          segsize = a - addr;
3226 3209                          error = as_iset1_default_lpsize(as, addr, segsize, szc,
3227 3210                              save_szcvec);
3228 3211                          if (error) {
3229 3212                                  return (error);
3230 3213                          }
3231 3214                          addr = a;
3232 3215                  }
3233 3216                  szcvec &= ~(1 << szc);
3234 3217                  if (szcvec) {
3235 3218                          szc = highbit(szcvec) - 1;
3236 3219                          pgsz = page_get_pagesize(szc);
3237 3220                  }
3238 3221          }
3239 3222          ASSERT(addr == eaddr);
3240 3223  
3241 3224          return (0);
3242 3225  }
3243 3226  
3244 3227  /*
3245 3228   * Set the default large page size for the range. Called via memcntl with
3246 3229   * page size set to 0. as_set_default_lpsize breaks the range down into
3247 3230   * chunks with the same type/flags, ignores-non segvn segments, and passes
3248 3231   * each chunk to as_iset_default_lpsize().
3249 3232   */
3250 3233  int
3251 3234  as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
3252 3235  {
3253 3236          struct seg *seg;
3254 3237          caddr_t raddr;
3255 3238          size_t rsize;
3256 3239          size_t ssize;
3257 3240          int rtype, rflags;
3258 3241          int stype, sflags;
3259 3242          int error;
3260 3243          caddr_t setaddr;
3261 3244          size_t setsize;
3262 3245          int segvn;
3263 3246  
3264 3247          if (size == 0)
3265 3248                  return (0);
3266 3249  
3267 3250          AS_LOCK_ENTER(as, RW_WRITER);
3268 3251  again:
3269 3252          error = 0;
3270 3253  
3271 3254          raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3272 3255          rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3273 3256              (size_t)raddr;
3274 3257  
3275 3258          if (raddr + rsize < raddr) {            /* check for wraparound */
3276 3259                  AS_LOCK_EXIT(as);
3277 3260                  return (ENOMEM);
3278 3261          }
3279 3262          as_clearwatchprot(as, raddr, rsize);
3280 3263          seg = as_segat(as, raddr);
3281 3264          if (seg == NULL) {
3282 3265                  as_setwatch(as);
3283 3266                  AS_LOCK_EXIT(as);
3284 3267                  return (ENOMEM);
3285 3268          }
3286 3269          if (seg->s_ops == &segvn_ops) {
3287 3270                  rtype = SEGOP_GETTYPE(seg, addr);
3288 3271                  rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3289 3272                  rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3290 3273                  segvn = 1;
3291 3274          } else {
3292 3275                  segvn = 0;
3293 3276          }
3294 3277          setaddr = raddr;
3295 3278          setsize = 0;
3296 3279  
3297 3280          for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3298 3281                  if (raddr >= (seg->s_base + seg->s_size)) {
3299 3282                          seg = AS_SEGNEXT(as, seg);
3300 3283                          if (seg == NULL || raddr != seg->s_base) {
3301 3284                                  error = ENOMEM;
3302 3285                                  break;
3303 3286                          }
3304 3287                          if (seg->s_ops == &segvn_ops) {
3305 3288                                  stype = SEGOP_GETTYPE(seg, raddr);
3306 3289                                  sflags = stype & (MAP_TEXT | MAP_INITDATA);
3307 3290                                  stype &= (MAP_SHARED | MAP_PRIVATE);
3308 3291                                  if (segvn && (rflags != sflags ||
3309 3292                                      rtype != stype)) {
3310 3293                                          /*
3311 3294                                           * The next segment is also segvn but
3312 3295                                           * has different flags and/or type.
3313 3296                                           */
3314 3297                                          ASSERT(setsize != 0);
3315 3298                                          error = as_iset_default_lpsize(as,
3316 3299                                              setaddr, setsize, rflags, rtype);
3317 3300                                          if (error) {
3318 3301                                                  break;
3319 3302                                          }
3320 3303                                          rflags = sflags;
3321 3304                                          rtype = stype;
3322 3305                                          setaddr = raddr;
3323 3306                                          setsize = 0;
3324 3307                                  } else if (!segvn) {
3325 3308                                          rflags = sflags;
3326 3309                                          rtype = stype;
3327 3310                                          setaddr = raddr;
3328 3311                                          setsize = 0;
3329 3312                                          segvn = 1;
3330 3313                                  }
3331 3314                          } else if (segvn) {
3332 3315                                  /* The next segment is not segvn. */
3333 3316                                  ASSERT(setsize != 0);
3334 3317                                  error = as_iset_default_lpsize(as,
3335 3318                                      setaddr, setsize, rflags, rtype);
3336 3319                                  if (error) {
3337 3320                                          break;
3338 3321                                  }
3339 3322                                  segvn = 0;
3340 3323                          }
3341 3324                  }
3342 3325                  if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3343 3326                          ssize = seg->s_base + seg->s_size - raddr;
3344 3327                  } else {
3345 3328                          ssize = rsize;
3346 3329                  }
3347 3330          }
3348 3331          if (error == 0 && segvn) {
3349 3332                  /* The last chunk when rsize == 0. */
3350 3333                  ASSERT(setsize != 0);
3351 3334                  error = as_iset_default_lpsize(as, setaddr, setsize,
3352 3335                      rflags, rtype);
3353 3336          }
3354 3337  
3355 3338          if (error == IE_RETRY) {
3356 3339                  goto again;
3357 3340          } else if (error == IE_NOMEM) {
3358 3341                  error = EAGAIN;
3359 3342          } else if (error == ENOTSUP) {
3360 3343                  error = EINVAL;
3361 3344          } else if (error == EAGAIN) {
3362 3345                  mutex_enter(&as->a_contents);
3363 3346                  if (!AS_ISNOUNMAPWAIT(as)) {
3364 3347                          if (AS_ISUNMAPWAIT(as) == 0) {
3365 3348                                  cv_broadcast(&as->a_cv);
3366 3349                          }
3367 3350                          AS_SETUNMAPWAIT(as);
3368 3351                          AS_LOCK_EXIT(as);
3369 3352                          while (AS_ISUNMAPWAIT(as)) {
3370 3353                                  cv_wait(&as->a_cv, &as->a_contents);
3371 3354                          }
3372 3355                          mutex_exit(&as->a_contents);
3373 3356                          AS_LOCK_ENTER(as, RW_WRITER);
3374 3357                  } else {
3375 3358                          /*
3376 3359                           * We may have raced with
3377 3360                           * segvn_reclaim()/segspt_reclaim(). In this case
3378 3361                           * clean nounmapwait flag and retry since softlockcnt
3379 3362                           * in this segment may be already 0.  We don't drop as
3380 3363                           * writer lock so our number of retries without
3381 3364                           * sleeping should be very small. See segvn_reclaim()
3382 3365                           * for more comments.
3383 3366                           */
3384 3367                          AS_CLRNOUNMAPWAIT(as);
3385 3368                          mutex_exit(&as->a_contents);
3386 3369                  }
3387 3370                  goto again;
3388 3371          }
3389 3372  
3390 3373          as_setwatch(as);
3391 3374          AS_LOCK_EXIT(as);
3392 3375          return (error);
3393 3376  }
3394 3377  
3395 3378  /*
3396 3379   * Setup all of the uninitialized watched pages that we can.
3397 3380   */
3398 3381  void
3399 3382  as_setwatch(struct as *as)
3400 3383  {
3401 3384          struct watched_page *pwp;
3402 3385          struct seg *seg;
3403 3386          caddr_t vaddr;
3404 3387          uint_t prot;
3405 3388          int  err, retrycnt;
3406 3389  
3407 3390          if (avl_numnodes(&as->a_wpage) == 0)
3408 3391                  return;
3409 3392  
3410 3393          ASSERT(AS_WRITE_HELD(as));
3411 3394  
3412 3395          for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3413 3396              pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3414 3397                  retrycnt = 0;
3415 3398          retry:
3416 3399                  vaddr = pwp->wp_vaddr;
3417 3400                  if (pwp->wp_oprot != 0 ||       /* already set up */
3418 3401                      (seg = as_segat(as, vaddr)) == NULL ||
3419 3402                      SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0)
3420 3403                          continue;
3421 3404  
3422 3405                  pwp->wp_oprot = prot;
3423 3406                  if (pwp->wp_read)
3424 3407                          prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3425 3408                  if (pwp->wp_write)
3426 3409                          prot &= ~PROT_WRITE;
3427 3410                  if (pwp->wp_exec)
3428 3411                          prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3429 3412                  if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3430 3413                          err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3431 3414                          if (err == IE_RETRY) {
3432 3415                                  pwp->wp_oprot = 0;
3433 3416                                  ASSERT(retrycnt == 0);
3434 3417                                  retrycnt++;
3435 3418                                  goto retry;
3436 3419                          }
3437 3420                  }
3438 3421                  pwp->wp_prot = prot;
3439 3422          }
3440 3423  }
3441 3424  
3442 3425  /*
3443 3426   * Clear all of the watched pages in the address space.
3444 3427   */
3445 3428  void
3446 3429  as_clearwatch(struct as *as)
3447 3430  {
3448 3431          struct watched_page *pwp;
3449 3432          struct seg *seg;
3450 3433          caddr_t vaddr;
3451 3434          uint_t prot;
3452 3435          int err, retrycnt;
3453 3436  
3454 3437          if (avl_numnodes(&as->a_wpage) == 0)
3455 3438                  return;
3456 3439  
3457 3440          ASSERT(AS_WRITE_HELD(as));
3458 3441  
3459 3442          for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3460 3443              pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3461 3444                  retrycnt = 0;
3462 3445          retry:
3463 3446                  vaddr = pwp->wp_vaddr;
3464 3447                  if (pwp->wp_oprot == 0 ||       /* not set up */
3465 3448                      (seg = as_segat(as, vaddr)) == NULL)
3466 3449                          continue;
3467 3450  
3468 3451                  if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3469 3452                          err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3470 3453                          if (err == IE_RETRY) {
3471 3454                                  ASSERT(retrycnt == 0);
3472 3455                                  retrycnt++;
3473 3456                                  goto retry;
3474 3457                          }
3475 3458                  }
3476 3459                  pwp->wp_oprot = 0;
3477 3460                  pwp->wp_prot = 0;
3478 3461          }
3479 3462  }
3480 3463  
3481 3464  /*
3482 3465   * Force a new setup for all the watched pages in the range.
3483 3466   */
3484 3467  static void
3485 3468  as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3486 3469  {
3487 3470          struct watched_page *pwp;
3488 3471          struct watched_page tpw;
3489 3472          caddr_t eaddr = addr + size;
3490 3473          caddr_t vaddr;
3491 3474          struct seg *seg;
3492 3475          int err, retrycnt;
3493 3476          uint_t  wprot;
3494 3477          avl_index_t where;
3495 3478  
3496 3479          if (avl_numnodes(&as->a_wpage) == 0)
3497 3480                  return;
3498 3481  
3499 3482          ASSERT(AS_WRITE_HELD(as));
3500 3483  
3501 3484          tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3502 3485          if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3503 3486                  pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3504 3487  
3505 3488          while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3506 3489                  retrycnt = 0;
3507 3490                  vaddr = pwp->wp_vaddr;
3508 3491  
3509 3492                  wprot = prot;
3510 3493                  if (pwp->wp_read)
3511 3494                          wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3512 3495                  if (pwp->wp_write)
3513 3496                          wprot &= ~PROT_WRITE;
3514 3497                  if (pwp->wp_exec)
3515 3498                          wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3516 3499                  if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3517 3500                  retry:
3518 3501                          seg = as_segat(as, vaddr);
3519 3502                          if (seg == NULL) {
3520 3503                                  panic("as_setwatchprot: no seg");
3521 3504                                  /*NOTREACHED*/
3522 3505                          }
3523 3506                          err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot);
3524 3507                          if (err == IE_RETRY) {
3525 3508                                  ASSERT(retrycnt == 0);
3526 3509                                  retrycnt++;
3527 3510                                  goto retry;
3528 3511                          }
3529 3512                  }
3530 3513                  pwp->wp_oprot = prot;
3531 3514                  pwp->wp_prot = wprot;
3532 3515  
3533 3516                  pwp = AVL_NEXT(&as->a_wpage, pwp);
3534 3517          }
3535 3518  }
3536 3519  
3537 3520  /*
3538 3521   * Clear all of the watched pages in the range.
3539 3522   */
3540 3523  static void
3541 3524  as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3542 3525  {
3543 3526          caddr_t eaddr = addr + size;
3544 3527          struct watched_page *pwp;
3545 3528          struct watched_page tpw;
3546 3529          uint_t prot;
3547 3530          struct seg *seg;
3548 3531          int err, retrycnt;
3549 3532          avl_index_t where;
3550 3533  
3551 3534          if (avl_numnodes(&as->a_wpage) == 0)
3552 3535                  return;
3553 3536  
3554 3537          tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3555 3538          if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3556 3539                  pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3557 3540  
3558 3541          ASSERT(AS_WRITE_HELD(as));
3559 3542  
3560 3543          while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3561 3544  
3562 3545                  if ((prot = pwp->wp_oprot) != 0) {
3563 3546                          retrycnt = 0;
3564 3547  
3565 3548                          if (prot != pwp->wp_prot) {
3566 3549                          retry:
3567 3550                                  seg = as_segat(as, pwp->wp_vaddr);
3568 3551                                  if (seg == NULL)
3569 3552                                          continue;
3570 3553                                  err = SEGOP_SETPROT(seg, pwp->wp_vaddr,
3571 3554                                      PAGESIZE, prot);
3572 3555                                  if (err == IE_RETRY) {
3573 3556                                          ASSERT(retrycnt == 0);
3574 3557                                          retrycnt++;
3575 3558                                          goto retry;
3576 3559  
3577 3560                                  }
3578 3561                          }
3579 3562                          pwp->wp_oprot = 0;
3580 3563                          pwp->wp_prot = 0;
3581 3564                  }
3582 3565  
3583 3566                  pwp = AVL_NEXT(&as->a_wpage, pwp);
3584 3567          }
3585 3568  }
3586 3569  
3587 3570  void
3588 3571  as_signal_proc(struct as *as, k_siginfo_t *siginfo)
3589 3572  {
3590 3573          struct proc *p;
3591 3574  
3592 3575          mutex_enter(&pidlock);
3593 3576          for (p = practive; p; p = p->p_next) {
3594 3577                  if (p->p_as == as) {
3595 3578                          mutex_enter(&p->p_lock);
3596 3579                          if (p->p_as == as)
3597 3580                                  sigaddq(p, NULL, siginfo, KM_NOSLEEP);
3598 3581                          mutex_exit(&p->p_lock);
3599 3582                  }
3600 3583          }
3601 3584          mutex_exit(&pidlock);
3602 3585  }
3603 3586  
3604 3587  /*
3605 3588   * return memory object ID
3606 3589   */
3607 3590  int
3608 3591  as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3609 3592  {
3610 3593          struct seg      *seg;
3611 3594          int             sts;
3612 3595  
3613 3596          AS_LOCK_ENTER(as, RW_READER);
3614 3597          seg = as_segat(as, addr);
3615 3598          if (seg == NULL) {
3616 3599                  AS_LOCK_EXIT(as);
3617 3600                  return (EFAULT);
3618 3601          }
3619 3602          /*
3620 3603           * catch old drivers which may not support getmemid
3621 3604           */
3622 3605          if (seg->s_ops->getmemid == NULL) {
3623 3606                  AS_LOCK_EXIT(as);
3624 3607                  return (ENODEV);
3625 3608          }
3626 3609  
3627 3610          sts = SEGOP_GETMEMID(seg, addr, memidp);
3628 3611  
3629 3612          AS_LOCK_EXIT(as);
3630 3613          return (sts);
3631 3614  }

↓ open down ↓

1886 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX