1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright 2015, Joyent, Inc.  All rights reserved.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  */
  27 
  28 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  29 /*        All Rights Reserved   */
  30 
  31 /*
  32  * University Copyright- Copyright (c) 1982, 1986, 1988
  33  * The Regents of the University of California
  34  * All Rights Reserved
  35  *
  36  * University Acknowledgment- Portions of this document are derived from
  37  * software developed by the University of California, Berkeley, and its
  38  * contributors.
  39  */
  40 
  41 /*
  42  * VM - address spaces.
  43  */
  44 
  45 #include <sys/types.h>
  46 #include <sys/t_lock.h>
  47 #include <sys/param.h>
  48 #include <sys/errno.h>
  49 #include <sys/systm.h>
  50 #include <sys/mman.h>
  51 #include <sys/sysmacros.h>
  52 #include <sys/cpuvar.h>
  53 #include <sys/sysinfo.h>
  54 #include <sys/kmem.h>
  55 #include <sys/vnode.h>
  56 #include <sys/vmsystm.h>
  57 #include <sys/cmn_err.h>
  58 #include <sys/debug.h>
  59 #include <sys/tnf_probe.h>
  60 #include <sys/vtrace.h>
  61 
  62 #include <vm/hat.h>
  63 #include <vm/as.h>
  64 #include <vm/seg.h>
  65 #include <vm/seg_vn.h>
  66 #include <vm/seg_dev.h>
  67 #include <vm/seg_kmem.h>
  68 #include <vm/seg_map.h>
  69 #include <vm/seg_spt.h>
  70 #include <vm/page.h>
  71 
  72 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
  73 
  74 static struct kmem_cache *as_cache;
  75 
  76 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
  77 static void as_clearwatchprot(struct as *, caddr_t, size_t);
  78 int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
  79 
  80 
  81 /*
  82  * Verifying the segment lists is very time-consuming; it may not be
  83  * desirable always to define VERIFY_SEGLIST when DEBUG is set.
  84  */
  85 #ifdef DEBUG
  86 #define VERIFY_SEGLIST
  87 int do_as_verify = 0;
  88 #endif
  89 
  90 /*
  91  * Allocate a new callback data structure entry and fill in the events of
  92  * interest, the address range of interest, and the callback argument.
  93  * Link the entry on the as->a_callbacks list. A callback entry for the
  94  * entire address space may be specified with vaddr = 0 and size = -1.
  95  *
  96  * CALLERS RESPONSIBILITY: If not calling from within the process context for
  97  * the specified as, the caller must guarantee persistence of the specified as
  98  * for the duration of this function (eg. pages being locked within the as
  99  * will guarantee persistence).
 100  */
 101 int
 102 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
 103     caddr_t vaddr, size_t size, int sleepflag)
 104 {
 105         struct as_callback      *current_head, *cb;
 106         caddr_t                 saddr;
 107         size_t                  rsize;
 108 
 109         /* callback function and an event are mandatory */
 110         if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
 111                 return (EINVAL);
 112 
 113         /* Adding a callback after as_free has been called is not allowed */
 114         if (as == &kas)
 115                 return (ENOMEM);
 116 
 117         /*
 118          * vaddr = 0 and size = -1 is used to indicate that the callback range
 119          * is the entire address space so no rounding is done in that case.
 120          */
 121         if (size != -1) {
 122                 saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
 123                 rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
 124                     (size_t)saddr;
 125                 /* check for wraparound */
 126                 if (saddr + rsize < saddr)
 127                         return (ENOMEM);
 128         } else {
 129                 if (vaddr != 0)
 130                         return (EINVAL);
 131                 saddr = vaddr;
 132                 rsize = size;
 133         }
 134 
 135         /* Allocate and initialize a callback entry */
 136         cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
 137         if (cb == NULL)
 138                 return (EAGAIN);
 139 
 140         cb->ascb_func = cb_func;
 141         cb->ascb_arg = arg;
 142         cb->ascb_events = events;
 143         cb->ascb_saddr = saddr;
 144         cb->ascb_len = rsize;
 145 
 146         /* Add the entry to the list */
 147         mutex_enter(&as->a_contents);
 148         current_head = as->a_callbacks;
 149         as->a_callbacks = cb;
 150         cb->ascb_next = current_head;
 151 
 152         /*
 153          * The call to this function may lose in a race with
 154          * a pertinent event - eg. a thread does long term memory locking
 155          * but before the callback is added another thread executes as_unmap.
 156          * A broadcast here resolves that.
 157          */
 158         if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
 159                 AS_CLRUNMAPWAIT(as);
 160                 cv_broadcast(&as->a_cv);
 161         }
 162 
 163         mutex_exit(&as->a_contents);
 164         return (0);
 165 }
 166 
 167 /*
 168  * Search the callback list for an entry which pertains to arg.
 169  *
 170  * This is called from within the client upon completion of the callback.
 171  * RETURN VALUES:
 172  *      AS_CALLBACK_DELETED  (callback entry found and deleted)
 173  *      AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
 174  *      AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
 175  *                      entry will be made in as_do_callbacks)
 176  *
 177  * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
 178  * set, it indicates that as_do_callbacks is processing this entry.  The
 179  * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
 180  * to unblock as_do_callbacks, in case it is blocked.
 181  *
 182  * CALLERS RESPONSIBILITY: If not calling from within the process context for
 183  * the specified as, the caller must guarantee persistence of the specified as
 184  * for the duration of this function (eg. pages being locked within the as
 185  * will guarantee persistence).
 186  */
 187 uint_t
 188 as_delete_callback(struct as *as, void *arg)
 189 {
 190         struct as_callback **prevcb = &as->a_callbacks;
 191         struct as_callback *cb;
 192         uint_t rc = AS_CALLBACK_NOTFOUND;
 193 
 194         mutex_enter(&as->a_contents);
 195         for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
 196                 if (cb->ascb_arg != arg)
 197                         continue;
 198 
 199                 /*
 200                  * If the events indicate AS_CALLBACK_CALLED, just clear
 201                  * AS_ALL_EVENT in the events field and wakeup the thread
 202                  * that may be waiting in as_do_callbacks.  as_do_callbacks
 203                  * will take care of removing this entry from the list.  In
 204                  * that case, return AS_CALLBACK_DELETE_DEFERRED.  Otherwise
 205                  * (AS_CALLBACK_CALLED not set), just remove it from the
 206                  * list, return the memory and return AS_CALLBACK_DELETED.
 207                  */
 208                 if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
 209                         /* leave AS_CALLBACK_CALLED */
 210                         cb->ascb_events &= ~AS_ALL_EVENT;
 211                         rc = AS_CALLBACK_DELETE_DEFERRED;
 212                         cv_broadcast(&as->a_cv);
 213                 } else {
 214                         *prevcb = cb->ascb_next;
 215                         kmem_free(cb, sizeof (struct as_callback));
 216                         rc = AS_CALLBACK_DELETED;
 217                 }
 218                 break;
 219         }
 220         mutex_exit(&as->a_contents);
 221         return (rc);
 222 }
 223 
 224 /*
 225  * Searches the as callback list for a matching entry.
 226  * Returns a pointer to the first matching callback, or NULL if
 227  * nothing is found.
 228  * This function never sleeps so it is ok to call it with more
 229  * locks held but the (required) a_contents mutex.
 230  *
 231  * See also comment on as_do_callbacks below.
 232  */
 233 static struct as_callback *
 234 as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
 235     size_t event_len)
 236 {
 237         struct as_callback      *cb;
 238 
 239         ASSERT(MUTEX_HELD(&as->a_contents));
 240         for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
 241                 /*
 242                  * If the callback has not already been called, then
 243                  * check if events or address range pertains.  An event_len
 244                  * of zero means do an unconditional callback.
 245                  */
 246                 if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
 247                     ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
 248                     (event_addr + event_len < cb->ascb_saddr) ||
 249                     (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
 250                         continue;
 251                 }
 252                 break;
 253         }
 254         return (cb);
 255 }
 256 
 257 /*
 258  * Executes a given callback and removes it from the callback list for
 259  * this address space.
 260  * This function may sleep so the caller must drop all locks except
 261  * a_contents before calling this func.
 262  *
 263  * See also comments on as_do_callbacks below.
 264  */
 265 static void
 266 as_execute_callback(struct as *as, struct as_callback *cb,
 267     uint_t events)
 268 {
 269         struct as_callback **prevcb;
 270         void    *cb_arg;
 271 
 272         ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
 273         cb->ascb_events |= AS_CALLBACK_CALLED;
 274         mutex_exit(&as->a_contents);
 275         (*cb->ascb_func)(as, cb->ascb_arg, events);
 276         mutex_enter(&as->a_contents);
 277         /*
 278          * the callback function is required to delete the callback
 279          * when the callback function determines it is OK for
 280          * this thread to continue. as_delete_callback will clear
 281          * the AS_ALL_EVENT in the events field when it is deleted.
 282          * If the callback function called as_delete_callback,
 283          * events will already be cleared and there will be no blocking.
 284          */
 285         while ((cb->ascb_events & events) != 0) {
 286                 cv_wait(&as->a_cv, &as->a_contents);
 287         }
 288         /*
 289          * This entry needs to be taken off the list. Normally, the
 290          * callback func itself does that, but unfortunately the list
 291          * may have changed while the callback was running because the
 292          * a_contents mutex was dropped and someone else other than the
 293          * callback func itself could have called as_delete_callback,
 294          * so we have to search to find this entry again.  The entry
 295          * must have AS_CALLBACK_CALLED, and have the same 'arg'.
 296          */
 297         cb_arg = cb->ascb_arg;
 298         prevcb = &as->a_callbacks;
 299         for (cb = as->a_callbacks; cb != NULL;
 300             prevcb = &cb->ascb_next, cb = *prevcb) {
 301                 if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
 302                     (cb_arg != cb->ascb_arg)) {
 303                         continue;
 304                 }
 305                 *prevcb = cb->ascb_next;
 306                 kmem_free(cb, sizeof (struct as_callback));
 307                 break;
 308         }
 309 }
 310 
 311 /*
 312  * Check the callback list for a matching event and intersection of
 313  * address range. If there is a match invoke the callback.  Skip an entry if:
 314  *    - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
 315  *    - not event of interest
 316  *    - not address range of interest
 317  *
 318  * An event_len of zero indicates a request for an unconditional callback
 319  * (regardless of event), only the AS_CALLBACK_CALLED is checked.  The
 320  * a_contents lock must be dropped before a callback, so only one callback
 321  * can be done before returning. Return -1 (true) if a callback was
 322  * executed and removed from the list, else return 0 (false).
 323  *
 324  * The logically separate parts, i.e. finding a matching callback and
 325  * executing a given callback have been separated into two functions
 326  * so that they can be called with different sets of locks held beyond
 327  * the always-required a_contents. as_find_callback does not sleep so
 328  * it is ok to call it if more locks than a_contents (i.e. the a_lock
 329  * rwlock) are held. as_execute_callback on the other hand may sleep
 330  * so all locks beyond a_contents must be dropped by the caller if one
 331  * does not want to end comatose.
 332  */
 333 static int
 334 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
 335     size_t event_len)
 336 {
 337         struct as_callback *cb;
 338 
 339         if ((cb = as_find_callback(as, events, event_addr, event_len))) {
 340                 as_execute_callback(as, cb, events);
 341                 return (-1);
 342         }
 343         return (0);
 344 }
 345 
 346 /*
 347  * Search for the segment containing addr. If a segment containing addr
 348  * exists, that segment is returned.  If no such segment exists, and
 349  * the list spans addresses greater than addr, then the first segment
 350  * whose base is greater than addr is returned; otherwise, NULL is
 351  * returned unless tail is true, in which case the last element of the
 352  * list is returned.
 353  *
 354  * a_seglast is used to cache the last found segment for repeated
 355  * searches to the same addr (which happens frequently).
 356  */
 357 struct seg *
 358 as_findseg(struct as *as, caddr_t addr, int tail)
 359 {
 360         struct seg *seg = as->a_seglast;
 361         avl_index_t where;
 362 
 363         ASSERT(AS_LOCK_HELD(as));
 364 
 365         if (seg != NULL &&
 366             seg->s_base <= addr &&
 367             addr < seg->s_base + seg->s_size)
 368                 return (seg);
 369 
 370         seg = avl_find(&as->a_segtree, &addr, &where);
 371         if (seg != NULL)
 372                 return (as->a_seglast = seg);
 373 
 374         seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 375         if (seg == NULL && tail)
 376                 seg = avl_last(&as->a_segtree);
 377         return (as->a_seglast = seg);
 378 }
 379 
 380 #ifdef VERIFY_SEGLIST
 381 /*
 382  * verify that the linked list is coherent
 383  */
 384 static void
 385 as_verify(struct as *as)
 386 {
 387         struct seg *seg, *seglast, *p, *n;
 388         uint_t nsegs = 0;
 389 
 390         if (do_as_verify == 0)
 391                 return;
 392 
 393         seglast = as->a_seglast;
 394 
 395         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 396                 ASSERT(seg->s_as == as);
 397                 p = AS_SEGPREV(as, seg);
 398                 n = AS_SEGNEXT(as, seg);
 399                 ASSERT(p == NULL || p->s_as == as);
 400                 ASSERT(p == NULL || p->s_base < seg->s_base);
 401                 ASSERT(n == NULL || n->s_base > seg->s_base);
 402                 ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
 403                 if (seg == seglast)
 404                         seglast = NULL;
 405                 nsegs++;
 406         }
 407         ASSERT(seglast == NULL);
 408         ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
 409 }
 410 #endif /* VERIFY_SEGLIST */
 411 
 412 /*
 413  * Add a new segment to the address space. The avl_find()
 414  * may be expensive so we attempt to use last segment accessed
 415  * in as_gap() as an insertion point.
 416  */
 417 int
 418 as_addseg(struct as  *as, struct seg *newseg)
 419 {
 420         struct seg *seg;
 421         caddr_t addr;
 422         caddr_t eaddr;
 423         avl_index_t where;
 424 
 425         ASSERT(AS_WRITE_HELD(as));
 426 
 427         as->a_updatedir = 1; /* inform /proc */
 428         gethrestime(&as->a_updatetime);
 429 
 430         if (as->a_lastgaphl != NULL) {
 431                 struct seg *hseg = NULL;
 432                 struct seg *lseg = NULL;
 433 
 434                 if (as->a_lastgaphl->s_base > newseg->s_base) {
 435                         hseg = as->a_lastgaphl;
 436                         lseg = AVL_PREV(&as->a_segtree, hseg);
 437                 } else {
 438                         lseg = as->a_lastgaphl;
 439                         hseg = AVL_NEXT(&as->a_segtree, lseg);
 440                 }
 441 
 442                 if (hseg && lseg && lseg->s_base < newseg->s_base &&
 443                     hseg->s_base > newseg->s_base) {
 444                         avl_insert_here(&as->a_segtree, newseg, lseg,
 445                             AVL_AFTER);
 446                         as->a_lastgaphl = NULL;
 447                         as->a_seglast = newseg;
 448                         return (0);
 449                 }
 450                 as->a_lastgaphl = NULL;
 451         }
 452 
 453         addr = newseg->s_base;
 454         eaddr = addr + newseg->s_size;
 455 again:
 456 
 457         seg = avl_find(&as->a_segtree, &addr, &where);
 458 
 459         if (seg == NULL)
 460                 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 461 
 462         if (seg == NULL)
 463                 seg = avl_last(&as->a_segtree);
 464 
 465         if (seg != NULL) {
 466                 caddr_t base = seg->s_base;
 467 
 468                 /*
 469                  * If top of seg is below the requested address, then
 470                  * the insertion point is at the end of the linked list,
 471                  * and seg points to the tail of the list.  Otherwise,
 472                  * the insertion point is immediately before seg.
 473                  */
 474                 if (base + seg->s_size > addr) {
 475                         if (addr >= base || eaddr > base) {
 476 #ifdef __sparc
 477                                 extern struct seg_ops segnf_ops;
 478 
 479                                 /*
 480                                  * no-fault segs must disappear if overlaid.
 481                                  * XXX need new segment type so
 482                                  * we don't have to check s_ops
 483                                  */
 484                                 if (seg->s_ops == &segnf_ops) {
 485                                         seg_unmap(seg);
 486                                         goto again;
 487                                 }
 488 #endif
 489                                 return (-1);    /* overlapping segment */
 490                         }
 491                 }
 492         }
 493         as->a_seglast = newseg;
 494         avl_insert(&as->a_segtree, newseg, where);
 495 
 496 #ifdef VERIFY_SEGLIST
 497         as_verify(as);
 498 #endif
 499         return (0);
 500 }
 501 
 502 struct seg *
 503 as_removeseg(struct as *as, struct seg *seg)
 504 {
 505         avl_tree_t *t;
 506 
 507         ASSERT(AS_WRITE_HELD(as));
 508 
 509         as->a_updatedir = 1; /* inform /proc */
 510         gethrestime(&as->a_updatetime);
 511 
 512         if (seg == NULL)
 513                 return (NULL);
 514 
 515         t = &as->a_segtree;
 516         if (as->a_seglast == seg)
 517                 as->a_seglast = NULL;
 518         as->a_lastgaphl = NULL;
 519 
 520         /*
 521          * if this segment is at an address higher than
 522          * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
 523          */
 524         if (as->a_lastgap &&
 525             (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
 526                 as->a_lastgap = AVL_NEXT(t, seg);
 527 
 528         /*
 529          * remove the segment from the seg tree
 530          */
 531         avl_remove(t, seg);
 532 
 533 #ifdef VERIFY_SEGLIST
 534         as_verify(as);
 535 #endif
 536         return (seg);
 537 }
 538 
 539 /*
 540  * Find a segment containing addr.
 541  */
 542 struct seg *
 543 as_segat(struct as *as, caddr_t addr)
 544 {
 545         struct seg *seg = as->a_seglast;
 546 
 547         ASSERT(AS_LOCK_HELD(as));
 548 
 549         if (seg != NULL && seg->s_base <= addr &&
 550             addr < seg->s_base + seg->s_size)
 551                 return (seg);
 552 
 553         seg = avl_find(&as->a_segtree, &addr, NULL);
 554         return (seg);
 555 }
 556 
 557 /*
 558  * Serialize all searches for holes in an address space to
 559  * prevent two or more threads from allocating the same virtual
 560  * address range.  The address space must not be "read/write"
 561  * locked by the caller since we may block.
 562  */
 563 void
 564 as_rangelock(struct as *as)
 565 {
 566         mutex_enter(&as->a_contents);
 567         while (AS_ISCLAIMGAP(as))
 568                 cv_wait(&as->a_cv, &as->a_contents);
 569         AS_SETCLAIMGAP(as);
 570         mutex_exit(&as->a_contents);
 571 }
 572 
 573 /*
 574  * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
 575  */
 576 void
 577 as_rangeunlock(struct as *as)
 578 {
 579         mutex_enter(&as->a_contents);
 580         AS_CLRCLAIMGAP(as);
 581         cv_signal(&as->a_cv);
 582         mutex_exit(&as->a_contents);
 583 }
 584 
 585 /*
 586  * compar segments (or just an address) by segment address range
 587  */
 588 static int
 589 as_segcompar(const void *x, const void *y)
 590 {
 591         struct seg *a = (struct seg *)x;
 592         struct seg *b = (struct seg *)y;
 593 
 594         if (a->s_base < b->s_base)
 595                 return (-1);
 596         if (a->s_base >= b->s_base + b->s_size)
 597                 return (1);
 598         return (0);
 599 }
 600 
 601 
 602 void
 603 as_avlinit(struct as *as)
 604 {
 605         avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
 606             offsetof(struct seg, s_tree));
 607         avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
 608             offsetof(struct watched_page, wp_link));
 609 }
 610 
 611 /*ARGSUSED*/
 612 static int
 613 as_constructor(void *buf, void *cdrarg, int kmflags)
 614 {
 615         struct as *as = buf;
 616 
 617         mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
 618         cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
 619         rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
 620         as_avlinit(as);
 621         return (0);
 622 }
 623 
 624 /*ARGSUSED1*/
 625 static void
 626 as_destructor(void *buf, void *cdrarg)
 627 {
 628         struct as *as = buf;
 629 
 630         avl_destroy(&as->a_segtree);
 631         mutex_destroy(&as->a_contents);
 632         cv_destroy(&as->a_cv);
 633         rw_destroy(&as->a_lock);
 634 }
 635 
 636 void
 637 as_init(void)
 638 {
 639         as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
 640             as_constructor, as_destructor, NULL, NULL, NULL, 0);
 641 }
 642 
 643 /*
 644  * Allocate and initialize an address space data structure.
 645  * We call hat_alloc to allow any machine dependent
 646  * information in the hat structure to be initialized.
 647  */
 648 struct as *
 649 as_alloc(void)
 650 {
 651         struct as *as;
 652 
 653         as = kmem_cache_alloc(as_cache, KM_SLEEP);
 654 
 655         as->a_flags          = 0;
 656         as->a_vbits          = 0;
 657         as->a_hrm            = NULL;
 658         as->a_seglast                = NULL;
 659         as->a_size           = 0;
 660         as->a_resvsize               = 0;
 661         as->a_updatedir              = 0;
 662         gethrestime(&as->a_updatetime);
 663         as->a_objectdir              = NULL;
 664         as->a_sizedir                = 0;
 665         as->a_userlimit              = (caddr_t)USERLIMIT;
 666         as->a_lastgap                = NULL;
 667         as->a_lastgaphl              = NULL;
 668         as->a_callbacks              = NULL;
 669         as->a_proc           = NULL;
 670 
 671         AS_LOCK_ENTER(as, RW_WRITER);
 672         as->a_hat = hat_alloc(as);   /* create hat for default system mmu */
 673         AS_LOCK_EXIT(as);
 674 
 675         return (as);
 676 }
 677 
 678 /*
 679  * Free an address space data structure.
 680  * Need to free the hat first and then
 681  * all the segments on this as and finally
 682  * the space for the as struct itself.
 683  */
 684 void
 685 as_free(struct as *as)
 686 {
 687         struct hat *hat = as->a_hat;
 688         struct seg *seg, *next;
 689         boolean_t free_started = B_FALSE;
 690 
 691 top:
 692         /*
 693          * Invoke ALL callbacks. as_do_callbacks will do one callback
 694          * per call, and not return (-1) until the callback has completed.
 695          * When as_do_callbacks returns zero, all callbacks have completed.
 696          */
 697         mutex_enter(&as->a_contents);
 698         while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
 699                 ;
 700 
 701         mutex_exit(&as->a_contents);
 702         AS_LOCK_ENTER(as, RW_WRITER);
 703 
 704         if (!free_started) {
 705                 free_started = B_TRUE;
 706                 hat_free_start(hat);
 707         }
 708         for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
 709                 int err;
 710 
 711                 next = AS_SEGNEXT(as, seg);
 712 retry:
 713                 err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
 714                 if (err == EAGAIN) {
 715                         mutex_enter(&as->a_contents);
 716                         if (as->a_callbacks) {
 717                                 AS_LOCK_EXIT(as);
 718                         } else if (!AS_ISNOUNMAPWAIT(as)) {
 719                                 /*
 720                                  * Memory is currently locked. Wait for a
 721                                  * cv_signal that it has been unlocked, then
 722                                  * try the operation again.
 723                                  */
 724                                 if (AS_ISUNMAPWAIT(as) == 0)
 725                                         cv_broadcast(&as->a_cv);
 726                                 AS_SETUNMAPWAIT(as);
 727                                 AS_LOCK_EXIT(as);
 728                                 while (AS_ISUNMAPWAIT(as))
 729                                         cv_wait(&as->a_cv, &as->a_contents);
 730                         } else {
 731                                 /*
 732                                  * We may have raced with
 733                                  * segvn_reclaim()/segspt_reclaim(). In this
 734                                  * case clean nounmapwait flag and retry since
 735                                  * softlockcnt in this segment may be already
 736                                  * 0.  We don't drop as writer lock so our
 737                                  * number of retries without sleeping should
 738                                  * be very small. See segvn_reclaim() for
 739                                  * more comments.
 740                                  */
 741                                 AS_CLRNOUNMAPWAIT(as);
 742                                 mutex_exit(&as->a_contents);
 743                                 goto retry;
 744                         }
 745                         mutex_exit(&as->a_contents);
 746                         goto top;
 747                 } else {
 748                         /*
 749                          * We do not expect any other error return at this
 750                          * time. This is similar to an ASSERT in seg_unmap()
 751                          */
 752                         ASSERT(err == 0);
 753                 }
 754         }
 755         hat_free_end(hat);
 756         AS_LOCK_EXIT(as);
 757 
 758         /* /proc stuff */
 759         ASSERT(avl_numnodes(&as->a_wpage) == 0);
 760         if (as->a_objectdir) {
 761                 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
 762                 as->a_objectdir = NULL;
 763                 as->a_sizedir = 0;
 764         }
 765 
 766         /*
 767          * Free the struct as back to kmem.  Assert it has no segments.
 768          */
 769         ASSERT(avl_numnodes(&as->a_segtree) == 0);
 770         kmem_cache_free(as_cache, as);
 771 }
 772 
 773 int
 774 as_dup(struct as *as, struct proc *forkedproc)
 775 {
 776         struct as *newas;
 777         struct seg *seg, *newseg;
 778         size_t  purgesize = 0;
 779         int error;
 780 
 781         AS_LOCK_ENTER(as, RW_WRITER);
 782         as_clearwatch(as);
 783         newas = as_alloc();
 784         newas->a_userlimit = as->a_userlimit;
 785         newas->a_proc = forkedproc;
 786 
 787         AS_LOCK_ENTER(newas, RW_WRITER);
 788 
 789         (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
 790 
 791         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 792 
 793                 if (seg->s_flags & S_PURGE) {
 794                         purgesize += seg->s_size;
 795                         continue;
 796                 }
 797 
 798                 newseg = seg_alloc(newas, seg->s_base, seg->s_size);
 799                 if (newseg == NULL) {
 800                         AS_LOCK_EXIT(newas);
 801                         as_setwatch(as);
 802                         AS_LOCK_EXIT(as);
 803                         as_free(newas);
 804                         return (-1);
 805                 }
 806                 if ((error = SEGOP_DUP(seg, newseg)) != 0) {
 807                         /*
 808                          * We call seg_free() on the new seg
 809                          * because the segment is not set up
 810                          * completely; i.e. it has no ops.
 811                          */
 812                         as_setwatch(as);
 813                         AS_LOCK_EXIT(as);
 814                         seg_free(newseg);
 815                         AS_LOCK_EXIT(newas);
 816                         as_free(newas);
 817                         return (error);
 818                 }
 819                 newas->a_size += seg->s_size;
 820         }
 821         newas->a_resvsize = as->a_resvsize - purgesize;
 822 
 823         error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
 824 
 825         AS_LOCK_EXIT(newas);
 826 
 827         as_setwatch(as);
 828         AS_LOCK_EXIT(as);
 829         if (error != 0) {
 830                 as_free(newas);
 831                 return (error);
 832         }
 833         forkedproc->p_as = newas;
 834         return (0);
 835 }
 836 
 837 /*
 838  * Handle a ``fault'' at addr for size bytes.
 839  */
 840 faultcode_t
 841 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
 842     enum fault_type type, enum seg_rw rw)
 843 {
 844         struct seg *seg;
 845         caddr_t raddr;                  /* rounded down addr */
 846         size_t rsize;                   /* rounded up size */
 847         size_t ssize;
 848         faultcode_t res = 0;
 849         caddr_t addrsav;
 850         struct seg *segsav;
 851         int as_lock_held;
 852         klwp_t *lwp = ttolwp(curthread);
 853 
 854 
 855 
 856 retry:
 857         /*
 858          * Indicate that the lwp is not to be stopped while waiting for a
 859          * pagefault.  This is to avoid deadlock while debugging a process
 860          * via /proc over NFS (in particular).
 861          */
 862         if (lwp != NULL)
 863                 lwp->lwp_nostop++;
 864 
 865         /*
 866          * same length must be used when we softlock and softunlock.  We
 867          * don't support softunlocking lengths less than the original length
 868          * when there is largepage support.  See seg_dev.c for more
 869          * comments.
 870          */
 871         switch (type) {
 872 
 873         case F_SOFTLOCK:
 874                 CPU_STATS_ADD_K(vm, softlock, 1);
 875                 break;
 876 
 877         case F_SOFTUNLOCK:
 878                 break;
 879 
 880         case F_PROT:
 881                 CPU_STATS_ADD_K(vm, prot_fault, 1);
 882                 break;
 883 
 884         case F_INVAL:
 885                 CPU_STATS_ENTER_K();
 886                 CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
 887                 if (as == &kas)
 888                         CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
 889                 CPU_STATS_EXIT_K();
 890                 break;
 891         }
 892 
 893         /* Kernel probe */
 894         TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */,
 895             tnf_opaque, address,        addr,
 896             tnf_fault_type,     fault_type,     type,
 897             tnf_seg_access,     access,         rw);
 898 
 899         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
 900         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
 901             (size_t)raddr;
 902 
 903         /*
 904          * XXX -- Don't grab the as lock for segkmap. We should grab it for
 905          * correctness, but then we could be stuck holding this lock for
 906          * a LONG time if the fault needs to be resolved on a slow
 907          * filesystem, and then no-one will be able to exec new commands,
 908          * as exec'ing requires the write lock on the as.
 909          */
 910         if (as == &kas && segkmap && segkmap->s_base <= raddr &&
 911             raddr + size < segkmap->s_base + segkmap->s_size) {
 912                 seg = segkmap;
 913                 as_lock_held = 0;
 914         } else {
 915                 AS_LOCK_ENTER(as, RW_READER);
 916 
 917                 seg = as_segat(as, raddr);
 918                 if (seg == NULL) {
 919                         AS_LOCK_EXIT(as);
 920                         if (lwp != NULL)
 921                                 lwp->lwp_nostop--;
 922                         return (FC_NOMAP);
 923                 }
 924 
 925                 as_lock_held = 1;
 926         }
 927 
 928         addrsav = raddr;
 929         segsav = seg;
 930 
 931         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
 932                 if (raddr >= seg->s_base + seg->s_size) {
 933                         seg = AS_SEGNEXT(as, seg);
 934                         if (seg == NULL || raddr != seg->s_base) {
 935                                 res = FC_NOMAP;
 936                                 break;
 937                         }
 938                 }
 939                 if (raddr + rsize > seg->s_base + seg->s_size)
 940                         ssize = seg->s_base + seg->s_size - raddr;
 941                 else
 942                         ssize = rsize;
 943 
 944                 res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw);
 945                 if (res != 0)
 946                         break;
 947         }
 948 
 949         /*
 950          * If we were SOFTLOCKing and encountered a failure,
 951          * we must SOFTUNLOCK the range we already did. (Maybe we
 952          * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
 953          * right here...)
 954          */
 955         if (res != 0 && type == F_SOFTLOCK) {
 956                 for (seg = segsav; addrsav < raddr; addrsav += ssize) {
 957                         if (addrsav >= seg->s_base + seg->s_size)
 958                                 seg = AS_SEGNEXT(as, seg);
 959                         ASSERT(seg != NULL);
 960                         /*
 961                          * Now call the fault routine again to perform the
 962                          * unlock using S_OTHER instead of the rw variable
 963                          * since we never got a chance to touch the pages.
 964                          */
 965                         if (raddr > seg->s_base + seg->s_size)
 966                                 ssize = seg->s_base + seg->s_size - addrsav;
 967                         else
 968                                 ssize = raddr - addrsav;
 969                         (void) SEGOP_FAULT(hat, seg, addrsav, ssize,
 970                             F_SOFTUNLOCK, S_OTHER);
 971                 }
 972         }
 973         if (as_lock_held)
 974                 AS_LOCK_EXIT(as);
 975         if (lwp != NULL)
 976                 lwp->lwp_nostop--;
 977 
 978         /*
 979          * If the lower levels returned EDEADLK for a fault,
 980          * It means that we should retry the fault.  Let's wait
 981          * a bit also to let the deadlock causing condition clear.
 982          * This is part of a gross hack to work around a design flaw
 983          * in the ufs/sds logging code and should go away when the
 984          * logging code is re-designed to fix the problem. See bug
 985          * 4125102 for details of the problem.
 986          */
 987         if (FC_ERRNO(res) == EDEADLK) {
 988                 delay(deadlk_wait);
 989                 res = 0;
 990                 goto retry;
 991         }
 992         return (res);
 993 }
 994 
 995 
 996 
 997 /*
 998  * Asynchronous ``fault'' at addr for size bytes.
 999  */
1000 faultcode_t
1001 as_faulta(struct as *as, caddr_t addr, size_t size)
1002 {
1003         struct seg *seg;
1004         caddr_t raddr;                  /* rounded down addr */
1005         size_t rsize;                   /* rounded up size */
1006         faultcode_t res = 0;
1007         klwp_t *lwp = ttolwp(curthread);
1008 
1009 retry:
1010         /*
1011          * Indicate that the lwp is not to be stopped while waiting
1012          * for a pagefault.  This is to avoid deadlock while debugging
1013          * a process via /proc over NFS (in particular).
1014          */
1015         if (lwp != NULL)
1016                 lwp->lwp_nostop++;
1017 
1018         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1019         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1020             (size_t)raddr;
1021 
1022         AS_LOCK_ENTER(as, RW_READER);
1023         seg = as_segat(as, raddr);
1024         if (seg == NULL) {
1025                 AS_LOCK_EXIT(as);
1026                 if (lwp != NULL)
1027                         lwp->lwp_nostop--;
1028                 return (FC_NOMAP);
1029         }
1030 
1031         for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1032                 if (raddr >= seg->s_base + seg->s_size) {
1033                         seg = AS_SEGNEXT(as, seg);
1034                         if (seg == NULL || raddr != seg->s_base) {
1035                                 res = FC_NOMAP;
1036                                 break;
1037                         }
1038                 }
1039                 res = SEGOP_FAULTA(seg, raddr);
1040                 if (res != 0)
1041                         break;
1042         }
1043         AS_LOCK_EXIT(as);
1044         if (lwp != NULL)
1045                 lwp->lwp_nostop--;
1046         /*
1047          * If the lower levels returned EDEADLK for a fault,
1048          * It means that we should retry the fault.  Let's wait
1049          * a bit also to let the deadlock causing condition clear.
1050          * This is part of a gross hack to work around a design flaw
1051          * in the ufs/sds logging code and should go away when the
1052          * logging code is re-designed to fix the problem. See bug
1053          * 4125102 for details of the problem.
1054          */
1055         if (FC_ERRNO(res) == EDEADLK) {
1056                 delay(deadlk_wait);
1057                 res = 0;
1058                 goto retry;
1059         }
1060         return (res);
1061 }
1062 
1063 /*
1064  * Set the virtual mapping for the interval from [addr : addr + size)
1065  * in address space `as' to have the specified protection.
1066  * It is ok for the range to cross over several segments,
1067  * as long as they are contiguous.
1068  */
1069 int
1070 as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1071 {
1072         struct seg *seg;
1073         struct as_callback *cb;
1074         size_t ssize;
1075         caddr_t raddr;                  /* rounded down addr */
1076         size_t rsize;                   /* rounded up size */
1077         int error = 0, writer = 0;
1078         caddr_t saveraddr;
1079         size_t saversize;
1080 
1081 setprot_top:
1082         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1083         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1084             (size_t)raddr;
1085 
1086         if (raddr + rsize < raddr)           /* check for wraparound */
1087                 return (ENOMEM);
1088 
1089         saveraddr = raddr;
1090         saversize = rsize;
1091 
1092         /*
1093          * Normally we only lock the as as a reader. But
1094          * if due to setprot the segment driver needs to split
1095          * a segment it will return IE_RETRY. Therefore we re-acquire
1096          * the as lock as a writer so the segment driver can change
1097          * the seg list. Also the segment driver will return IE_RETRY
1098          * after it has changed the segment list so we therefore keep
1099          * locking as a writer. Since these opeartions should be rare
1100          * want to only lock as a writer when necessary.
1101          */
1102         if (writer || avl_numnodes(&as->a_wpage) != 0) {
1103                 AS_LOCK_ENTER(as, RW_WRITER);
1104         } else {
1105                 AS_LOCK_ENTER(as, RW_READER);
1106         }
1107 
1108         as_clearwatchprot(as, raddr, rsize);
1109         seg = as_segat(as, raddr);
1110         if (seg == NULL) {
1111                 as_setwatch(as);
1112                 AS_LOCK_EXIT(as);
1113                 return (ENOMEM);
1114         }
1115 
1116         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1117                 if (raddr >= seg->s_base + seg->s_size) {
1118                         seg = AS_SEGNEXT(as, seg);
1119                         if (seg == NULL || raddr != seg->s_base) {
1120                                 error = ENOMEM;
1121                                 break;
1122                         }
1123                 }
1124                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1125                         ssize = seg->s_base + seg->s_size - raddr;
1126                 else
1127                         ssize = rsize;
1128 retry:
1129                 error = SEGOP_SETPROT(seg, raddr, ssize, prot);
1130 
1131                 if (error == IE_NOMEM) {
1132                         error = EAGAIN;
1133                         break;
1134                 }
1135 
1136                 if (error == IE_RETRY) {
1137                         AS_LOCK_EXIT(as);
1138                         writer = 1;
1139                         goto setprot_top;
1140                 }
1141 
1142                 if (error == EAGAIN) {
1143                         /*
1144                          * Make sure we have a_lock as writer.
1145                          */
1146                         if (writer == 0) {
1147                                 AS_LOCK_EXIT(as);
1148                                 writer = 1;
1149                                 goto setprot_top;
1150                         }
1151 
1152                         /*
1153                          * Memory is currently locked.  It must be unlocked
1154                          * before this operation can succeed through a retry.
1155                          * The possible reasons for locked memory and
1156                          * corresponding strategies for unlocking are:
1157                          * (1) Normal I/O
1158                          *      wait for a signal that the I/O operation
1159                          *      has completed and the memory is unlocked.
1160                          * (2) Asynchronous I/O
1161                          *      The aio subsystem does not unlock pages when
1162                          *      the I/O is completed. Those pages are unlocked
1163                          *      when the application calls aiowait/aioerror.
1164                          *      So, to prevent blocking forever, cv_broadcast()
1165                          *      is done to wake up aio_cleanup_thread.
1166                          *      Subsequently, segvn_reclaim will be called, and
1167                          *      that will do AS_CLRUNMAPWAIT() and wake us up.
1168                          * (3) Long term page locking:
1169                          *      Drivers intending to have pages locked for a
1170                          *      period considerably longer than for normal I/O
1171                          *      (essentially forever) may have registered for a
1172                          *      callback so they may unlock these pages on
1173                          *      request. This is needed to allow this operation
1174                          *      to succeed. Each entry on the callback list is
1175                          *      examined. If the event or address range pertains
1176                          *      the callback is invoked (unless it already is in
1177                          *      progress). The a_contents lock must be dropped
1178                          *      before the callback, so only one callback can
1179                          *      be done at a time. Go to the top and do more
1180                          *      until zero is returned. If zero is returned,
1181                          *      either there were no callbacks for this event
1182                          *      or they were already in progress.
1183                          */
1184                         mutex_enter(&as->a_contents);
1185                         if (as->a_callbacks &&
1186                             (cb = as_find_callback(as, AS_SETPROT_EVENT,
1187                             seg->s_base, seg->s_size))) {
1188                                 AS_LOCK_EXIT(as);
1189                                 as_execute_callback(as, cb, AS_SETPROT_EVENT);
1190                         } else if (!AS_ISNOUNMAPWAIT(as)) {
1191                                 if (AS_ISUNMAPWAIT(as) == 0)
1192                                         cv_broadcast(&as->a_cv);
1193                                 AS_SETUNMAPWAIT(as);
1194                                 AS_LOCK_EXIT(as);
1195                                 while (AS_ISUNMAPWAIT(as))
1196                                         cv_wait(&as->a_cv, &as->a_contents);
1197                         } else {
1198                                 /*
1199                                  * We may have raced with
1200                                  * segvn_reclaim()/segspt_reclaim(). In this
1201                                  * case clean nounmapwait flag and retry since
1202                                  * softlockcnt in this segment may be already
1203                                  * 0.  We don't drop as writer lock so our
1204                                  * number of retries without sleeping should
1205                                  * be very small. See segvn_reclaim() for
1206                                  * more comments.
1207                                  */
1208                                 AS_CLRNOUNMAPWAIT(as);
1209                                 mutex_exit(&as->a_contents);
1210                                 goto retry;
1211                         }
1212                         mutex_exit(&as->a_contents);
1213                         goto setprot_top;
1214                 } else if (error != 0)
1215                         break;
1216         }
1217         if (error != 0) {
1218                 as_setwatch(as);
1219         } else {
1220                 as_setwatchprot(as, saveraddr, saversize, prot);
1221         }
1222         AS_LOCK_EXIT(as);
1223         return (error);
1224 }
1225 
1226 /*
1227  * Check to make sure that the interval [addr, addr + size)
1228  * in address space `as' has at least the specified protection.
1229  * It is ok for the range to cross over several segments, as long
1230  * as they are contiguous.
1231  */
1232 int
1233 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1234 {
1235         struct seg *seg;
1236         size_t ssize;
1237         caddr_t raddr;                  /* rounded down addr */
1238         size_t rsize;                   /* rounded up size */
1239         int error = 0;
1240 
1241         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1242         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1243             (size_t)raddr;
1244 
1245         if (raddr + rsize < raddr)           /* check for wraparound */
1246                 return (ENOMEM);
1247 
1248         /*
1249          * This is ugly as sin...
1250          * Normally, we only acquire the address space readers lock.
1251          * However, if the address space has watchpoints present,
1252          * we must acquire the writer lock on the address space for
1253          * the benefit of as_clearwatchprot() and as_setwatchprot().
1254          */
1255         if (avl_numnodes(&as->a_wpage) != 0)
1256                 AS_LOCK_ENTER(as, RW_WRITER);
1257         else
1258                 AS_LOCK_ENTER(as, RW_READER);
1259         as_clearwatchprot(as, raddr, rsize);
1260         seg = as_segat(as, raddr);
1261         if (seg == NULL) {
1262                 as_setwatch(as);
1263                 AS_LOCK_EXIT(as);
1264                 return (ENOMEM);
1265         }
1266 
1267         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1268                 if (raddr >= seg->s_base + seg->s_size) {
1269                         seg = AS_SEGNEXT(as, seg);
1270                         if (seg == NULL || raddr != seg->s_base) {
1271                                 error = ENOMEM;
1272                                 break;
1273                         }
1274                 }
1275                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1276                         ssize = seg->s_base + seg->s_size - raddr;
1277                 else
1278                         ssize = rsize;
1279 
1280                 error = SEGOP_CHECKPROT(seg, raddr, ssize, prot);
1281                 if (error != 0)
1282                         break;
1283         }
1284         as_setwatch(as);
1285         AS_LOCK_EXIT(as);
1286         return (error);
1287 }
1288 
1289 int
1290 as_unmap(struct as *as, caddr_t addr, size_t size)
1291 {
1292         struct seg *seg, *seg_next;
1293         struct as_callback *cb;
1294         caddr_t raddr, eaddr;
1295         size_t ssize, rsize = 0;
1296         int err;
1297 
1298 top:
1299         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1300         eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1301             (uintptr_t)PAGEMASK);
1302 
1303         AS_LOCK_ENTER(as, RW_WRITER);
1304 
1305         as->a_updatedir = 1; /* inform /proc */
1306         gethrestime(&as->a_updatetime);
1307 
1308         /*
1309          * Use as_findseg to find the first segment in the range, then
1310          * step through the segments in order, following s_next.
1311          */
1312         as_clearwatchprot(as, raddr, eaddr - raddr);
1313 
1314         for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
1315                 if (eaddr <= seg->s_base)
1316                         break;          /* eaddr was in a gap; all done */
1317 
1318                 /* this is implied by the test above */
1319                 ASSERT(raddr < eaddr);
1320 
1321                 if (raddr < seg->s_base)
1322                         raddr = seg->s_base;         /* raddr was in a gap */
1323 
1324                 if (eaddr > (seg->s_base + seg->s_size))
1325                         ssize = seg->s_base + seg->s_size - raddr;
1326                 else
1327                         ssize = eaddr - raddr;
1328 
1329                 /*
1330                  * Save next segment pointer since seg can be
1331                  * destroyed during the segment unmap operation.
1332                  */
1333                 seg_next = AS_SEGNEXT(as, seg);
1334 
1335                 /*
1336                  * We didn't count /dev/null mappings, so ignore them here.
1337                  * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1338                  * we have to do this check here while we have seg.)
1339                  */
1340                 rsize = 0;
1341                 if (!SEG_IS_DEVNULL_MAPPING(seg) &&
1342                     !SEG_IS_PARTIAL_RESV(seg))
1343                         rsize = ssize;
1344 
1345 retry:
1346                 err = SEGOP_UNMAP(seg, raddr, ssize);
1347                 if (err == EAGAIN) {
1348                         /*
1349                          * Memory is currently locked.  It must be unlocked
1350                          * before this operation can succeed through a retry.
1351                          * The possible reasons for locked memory and
1352                          * corresponding strategies for unlocking are:
1353                          * (1) Normal I/O
1354                          *      wait for a signal that the I/O operation
1355                          *      has completed and the memory is unlocked.
1356                          * (2) Asynchronous I/O
1357                          *      The aio subsystem does not unlock pages when
1358                          *      the I/O is completed. Those pages are unlocked
1359                          *      when the application calls aiowait/aioerror.
1360                          *      So, to prevent blocking forever, cv_broadcast()
1361                          *      is done to wake up aio_cleanup_thread.
1362                          *      Subsequently, segvn_reclaim will be called, and
1363                          *      that will do AS_CLRUNMAPWAIT() and wake us up.
1364                          * (3) Long term page locking:
1365                          *      Drivers intending to have pages locked for a
1366                          *      period considerably longer than for normal I/O
1367                          *      (essentially forever) may have registered for a
1368                          *      callback so they may unlock these pages on
1369                          *      request. This is needed to allow this operation
1370                          *      to succeed. Each entry on the callback list is
1371                          *      examined. If the event or address range pertains
1372                          *      the callback is invoked (unless it already is in
1373                          *      progress). The a_contents lock must be dropped
1374                          *      before the callback, so only one callback can
1375                          *      be done at a time. Go to the top and do more
1376                          *      until zero is returned. If zero is returned,
1377                          *      either there were no callbacks for this event
1378                          *      or they were already in progress.
1379                          */
1380                         mutex_enter(&as->a_contents);
1381                         if (as->a_callbacks &&
1382                             (cb = as_find_callback(as, AS_UNMAP_EVENT,
1383                             seg->s_base, seg->s_size))) {
1384                                 AS_LOCK_EXIT(as);
1385                                 as_execute_callback(as, cb, AS_UNMAP_EVENT);
1386                         } else if (!AS_ISNOUNMAPWAIT(as)) {
1387                                 if (AS_ISUNMAPWAIT(as) == 0)
1388                                         cv_broadcast(&as->a_cv);
1389                                 AS_SETUNMAPWAIT(as);
1390                                 AS_LOCK_EXIT(as);
1391                                 while (AS_ISUNMAPWAIT(as))
1392                                         cv_wait(&as->a_cv, &as->a_contents);
1393                         } else {
1394                                 /*
1395                                  * We may have raced with
1396                                  * segvn_reclaim()/segspt_reclaim(). In this
1397                                  * case clean nounmapwait flag and retry since
1398                                  * softlockcnt in this segment may be already
1399                                  * 0.  We don't drop as writer lock so our
1400                                  * number of retries without sleeping should
1401                                  * be very small. See segvn_reclaim() for
1402                                  * more comments.
1403                                  */
1404                                 AS_CLRNOUNMAPWAIT(as);
1405                                 mutex_exit(&as->a_contents);
1406                                 goto retry;
1407                         }
1408                         mutex_exit(&as->a_contents);
1409                         goto top;
1410                 } else if (err == IE_RETRY) {
1411                         AS_LOCK_EXIT(as);
1412                         goto top;
1413                 } else if (err) {
1414                         as_setwatch(as);
1415                         AS_LOCK_EXIT(as);
1416                         return (-1);
1417                 }
1418 
1419                 as->a_size -= ssize;
1420                 if (rsize)
1421                         as->a_resvsize -= rsize;
1422                 raddr += ssize;
1423         }
1424         AS_LOCK_EXIT(as);
1425         return (0);
1426 }
1427 
1428 static int
1429 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
1430     int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1431 {
1432         uint_t szc;
1433         uint_t nszc;
1434         int error;
1435         caddr_t a;
1436         caddr_t eaddr;
1437         size_t segsize;
1438         struct seg *seg;
1439         size_t pgsz;
1440         int do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
1441         uint_t save_szcvec;
1442 
1443         ASSERT(AS_WRITE_HELD(as));
1444         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1445         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1446         ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
1447         if (!do_off) {
1448                 vn_a->offset = 0;
1449         }
1450 
1451         if (szcvec <= 1) {
1452                 seg = seg_alloc(as, addr, size);
1453                 if (seg == NULL) {
1454                         return (ENOMEM);
1455                 }
1456                 vn_a->szc = 0;
1457                 error = (*crfp)(seg, vn_a);
1458                 if (error != 0) {
1459                         seg_free(seg);
1460                 } else {
1461                         as->a_size += size;
1462                         as->a_resvsize += size;
1463                 }
1464                 return (error);
1465         }
1466 
1467         eaddr = addr + size;
1468         save_szcvec = szcvec;
1469         szcvec >>= 1;
1470         szc = 0;
1471         nszc = 0;
1472         while (szcvec) {
1473                 if ((szcvec & 0x1) == 0) {
1474                         nszc++;
1475                         szcvec >>= 1;
1476                         continue;
1477                 }
1478                 nszc++;
1479                 pgsz = page_get_pagesize(nszc);
1480                 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
1481                 if (a != addr) {
1482                         ASSERT(a < eaddr);
1483                         segsize = a - addr;
1484                         seg = seg_alloc(as, addr, segsize);
1485                         if (seg == NULL) {
1486                                 return (ENOMEM);
1487                         }
1488                         vn_a->szc = szc;
1489                         error = (*crfp)(seg, vn_a);
1490                         if (error != 0) {
1491                                 seg_free(seg);
1492                                 return (error);
1493                         }
1494                         as->a_size += segsize;
1495                         as->a_resvsize += segsize;
1496                         *segcreated = 1;
1497                         if (do_off) {
1498                                 vn_a->offset += segsize;
1499                         }
1500                         addr = a;
1501                 }
1502                 szc = nszc;
1503                 szcvec >>= 1;
1504         }
1505 
1506         ASSERT(addr < eaddr);
1507         szcvec = save_szcvec | 1; /* add 8K pages */
1508         while (szcvec) {
1509                 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
1510                 ASSERT(a >= addr);
1511                 if (a != addr) {
1512                         segsize = a - addr;
1513                         seg = seg_alloc(as, addr, segsize);
1514                         if (seg == NULL) {
1515                                 return (ENOMEM);
1516                         }
1517                         vn_a->szc = szc;
1518                         error = (*crfp)(seg, vn_a);
1519                         if (error != 0) {
1520                                 seg_free(seg);
1521                                 return (error);
1522                         }
1523                         as->a_size += segsize;
1524                         as->a_resvsize += segsize;
1525                         *segcreated = 1;
1526                         if (do_off) {
1527                                 vn_a->offset += segsize;
1528                         }
1529                         addr = a;
1530                 }
1531                 szcvec &= ~(1 << szc);
1532                 if (szcvec) {
1533                         szc = highbit(szcvec) - 1;
1534                         pgsz = page_get_pagesize(szc);
1535                 }
1536         }
1537         ASSERT(addr == eaddr);
1538 
1539         return (0);
1540 }
1541 
1542 static int
1543 as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
1544     int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1545 {
1546         uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
1547         int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
1548         uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1549             type, 0);
1550         int error;
1551         struct seg *seg;
1552         struct vattr va;
1553         u_offset_t eoff;
1554         size_t save_size = 0;
1555         extern size_t textrepl_size_thresh;
1556 
1557         ASSERT(AS_WRITE_HELD(as));
1558         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1559         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1560         ASSERT(vn_a->vp != NULL);
1561         ASSERT(vn_a->amp == NULL);
1562 
1563 again:
1564         if (szcvec <= 1) {
1565                 seg = seg_alloc(as, addr, size);
1566                 if (seg == NULL) {
1567                         return (ENOMEM);
1568                 }
1569                 vn_a->szc = 0;
1570                 error = (*crfp)(seg, vn_a);
1571                 if (error != 0) {
1572                         seg_free(seg);
1573                 } else {
1574                         as->a_size += size;
1575                         as->a_resvsize += size;
1576                 }
1577                 return (error);
1578         }
1579 
1580         va.va_mask = AT_SIZE;
1581         if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) {
1582                 szcvec = 0;
1583                 goto again;
1584         }
1585         eoff = vn_a->offset & PAGEMASK;
1586         if (eoff >= va.va_size) {
1587                 szcvec = 0;
1588                 goto again;
1589         }
1590         eoff += size;
1591         if (btopr(va.va_size) < btopr(eoff)) {
1592                 save_size = size;
1593                 size = va.va_size - (vn_a->offset & PAGEMASK);
1594                 size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
1595                 szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1596                     type, 0);
1597                 if (szcvec <= 1) {
1598                         size = save_size;
1599                         goto again;
1600                 }
1601         }
1602 
1603         if (size > textrepl_size_thresh) {
1604                 vn_a->flags |= _MAP_TEXTREPL;
1605         }
1606         error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a,
1607             segcreated);
1608         if (error != 0) {
1609                 return (error);
1610         }
1611         if (save_size) {
1612                 addr += size;
1613                 size = save_size - size;
1614                 szcvec = 0;
1615                 goto again;
1616         }
1617         return (0);
1618 }
1619 
1620 /*
1621  * as_map_ansegs: shared or private anonymous memory.  Note that the flags
1622  * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1623  */
1624 static int
1625 as_map_ansegs(struct as *as, caddr_t addr, size_t size,
1626     int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1627 {
1628         uint_t szcvec;
1629         uchar_t type;
1630 
1631         ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
1632         if (vn_a->type == MAP_SHARED) {
1633                 type = MAPPGSZC_SHM;
1634         } else if (vn_a->type == MAP_PRIVATE) {
1635                 if (vn_a->szc == AS_MAP_HEAP) {
1636                         type = MAPPGSZC_HEAP;
1637                 } else if (vn_a->szc == AS_MAP_STACK) {
1638                         type = MAPPGSZC_STACK;
1639                 } else {
1640                         type = MAPPGSZC_PRIVM;
1641                 }
1642         }
1643         szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
1644             (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
1645             (vn_a->flags & MAP_TEXT), type, 0);
1646         ASSERT(AS_WRITE_HELD(as));
1647         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1648         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1649         ASSERT(vn_a->vp == NULL);
1650 
1651         return (as_map_segvn_segs(as, addr, size, szcvec,
1652             crfp, vn_a, segcreated));
1653 }
1654 
1655 int
1656 as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp)
1657 {
1658         AS_LOCK_ENTER(as, RW_WRITER);
1659         return (as_map_locked(as, addr, size, crfp, argsp));
1660 }
1661 
1662 int
1663 as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(),
1664     void *argsp)
1665 {
1666         struct seg *seg = NULL;
1667         caddr_t raddr;                  /* rounded down addr */
1668         size_t rsize;                   /* rounded up size */
1669         int error;
1670         int unmap = 0;
1671         /*
1672          * The use of a_proc is preferred to handle the case where curproc is
1673          * a door_call server and is allocating memory in the client's (a_proc)
1674          * address space.
1675          * When creating a shared memory segment a_proc will be NULL so we
1676          * fallback to curproc in that case.
1677          */
1678         struct proc *p = (as->a_proc == NULL) ? curproc : as->a_proc;
1679         struct segvn_crargs crargs;
1680 
1681         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1682         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1683             (size_t)raddr;
1684 
1685         /*
1686          * check for wrap around
1687          */
1688         if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
1689                 AS_LOCK_EXIT(as);
1690                 return (ENOMEM);
1691         }
1692 
1693         as->a_updatedir = 1; /* inform /proc */
1694         gethrestime(&as->a_updatetime);
1695 
1696         if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) {
1697                 AS_LOCK_EXIT(as);
1698 
1699                 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
1700                     RCA_UNSAFE_ALL);
1701 
1702                 return (ENOMEM);
1703         }
1704 
1705         if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
1706                 crargs = *(struct segvn_crargs *)argsp;
1707                 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap);
1708                 if (error != 0) {
1709                         AS_LOCK_EXIT(as);
1710                         if (unmap) {
1711                                 (void) as_unmap(as, addr, size);
1712                         }
1713                         return (error);
1714                 }
1715         } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
1716                 crargs = *(struct segvn_crargs *)argsp;
1717                 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap);
1718                 if (error != 0) {
1719                         AS_LOCK_EXIT(as);
1720                         if (unmap) {
1721                                 (void) as_unmap(as, addr, size);
1722                         }
1723                         return (error);
1724                 }
1725         } else {
1726                 seg = seg_alloc(as, addr, size);
1727                 if (seg == NULL) {
1728                         AS_LOCK_EXIT(as);
1729                         return (ENOMEM);
1730                 }
1731 
1732                 error = (*crfp)(seg, argsp);
1733                 if (error != 0) {
1734                         seg_free(seg);
1735                         AS_LOCK_EXIT(as);
1736                         return (error);
1737                 }
1738                 /*
1739                  * Add size now so as_unmap will work if as_ctl fails.
1740                  */
1741                 as->a_size += rsize;
1742                 as->a_resvsize += rsize;
1743         }
1744 
1745         as_setwatch(as);
1746 
1747         /*
1748          * If the address space is locked,
1749          * establish memory locks for the new segment.
1750          */
1751         mutex_enter(&as->a_contents);
1752         if (AS_ISPGLCK(as)) {
1753                 mutex_exit(&as->a_contents);
1754                 AS_LOCK_EXIT(as);
1755                 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
1756                 if (error != 0)
1757                         (void) as_unmap(as, addr, size);
1758         } else {
1759                 mutex_exit(&as->a_contents);
1760                 AS_LOCK_EXIT(as);
1761         }
1762         return (error);
1763 }
1764 
1765 
1766 /*
1767  * Delete all segments in the address space marked with S_PURGE.
1768  * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1769  * These segments are deleted as a first step before calls to as_gap(), so
1770  * that they don't affect mmap() or shmat().
1771  */
1772 void
1773 as_purge(struct as *as)
1774 {
1775         struct seg *seg;
1776         struct seg *next_seg;
1777 
1778         /*
1779          * the setting of NEEDSPURGE is protect by as_rangelock(), so
1780          * no need to grab a_contents mutex for this check
1781          */
1782         if ((as->a_flags & AS_NEEDSPURGE) == 0)
1783                 return;
1784 
1785         AS_LOCK_ENTER(as, RW_WRITER);
1786         next_seg = NULL;
1787         seg = AS_SEGFIRST(as);
1788         while (seg != NULL) {
1789                 next_seg = AS_SEGNEXT(as, seg);
1790                 if (seg->s_flags & S_PURGE)
1791                         SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1792                 seg = next_seg;
1793         }
1794         AS_LOCK_EXIT(as);
1795 
1796         mutex_enter(&as->a_contents);
1797         as->a_flags &= ~AS_NEEDSPURGE;
1798         mutex_exit(&as->a_contents);
1799 }
1800 
1801 /*
1802  * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1803  * range of addresses at least "minlen" long, where the base of the range is
1804  * at "off" phase from an "align" boundary and there is space for a
1805  * "redzone"-sized redzone on eithe rside of the range.  Thus,
1806  * if align was 4M and off was 16k, the user wants a hole which will start
1807  * 16k into a 4M page.
1808  *
1809  * If flags specifies AH_HI, the hole will have the highest possible address
1810  * in the range.  We use the as->a_lastgap field to figure out where to
1811  * start looking for a gap.
1812  *
1813  * Otherwise, the gap will have the lowest possible address.
1814  *
1815  * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1816  *
1817  * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1818  * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1819  *
1820  * NOTE: This routine is not correct when base+len overflows caddr_t.
1821  */
1822 int
1823 as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
1824     uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off)
1825 {
1826         caddr_t lobound = *basep;
1827         caddr_t hibound = lobound + *lenp;
1828         struct seg *lseg, *hseg;
1829         caddr_t lo, hi;
1830         int forward;
1831         caddr_t save_base;
1832         size_t save_len;
1833         size_t save_minlen;
1834         size_t save_redzone;
1835         int fast_path = 1;
1836 
1837         save_base = *basep;
1838         save_len = *lenp;
1839         save_minlen = minlen;
1840         save_redzone = redzone;
1841 
1842         /*
1843          * For the first pass/fast_path, just add align and redzone into
1844          * minlen since if we get an allocation, we can guarantee that it
1845          * will fit the alignment and redzone requested.
1846          * This increases the chance that hibound will be adjusted to
1847          * a_lastgap->s_base which will likely allow us to find an
1848          * acceptable hole in the address space quicker.
1849          * If we can't find a hole with this fast_path, then we look for
1850          * smaller holes in which the alignment and offset may allow
1851          * the allocation to fit.
1852          */
1853         minlen += align;
1854         minlen += 2 * redzone;
1855         redzone = 0;
1856 
1857         AS_LOCK_ENTER(as, RW_READER);
1858         if (AS_SEGFIRST(as) == NULL) {
1859                 if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
1860                     align, redzone, off)) {
1861                         AS_LOCK_EXIT(as);
1862                         return (0);
1863                 } else {
1864                         AS_LOCK_EXIT(as);
1865                         *basep = save_base;
1866                         *lenp = save_len;
1867                         return (-1);
1868                 }
1869         }
1870 
1871 retry:
1872         /*
1873          * Set up to iterate over all the inter-segment holes in the given
1874          * direction.  lseg is NULL for the lowest-addressed hole and hseg is
1875          * NULL for the highest-addressed hole.  If moving backwards, we reset
1876          * sseg to denote the highest-addressed segment.
1877          */
1878         forward = (flags & AH_DIR) == AH_LO;
1879         if (forward) {
1880                 hseg = as_findseg(as, lobound, 1);
1881                 lseg = AS_SEGPREV(as, hseg);
1882         } else {
1883 
1884                 /*
1885                  * If allocating at least as much as the last allocation,
1886                  * use a_lastgap's base as a better estimate of hibound.
1887                  */
1888                 if (as->a_lastgap &&
1889                     minlen >= as->a_lastgap->s_size &&
1890                     hibound >= as->a_lastgap->s_base)
1891                         hibound = as->a_lastgap->s_base;
1892 
1893                 hseg = as_findseg(as, hibound, 1);
1894                 if (hseg->s_base + hseg->s_size < hibound) {
1895                         lseg = hseg;
1896                         hseg = NULL;
1897                 } else {
1898                         lseg = AS_SEGPREV(as, hseg);
1899                 }
1900         }
1901 
1902         for (;;) {
1903                 /*
1904                  * Set lo and hi to the hole's boundaries.  (We should really
1905                  * use MAXADDR in place of hibound in the expression below,
1906                  * but can't express it easily; using hibound in its place is
1907                  * harmless.)
1908                  */
1909                 lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
1910                 hi = (hseg == NULL) ? hibound : hseg->s_base;
1911                 /*
1912                  * If the iteration has moved past the interval from lobound
1913                  * to hibound it's pointless to continue.
1914                  */
1915                 if ((forward && lo > hibound) || (!forward && hi < lobound))
1916                         break;
1917                 else if (lo > hibound || hi < lobound)
1918                         goto cont;
1919                 /*
1920                  * Candidate hole lies at least partially within the allowable
1921                  * range.  Restrict it to fall completely within that range,
1922                  * i.e., to [max(lo, lobound), min(hi, hibound)].
1923                  */
1924                 if (lo < lobound)
1925                         lo = lobound;
1926                 if (hi > hibound)
1927                         hi = hibound;
1928                 /*
1929                  * Verify that the candidate hole is big enough and meets
1930                  * hardware constraints.  If the hole is too small, no need
1931                  * to do the further checks since they will fail.
1932                  */
1933                 *basep = lo;
1934                 *lenp = hi - lo;
1935                 if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
1936                     minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
1937                     ((flags & AH_CONTAIN) == 0 ||
1938                     (*basep <= addr && *basep + *lenp > addr))) {
1939                         if (!forward)
1940                                 as->a_lastgap = hseg;
1941                         if (hseg != NULL)
1942                                 as->a_lastgaphl = hseg;
1943                         else
1944                                 as->a_lastgaphl = lseg;
1945                         AS_LOCK_EXIT(as);
1946                         return (0);
1947                 }
1948         cont:
1949                 /*
1950                  * Move to the next hole.
1951                  */
1952                 if (forward) {
1953                         lseg = hseg;
1954                         if (lseg == NULL)
1955                                 break;
1956                         hseg = AS_SEGNEXT(as, hseg);
1957                 } else {
1958                         hseg = lseg;
1959                         if (hseg == NULL)
1960                                 break;
1961                         lseg = AS_SEGPREV(as, lseg);
1962                 }
1963         }
1964         if (fast_path && (align != 0 || save_redzone != 0)) {
1965                 fast_path = 0;
1966                 minlen = save_minlen;
1967                 redzone = save_redzone;
1968                 goto retry;
1969         }
1970         *basep = save_base;
1971         *lenp = save_len;
1972         AS_LOCK_EXIT(as);
1973         return (-1);
1974 }
1975 
1976 /*
1977  * Find a hole of at least size minlen within [*basep, *basep + *lenp).
1978  *
1979  * If flags specifies AH_HI, the hole will have the highest possible address
1980  * in the range.  We use the as->a_lastgap field to figure out where to
1981  * start looking for a gap.
1982  *
1983  * Otherwise, the gap will have the lowest possible address.
1984  *
1985  * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1986  *
1987  * If an adequate hole is found, base and len are set to reflect the part of
1988  * the hole that is within range, and 0 is returned, otherwise,
1989  * -1 is returned.
1990  *
1991  * NOTE: This routine is not correct when base+len overflows caddr_t.
1992  */
1993 int
1994 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
1995     caddr_t addr)
1996 {
1997 
1998         return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
1999 }
2000 
2001 /*
2002  * Return the next range within [base, base + len) that is backed
2003  * with "real memory".  Skip holes and non-seg_vn segments.
2004  * We're lazy and only return one segment at a time.
2005  */
2006 int
2007 as_memory(struct as *as, caddr_t *basep, size_t *lenp)
2008 {
2009         extern struct seg_ops segspt_shmops;    /* needs a header file */
2010         struct seg *seg;
2011         caddr_t addr, eaddr;
2012         caddr_t segend;
2013 
2014         AS_LOCK_ENTER(as, RW_READER);
2015 
2016         addr = *basep;
2017         eaddr = addr + *lenp;
2018 
2019         seg = as_findseg(as, addr, 0);
2020         if (seg != NULL)
2021                 addr = MAX(seg->s_base, addr);
2022 
2023         for (;;) {
2024                 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2025                         AS_LOCK_EXIT(as);
2026                         return (EINVAL);
2027                 }
2028 
2029                 if (seg->s_ops == &segvn_ops) {
2030                         segend = seg->s_base + seg->s_size;
2031                         break;
2032                 }
2033 
2034                 /*
2035                  * We do ISM by looking into the private data
2036                  * to determine the real size of the segment.
2037                  */
2038                 if (seg->s_ops == &segspt_shmops) {
2039                         segend = seg->s_base + spt_realsize(seg);
2040                         if (addr < segend)
2041                                 break;
2042                 }
2043 
2044                 seg = AS_SEGNEXT(as, seg);
2045 
2046                 if (seg != NULL)
2047                         addr = seg->s_base;
2048         }
2049 
2050         *basep = addr;
2051 
2052         if (segend > eaddr)
2053                 *lenp = eaddr - addr;
2054         else
2055                 *lenp = segend - addr;
2056 
2057         AS_LOCK_EXIT(as);
2058         return (0);
2059 }
2060 
2061 /*
2062  * Swap the pages associated with the address space as out to
2063  * secondary storage, returning the number of bytes actually
2064  * swapped.
2065  *
2066  * The value returned is intended to correlate well with the process's
2067  * memory requirements.  Its usefulness for this purpose depends on
2068  * how well the segment-level routines do at returning accurate
2069  * information.
2070  */
2071 size_t
2072 as_swapout(struct as *as)
2073 {
2074         struct seg *seg;
2075         size_t swpcnt = 0;
2076 
2077         /*
2078          * Kernel-only processes have given up their address
2079          * spaces.  Of course, we shouldn't be attempting to
2080          * swap out such processes in the first place...
2081          */
2082         if (as == NULL)
2083                 return (0);
2084 
2085         AS_LOCK_ENTER(as, RW_READER);
2086 
2087         /*
2088          * Free all mapping resources associated with the address
2089          * space.  The segment-level swapout routines capitalize
2090          * on this unmapping by scavanging pages that have become
2091          * unmapped here.
2092          */
2093         hat_swapout(as->a_hat);
2094 
2095         /*
2096          * Call the swapout routines of all segments in the address
2097          * space to do the actual work, accumulating the amount of
2098          * space reclaimed.
2099          */
2100         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2101                 struct seg_ops *ov = seg->s_ops;
2102 
2103                 /*
2104                  * We have to check to see if the seg has
2105                  * an ops vector because the seg may have
2106                  * been in the middle of being set up when
2107                  * the process was picked for swapout.
2108                  */
2109                 if ((ov != NULL) && (ov->swapout != NULL))
2110                         swpcnt += SEGOP_SWAPOUT(seg);
2111         }
2112         AS_LOCK_EXIT(as);
2113         return (swpcnt);
2114 }
2115 
2116 /*
2117  * Determine whether data from the mappings in interval [addr, addr + size)
2118  * are in the primary memory (core) cache.
2119  */
2120 int
2121 as_incore(struct as *as, caddr_t addr,
2122     size_t size, char *vec, size_t *sizep)
2123 {
2124         struct seg *seg;
2125         size_t ssize;
2126         caddr_t raddr;          /* rounded down addr */
2127         size_t rsize;           /* rounded up size */
2128         size_t isize;                   /* iteration size */
2129         int error = 0;          /* result, assume success */
2130 
2131         *sizep = 0;
2132         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2133         rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2134             (size_t)raddr;
2135 
2136         if (raddr + rsize < raddr)           /* check for wraparound */
2137                 return (ENOMEM);
2138 
2139         AS_LOCK_ENTER(as, RW_READER);
2140         seg = as_segat(as, raddr);
2141         if (seg == NULL) {
2142                 AS_LOCK_EXIT(as);
2143                 return (-1);
2144         }
2145 
2146         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2147                 if (raddr >= seg->s_base + seg->s_size) {
2148                         seg = AS_SEGNEXT(as, seg);
2149                         if (seg == NULL || raddr != seg->s_base) {
2150                                 error = -1;
2151                                 break;
2152                         }
2153                 }
2154                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2155                         ssize = seg->s_base + seg->s_size - raddr;
2156                 else
2157                         ssize = rsize;
2158                 *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec);
2159                 if (isize != ssize) {
2160                         error = -1;
2161                         break;
2162                 }
2163                 vec += btopr(ssize);
2164         }
2165         AS_LOCK_EXIT(as);
2166         return (error);
2167 }
2168 
2169 static void
2170 as_segunlock(struct seg *seg, caddr_t addr, int attr,
2171     ulong_t *bitmap, size_t position, size_t npages)
2172 {
2173         caddr_t range_start;
2174         size_t  pos1 = position;
2175         size_t  pos2;
2176         size_t  size;
2177         size_t  end_pos = npages + position;
2178 
2179         while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2180                 size = ptob((pos2 - pos1));
2181                 range_start = (caddr_t)((uintptr_t)addr +
2182                     ptob(pos1 - position));
2183 
2184                 (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK,
2185                     (ulong_t *)NULL, (size_t)NULL);
2186                 pos1 = pos2;
2187         }
2188 }
2189 
2190 static void
2191 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
2192     caddr_t raddr, size_t rsize)
2193 {
2194         struct seg *seg = as_segat(as, raddr);
2195         size_t ssize;
2196 
2197         while (rsize != 0) {
2198                 if (raddr >= seg->s_base + seg->s_size)
2199                         seg = AS_SEGNEXT(as, seg);
2200 
2201                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2202                         ssize = seg->s_base + seg->s_size - raddr;
2203                 else
2204                         ssize = rsize;
2205 
2206                 as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
2207 
2208                 rsize -= ssize;
2209                 raddr += ssize;
2210         }
2211 }
2212 
2213 /*
2214  * Cache control operations over the interval [addr, addr + size) in
2215  * address space "as".
2216  */
2217 /*ARGSUSED*/
2218 int
2219 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
2220     uintptr_t arg, ulong_t *lock_map, size_t pos)
2221 {
2222         struct seg *seg;        /* working segment */
2223         caddr_t raddr;          /* rounded down addr */
2224         caddr_t initraddr;      /* saved initial rounded down addr */
2225         size_t rsize;           /* rounded up size */
2226         size_t initrsize;       /* saved initial rounded up size */
2227         size_t ssize;           /* size of seg */
2228         int error = 0;                  /* result */
2229         size_t mlock_size;      /* size of bitmap */
2230         ulong_t *mlock_map;     /* pointer to bitmap used */
2231                                 /* to represent the locked */
2232                                 /* pages. */
2233 retry:
2234         if (error == IE_RETRY)
2235                 AS_LOCK_ENTER(as, RW_WRITER);
2236         else
2237                 AS_LOCK_ENTER(as, RW_READER);
2238 
2239         /*
2240          * If these are address space lock/unlock operations, loop over
2241          * all segments in the address space, as appropriate.
2242          */
2243         if (func == MC_LOCKAS) {
2244                 size_t npages, idx;
2245                 size_t rlen = 0;        /* rounded as length */
2246 
2247                 idx = pos;
2248 
2249                 if (arg & MCL_FUTURE) {
2250                         mutex_enter(&as->a_contents);
2251                         AS_SETPGLCK(as);
2252                         mutex_exit(&as->a_contents);
2253                 }
2254                 if ((arg & MCL_CURRENT) == 0) {
2255                         AS_LOCK_EXIT(as);
2256                         return (0);
2257                 }
2258 
2259                 seg = AS_SEGFIRST(as);
2260                 if (seg == NULL) {
2261                         AS_LOCK_EXIT(as);
2262                         return (0);
2263                 }
2264 
2265                 do {
2266                         raddr = (caddr_t)((uintptr_t)seg->s_base &
2267                             (uintptr_t)PAGEMASK);
2268                         rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2269                             PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2270                 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2271 
2272                 mlock_size = BT_BITOUL(btopr(rlen));
2273                 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2274                     sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2275                                 AS_LOCK_EXIT(as);
2276                                 return (EAGAIN);
2277                 }
2278 
2279                 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2280                         error = SEGOP_LOCKOP(seg, seg->s_base,
2281                             seg->s_size, attr, MC_LOCK, mlock_map, pos);
2282                         if (error != 0)
2283                                 break;
2284                         pos += seg_pages(seg);
2285                 }
2286 
2287                 if (error) {
2288                         for (seg = AS_SEGFIRST(as); seg != NULL;
2289                             seg = AS_SEGNEXT(as, seg)) {
2290 
2291                                 raddr = (caddr_t)((uintptr_t)seg->s_base &
2292                                     (uintptr_t)PAGEMASK);
2293                                 npages = seg_pages(seg);
2294                                 as_segunlock(seg, raddr, attr, mlock_map,
2295                                     idx, npages);
2296                                 idx += npages;
2297                         }
2298                 }
2299 
2300                 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2301                 AS_LOCK_EXIT(as);
2302                 goto lockerr;
2303         } else if (func == MC_UNLOCKAS) {
2304                 mutex_enter(&as->a_contents);
2305                 AS_CLRPGLCK(as);
2306                 mutex_exit(&as->a_contents);
2307 
2308                 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2309                         error = SEGOP_LOCKOP(seg, seg->s_base,
2310                             seg->s_size, attr, MC_UNLOCK, NULL, 0);
2311                         if (error != 0)
2312                                 break;
2313                 }
2314 
2315                 AS_LOCK_EXIT(as);
2316                 goto lockerr;
2317         }
2318 
2319         /*
2320          * Normalize addresses and sizes.
2321          */
2322         initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2323         initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2324             (size_t)raddr;
2325 
2326         if (raddr + rsize < raddr) {         /* check for wraparound */
2327                 AS_LOCK_EXIT(as);
2328                 return (ENOMEM);
2329         }
2330 
2331         /*
2332          * Get initial segment.
2333          */
2334         if ((seg = as_segat(as, raddr)) == NULL) {
2335                 AS_LOCK_EXIT(as);
2336                 return (ENOMEM);
2337         }
2338 
2339         if (func == MC_LOCK) {
2340                 mlock_size = BT_BITOUL(btopr(rsize));
2341                 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2342                     sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2343                                 AS_LOCK_EXIT(as);
2344                                 return (EAGAIN);
2345                 }
2346         }
2347 
2348         /*
2349          * Loop over all segments.  If a hole in the address range is
2350          * discovered, then fail.  For each segment, perform the appropriate
2351          * control operation.
2352          */
2353         while (rsize != 0) {
2354 
2355                 /*
2356                  * Make sure there's no hole, calculate the portion
2357                  * of the next segment to be operated over.
2358                  */
2359                 if (raddr >= seg->s_base + seg->s_size) {
2360                         seg = AS_SEGNEXT(as, seg);
2361                         if (seg == NULL || raddr != seg->s_base) {
2362                                 if (func == MC_LOCK) {
2363                                         as_unlockerr(as, attr, mlock_map,
2364                                             initraddr, initrsize - rsize);
2365                                         kmem_free(mlock_map,
2366                                             mlock_size * sizeof (ulong_t));
2367                                 }
2368                                 AS_LOCK_EXIT(as);
2369                                 return (ENOMEM);
2370                         }
2371                 }
2372                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2373                         ssize = seg->s_base + seg->s_size - raddr;
2374                 else
2375                         ssize = rsize;
2376 
2377                 /*
2378                  * Dispatch on specific function.
2379                  */
2380                 switch (func) {
2381 
2382                 /*
2383                  * Synchronize cached data from mappings with backing
2384                  * objects.
2385                  */
2386                 case MC_SYNC:
2387                         if (error = SEGOP_SYNC(seg, raddr, ssize,
2388                             attr, (uint_t)arg)) {
2389                                 AS_LOCK_EXIT(as);
2390                                 return (error);
2391                         }
2392                         break;
2393 
2394                 /*
2395                  * Lock pages in memory.
2396                  */
2397                 case MC_LOCK:
2398                         if (error = SEGOP_LOCKOP(seg, raddr, ssize,
2399                             attr, func, mlock_map, pos)) {
2400                                 as_unlockerr(as, attr, mlock_map, initraddr,
2401                                     initrsize - rsize + ssize);
2402                                 kmem_free(mlock_map, mlock_size *
2403                                     sizeof (ulong_t));
2404                                 AS_LOCK_EXIT(as);
2405                                 goto lockerr;
2406                         }
2407                         break;
2408 
2409                 /*
2410                  * Unlock mapped pages.
2411                  */
2412                 case MC_UNLOCK:
2413                         (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func,
2414                             (ulong_t *)NULL, (size_t)NULL);
2415                         break;
2416 
2417                 /*
2418                  * Store VM advise for mapped pages in segment layer.
2419                  */
2420                 case MC_ADVISE:
2421                         error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg);
2422 
2423                         /*
2424                          * Check for regular errors and special retry error
2425                          */
2426                         if (error) {
2427                                 if (error == IE_RETRY) {
2428                                         /*
2429                                          * Need to acquire writers lock, so
2430                                          * have to drop readers lock and start
2431                                          * all over again
2432                                          */
2433                                         AS_LOCK_EXIT(as);
2434                                         goto retry;
2435                                 } else if (error == IE_REATTACH) {
2436                                         /*
2437                                          * Find segment for current address
2438                                          * because current segment just got
2439                                          * split or concatenated
2440                                          */
2441                                         seg = as_segat(as, raddr);
2442                                         if (seg == NULL) {
2443                                                 AS_LOCK_EXIT(as);
2444                                                 return (ENOMEM);
2445                                         }
2446                                 } else {
2447                                         /*
2448                                          * Regular error
2449                                          */
2450                                         AS_LOCK_EXIT(as);
2451                                         return (error);
2452                                 }
2453                         }
2454                         break;
2455 
2456                 case MC_INHERIT_ZERO:
2457                         if (seg->s_ops->inherit == NULL) {
2458                                 error = ENOTSUP;
2459                         } else {
2460                                 error = SEGOP_INHERIT(seg, raddr, ssize,
2461                                     SEGP_INH_ZERO);
2462                         }
2463                         if (error != 0) {
2464                                 AS_LOCK_EXIT(as);
2465                                 return (error);
2466                         }
2467                         break;
2468 
2469                 /*
2470                  * Can't happen.
2471                  */
2472                 default:
2473                         panic("as_ctl: bad operation %d", func);
2474                         /*NOTREACHED*/
2475                 }
2476 
2477                 rsize -= ssize;
2478                 raddr += ssize;
2479         }
2480 
2481         if (func == MC_LOCK)
2482                 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2483         AS_LOCK_EXIT(as);
2484         return (0);
2485 lockerr:
2486 
2487         /*
2488          * If the lower levels returned EDEADLK for a segment lockop,
2489          * it means that we should retry the operation.  Let's wait
2490          * a bit also to let the deadlock causing condition clear.
2491          * This is part of a gross hack to work around a design flaw
2492          * in the ufs/sds logging code and should go away when the
2493          * logging code is re-designed to fix the problem. See bug
2494          * 4125102 for details of the problem.
2495          */
2496         if (error == EDEADLK) {
2497                 delay(deadlk_wait);
2498                 error = 0;
2499                 goto retry;
2500         }
2501         return (error);
2502 }
2503 
2504 int
2505 fc_decode(faultcode_t fault_err)
2506 {
2507         int error = 0;
2508 
2509         switch (FC_CODE(fault_err)) {
2510         case FC_OBJERR:
2511                 error = FC_ERRNO(fault_err);
2512                 break;
2513         case FC_PROT:
2514                 error = EACCES;
2515                 break;
2516         default:
2517                 error = EFAULT;
2518                 break;
2519         }
2520         return (error);
2521 }
2522 
2523 /*
2524  * Pagelock pages from a range that spans more than 1 segment.  Obtain shadow
2525  * lists from each segment and copy them to one contiguous shadow list (plist)
2526  * as expected by the caller.  Save pointers to per segment shadow lists at
2527  * the tail of plist so that they can be used during as_pageunlock().
2528  */
2529 static int
2530 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2531     caddr_t addr, size_t size, enum seg_rw rw)
2532 {
2533         caddr_t sv_addr = addr;
2534         size_t sv_size = size;
2535         struct seg *sv_seg = seg;
2536         ulong_t segcnt = 1;
2537         ulong_t cnt;
2538         size_t ssize;
2539         pgcnt_t npages = btop(size);
2540         page_t **plist;
2541         page_t **pl;
2542         int error;
2543         caddr_t eaddr;
2544         faultcode_t fault_err = 0;
2545         pgcnt_t pl_off;
2546         extern struct seg_ops segspt_shmops;
2547 
2548         ASSERT(AS_LOCK_HELD(as));
2549         ASSERT(seg != NULL);
2550         ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2551         ASSERT(addr + size > seg->s_base + seg->s_size);
2552         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2553         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2554 
2555         /*
2556          * Count the number of segments covered by the range we are about to
2557          * lock. The segment count is used to size the shadow list we return
2558          * back to the caller.
2559          */
2560         for (; size != 0; size -= ssize, addr += ssize) {
2561                 if (addr >= seg->s_base + seg->s_size) {
2562 
2563                         seg = AS_SEGNEXT(as, seg);
2564                         if (seg == NULL || addr != seg->s_base) {
2565                                 AS_LOCK_EXIT(as);
2566                                 return (EFAULT);
2567                         }
2568                         /*
2569                          * Do a quick check if subsequent segments
2570                          * will most likely support pagelock.
2571                          */
2572                         if (seg->s_ops == &segvn_ops) {
2573                                 vnode_t *vp;
2574 
2575                                 if (SEGOP_GETVP(seg, addr, &vp) != 0 ||
2576                                     vp != NULL) {
2577                                         AS_LOCK_EXIT(as);
2578                                         goto slow;
2579                                 }
2580                         } else if (seg->s_ops != &segspt_shmops) {
2581                                 AS_LOCK_EXIT(as);
2582                                 goto slow;
2583                         }
2584                         segcnt++;
2585                 }
2586                 if (addr + size > seg->s_base + seg->s_size) {
2587                         ssize = seg->s_base + seg->s_size - addr;
2588                 } else {
2589                         ssize = size;
2590                 }
2591         }
2592         ASSERT(segcnt > 1);
2593 
2594         plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2595 
2596         addr = sv_addr;
2597         size = sv_size;
2598         seg = sv_seg;
2599 
2600         for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2601                 if (addr >= seg->s_base + seg->s_size) {
2602                         seg = AS_SEGNEXT(as, seg);
2603                         ASSERT(seg != NULL && addr == seg->s_base);
2604                         cnt++;
2605                         ASSERT(cnt < segcnt);
2606                 }
2607                 if (addr + size > seg->s_base + seg->s_size) {
2608                         ssize = seg->s_base + seg->s_size - addr;
2609                 } else {
2610                         ssize = size;
2611                 }
2612                 pl = &plist[npages + cnt];
2613                 error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2614                     L_PAGELOCK, rw);
2615                 if (error) {
2616                         break;
2617                 }
2618                 ASSERT(plist[npages + cnt] != NULL);
2619                 ASSERT(pl_off + btop(ssize) <= npages);
2620                 bcopy(plist[npages + cnt], &plist[pl_off],
2621                     btop(ssize) * sizeof (page_t *));
2622                 pl_off += btop(ssize);
2623         }
2624 
2625         if (size == 0) {
2626                 AS_LOCK_EXIT(as);
2627                 ASSERT(cnt == segcnt - 1);
2628                 *ppp = plist;
2629                 return (0);
2630         }
2631 
2632         /*
2633          * one of pagelock calls failed. The error type is in error variable.
2634          * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2635          * type is either EFAULT or ENOTSUP. Otherwise just return the error
2636          * back to the caller.
2637          */
2638 
2639         eaddr = addr;
2640         seg = sv_seg;
2641 
2642         for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2643                 if (addr >= seg->s_base + seg->s_size) {
2644                         seg = AS_SEGNEXT(as, seg);
2645                         ASSERT(seg != NULL && addr == seg->s_base);
2646                         cnt++;
2647                         ASSERT(cnt < segcnt);
2648                 }
2649                 if (eaddr > seg->s_base + seg->s_size) {
2650                         ssize = seg->s_base + seg->s_size - addr;
2651                 } else {
2652                         ssize = eaddr - addr;
2653                 }
2654                 pl = &plist[npages + cnt];
2655                 ASSERT(*pl != NULL);
2656                 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2657                     L_PAGEUNLOCK, rw);
2658         }
2659 
2660         AS_LOCK_EXIT(as);
2661 
2662         kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2663 
2664         if (error != ENOTSUP && error != EFAULT) {
2665                 return (error);
2666         }
2667 
2668 slow:
2669         /*
2670          * If we are here because pagelock failed due to the need to cow fault
2671          * in the pages we want to lock F_SOFTLOCK will do this job and in
2672          * next as_pagelock() call for this address range pagelock will
2673          * hopefully succeed.
2674          */
2675         fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2676         if (fault_err != 0) {
2677                 return (fc_decode(fault_err));
2678         }
2679         *ppp = NULL;
2680 
2681         return (0);
2682 }
2683 
2684 /*
2685  * lock pages in a given address space. Return shadow list. If
2686  * the list is NULL, the MMU mapping is also locked.
2687  */
2688 int
2689 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
2690     size_t size, enum seg_rw rw)
2691 {
2692         size_t rsize;
2693         caddr_t raddr;
2694         faultcode_t fault_err;
2695         struct seg *seg;
2696         int err;
2697 
2698         TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START,
2699             "as_pagelock_start: addr %p size %ld", addr, size);
2700 
2701         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2702         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2703             (size_t)raddr;
2704 
2705         /*
2706          * if the request crosses two segments let
2707          * as_fault handle it.
2708          */
2709         AS_LOCK_ENTER(as, RW_READER);
2710 
2711         seg = as_segat(as, raddr);
2712         if (seg == NULL) {
2713                 AS_LOCK_EXIT(as);
2714                 return (EFAULT);
2715         }
2716         ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2717         if (raddr + rsize > seg->s_base + seg->s_size) {
2718                 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2719         }
2720         if (raddr + rsize <= raddr) {
2721                 AS_LOCK_EXIT(as);
2722                 return (EFAULT);
2723         }
2724 
2725         TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
2726             "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize);
2727 
2728         /*
2729          * try to lock pages and pass back shadow list
2730          */
2731         err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2732 
2733         TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
2734 
2735         AS_LOCK_EXIT(as);
2736 
2737         if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2738                 return (err);
2739         }
2740 
2741         /*
2742          * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2743          * to no pagelock support for this segment or pages need to be cow
2744          * faulted in. If fault is needed F_SOFTLOCK will do this job for
2745          * this as_pagelock() call and in the next as_pagelock() call for the
2746          * same address range pagelock call will hopefull succeed.
2747          */
2748         fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2749         if (fault_err != 0) {
2750                 return (fc_decode(fault_err));
2751         }
2752         *ppp = NULL;
2753 
2754         TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end");
2755         return (0);
2756 }
2757 
2758 /*
2759  * unlock pages locked by as_pagelock_segs().  Retrieve per segment shadow
2760  * lists from the end of plist and call pageunlock interface for each segment.
2761  * Drop as lock and free plist.
2762  */
2763 static void
2764 as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
2765     struct page **plist, enum seg_rw rw)
2766 {
2767         ulong_t cnt;
2768         caddr_t eaddr = addr + size;
2769         pgcnt_t npages = btop(size);
2770         size_t ssize;
2771         page_t **pl;
2772 
2773         ASSERT(AS_LOCK_HELD(as));
2774         ASSERT(seg != NULL);
2775         ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2776         ASSERT(addr + size > seg->s_base + seg->s_size);
2777         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2778         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2779         ASSERT(plist != NULL);
2780 
2781         for (cnt = 0; addr < eaddr; addr += ssize) {
2782                 if (addr >= seg->s_base + seg->s_size) {
2783                         seg = AS_SEGNEXT(as, seg);
2784                         ASSERT(seg != NULL && addr == seg->s_base);
2785                         cnt++;
2786                 }
2787                 if (eaddr > seg->s_base + seg->s_size) {
2788                         ssize = seg->s_base + seg->s_size - addr;
2789                 } else {
2790                         ssize = eaddr - addr;
2791                 }
2792                 pl = &plist[npages + cnt];
2793                 ASSERT(*pl != NULL);
2794                 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2795                     L_PAGEUNLOCK, rw);
2796         }
2797         ASSERT(cnt > 0);
2798         AS_LOCK_EXIT(as);
2799 
2800         cnt++;
2801         kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2802 }
2803 
2804 /*
2805  * unlock pages in a given address range
2806  */
2807 void
2808 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2809     enum seg_rw rw)
2810 {
2811         struct seg *seg;
2812         size_t rsize;
2813         caddr_t raddr;
2814 
2815         TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START,
2816             "as_pageunlock_start: addr %p size %ld", addr, size);
2817 
2818         /*
2819          * if the shadow list is NULL, as_pagelock was
2820          * falling back to as_fault
2821          */
2822         if (pp == NULL) {
2823                 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2824                 return;
2825         }
2826 
2827         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2828         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2829             (size_t)raddr;
2830 
2831         AS_LOCK_ENTER(as, RW_READER);
2832         seg = as_segat(as, raddr);
2833         ASSERT(seg != NULL);
2834 
2835         TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
2836             "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
2837 
2838         ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2839         if (raddr + rsize <= seg->s_base + seg->s_size) {
2840                 SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2841         } else {
2842                 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2843                 return;
2844         }
2845         AS_LOCK_EXIT(as);
2846         TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
2847 }
2848 
2849 int
2850 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2851     boolean_t wait)
2852 {
2853         struct seg *seg;
2854         size_t ssize;
2855         caddr_t raddr;                  /* rounded down addr */
2856         size_t rsize;                   /* rounded up size */
2857         int error = 0;
2858         size_t pgsz = page_get_pagesize(szc);
2859 
2860 setpgsz_top:
2861         if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
2862                 return (EINVAL);
2863         }
2864 
2865         raddr = addr;
2866         rsize = size;
2867 
2868         if (raddr + rsize < raddr)           /* check for wraparound */
2869                 return (ENOMEM);
2870 
2871         AS_LOCK_ENTER(as, RW_WRITER);
2872         as_clearwatchprot(as, raddr, rsize);
2873         seg = as_segat(as, raddr);
2874         if (seg == NULL) {
2875                 as_setwatch(as);
2876                 AS_LOCK_EXIT(as);
2877                 return (ENOMEM);
2878         }
2879 
2880         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2881                 if (raddr >= seg->s_base + seg->s_size) {
2882                         seg = AS_SEGNEXT(as, seg);
2883                         if (seg == NULL || raddr != seg->s_base) {
2884                                 error = ENOMEM;
2885                                 break;
2886                         }
2887                 }
2888                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2889                         ssize = seg->s_base + seg->s_size - raddr;
2890                 } else {
2891                         ssize = rsize;
2892                 }
2893 
2894 retry:
2895                 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
2896 
2897                 if (error == IE_NOMEM) {
2898                         error = EAGAIN;
2899                         break;
2900                 }
2901 
2902                 if (error == IE_RETRY) {
2903                         AS_LOCK_EXIT(as);
2904                         goto setpgsz_top;
2905                 }
2906 
2907                 if (error == ENOTSUP) {
2908                         error = EINVAL;
2909                         break;
2910                 }
2911 
2912                 if (wait && (error == EAGAIN)) {
2913                         /*
2914                          * Memory is currently locked.  It must be unlocked
2915                          * before this operation can succeed through a retry.
2916                          * The possible reasons for locked memory and
2917                          * corresponding strategies for unlocking are:
2918                          * (1) Normal I/O
2919                          *      wait for a signal that the I/O operation
2920                          *      has completed and the memory is unlocked.
2921                          * (2) Asynchronous I/O
2922                          *      The aio subsystem does not unlock pages when
2923                          *      the I/O is completed. Those pages are unlocked
2924                          *      when the application calls aiowait/aioerror.
2925                          *      So, to prevent blocking forever, cv_broadcast()
2926                          *      is done to wake up aio_cleanup_thread.
2927                          *      Subsequently, segvn_reclaim will be called, and
2928                          *      that will do AS_CLRUNMAPWAIT() and wake us up.
2929                          * (3) Long term page locking:
2930                          *      This is not relevant for as_setpagesize()
2931                          *      because we cannot change the page size for
2932                          *      driver memory. The attempt to do so will
2933                          *      fail with a different error than EAGAIN so
2934                          *      there's no need to trigger as callbacks like
2935                          *      as_unmap, as_setprot or as_free would do.
2936                          */
2937                         mutex_enter(&as->a_contents);
2938                         if (!AS_ISNOUNMAPWAIT(as)) {
2939                                 if (AS_ISUNMAPWAIT(as) == 0) {
2940                                         cv_broadcast(&as->a_cv);
2941                                 }
2942                                 AS_SETUNMAPWAIT(as);
2943                                 AS_LOCK_EXIT(as);
2944                                 while (AS_ISUNMAPWAIT(as)) {
2945                                         cv_wait(&as->a_cv, &as->a_contents);
2946                                 }
2947                         } else {
2948                                 /*
2949                                  * We may have raced with
2950                                  * segvn_reclaim()/segspt_reclaim(). In this
2951                                  * case clean nounmapwait flag and retry since
2952                                  * softlockcnt in this segment may be already
2953                                  * 0.  We don't drop as writer lock so our
2954                                  * number of retries without sleeping should
2955                                  * be very small. See segvn_reclaim() for
2956                                  * more comments.
2957                                  */
2958                                 AS_CLRNOUNMAPWAIT(as);
2959                                 mutex_exit(&as->a_contents);
2960                                 goto retry;
2961                         }
2962                         mutex_exit(&as->a_contents);
2963                         goto setpgsz_top;
2964                 } else if (error != 0) {
2965                         break;
2966                 }
2967         }
2968         as_setwatch(as);
2969         AS_LOCK_EXIT(as);
2970         return (error);
2971 }
2972 
2973 /*
2974  * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments
2975  * in its chunk where s_szc is less than the szc we want to set.
2976  */
2977 static int
2978 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
2979     int *retry)
2980 {
2981         struct seg *seg;
2982         size_t ssize;
2983         int error;
2984 
2985         ASSERT(AS_WRITE_HELD(as));
2986 
2987         seg = as_segat(as, raddr);
2988         if (seg == NULL) {
2989                 panic("as_iset3_default_lpsize: no seg");
2990         }
2991 
2992         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2993                 if (raddr >= seg->s_base + seg->s_size) {
2994                         seg = AS_SEGNEXT(as, seg);
2995                         if (seg == NULL || raddr != seg->s_base) {
2996                                 panic("as_iset3_default_lpsize: as changed");
2997                         }
2998                 }
2999                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3000                         ssize = seg->s_base + seg->s_size - raddr;
3001                 } else {
3002                         ssize = rsize;
3003                 }
3004 
3005                 if (szc > seg->s_szc) {
3006                         error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
3007                         /* Only retry on EINVAL segments that have no vnode. */
3008                         if (error == EINVAL) {
3009                                 vnode_t *vp = NULL;
3010                                 if ((SEGOP_GETTYPE(seg, raddr) & MAP_SHARED) &&
3011                                     (SEGOP_GETVP(seg, raddr, &vp) != 0 ||
3012                                     vp == NULL)) {
3013                                         *retry = 1;
3014                                 } else {
3015                                         *retry = 0;
3016                                 }
3017                         }
3018                         if (error) {
3019                                 return (error);
3020                         }
3021                 }
3022         }
3023         return (0);
3024 }
3025 
3026 /*
3027  * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
3028  * pagesize on each segment in its range, but if any fails with EINVAL,
3029  * then it reduces the pagesizes to the next size in the bitmap and
3030  * retries as_iset3_default_lpsize(). The reason why the code retries
3031  * smaller allowed sizes on EINVAL is because (a) the anon offset may not
3032  * match the bigger sizes, and (b) it's hard to get this offset (to begin
3033  * with) to pass to map_pgszcvec().
3034  */
3035 static int
3036 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
3037     uint_t szcvec)
3038 {
3039         int error;
3040         int retry;
3041 
3042         ASSERT(AS_WRITE_HELD(as));
3043 
3044         for (;;) {
3045                 error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
3046                 if (error == EINVAL && retry) {
3047                         szcvec &= ~(1 << szc);
3048                         if (szcvec <= 1) {
3049                                 return (EINVAL);
3050                         }
3051                         szc = highbit(szcvec) - 1;
3052                 } else {
3053                         return (error);
3054                 }
3055         }
3056 }
3057 
3058 /*
3059  * as_iset1_default_lpsize() breaks its chunk into areas where existing
3060  * segments have a smaller szc than we want to set. For each such area,
3061  * it calls as_iset2_default_lpsize()
3062  */
3063 static int
3064 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3065     uint_t szcvec)
3066 {
3067         struct seg *seg;
3068         size_t ssize;
3069         caddr_t setaddr = raddr;
3070         size_t setsize = 0;
3071         int set;
3072         int error;
3073 
3074         ASSERT(AS_WRITE_HELD(as));
3075 
3076         seg = as_segat(as, raddr);
3077         if (seg == NULL) {
3078                 panic("as_iset1_default_lpsize: no seg");
3079         }
3080         if (seg->s_szc < szc) {
3081                 set = 1;
3082         } else {
3083                 set = 0;
3084         }
3085 
3086         for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3087                 if (raddr >= seg->s_base + seg->s_size) {
3088                         seg = AS_SEGNEXT(as, seg);
3089                         if (seg == NULL || raddr != seg->s_base) {
3090                                 panic("as_iset1_default_lpsize: as changed");
3091                         }
3092                         if (seg->s_szc >= szc && set) {
3093                                 ASSERT(setsize != 0);
3094                                 error = as_iset2_default_lpsize(as,
3095                                     setaddr, setsize, szc, szcvec);
3096                                 if (error) {
3097                                         return (error);
3098                                 }
3099                                 set = 0;
3100                         } else if (seg->s_szc < szc && !set) {
3101                                 setaddr = raddr;
3102                                 setsize = 0;
3103                                 set = 1;
3104                         }
3105                 }
3106                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3107                         ssize = seg->s_base + seg->s_size - raddr;
3108                 } else {
3109                         ssize = rsize;
3110                 }
3111         }
3112         error = 0;
3113         if (set) {
3114                 ASSERT(setsize != 0);
3115                 error = as_iset2_default_lpsize(as, setaddr, setsize,
3116                     szc, szcvec);
3117         }
3118         return (error);
3119 }
3120 
3121 /*
3122  * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3123  * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3124  * chunk to as_iset1_default_lpsize().
3125  */
3126 static int
3127 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
3128     int type)
3129 {
3130         int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
3131         uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
3132             flags, rtype, 1);
3133         uint_t szc;
3134         uint_t nszc;
3135         int error;
3136         caddr_t a;
3137         caddr_t eaddr;
3138         size_t segsize;
3139         size_t pgsz;
3140         uint_t save_szcvec;
3141 
3142         ASSERT(AS_WRITE_HELD(as));
3143         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
3144         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
3145 
3146         szcvec &= ~1;
3147         if (szcvec <= 1) {   /* skip if base page size */
3148                 return (0);
3149         }
3150 
3151         /* Get the pagesize of the first larger page size. */
3152         szc = lowbit(szcvec) - 1;
3153         pgsz = page_get_pagesize(szc);
3154         eaddr = addr + size;
3155         addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3156         eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3157 
3158         save_szcvec = szcvec;
3159         szcvec >>= (szc + 1);
3160         nszc = szc;
3161         while (szcvec) {
3162                 if ((szcvec & 0x1) == 0) {
3163                         nszc++;
3164                         szcvec >>= 1;
3165                         continue;
3166                 }
3167                 nszc++;
3168                 pgsz = page_get_pagesize(nszc);
3169                 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3170                 if (a != addr) {
3171                         ASSERT(szc > 0);
3172                         ASSERT(a < eaddr);
3173                         segsize = a - addr;
3174                         error = as_iset1_default_lpsize(as, addr, segsize, szc,
3175                             save_szcvec);
3176                         if (error) {
3177                                 return (error);
3178                         }
3179                         addr = a;
3180                 }
3181                 szc = nszc;
3182                 szcvec >>= 1;
3183         }
3184 
3185         ASSERT(addr < eaddr);
3186         szcvec = save_szcvec;
3187         while (szcvec) {
3188                 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3189                 ASSERT(a >= addr);
3190                 if (a != addr) {
3191                         ASSERT(szc > 0);
3192                         segsize = a - addr;
3193                         error = as_iset1_default_lpsize(as, addr, segsize, szc,
3194                             save_szcvec);
3195                         if (error) {
3196                                 return (error);
3197                         }
3198                         addr = a;
3199                 }
3200                 szcvec &= ~(1 << szc);
3201                 if (szcvec) {
3202                         szc = highbit(szcvec) - 1;
3203                         pgsz = page_get_pagesize(szc);
3204                 }
3205         }
3206         ASSERT(addr == eaddr);
3207 
3208         return (0);
3209 }
3210 
3211 /*
3212  * Set the default large page size for the range. Called via memcntl with
3213  * page size set to 0. as_set_default_lpsize breaks the range down into
3214  * chunks with the same type/flags, ignores-non segvn segments, and passes
3215  * each chunk to as_iset_default_lpsize().
3216  */
3217 int
3218 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
3219 {
3220         struct seg *seg;
3221         caddr_t raddr;
3222         size_t rsize;
3223         size_t ssize;
3224         int rtype, rflags;
3225         int stype, sflags;
3226         int error;
3227         caddr_t setaddr;
3228         size_t setsize;
3229         int segvn;
3230 
3231         if (size == 0)
3232                 return (0);
3233 
3234         AS_LOCK_ENTER(as, RW_WRITER);
3235 again:
3236         error = 0;
3237 
3238         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3239         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3240             (size_t)raddr;
3241 
3242         if (raddr + rsize < raddr) {         /* check for wraparound */
3243                 AS_LOCK_EXIT(as);
3244                 return (ENOMEM);
3245         }
3246         as_clearwatchprot(as, raddr, rsize);
3247         seg = as_segat(as, raddr);
3248         if (seg == NULL) {
3249                 as_setwatch(as);
3250                 AS_LOCK_EXIT(as);
3251                 return (ENOMEM);
3252         }
3253         if (seg->s_ops == &segvn_ops) {
3254                 rtype = SEGOP_GETTYPE(seg, addr);
3255                 rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3256                 rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3257                 segvn = 1;
3258         } else {
3259                 segvn = 0;
3260         }
3261         setaddr = raddr;
3262         setsize = 0;
3263 
3264         for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3265                 if (raddr >= (seg->s_base + seg->s_size)) {
3266                         seg = AS_SEGNEXT(as, seg);
3267                         if (seg == NULL || raddr != seg->s_base) {
3268                                 error = ENOMEM;
3269                                 break;
3270                         }
3271                         if (seg->s_ops == &segvn_ops) {
3272                                 stype = SEGOP_GETTYPE(seg, raddr);
3273                                 sflags = stype & (MAP_TEXT | MAP_INITDATA);
3274                                 stype &= (MAP_SHARED | MAP_PRIVATE);
3275                                 if (segvn && (rflags != sflags ||
3276                                     rtype != stype)) {
3277                                         /*
3278                                          * The next segment is also segvn but
3279                                          * has different flags and/or type.
3280                                          */
3281                                         ASSERT(setsize != 0);
3282                                         error = as_iset_default_lpsize(as,
3283                                             setaddr, setsize, rflags, rtype);
3284                                         if (error) {
3285                                                 break;
3286                                         }
3287                                         rflags = sflags;
3288                                         rtype = stype;
3289                                         setaddr = raddr;
3290                                         setsize = 0;
3291                                 } else if (!segvn) {
3292                                         rflags = sflags;
3293                                         rtype = stype;
3294                                         setaddr = raddr;
3295                                         setsize = 0;
3296                                         segvn = 1;
3297                                 }
3298                         } else if (segvn) {
3299                                 /* The next segment is not segvn. */
3300                                 ASSERT(setsize != 0);
3301                                 error = as_iset_default_lpsize(as,
3302                                     setaddr, setsize, rflags, rtype);
3303                                 if (error) {
3304                                         break;
3305                                 }
3306                                 segvn = 0;
3307                         }
3308                 }
3309                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3310                         ssize = seg->s_base + seg->s_size - raddr;
3311                 } else {
3312                         ssize = rsize;
3313                 }
3314         }
3315         if (error == 0 && segvn) {
3316                 /* The last chunk when rsize == 0. */
3317                 ASSERT(setsize != 0);
3318                 error = as_iset_default_lpsize(as, setaddr, setsize,
3319                     rflags, rtype);
3320         }
3321 
3322         if (error == IE_RETRY) {
3323                 goto again;
3324         } else if (error == IE_NOMEM) {
3325                 error = EAGAIN;
3326         } else if (error == ENOTSUP) {
3327                 error = EINVAL;
3328         } else if (error == EAGAIN) {
3329                 mutex_enter(&as->a_contents);
3330                 if (!AS_ISNOUNMAPWAIT(as)) {
3331                         if (AS_ISUNMAPWAIT(as) == 0) {
3332                                 cv_broadcast(&as->a_cv);
3333                         }
3334                         AS_SETUNMAPWAIT(as);
3335                         AS_LOCK_EXIT(as);
3336                         while (AS_ISUNMAPWAIT(as)) {
3337                                 cv_wait(&as->a_cv, &as->a_contents);
3338                         }
3339                         mutex_exit(&as->a_contents);
3340                         AS_LOCK_ENTER(as, RW_WRITER);
3341                 } else {
3342                         /*
3343                          * We may have raced with
3344                          * segvn_reclaim()/segspt_reclaim(). In this case
3345                          * clean nounmapwait flag and retry since softlockcnt
3346                          * in this segment may be already 0.  We don't drop as
3347                          * writer lock so our number of retries without
3348                          * sleeping should be very small. See segvn_reclaim()
3349                          * for more comments.
3350                          */
3351                         AS_CLRNOUNMAPWAIT(as);
3352                         mutex_exit(&as->a_contents);
3353                 }
3354                 goto again;
3355         }
3356 
3357         as_setwatch(as);
3358         AS_LOCK_EXIT(as);
3359         return (error);
3360 }
3361 
3362 /*
3363  * Setup all of the uninitialized watched pages that we can.
3364  */
3365 void
3366 as_setwatch(struct as *as)
3367 {
3368         struct watched_page *pwp;
3369         struct seg *seg;
3370         caddr_t vaddr;
3371         uint_t prot;
3372         int  err, retrycnt;
3373 
3374         if (avl_numnodes(&as->a_wpage) == 0)
3375                 return;
3376 
3377         ASSERT(AS_WRITE_HELD(as));
3378 
3379         for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3380             pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3381                 retrycnt = 0;
3382         retry:
3383                 vaddr = pwp->wp_vaddr;
3384                 if (pwp->wp_oprot != 0 ||    /* already set up */
3385                     (seg = as_segat(as, vaddr)) == NULL ||
3386                     SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0)
3387                         continue;
3388 
3389                 pwp->wp_oprot = prot;
3390                 if (pwp->wp_read)
3391                         prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3392                 if (pwp->wp_write)
3393                         prot &= ~PROT_WRITE;
3394                 if (pwp->wp_exec)
3395                         prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3396                 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3397                         err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3398                         if (err == IE_RETRY) {
3399                                 pwp->wp_oprot = 0;
3400                                 ASSERT(retrycnt == 0);
3401                                 retrycnt++;
3402                                 goto retry;
3403                         }
3404                 }
3405                 pwp->wp_prot = prot;
3406         }
3407 }
3408 
3409 /*
3410  * Clear all of the watched pages in the address space.
3411  */
3412 void
3413 as_clearwatch(struct as *as)
3414 {
3415         struct watched_page *pwp;
3416         struct seg *seg;
3417         caddr_t vaddr;
3418         uint_t prot;
3419         int err, retrycnt;
3420 
3421         if (avl_numnodes(&as->a_wpage) == 0)
3422                 return;
3423 
3424         ASSERT(AS_WRITE_HELD(as));
3425 
3426         for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3427             pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3428                 retrycnt = 0;
3429         retry:
3430                 vaddr = pwp->wp_vaddr;
3431                 if (pwp->wp_oprot == 0 ||    /* not set up */
3432                     (seg = as_segat(as, vaddr)) == NULL)
3433                         continue;
3434 
3435                 if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3436                         err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3437                         if (err == IE_RETRY) {
3438                                 ASSERT(retrycnt == 0);
3439                                 retrycnt++;
3440                                 goto retry;
3441                         }
3442                 }
3443                 pwp->wp_oprot = 0;
3444                 pwp->wp_prot = 0;
3445         }
3446 }
3447 
3448 /*
3449  * Force a new setup for all the watched pages in the range.
3450  */
3451 static void
3452 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3453 {
3454         struct watched_page *pwp;
3455         struct watched_page tpw;
3456         caddr_t eaddr = addr + size;
3457         caddr_t vaddr;
3458         struct seg *seg;
3459         int err, retrycnt;
3460         uint_t  wprot;
3461         avl_index_t where;
3462 
3463         if (avl_numnodes(&as->a_wpage) == 0)
3464                 return;
3465 
3466         ASSERT(AS_WRITE_HELD(as));
3467 
3468         tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3469         if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3470                 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3471 
3472         while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3473                 retrycnt = 0;
3474                 vaddr = pwp->wp_vaddr;
3475 
3476                 wprot = prot;
3477                 if (pwp->wp_read)
3478                         wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3479                 if (pwp->wp_write)
3480                         wprot &= ~PROT_WRITE;
3481                 if (pwp->wp_exec)
3482                         wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3483                 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3484                 retry:
3485                         seg = as_segat(as, vaddr);
3486                         if (seg == NULL) {
3487                                 panic("as_setwatchprot: no seg");
3488                                 /*NOTREACHED*/
3489                         }
3490                         err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot);
3491                         if (err == IE_RETRY) {
3492                                 ASSERT(retrycnt == 0);
3493                                 retrycnt++;
3494                                 goto retry;
3495                         }
3496                 }
3497                 pwp->wp_oprot = prot;
3498                 pwp->wp_prot = wprot;
3499 
3500                 pwp = AVL_NEXT(&as->a_wpage, pwp);
3501         }
3502 }
3503 
3504 /*
3505  * Clear all of the watched pages in the range.
3506  */
3507 static void
3508 as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3509 {
3510         caddr_t eaddr = addr + size;
3511         struct watched_page *pwp;
3512         struct watched_page tpw;
3513         uint_t prot;
3514         struct seg *seg;
3515         int err, retrycnt;
3516         avl_index_t where;
3517 
3518         if (avl_numnodes(&as->a_wpage) == 0)
3519                 return;
3520 
3521         tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3522         if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3523                 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3524 
3525         ASSERT(AS_WRITE_HELD(as));
3526 
3527         while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3528 
3529                 if ((prot = pwp->wp_oprot) != 0) {
3530                         retrycnt = 0;
3531 
3532                         if (prot != pwp->wp_prot) {
3533                         retry:
3534                                 seg = as_segat(as, pwp->wp_vaddr);
3535                                 if (seg == NULL)
3536                                         continue;
3537                                 err = SEGOP_SETPROT(seg, pwp->wp_vaddr,
3538                                     PAGESIZE, prot);
3539                                 if (err == IE_RETRY) {
3540                                         ASSERT(retrycnt == 0);
3541                                         retrycnt++;
3542                                         goto retry;
3543 
3544                                 }
3545                         }
3546                         pwp->wp_oprot = 0;
3547                         pwp->wp_prot = 0;
3548                 }
3549 
3550                 pwp = AVL_NEXT(&as->a_wpage, pwp);
3551         }
3552 }
3553 
3554 void
3555 as_signal_proc(struct as *as, k_siginfo_t *siginfo)
3556 {
3557         struct proc *p;
3558 
3559         mutex_enter(&pidlock);
3560         for (p = practive; p; p = p->p_next) {
3561                 if (p->p_as == as) {
3562                         mutex_enter(&p->p_lock);
3563                         if (p->p_as == as)
3564                                 sigaddq(p, NULL, siginfo, KM_NOSLEEP);
3565                         mutex_exit(&p->p_lock);
3566                 }
3567         }
3568         mutex_exit(&pidlock);
3569 }
3570 
3571 /*
3572  * return memory object ID
3573  */
3574 int
3575 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3576 {
3577         struct seg      *seg;
3578         int             sts;
3579 
3580         AS_LOCK_ENTER(as, RW_READER);
3581         seg = as_segat(as, addr);
3582         if (seg == NULL) {
3583                 AS_LOCK_EXIT(as);
3584                 return (EFAULT);
3585         }
3586         /*
3587          * catch old drivers which may not support getmemid
3588          */
3589         if (seg->s_ops->getmemid == NULL) {
3590                 AS_LOCK_EXIT(as);
3591                 return (ENODEV);
3592         }
3593 
3594         sts = SEGOP_GETMEMID(seg, addr, memidp);
3595 
3596         AS_LOCK_EXIT(as);
3597         return (sts);
3598 }