1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright 2015, Joyent, Inc.  All rights reserved.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  */
  27 
  28 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  29 /*        All Rights Reserved   */
  30 
  31 /*
  32  * University Copyright- Copyright (c) 1982, 1986, 1988
  33  * The Regents of the University of California
  34  * All Rights Reserved
  35  *
  36  * University Acknowledgment- Portions of this document are derived from
  37  * software developed by the University of California, Berkeley, and its
  38  * contributors.
  39  */
  40 
  41 /*
  42  * VM - address spaces.
  43  */
  44 
  45 #include <sys/types.h>
  46 #include <sys/t_lock.h>
  47 #include <sys/param.h>
  48 #include <sys/errno.h>
  49 #include <sys/systm.h>
  50 #include <sys/mman.h>
  51 #include <sys/sysmacros.h>
  52 #include <sys/cpuvar.h>
  53 #include <sys/sysinfo.h>
  54 #include <sys/kmem.h>
  55 #include <sys/vnode.h>
  56 #include <sys/vmsystm.h>
  57 #include <sys/cmn_err.h>
  58 #include <sys/debug.h>
  59 #include <sys/tnf_probe.h>
  60 #include <sys/vtrace.h>
  61 #include <sys/ddi.h>
  62 
  63 #include <vm/hat.h>
  64 #include <vm/as.h>
  65 #include <vm/seg.h>
  66 #include <vm/seg_vn.h>
  67 #include <vm/seg_dev.h>
  68 #include <vm/seg_kmem.h>
  69 #include <vm/seg_map.h>
  70 #include <vm/seg_spt.h>
  71 #include <vm/page.h>
  72 
  73 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
  74 
  75 static struct kmem_cache *as_cache;
  76 
  77 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
  78 static void as_clearwatchprot(struct as *, caddr_t, size_t);
  79 int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
  80 
  81 
  82 /*
  83  * Verifying the segment lists is very time-consuming; it may not be
  84  * desirable always to define VERIFY_SEGLIST when DEBUG is set.
  85  */
  86 #ifdef DEBUG
  87 #define VERIFY_SEGLIST
  88 int do_as_verify = 0;
  89 #endif
  90 
  91 /*
  92  * Allocate a new callback data structure entry and fill in the events of
  93  * interest, the address range of interest, and the callback argument.
  94  * Link the entry on the as->a_callbacks list. A callback entry for the
  95  * entire address space may be specified with vaddr = 0 and size = -1.
  96  *
  97  * CALLERS RESPONSIBILITY: If not calling from within the process context for
  98  * the specified as, the caller must guarantee persistence of the specified as
  99  * for the duration of this function (eg. pages being locked within the as
 100  * will guarantee persistence).
 101  */
 102 int
 103 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
 104     caddr_t vaddr, size_t size, int sleepflag)
 105 {
 106         struct as_callback      *current_head, *cb;
 107         caddr_t                 saddr;
 108         size_t                  rsize;
 109 
 110         /* callback function and an event are mandatory */
 111         if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
 112                 return (EINVAL);
 113 
 114         /* Adding a callback after as_free has been called is not allowed */
 115         if (as == &kas)
 116                 return (ENOMEM);
 117 
 118         /*
 119          * vaddr = 0 and size = -1 is used to indicate that the callback range
 120          * is the entire address space so no rounding is done in that case.
 121          */
 122         if (size != -1) {
 123                 saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
 124                 rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
 125                     (size_t)saddr;
 126                 /* check for wraparound */
 127                 if (saddr + rsize < saddr)
 128                         return (ENOMEM);
 129         } else {
 130                 if (vaddr != 0)
 131                         return (EINVAL);
 132                 saddr = vaddr;
 133                 rsize = size;
 134         }
 135 
 136         /* Allocate and initialize a callback entry */
 137         cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
 138         if (cb == NULL)
 139                 return (EAGAIN);
 140 
 141         cb->ascb_func = cb_func;
 142         cb->ascb_arg = arg;
 143         cb->ascb_events = events;
 144         cb->ascb_saddr = saddr;
 145         cb->ascb_len = rsize;
 146 
 147         /* Add the entry to the list */
 148         mutex_enter(&as->a_contents);
 149         current_head = as->a_callbacks;
 150         as->a_callbacks = cb;
 151         cb->ascb_next = current_head;
 152 
 153         /*
 154          * The call to this function may lose in a race with
 155          * a pertinent event - eg. a thread does long term memory locking
 156          * but before the callback is added another thread executes as_unmap.
 157          * A broadcast here resolves that.
 158          */
 159         if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
 160                 AS_CLRUNMAPWAIT(as);
 161                 cv_broadcast(&as->a_cv);
 162         }
 163 
 164         mutex_exit(&as->a_contents);
 165         return (0);
 166 }
 167 
 168 /*
 169  * Search the callback list for an entry which pertains to arg.
 170  *
 171  * This is called from within the client upon completion of the callback.
 172  * RETURN VALUES:
 173  *      AS_CALLBACK_DELETED  (callback entry found and deleted)
 174  *      AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
 175  *      AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
 176  *                      entry will be made in as_do_callbacks)
 177  *
 178  * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
 179  * set, it indicates that as_do_callbacks is processing this entry.  The
 180  * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
 181  * to unblock as_do_callbacks, in case it is blocked.
 182  *
 183  * CALLERS RESPONSIBILITY: If not calling from within the process context for
 184  * the specified as, the caller must guarantee persistence of the specified as
 185  * for the duration of this function (eg. pages being locked within the as
 186  * will guarantee persistence).
 187  */
 188 uint_t
 189 as_delete_callback(struct as *as, void *arg)
 190 {
 191         struct as_callback **prevcb = &as->a_callbacks;
 192         struct as_callback *cb;
 193         uint_t rc = AS_CALLBACK_NOTFOUND;
 194 
 195         mutex_enter(&as->a_contents);
 196         for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
 197                 if (cb->ascb_arg != arg)
 198                         continue;
 199 
 200                 /*
 201                  * If the events indicate AS_CALLBACK_CALLED, just clear
 202                  * AS_ALL_EVENT in the events field and wakeup the thread
 203                  * that may be waiting in as_do_callbacks.  as_do_callbacks
 204                  * will take care of removing this entry from the list.  In
 205                  * that case, return AS_CALLBACK_DELETE_DEFERRED.  Otherwise
 206                  * (AS_CALLBACK_CALLED not set), just remove it from the
 207                  * list, return the memory and return AS_CALLBACK_DELETED.
 208                  */
 209                 if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
 210                         /* leave AS_CALLBACK_CALLED */
 211                         cb->ascb_events &= ~AS_ALL_EVENT;
 212                         rc = AS_CALLBACK_DELETE_DEFERRED;
 213                         cv_broadcast(&as->a_cv);
 214                 } else {
 215                         *prevcb = cb->ascb_next;
 216                         kmem_free(cb, sizeof (struct as_callback));
 217                         rc = AS_CALLBACK_DELETED;
 218                 }
 219                 break;
 220         }
 221         mutex_exit(&as->a_contents);
 222         return (rc);
 223 }
 224 
 225 /*
 226  * Searches the as callback list for a matching entry.
 227  * Returns a pointer to the first matching callback, or NULL if
 228  * nothing is found.
 229  * This function never sleeps so it is ok to call it with more
 230  * locks held but the (required) a_contents mutex.
 231  *
 232  * See also comment on as_do_callbacks below.
 233  */
 234 static struct as_callback *
 235 as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
 236     size_t event_len)
 237 {
 238         struct as_callback      *cb;
 239 
 240         ASSERT(MUTEX_HELD(&as->a_contents));
 241         for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
 242                 /*
 243                  * If the callback has not already been called, then
 244                  * check if events or address range pertains.  An event_len
 245                  * of zero means do an unconditional callback.
 246                  */
 247                 if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
 248                     ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
 249                     (event_addr + event_len < cb->ascb_saddr) ||
 250                     (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
 251                         continue;
 252                 }
 253                 break;
 254         }
 255         return (cb);
 256 }
 257 
 258 /*
 259  * Executes a given callback and removes it from the callback list for
 260  * this address space.
 261  * This function may sleep so the caller must drop all locks except
 262  * a_contents before calling this func.
 263  *
 264  * See also comments on as_do_callbacks below.
 265  */
 266 static void
 267 as_execute_callback(struct as *as, struct as_callback *cb,
 268     uint_t events)
 269 {
 270         struct as_callback **prevcb;
 271         void    *cb_arg;
 272 
 273         ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
 274         cb->ascb_events |= AS_CALLBACK_CALLED;
 275         mutex_exit(&as->a_contents);
 276         (*cb->ascb_func)(as, cb->ascb_arg, events);
 277         mutex_enter(&as->a_contents);
 278         /*
 279          * the callback function is required to delete the callback
 280          * when the callback function determines it is OK for
 281          * this thread to continue. as_delete_callback will clear
 282          * the AS_ALL_EVENT in the events field when it is deleted.
 283          * If the callback function called as_delete_callback,
 284          * events will already be cleared and there will be no blocking.
 285          */
 286         while ((cb->ascb_events & events) != 0) {
 287                 cv_wait(&as->a_cv, &as->a_contents);
 288         }
 289         /*
 290          * This entry needs to be taken off the list. Normally, the
 291          * callback func itself does that, but unfortunately the list
 292          * may have changed while the callback was running because the
 293          * a_contents mutex was dropped and someone else other than the
 294          * callback func itself could have called as_delete_callback,
 295          * so we have to search to find this entry again.  The entry
 296          * must have AS_CALLBACK_CALLED, and have the same 'arg'.
 297          */
 298         cb_arg = cb->ascb_arg;
 299         prevcb = &as->a_callbacks;
 300         for (cb = as->a_callbacks; cb != NULL;
 301             prevcb = &cb->ascb_next, cb = *prevcb) {
 302                 if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
 303                     (cb_arg != cb->ascb_arg)) {
 304                         continue;
 305                 }
 306                 *prevcb = cb->ascb_next;
 307                 kmem_free(cb, sizeof (struct as_callback));
 308                 break;
 309         }
 310 }
 311 
 312 /*
 313  * Check the callback list for a matching event and intersection of
 314  * address range. If there is a match invoke the callback.  Skip an entry if:
 315  *    - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
 316  *    - not event of interest
 317  *    - not address range of interest
 318  *
 319  * An event_len of zero indicates a request for an unconditional callback
 320  * (regardless of event), only the AS_CALLBACK_CALLED is checked.  The
 321  * a_contents lock must be dropped before a callback, so only one callback
 322  * can be done before returning. Return -1 (true) if a callback was
 323  * executed and removed from the list, else return 0 (false).
 324  *
 325  * The logically separate parts, i.e. finding a matching callback and
 326  * executing a given callback have been separated into two functions
 327  * so that they can be called with different sets of locks held beyond
 328  * the always-required a_contents. as_find_callback does not sleep so
 329  * it is ok to call it if more locks than a_contents (i.e. the a_lock
 330  * rwlock) are held. as_execute_callback on the other hand may sleep
 331  * so all locks beyond a_contents must be dropped by the caller if one
 332  * does not want to end comatose.
 333  */
 334 static int
 335 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
 336     size_t event_len)
 337 {
 338         struct as_callback *cb;
 339 
 340         if ((cb = as_find_callback(as, events, event_addr, event_len))) {
 341                 as_execute_callback(as, cb, events);
 342                 return (-1);
 343         }
 344         return (0);
 345 }
 346 
 347 /*
 348  * Search for the segment containing addr. If a segment containing addr
 349  * exists, that segment is returned.  If no such segment exists, and
 350  * the list spans addresses greater than addr, then the first segment
 351  * whose base is greater than addr is returned; otherwise, NULL is
 352  * returned unless tail is true, in which case the last element of the
 353  * list is returned.
 354  *
 355  * a_seglast is used to cache the last found segment for repeated
 356  * searches to the same addr (which happens frequently).
 357  */
 358 struct seg *
 359 as_findseg(struct as *as, caddr_t addr, int tail)
 360 {
 361         struct seg *seg = as->a_seglast;
 362         avl_index_t where;
 363 
 364         ASSERT(AS_LOCK_HELD(as));
 365 
 366         if (seg != NULL &&
 367             seg->s_base <= addr &&
 368             addr < seg->s_base + seg->s_size)
 369                 return (seg);
 370 
 371         seg = avl_find(&as->a_segtree, &addr, &where);
 372         if (seg != NULL)
 373                 return (as->a_seglast = seg);
 374 
 375         seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 376         if (seg == NULL && tail)
 377                 seg = avl_last(&as->a_segtree);
 378         return (as->a_seglast = seg);
 379 }
 380 
 381 #ifdef VERIFY_SEGLIST
 382 /*
 383  * verify that the linked list is coherent
 384  */
 385 static void
 386 as_verify(struct as *as)
 387 {
 388         struct seg *seg, *seglast, *p, *n;
 389         uint_t nsegs = 0;
 390 
 391         if (do_as_verify == 0)
 392                 return;
 393 
 394         seglast = as->a_seglast;
 395 
 396         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 397                 ASSERT(seg->s_as == as);
 398                 p = AS_SEGPREV(as, seg);
 399                 n = AS_SEGNEXT(as, seg);
 400                 ASSERT(p == NULL || p->s_as == as);
 401                 ASSERT(p == NULL || p->s_base < seg->s_base);
 402                 ASSERT(n == NULL || n->s_base > seg->s_base);
 403                 ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
 404                 if (seg == seglast)
 405                         seglast = NULL;
 406                 nsegs++;
 407         }
 408         ASSERT(seglast == NULL);
 409         ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
 410 }
 411 #endif /* VERIFY_SEGLIST */
 412 
 413 /*
 414  * Add a new segment to the address space. The avl_find()
 415  * may be expensive so we attempt to use last segment accessed
 416  * in as_gap() as an insertion point.
 417  */
 418 int
 419 as_addseg(struct as  *as, struct seg *newseg)
 420 {
 421         struct seg *seg;
 422         caddr_t addr;
 423         caddr_t eaddr;
 424         avl_index_t where;
 425 
 426         ASSERT(AS_WRITE_HELD(as));
 427 
 428         as->a_updatedir = 1; /* inform /proc */
 429         gethrestime(&as->a_updatetime);
 430 
 431         if (as->a_lastgaphl != NULL) {
 432                 struct seg *hseg = NULL;
 433                 struct seg *lseg = NULL;
 434 
 435                 if (as->a_lastgaphl->s_base > newseg->s_base) {
 436                         hseg = as->a_lastgaphl;
 437                         lseg = AVL_PREV(&as->a_segtree, hseg);
 438                 } else {
 439                         lseg = as->a_lastgaphl;
 440                         hseg = AVL_NEXT(&as->a_segtree, lseg);
 441                 }
 442 
 443                 if (hseg && lseg && lseg->s_base < newseg->s_base &&
 444                     hseg->s_base > newseg->s_base) {
 445                         avl_insert_here(&as->a_segtree, newseg, lseg,
 446                             AVL_AFTER);
 447                         as->a_lastgaphl = NULL;
 448                         as->a_seglast = newseg;
 449                         return (0);
 450                 }
 451                 as->a_lastgaphl = NULL;
 452         }
 453 
 454         addr = newseg->s_base;
 455         eaddr = addr + newseg->s_size;
 456 again:
 457 
 458         seg = avl_find(&as->a_segtree, &addr, &where);
 459 
 460         if (seg == NULL)
 461                 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 462 
 463         if (seg == NULL)
 464                 seg = avl_last(&as->a_segtree);
 465 
 466         if (seg != NULL) {
 467                 caddr_t base = seg->s_base;
 468 
 469                 /*
 470                  * If top of seg is below the requested address, then
 471                  * the insertion point is at the end of the linked list,
 472                  * and seg points to the tail of the list.  Otherwise,
 473                  * the insertion point is immediately before seg.
 474                  */
 475                 if (base + seg->s_size > addr) {
 476                         if (addr >= base || eaddr > base) {
 477 #ifdef __sparc
 478                                 extern struct seg_ops segnf_ops;
 479 
 480                                 /*
 481                                  * no-fault segs must disappear if overlaid.
 482                                  * XXX need new segment type so
 483                                  * we don't have to check s_ops
 484                                  */
 485                                 if (seg->s_ops == &segnf_ops) {
 486                                         seg_unmap(seg);
 487                                         goto again;
 488                                 }
 489 #endif
 490                                 return (-1);    /* overlapping segment */
 491                         }
 492                 }
 493         }
 494         as->a_seglast = newseg;
 495         avl_insert(&as->a_segtree, newseg, where);
 496 
 497 #ifdef VERIFY_SEGLIST
 498         as_verify(as);
 499 #endif
 500         return (0);
 501 }
 502 
 503 struct seg *
 504 as_removeseg(struct as *as, struct seg *seg)
 505 {
 506         avl_tree_t *t;
 507 
 508         ASSERT(AS_WRITE_HELD(as));
 509 
 510         as->a_updatedir = 1; /* inform /proc */
 511         gethrestime(&as->a_updatetime);
 512 
 513         if (seg == NULL)
 514                 return (NULL);
 515 
 516         t = &as->a_segtree;
 517         if (as->a_seglast == seg)
 518                 as->a_seglast = NULL;
 519         as->a_lastgaphl = NULL;
 520 
 521         /*
 522          * if this segment is at an address higher than
 523          * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
 524          */
 525         if (as->a_lastgap &&
 526             (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
 527                 as->a_lastgap = AVL_NEXT(t, seg);
 528 
 529         /*
 530          * remove the segment from the seg tree
 531          */
 532         avl_remove(t, seg);
 533 
 534 #ifdef VERIFY_SEGLIST
 535         as_verify(as);
 536 #endif
 537         return (seg);
 538 }
 539 
 540 /*
 541  * Find a segment containing addr.
 542  */
 543 struct seg *
 544 as_segat(struct as *as, caddr_t addr)
 545 {
 546         struct seg *seg = as->a_seglast;
 547 
 548         ASSERT(AS_LOCK_HELD(as));
 549 
 550         if (seg != NULL && seg->s_base <= addr &&
 551             addr < seg->s_base + seg->s_size)
 552                 return (seg);
 553 
 554         seg = avl_find(&as->a_segtree, &addr, NULL);
 555         return (seg);
 556 }
 557 
 558 /*
 559  * Serialize all searches for holes in an address space to
 560  * prevent two or more threads from allocating the same virtual
 561  * address range.  The address space must not be "read/write"
 562  * locked by the caller since we may block.
 563  */
 564 void
 565 as_rangelock(struct as *as)
 566 {
 567         mutex_enter(&as->a_contents);
 568         while (AS_ISCLAIMGAP(as))
 569                 cv_wait(&as->a_cv, &as->a_contents);
 570         AS_SETCLAIMGAP(as);
 571         mutex_exit(&as->a_contents);
 572 }
 573 
 574 /*
 575  * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
 576  */
 577 void
 578 as_rangeunlock(struct as *as)
 579 {
 580         mutex_enter(&as->a_contents);
 581         AS_CLRCLAIMGAP(as);
 582         cv_signal(&as->a_cv);
 583         mutex_exit(&as->a_contents);
 584 }
 585 
 586 /*
 587  * compar segments (or just an address) by segment address range
 588  */
 589 static int
 590 as_segcompar(const void *x, const void *y)
 591 {
 592         struct seg *a = (struct seg *)x;
 593         struct seg *b = (struct seg *)y;
 594 
 595         if (a->s_base < b->s_base)
 596                 return (-1);
 597         if (a->s_base >= b->s_base + b->s_size)
 598                 return (1);
 599         return (0);
 600 }
 601 
 602 
 603 void
 604 as_avlinit(struct as *as)
 605 {
 606         avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
 607             offsetof(struct seg, s_tree));
 608         avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
 609             offsetof(struct watched_page, wp_link));
 610 }
 611 
 612 /*ARGSUSED*/
 613 static int
 614 as_constructor(void *buf, void *cdrarg, int kmflags)
 615 {
 616         struct as *as = buf;
 617 
 618         mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
 619         cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
 620         rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
 621         as_avlinit(as);
 622         return (0);
 623 }
 624 
 625 /*ARGSUSED1*/
 626 static void
 627 as_destructor(void *buf, void *cdrarg)
 628 {
 629         struct as *as = buf;
 630 
 631         avl_destroy(&as->a_segtree);
 632         mutex_destroy(&as->a_contents);
 633         cv_destroy(&as->a_cv);
 634         rw_destroy(&as->a_lock);
 635 }
 636 
 637 void
 638 as_init(void)
 639 {
 640         as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
 641             as_constructor, as_destructor, NULL, NULL, NULL, 0);
 642 }
 643 
 644 /*
 645  * Allocate and initialize an address space data structure.
 646  * We call hat_alloc to allow any machine dependent
 647  * information in the hat structure to be initialized.
 648  */
 649 struct as *
 650 as_alloc(void)
 651 {
 652         struct as *as;
 653 
 654         as = kmem_cache_alloc(as_cache, KM_SLEEP);
 655 
 656         as->a_flags          = 0;
 657         as->a_vbits          = 0;
 658         as->a_hrm            = NULL;
 659         as->a_seglast                = NULL;
 660         as->a_size           = 0;
 661         as->a_resvsize               = 0;
 662         as->a_updatedir              = 0;
 663         gethrestime(&as->a_updatetime);
 664         as->a_objectdir              = NULL;
 665         as->a_sizedir                = 0;
 666         as->a_userlimit              = (caddr_t)USERLIMIT;
 667         as->a_lastgap                = NULL;
 668         as->a_lastgaphl              = NULL;
 669         as->a_callbacks              = NULL;
 670         as->a_proc           = NULL;
 671 
 672         AS_LOCK_ENTER(as, RW_WRITER);
 673         as->a_hat = hat_alloc(as);   /* create hat for default system mmu */
 674         AS_LOCK_EXIT(as);
 675 
 676         return (as);
 677 }
 678 
 679 /*
 680  * Free an address space data structure.
 681  * Need to free the hat first and then
 682  * all the segments on this as and finally
 683  * the space for the as struct itself.
 684  */
 685 void
 686 as_free(struct as *as)
 687 {
 688         struct hat *hat = as->a_hat;
 689         struct seg *seg, *next;
 690         boolean_t free_started = B_FALSE;
 691 
 692 top:
 693         /*
 694          * Invoke ALL callbacks. as_do_callbacks will do one callback
 695          * per call, and not return (-1) until the callback has completed.
 696          * When as_do_callbacks returns zero, all callbacks have completed.
 697          */
 698         mutex_enter(&as->a_contents);
 699         while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
 700                 ;
 701 
 702         mutex_exit(&as->a_contents);
 703         AS_LOCK_ENTER(as, RW_WRITER);
 704 
 705         if (!free_started) {
 706                 free_started = B_TRUE;
 707                 hat_free_start(hat);
 708         }
 709         for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
 710                 int err;
 711 
 712                 next = AS_SEGNEXT(as, seg);
 713 retry:
 714                 err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
 715                 if (err == EAGAIN) {
 716                         mutex_enter(&as->a_contents);
 717                         if (as->a_callbacks) {
 718                                 AS_LOCK_EXIT(as);
 719                         } else if (!AS_ISNOUNMAPWAIT(as)) {
 720                                 /*
 721                                  * Memory is currently locked. Wait for a
 722                                  * cv_signal that it has been unlocked, then
 723                                  * try the operation again.
 724                                  */
 725                                 if (AS_ISUNMAPWAIT(as) == 0)
 726                                         cv_broadcast(&as->a_cv);
 727                                 AS_SETUNMAPWAIT(as);
 728                                 AS_LOCK_EXIT(as);
 729                                 while (AS_ISUNMAPWAIT(as))
 730                                         cv_wait(&as->a_cv, &as->a_contents);
 731                         } else {
 732                                 /*
 733                                  * We may have raced with
 734                                  * segvn_reclaim()/segspt_reclaim(). In this
 735                                  * case clean nounmapwait flag and retry since
 736                                  * softlockcnt in this segment may be already
 737                                  * 0.  We don't drop as writer lock so our
 738                                  * number of retries without sleeping should
 739                                  * be very small. See segvn_reclaim() for
 740                                  * more comments.
 741                                  */
 742                                 AS_CLRNOUNMAPWAIT(as);
 743                                 mutex_exit(&as->a_contents);
 744                                 goto retry;
 745                         }
 746                         mutex_exit(&as->a_contents);
 747                         goto top;
 748                 } else {
 749                         /*
 750                          * We do not expect any other error return at this
 751                          * time. This is similar to an ASSERT in seg_unmap()
 752                          */
 753                         ASSERT(err == 0);
 754                 }
 755         }
 756         hat_free_end(hat);
 757         AS_LOCK_EXIT(as);
 758 
 759         /* /proc stuff */
 760         ASSERT(avl_numnodes(&as->a_wpage) == 0);
 761         if (as->a_objectdir) {
 762                 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
 763                 as->a_objectdir = NULL;
 764                 as->a_sizedir = 0;
 765         }
 766 
 767         /*
 768          * Free the struct as back to kmem.  Assert it has no segments.
 769          */
 770         ASSERT(avl_numnodes(&as->a_segtree) == 0);
 771         kmem_cache_free(as_cache, as);
 772 }
 773 
 774 int
 775 as_dup(struct as *as, struct proc *forkedproc)
 776 {
 777         struct as *newas;
 778         struct seg *seg, *newseg;
 779         size_t  purgesize = 0;
 780         int error;
 781 
 782         AS_LOCK_ENTER(as, RW_WRITER);
 783         as_clearwatch(as);
 784         newas = as_alloc();
 785         newas->a_userlimit = as->a_userlimit;
 786         newas->a_proc = forkedproc;
 787 
 788         AS_LOCK_ENTER(newas, RW_WRITER);
 789 
 790         (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
 791 
 792         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 793 
 794                 if (seg->s_flags & S_PURGE) {
 795                         purgesize += seg->s_size;
 796                         continue;
 797                 }
 798 
 799                 newseg = seg_alloc(newas, seg->s_base, seg->s_size);
 800                 if (newseg == NULL) {
 801                         AS_LOCK_EXIT(newas);
 802                         as_setwatch(as);
 803                         AS_LOCK_EXIT(as);
 804                         as_free(newas);
 805                         return (-1);
 806                 }
 807                 if ((error = SEGOP_DUP(seg, newseg)) != 0) {
 808                         /*
 809                          * We call seg_free() on the new seg
 810                          * because the segment is not set up
 811                          * completely; i.e. it has no ops.
 812                          */
 813                         as_setwatch(as);
 814                         AS_LOCK_EXIT(as);
 815                         seg_free(newseg);
 816                         AS_LOCK_EXIT(newas);
 817                         as_free(newas);
 818                         return (error);
 819                 }
 820                 newas->a_size += seg->s_size;
 821         }
 822         newas->a_resvsize = as->a_resvsize - purgesize;
 823 
 824         error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
 825 
 826         AS_LOCK_EXIT(newas);
 827 
 828         as_setwatch(as);
 829         AS_LOCK_EXIT(as);
 830         if (error != 0) {
 831                 as_free(newas);
 832                 return (error);
 833         }
 834         forkedproc->p_as = newas;
 835         return (0);
 836 }
 837 
 838 /*
 839  * Handle a ``fault'' at addr for size bytes.
 840  */
 841 faultcode_t
 842 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
 843     enum fault_type type, enum seg_rw rw)
 844 {
 845         struct seg *seg;
 846         caddr_t raddr;                  /* rounded down addr */
 847         size_t rsize;                   /* rounded up size */
 848         size_t ssize;
 849         faultcode_t res = 0;
 850         caddr_t addrsav;
 851         struct seg *segsav;
 852         int as_lock_held;
 853         klwp_t *lwp = ttolwp(curthread);
 854         zone_t *zonep = curzone;
 855 
 856 retry:
 857         /*
 858          * Indicate that the lwp is not to be stopped while waiting for a
 859          * pagefault.  This is to avoid deadlock while debugging a process
 860          * via /proc over NFS (in particular).
 861          */
 862         if (lwp != NULL)
 863                 lwp->lwp_nostop++;
 864 
 865         /*
 866          * same length must be used when we softlock and softunlock.  We
 867          * don't support softunlocking lengths less than the original length
 868          * when there is largepage support.  See seg_dev.c for more
 869          * comments.
 870          */
 871         switch (type) {
 872 
 873         case F_SOFTLOCK:
 874                 CPU_STATS_ADD_K(vm, softlock, 1);
 875                 break;
 876 
 877         case F_SOFTUNLOCK:
 878                 break;
 879 
 880         case F_PROT:
 881                 CPU_STATS_ADD_K(vm, prot_fault, 1);
 882                 break;
 883 
 884         case F_INVAL:
 885                 CPU_STATS_ENTER_K();
 886                 CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
 887                 if (as == &kas)
 888                         CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
 889                 CPU_STATS_EXIT_K();
 890                 if (zonep->zone_pg_flt_delay != 0) {
 891                         /*
 892                          * The zone in which this process is running
 893                          * is currently over it's physical memory cap.
 894                          * Throttle page faults to help the user-land
 895                          * memory capper catch up. Note that
 896                          * drv_usectohz() rounds up.
 897                          */
 898                         atomic_add_64(&zonep->zone_pf_throttle, 1);
 899                         atomic_add_64(&zonep->zone_pf_throttle_usec,
 900                             zonep->zone_pg_flt_delay);
 901                         if (zonep->zone_pg_flt_delay < TICK_TO_USEC(1))
 902                                 drv_usecwait(zonep->zone_pg_flt_delay);
 903                         else
 904                                 delay(drv_usectohz(zonep->zone_pg_flt_delay));
 905                 }
 906                 break;
 907         }
 908 
 909         /* Kernel probe */
 910         TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */,
 911             tnf_opaque, address,        addr,
 912             tnf_fault_type,     fault_type,     type,
 913             tnf_seg_access,     access,         rw);
 914 
 915         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
 916         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
 917             (size_t)raddr;
 918 
 919         /*
 920          * XXX -- Don't grab the as lock for segkmap. We should grab it for
 921          * correctness, but then we could be stuck holding this lock for
 922          * a LONG time if the fault needs to be resolved on a slow
 923          * filesystem, and then no-one will be able to exec new commands,
 924          * as exec'ing requires the write lock on the as.
 925          */
 926         if (as == &kas && segkmap && segkmap->s_base <= raddr &&
 927             raddr + size < segkmap->s_base + segkmap->s_size) {
 928                 seg = segkmap;
 929                 as_lock_held = 0;
 930         } else {
 931                 AS_LOCK_ENTER(as, RW_READER);
 932 
 933                 seg = as_segat(as, raddr);
 934                 if (seg == NULL) {
 935                         AS_LOCK_EXIT(as);
 936                         if (lwp != NULL)
 937                                 lwp->lwp_nostop--;
 938                         return (FC_NOMAP);
 939                 }
 940 
 941                 as_lock_held = 1;
 942         }
 943 
 944         addrsav = raddr;
 945         segsav = seg;
 946 
 947         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
 948                 if (raddr >= seg->s_base + seg->s_size) {
 949                         seg = AS_SEGNEXT(as, seg);
 950                         if (seg == NULL || raddr != seg->s_base) {
 951                                 res = FC_NOMAP;
 952                                 break;
 953                         }
 954                 }
 955                 if (raddr + rsize > seg->s_base + seg->s_size)
 956                         ssize = seg->s_base + seg->s_size - raddr;
 957                 else
 958                         ssize = rsize;
 959 
 960                 res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw);
 961                 if (res != 0)
 962                         break;
 963         }
 964 
 965         /*
 966          * If we were SOFTLOCKing and encountered a failure,
 967          * we must SOFTUNLOCK the range we already did. (Maybe we
 968          * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
 969          * right here...)
 970          */
 971         if (res != 0 && type == F_SOFTLOCK) {
 972                 for (seg = segsav; addrsav < raddr; addrsav += ssize) {
 973                         if (addrsav >= seg->s_base + seg->s_size)
 974                                 seg = AS_SEGNEXT(as, seg);
 975                         ASSERT(seg != NULL);
 976                         /*
 977                          * Now call the fault routine again to perform the
 978                          * unlock using S_OTHER instead of the rw variable
 979                          * since we never got a chance to touch the pages.
 980                          */
 981                         if (raddr > seg->s_base + seg->s_size)
 982                                 ssize = seg->s_base + seg->s_size - addrsav;
 983                         else
 984                                 ssize = raddr - addrsav;
 985                         (void) SEGOP_FAULT(hat, seg, addrsav, ssize,
 986                             F_SOFTUNLOCK, S_OTHER);
 987                 }
 988         }
 989         if (as_lock_held)
 990                 AS_LOCK_EXIT(as);
 991         if (lwp != NULL)
 992                 lwp->lwp_nostop--;
 993 
 994         /*
 995          * If the lower levels returned EDEADLK for a fault,
 996          * It means that we should retry the fault.  Let's wait
 997          * a bit also to let the deadlock causing condition clear.
 998          * This is part of a gross hack to work around a design flaw
 999          * in the ufs/sds logging code and should go away when the
1000          * logging code is re-designed to fix the problem. See bug
1001          * 4125102 for details of the problem.
1002          */
1003         if (FC_ERRNO(res) == EDEADLK) {
1004                 delay(deadlk_wait);
1005                 res = 0;
1006                 goto retry;
1007         }
1008         return (res);
1009 }
1010 
1011 
1012 
1013 /*
1014  * Asynchronous ``fault'' at addr for size bytes.
1015  */
1016 faultcode_t
1017 as_faulta(struct as *as, caddr_t addr, size_t size)
1018 {
1019         struct seg *seg;
1020         caddr_t raddr;                  /* rounded down addr */
1021         size_t rsize;                   /* rounded up size */
1022         faultcode_t res = 0;
1023         klwp_t *lwp = ttolwp(curthread);
1024 
1025 retry:
1026         /*
1027          * Indicate that the lwp is not to be stopped while waiting
1028          * for a pagefault.  This is to avoid deadlock while debugging
1029          * a process via /proc over NFS (in particular).
1030          */
1031         if (lwp != NULL)
1032                 lwp->lwp_nostop++;
1033 
1034         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1035         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1036             (size_t)raddr;
1037 
1038         AS_LOCK_ENTER(as, RW_READER);
1039         seg = as_segat(as, raddr);
1040         if (seg == NULL) {
1041                 AS_LOCK_EXIT(as);
1042                 if (lwp != NULL)
1043                         lwp->lwp_nostop--;
1044                 return (FC_NOMAP);
1045         }
1046 
1047         for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1048                 if (raddr >= seg->s_base + seg->s_size) {
1049                         seg = AS_SEGNEXT(as, seg);
1050                         if (seg == NULL || raddr != seg->s_base) {
1051                                 res = FC_NOMAP;
1052                                 break;
1053                         }
1054                 }
1055                 res = SEGOP_FAULTA(seg, raddr);
1056                 if (res != 0)
1057                         break;
1058         }
1059         AS_LOCK_EXIT(as);
1060         if (lwp != NULL)
1061                 lwp->lwp_nostop--;
1062         /*
1063          * If the lower levels returned EDEADLK for a fault,
1064          * It means that we should retry the fault.  Let's wait
1065          * a bit also to let the deadlock causing condition clear.
1066          * This is part of a gross hack to work around a design flaw
1067          * in the ufs/sds logging code and should go away when the
1068          * logging code is re-designed to fix the problem. See bug
1069          * 4125102 for details of the problem.
1070          */
1071         if (FC_ERRNO(res) == EDEADLK) {
1072                 delay(deadlk_wait);
1073                 res = 0;
1074                 goto retry;
1075         }
1076         return (res);
1077 }
1078 
1079 /*
1080  * Set the virtual mapping for the interval from [addr : addr + size)
1081  * in address space `as' to have the specified protection.
1082  * It is ok for the range to cross over several segments,
1083  * as long as they are contiguous.
1084  */
1085 int
1086 as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1087 {
1088         struct seg *seg;
1089         struct as_callback *cb;
1090         size_t ssize;
1091         caddr_t raddr;                  /* rounded down addr */
1092         size_t rsize;                   /* rounded up size */
1093         int error = 0, writer = 0;
1094         caddr_t saveraddr;
1095         size_t saversize;
1096 
1097 setprot_top:
1098         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1099         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1100             (size_t)raddr;
1101 
1102         if (raddr + rsize < raddr)           /* check for wraparound */
1103                 return (ENOMEM);
1104 
1105         saveraddr = raddr;
1106         saversize = rsize;
1107 
1108         /*
1109          * Normally we only lock the as as a reader. But
1110          * if due to setprot the segment driver needs to split
1111          * a segment it will return IE_RETRY. Therefore we re-acquire
1112          * the as lock as a writer so the segment driver can change
1113          * the seg list. Also the segment driver will return IE_RETRY
1114          * after it has changed the segment list so we therefore keep
1115          * locking as a writer. Since these opeartions should be rare
1116          * want to only lock as a writer when necessary.
1117          */
1118         if (writer || avl_numnodes(&as->a_wpage) != 0) {
1119                 AS_LOCK_ENTER(as, RW_WRITER);
1120         } else {
1121                 AS_LOCK_ENTER(as, RW_READER);
1122         }
1123 
1124         as_clearwatchprot(as, raddr, rsize);
1125         seg = as_segat(as, raddr);
1126         if (seg == NULL) {
1127                 as_setwatch(as);
1128                 AS_LOCK_EXIT(as);
1129                 return (ENOMEM);
1130         }
1131 
1132         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1133                 if (raddr >= seg->s_base + seg->s_size) {
1134                         seg = AS_SEGNEXT(as, seg);
1135                         if (seg == NULL || raddr != seg->s_base) {
1136                                 error = ENOMEM;
1137                                 break;
1138                         }
1139                 }
1140                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1141                         ssize = seg->s_base + seg->s_size - raddr;
1142                 else
1143                         ssize = rsize;
1144 retry:
1145                 error = SEGOP_SETPROT(seg, raddr, ssize, prot);
1146 
1147                 if (error == IE_NOMEM) {
1148                         error = EAGAIN;
1149                         break;
1150                 }
1151 
1152                 if (error == IE_RETRY) {
1153                         AS_LOCK_EXIT(as);
1154                         writer = 1;
1155                         goto setprot_top;
1156                 }
1157 
1158                 if (error == EAGAIN) {
1159                         /*
1160                          * Make sure we have a_lock as writer.
1161                          */
1162                         if (writer == 0) {
1163                                 AS_LOCK_EXIT(as);
1164                                 writer = 1;
1165                                 goto setprot_top;
1166                         }
1167 
1168                         /*
1169                          * Memory is currently locked.  It must be unlocked
1170                          * before this operation can succeed through a retry.
1171                          * The possible reasons for locked memory and
1172                          * corresponding strategies for unlocking are:
1173                          * (1) Normal I/O
1174                          *      wait for a signal that the I/O operation
1175                          *      has completed and the memory is unlocked.
1176                          * (2) Asynchronous I/O
1177                          *      The aio subsystem does not unlock pages when
1178                          *      the I/O is completed. Those pages are unlocked
1179                          *      when the application calls aiowait/aioerror.
1180                          *      So, to prevent blocking forever, cv_broadcast()
1181                          *      is done to wake up aio_cleanup_thread.
1182                          *      Subsequently, segvn_reclaim will be called, and
1183                          *      that will do AS_CLRUNMAPWAIT() and wake us up.
1184                          * (3) Long term page locking:
1185                          *      Drivers intending to have pages locked for a
1186                          *      period considerably longer than for normal I/O
1187                          *      (essentially forever) may have registered for a
1188                          *      callback so they may unlock these pages on
1189                          *      request. This is needed to allow this operation
1190                          *      to succeed. Each entry on the callback list is
1191                          *      examined. If the event or address range pertains
1192                          *      the callback is invoked (unless it already is in
1193                          *      progress). The a_contents lock must be dropped
1194                          *      before the callback, so only one callback can
1195                          *      be done at a time. Go to the top and do more
1196                          *      until zero is returned. If zero is returned,
1197                          *      either there were no callbacks for this event
1198                          *      or they were already in progress.
1199                          */
1200                         mutex_enter(&as->a_contents);
1201                         if (as->a_callbacks &&
1202                             (cb = as_find_callback(as, AS_SETPROT_EVENT,
1203                             seg->s_base, seg->s_size))) {
1204                                 AS_LOCK_EXIT(as);
1205                                 as_execute_callback(as, cb, AS_SETPROT_EVENT);
1206                         } else if (!AS_ISNOUNMAPWAIT(as)) {
1207                                 if (AS_ISUNMAPWAIT(as) == 0)
1208                                         cv_broadcast(&as->a_cv);
1209                                 AS_SETUNMAPWAIT(as);
1210                                 AS_LOCK_EXIT(as);
1211                                 while (AS_ISUNMAPWAIT(as))
1212                                         cv_wait(&as->a_cv, &as->a_contents);
1213                         } else {
1214                                 /*
1215                                  * We may have raced with
1216                                  * segvn_reclaim()/segspt_reclaim(). In this
1217                                  * case clean nounmapwait flag and retry since
1218                                  * softlockcnt in this segment may be already
1219                                  * 0.  We don't drop as writer lock so our
1220                                  * number of retries without sleeping should
1221                                  * be very small. See segvn_reclaim() for
1222                                  * more comments.
1223                                  */
1224                                 AS_CLRNOUNMAPWAIT(as);
1225                                 mutex_exit(&as->a_contents);
1226                                 goto retry;
1227                         }
1228                         mutex_exit(&as->a_contents);
1229                         goto setprot_top;
1230                 } else if (error != 0)
1231                         break;
1232         }
1233         if (error != 0) {
1234                 as_setwatch(as);
1235         } else {
1236                 as_setwatchprot(as, saveraddr, saversize, prot);
1237         }
1238         AS_LOCK_EXIT(as);
1239         return (error);
1240 }
1241 
1242 /*
1243  * Check to make sure that the interval [addr, addr + size)
1244  * in address space `as' has at least the specified protection.
1245  * It is ok for the range to cross over several segments, as long
1246  * as they are contiguous.
1247  */
1248 int
1249 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1250 {
1251         struct seg *seg;
1252         size_t ssize;
1253         caddr_t raddr;                  /* rounded down addr */
1254         size_t rsize;                   /* rounded up size */
1255         int error = 0;
1256 
1257         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1258         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1259             (size_t)raddr;
1260 
1261         if (raddr + rsize < raddr)           /* check for wraparound */
1262                 return (ENOMEM);
1263 
1264         /*
1265          * This is ugly as sin...
1266          * Normally, we only acquire the address space readers lock.
1267          * However, if the address space has watchpoints present,
1268          * we must acquire the writer lock on the address space for
1269          * the benefit of as_clearwatchprot() and as_setwatchprot().
1270          */
1271         if (avl_numnodes(&as->a_wpage) != 0)
1272                 AS_LOCK_ENTER(as, RW_WRITER);
1273         else
1274                 AS_LOCK_ENTER(as, RW_READER);
1275         as_clearwatchprot(as, raddr, rsize);
1276         seg = as_segat(as, raddr);
1277         if (seg == NULL) {
1278                 as_setwatch(as);
1279                 AS_LOCK_EXIT(as);
1280                 return (ENOMEM);
1281         }
1282 
1283         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1284                 if (raddr >= seg->s_base + seg->s_size) {
1285                         seg = AS_SEGNEXT(as, seg);
1286                         if (seg == NULL || raddr != seg->s_base) {
1287                                 error = ENOMEM;
1288                                 break;
1289                         }
1290                 }
1291                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1292                         ssize = seg->s_base + seg->s_size - raddr;
1293                 else
1294                         ssize = rsize;
1295 
1296                 error = SEGOP_CHECKPROT(seg, raddr, ssize, prot);
1297                 if (error != 0)
1298                         break;
1299         }
1300         as_setwatch(as);
1301         AS_LOCK_EXIT(as);
1302         return (error);
1303 }
1304 
1305 int
1306 as_unmap(struct as *as, caddr_t addr, size_t size)
1307 {
1308         struct seg *seg, *seg_next;
1309         struct as_callback *cb;
1310         caddr_t raddr, eaddr;
1311         size_t ssize, rsize = 0;
1312         int err;
1313 
1314 top:
1315         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1316         eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1317             (uintptr_t)PAGEMASK);
1318 
1319         AS_LOCK_ENTER(as, RW_WRITER);
1320 
1321         as->a_updatedir = 1; /* inform /proc */
1322         gethrestime(&as->a_updatetime);
1323 
1324         /*
1325          * Use as_findseg to find the first segment in the range, then
1326          * step through the segments in order, following s_next.
1327          */
1328         as_clearwatchprot(as, raddr, eaddr - raddr);
1329 
1330         for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
1331                 if (eaddr <= seg->s_base)
1332                         break;          /* eaddr was in a gap; all done */
1333 
1334                 /* this is implied by the test above */
1335                 ASSERT(raddr < eaddr);
1336 
1337                 if (raddr < seg->s_base)
1338                         raddr = seg->s_base;         /* raddr was in a gap */
1339 
1340                 if (eaddr > (seg->s_base + seg->s_size))
1341                         ssize = seg->s_base + seg->s_size - raddr;
1342                 else
1343                         ssize = eaddr - raddr;
1344 
1345                 /*
1346                  * Save next segment pointer since seg can be
1347                  * destroyed during the segment unmap operation.
1348                  */
1349                 seg_next = AS_SEGNEXT(as, seg);
1350 
1351                 /*
1352                  * We didn't count /dev/null mappings, so ignore them here.
1353                  * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1354                  * we have to do this check here while we have seg.)
1355                  */
1356                 rsize = 0;
1357                 if (!SEG_IS_DEVNULL_MAPPING(seg) &&
1358                     !SEG_IS_PARTIAL_RESV(seg))
1359                         rsize = ssize;
1360 
1361 retry:
1362                 err = SEGOP_UNMAP(seg, raddr, ssize);
1363                 if (err == EAGAIN) {
1364                         /*
1365                          * Memory is currently locked.  It must be unlocked
1366                          * before this operation can succeed through a retry.
1367                          * The possible reasons for locked memory and
1368                          * corresponding strategies for unlocking are:
1369                          * (1) Normal I/O
1370                          *      wait for a signal that the I/O operation
1371                          *      has completed and the memory is unlocked.
1372                          * (2) Asynchronous I/O
1373                          *      The aio subsystem does not unlock pages when
1374                          *      the I/O is completed. Those pages are unlocked
1375                          *      when the application calls aiowait/aioerror.
1376                          *      So, to prevent blocking forever, cv_broadcast()
1377                          *      is done to wake up aio_cleanup_thread.
1378                          *      Subsequently, segvn_reclaim will be called, and
1379                          *      that will do AS_CLRUNMAPWAIT() and wake us up.
1380                          * (3) Long term page locking:
1381                          *      Drivers intending to have pages locked for a
1382                          *      period considerably longer than for normal I/O
1383                          *      (essentially forever) may have registered for a
1384                          *      callback so they may unlock these pages on
1385                          *      request. This is needed to allow this operation
1386                          *      to succeed. Each entry on the callback list is
1387                          *      examined. If the event or address range pertains
1388                          *      the callback is invoked (unless it already is in
1389                          *      progress). The a_contents lock must be dropped
1390                          *      before the callback, so only one callback can
1391                          *      be done at a time. Go to the top and do more
1392                          *      until zero is returned. If zero is returned,
1393                          *      either there were no callbacks for this event
1394                          *      or they were already in progress.
1395                          */
1396                         mutex_enter(&as->a_contents);
1397                         if (as->a_callbacks &&
1398                             (cb = as_find_callback(as, AS_UNMAP_EVENT,
1399                             seg->s_base, seg->s_size))) {
1400                                 AS_LOCK_EXIT(as);
1401                                 as_execute_callback(as, cb, AS_UNMAP_EVENT);
1402                         } else if (!AS_ISNOUNMAPWAIT(as)) {
1403                                 if (AS_ISUNMAPWAIT(as) == 0)
1404                                         cv_broadcast(&as->a_cv);
1405                                 AS_SETUNMAPWAIT(as);
1406                                 AS_LOCK_EXIT(as);
1407                                 while (AS_ISUNMAPWAIT(as))
1408                                         cv_wait(&as->a_cv, &as->a_contents);
1409                         } else {
1410                                 /*
1411                                  * We may have raced with
1412                                  * segvn_reclaim()/segspt_reclaim(). In this
1413                                  * case clean nounmapwait flag and retry since
1414                                  * softlockcnt in this segment may be already
1415                                  * 0.  We don't drop as writer lock so our
1416                                  * number of retries without sleeping should
1417                                  * be very small. See segvn_reclaim() for
1418                                  * more comments.
1419                                  */
1420                                 AS_CLRNOUNMAPWAIT(as);
1421                                 mutex_exit(&as->a_contents);
1422                                 goto retry;
1423                         }
1424                         mutex_exit(&as->a_contents);
1425                         goto top;
1426                 } else if (err == IE_RETRY) {
1427                         AS_LOCK_EXIT(as);
1428                         goto top;
1429                 } else if (err) {
1430                         as_setwatch(as);
1431                         AS_LOCK_EXIT(as);
1432                         return (-1);
1433                 }
1434 
1435                 as->a_size -= ssize;
1436                 if (rsize)
1437                         as->a_resvsize -= rsize;
1438                 raddr += ssize;
1439         }
1440         AS_LOCK_EXIT(as);
1441         return (0);
1442 }
1443 
1444 static int
1445 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
1446     int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1447 {
1448         uint_t szc;
1449         uint_t nszc;
1450         int error;
1451         caddr_t a;
1452         caddr_t eaddr;
1453         size_t segsize;
1454         struct seg *seg;
1455         size_t pgsz;
1456         int do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
1457         uint_t save_szcvec;
1458 
1459         ASSERT(AS_WRITE_HELD(as));
1460         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1461         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1462         ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
1463         if (!do_off) {
1464                 vn_a->offset = 0;
1465         }
1466 
1467         if (szcvec <= 1) {
1468                 seg = seg_alloc(as, addr, size);
1469                 if (seg == NULL) {
1470                         return (ENOMEM);
1471                 }
1472                 vn_a->szc = 0;
1473                 error = (*crfp)(seg, vn_a);
1474                 if (error != 0) {
1475                         seg_free(seg);
1476                 } else {
1477                         as->a_size += size;
1478                         as->a_resvsize += size;
1479                 }
1480                 return (error);
1481         }
1482 
1483         eaddr = addr + size;
1484         save_szcvec = szcvec;
1485         szcvec >>= 1;
1486         szc = 0;
1487         nszc = 0;
1488         while (szcvec) {
1489                 if ((szcvec & 0x1) == 0) {
1490                         nszc++;
1491                         szcvec >>= 1;
1492                         continue;
1493                 }
1494                 nszc++;
1495                 pgsz = page_get_pagesize(nszc);
1496                 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
1497                 if (a != addr) {
1498                         ASSERT(a < eaddr);
1499                         segsize = a - addr;
1500                         seg = seg_alloc(as, addr, segsize);
1501                         if (seg == NULL) {
1502                                 return (ENOMEM);
1503                         }
1504                         vn_a->szc = szc;
1505                         error = (*crfp)(seg, vn_a);
1506                         if (error != 0) {
1507                                 seg_free(seg);
1508                                 return (error);
1509                         }
1510                         as->a_size += segsize;
1511                         as->a_resvsize += segsize;
1512                         *segcreated = 1;
1513                         if (do_off) {
1514                                 vn_a->offset += segsize;
1515                         }
1516                         addr = a;
1517                 }
1518                 szc = nszc;
1519                 szcvec >>= 1;
1520         }
1521 
1522         ASSERT(addr < eaddr);
1523         szcvec = save_szcvec | 1; /* add 8K pages */
1524         while (szcvec) {
1525                 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
1526                 ASSERT(a >= addr);
1527                 if (a != addr) {
1528                         segsize = a - addr;
1529                         seg = seg_alloc(as, addr, segsize);
1530                         if (seg == NULL) {
1531                                 return (ENOMEM);
1532                         }
1533                         vn_a->szc = szc;
1534                         error = (*crfp)(seg, vn_a);
1535                         if (error != 0) {
1536                                 seg_free(seg);
1537                                 return (error);
1538                         }
1539                         as->a_size += segsize;
1540                         as->a_resvsize += segsize;
1541                         *segcreated = 1;
1542                         if (do_off) {
1543                                 vn_a->offset += segsize;
1544                         }
1545                         addr = a;
1546                 }
1547                 szcvec &= ~(1 << szc);
1548                 if (szcvec) {
1549                         szc = highbit(szcvec) - 1;
1550                         pgsz = page_get_pagesize(szc);
1551                 }
1552         }
1553         ASSERT(addr == eaddr);
1554 
1555         return (0);
1556 }
1557 
1558 static int
1559 as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
1560     int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1561 {
1562         uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
1563         int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
1564         uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1565             type, 0);
1566         int error;
1567         struct seg *seg;
1568         struct vattr va;
1569         u_offset_t eoff;
1570         size_t save_size = 0;
1571         extern size_t textrepl_size_thresh;
1572 
1573         ASSERT(AS_WRITE_HELD(as));
1574         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1575         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1576         ASSERT(vn_a->vp != NULL);
1577         ASSERT(vn_a->amp == NULL);
1578 
1579 again:
1580         if (szcvec <= 1) {
1581                 seg = seg_alloc(as, addr, size);
1582                 if (seg == NULL) {
1583                         return (ENOMEM);
1584                 }
1585                 vn_a->szc = 0;
1586                 error = (*crfp)(seg, vn_a);
1587                 if (error != 0) {
1588                         seg_free(seg);
1589                 } else {
1590                         as->a_size += size;
1591                         as->a_resvsize += size;
1592                 }
1593                 return (error);
1594         }
1595 
1596         va.va_mask = AT_SIZE;
1597         if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) {
1598                 szcvec = 0;
1599                 goto again;
1600         }
1601         eoff = vn_a->offset & PAGEMASK;
1602         if (eoff >= va.va_size) {
1603                 szcvec = 0;
1604                 goto again;
1605         }
1606         eoff += size;
1607         if (btopr(va.va_size) < btopr(eoff)) {
1608                 save_size = size;
1609                 size = va.va_size - (vn_a->offset & PAGEMASK);
1610                 size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
1611                 szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1612                     type, 0);
1613                 if (szcvec <= 1) {
1614                         size = save_size;
1615                         goto again;
1616                 }
1617         }
1618 
1619         if (size > textrepl_size_thresh) {
1620                 vn_a->flags |= _MAP_TEXTREPL;
1621         }
1622         error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a,
1623             segcreated);
1624         if (error != 0) {
1625                 return (error);
1626         }
1627         if (save_size) {
1628                 addr += size;
1629                 size = save_size - size;
1630                 szcvec = 0;
1631                 goto again;
1632         }
1633         return (0);
1634 }
1635 
1636 /*
1637  * as_map_ansegs: shared or private anonymous memory.  Note that the flags
1638  * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1639  */
1640 static int
1641 as_map_ansegs(struct as *as, caddr_t addr, size_t size,
1642     int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1643 {
1644         uint_t szcvec;
1645         uchar_t type;
1646 
1647         ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
1648         if (vn_a->type == MAP_SHARED) {
1649                 type = MAPPGSZC_SHM;
1650         } else if (vn_a->type == MAP_PRIVATE) {
1651                 if (vn_a->szc == AS_MAP_HEAP) {
1652                         type = MAPPGSZC_HEAP;
1653                 } else if (vn_a->szc == AS_MAP_STACK) {
1654                         type = MAPPGSZC_STACK;
1655                 } else {
1656                         type = MAPPGSZC_PRIVM;
1657                 }
1658         }
1659         szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
1660             (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
1661             (vn_a->flags & MAP_TEXT), type, 0);
1662         ASSERT(AS_WRITE_HELD(as));
1663         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1664         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1665         ASSERT(vn_a->vp == NULL);
1666 
1667         return (as_map_segvn_segs(as, addr, size, szcvec,
1668             crfp, vn_a, segcreated));
1669 }
1670 
1671 int
1672 as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp)
1673 {
1674         AS_LOCK_ENTER(as, RW_WRITER);
1675         return (as_map_locked(as, addr, size, crfp, argsp));
1676 }
1677 
1678 int
1679 as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(),
1680     void *argsp)
1681 {
1682         struct seg *seg = NULL;
1683         caddr_t raddr;                  /* rounded down addr */
1684         size_t rsize;                   /* rounded up size */
1685         int error;
1686         int unmap = 0;
1687         /*
1688          * The use of a_proc is preferred to handle the case where curproc is
1689          * a door_call server and is allocating memory in the client's (a_proc)
1690          * address space.
1691          * When creating a shared memory segment a_proc will be NULL so we
1692          * fallback to curproc in that case.
1693          */
1694         struct proc *p = (as->a_proc == NULL) ? curproc : as->a_proc;
1695         struct segvn_crargs crargs;
1696 
1697         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1698         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1699             (size_t)raddr;
1700 
1701         /*
1702          * check for wrap around
1703          */
1704         if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
1705                 AS_LOCK_EXIT(as);
1706                 return (ENOMEM);
1707         }
1708 
1709         as->a_updatedir = 1; /* inform /proc */
1710         gethrestime(&as->a_updatetime);
1711 
1712         if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) {
1713                 AS_LOCK_EXIT(as);
1714 
1715                 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
1716                     RCA_UNSAFE_ALL);
1717 
1718                 return (ENOMEM);
1719         }
1720 
1721         if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
1722                 crargs = *(struct segvn_crargs *)argsp;
1723                 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap);
1724                 if (error != 0) {
1725                         AS_LOCK_EXIT(as);
1726                         if (unmap) {
1727                                 (void) as_unmap(as, addr, size);
1728                         }
1729                         return (error);
1730                 }
1731         } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
1732                 crargs = *(struct segvn_crargs *)argsp;
1733                 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap);
1734                 if (error != 0) {
1735                         AS_LOCK_EXIT(as);
1736                         if (unmap) {
1737                                 (void) as_unmap(as, addr, size);
1738                         }
1739                         return (error);
1740                 }
1741         } else {
1742                 seg = seg_alloc(as, addr, size);
1743                 if (seg == NULL) {
1744                         AS_LOCK_EXIT(as);
1745                         return (ENOMEM);
1746                 }
1747 
1748                 error = (*crfp)(seg, argsp);
1749                 if (error != 0) {
1750                         seg_free(seg);
1751                         AS_LOCK_EXIT(as);
1752                         return (error);
1753                 }
1754                 /*
1755                  * Add size now so as_unmap will work if as_ctl fails.
1756                  */
1757                 as->a_size += rsize;
1758                 as->a_resvsize += rsize;
1759         }
1760 
1761         as_setwatch(as);
1762 
1763         /*
1764          * If the address space is locked,
1765          * establish memory locks for the new segment.
1766          */
1767         mutex_enter(&as->a_contents);
1768         if (AS_ISPGLCK(as)) {
1769                 mutex_exit(&as->a_contents);
1770                 AS_LOCK_EXIT(as);
1771                 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
1772                 if (error != 0)
1773                         (void) as_unmap(as, addr, size);
1774         } else {
1775                 mutex_exit(&as->a_contents);
1776                 AS_LOCK_EXIT(as);
1777         }
1778         return (error);
1779 }
1780 
1781 
1782 /*
1783  * Delete all segments in the address space marked with S_PURGE.
1784  * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1785  * These segments are deleted as a first step before calls to as_gap(), so
1786  * that they don't affect mmap() or shmat().
1787  */
1788 void
1789 as_purge(struct as *as)
1790 {
1791         struct seg *seg;
1792         struct seg *next_seg;
1793 
1794         /*
1795          * the setting of NEEDSPURGE is protect by as_rangelock(), so
1796          * no need to grab a_contents mutex for this check
1797          */
1798         if ((as->a_flags & AS_NEEDSPURGE) == 0)
1799                 return;
1800 
1801         AS_LOCK_ENTER(as, RW_WRITER);
1802         next_seg = NULL;
1803         seg = AS_SEGFIRST(as);
1804         while (seg != NULL) {
1805                 next_seg = AS_SEGNEXT(as, seg);
1806                 if (seg->s_flags & S_PURGE)
1807                         SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1808                 seg = next_seg;
1809         }
1810         AS_LOCK_EXIT(as);
1811 
1812         mutex_enter(&as->a_contents);
1813         as->a_flags &= ~AS_NEEDSPURGE;
1814         mutex_exit(&as->a_contents);
1815 }
1816 
1817 /*
1818  * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1819  * range of addresses at least "minlen" long, where the base of the range is
1820  * at "off" phase from an "align" boundary and there is space for a
1821  * "redzone"-sized redzone on eithe rside of the range.  Thus,
1822  * if align was 4M and off was 16k, the user wants a hole which will start
1823  * 16k into a 4M page.
1824  *
1825  * If flags specifies AH_HI, the hole will have the highest possible address
1826  * in the range.  We use the as->a_lastgap field to figure out where to
1827  * start looking for a gap.
1828  *
1829  * Otherwise, the gap will have the lowest possible address.
1830  *
1831  * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1832  *
1833  * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1834  * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1835  *
1836  * NOTE: This routine is not correct when base+len overflows caddr_t.
1837  */
1838 int
1839 as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
1840     uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off)
1841 {
1842         caddr_t lobound = *basep;
1843         caddr_t hibound = lobound + *lenp;
1844         struct seg *lseg, *hseg;
1845         caddr_t lo, hi;
1846         int forward;
1847         caddr_t save_base;
1848         size_t save_len;
1849         size_t save_minlen;
1850         size_t save_redzone;
1851         int fast_path = 1;
1852 
1853         save_base = *basep;
1854         save_len = *lenp;
1855         save_minlen = minlen;
1856         save_redzone = redzone;
1857 
1858         /*
1859          * For the first pass/fast_path, just add align and redzone into
1860          * minlen since if we get an allocation, we can guarantee that it
1861          * will fit the alignment and redzone requested.
1862          * This increases the chance that hibound will be adjusted to
1863          * a_lastgap->s_base which will likely allow us to find an
1864          * acceptable hole in the address space quicker.
1865          * If we can't find a hole with this fast_path, then we look for
1866          * smaller holes in which the alignment and offset may allow
1867          * the allocation to fit.
1868          */
1869         minlen += align;
1870         minlen += 2 * redzone;
1871         redzone = 0;
1872 
1873         AS_LOCK_ENTER(as, RW_READER);
1874         if (AS_SEGFIRST(as) == NULL) {
1875                 if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
1876                     align, redzone, off)) {
1877                         AS_LOCK_EXIT(as);
1878                         return (0);
1879                 } else {
1880                         AS_LOCK_EXIT(as);
1881                         *basep = save_base;
1882                         *lenp = save_len;
1883                         return (-1);
1884                 }
1885         }
1886 
1887 retry:
1888         /*
1889          * Set up to iterate over all the inter-segment holes in the given
1890          * direction.  lseg is NULL for the lowest-addressed hole and hseg is
1891          * NULL for the highest-addressed hole.  If moving backwards, we reset
1892          * sseg to denote the highest-addressed segment.
1893          */
1894         forward = (flags & AH_DIR) == AH_LO;
1895         if (forward) {
1896                 hseg = as_findseg(as, lobound, 1);
1897                 lseg = AS_SEGPREV(as, hseg);
1898         } else {
1899 
1900                 /*
1901                  * If allocating at least as much as the last allocation,
1902                  * use a_lastgap's base as a better estimate of hibound.
1903                  */
1904                 if (as->a_lastgap &&
1905                     minlen >= as->a_lastgap->s_size &&
1906                     hibound >= as->a_lastgap->s_base)
1907                         hibound = as->a_lastgap->s_base;
1908 
1909                 hseg = as_findseg(as, hibound, 1);
1910                 if (hseg->s_base + hseg->s_size < hibound) {
1911                         lseg = hseg;
1912                         hseg = NULL;
1913                 } else {
1914                         lseg = AS_SEGPREV(as, hseg);
1915                 }
1916         }
1917 
1918         for (;;) {
1919                 /*
1920                  * Set lo and hi to the hole's boundaries.  (We should really
1921                  * use MAXADDR in place of hibound in the expression below,
1922                  * but can't express it easily; using hibound in its place is
1923                  * harmless.)
1924                  */
1925                 lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
1926                 hi = (hseg == NULL) ? hibound : hseg->s_base;
1927                 /*
1928                  * If the iteration has moved past the interval from lobound
1929                  * to hibound it's pointless to continue.
1930                  */
1931                 if ((forward && lo > hibound) || (!forward && hi < lobound))
1932                         break;
1933                 else if (lo > hibound || hi < lobound)
1934                         goto cont;
1935                 /*
1936                  * Candidate hole lies at least partially within the allowable
1937                  * range.  Restrict it to fall completely within that range,
1938                  * i.e., to [max(lo, lobound), min(hi, hibound)].
1939                  */
1940                 if (lo < lobound)
1941                         lo = lobound;
1942                 if (hi > hibound)
1943                         hi = hibound;
1944                 /*
1945                  * Verify that the candidate hole is big enough and meets
1946                  * hardware constraints.  If the hole is too small, no need
1947                  * to do the further checks since they will fail.
1948                  */
1949                 *basep = lo;
1950                 *lenp = hi - lo;
1951                 if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
1952                     minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
1953                     ((flags & AH_CONTAIN) == 0 ||
1954                     (*basep <= addr && *basep + *lenp > addr))) {
1955                         if (!forward)
1956                                 as->a_lastgap = hseg;
1957                         if (hseg != NULL)
1958                                 as->a_lastgaphl = hseg;
1959                         else
1960                                 as->a_lastgaphl = lseg;
1961                         AS_LOCK_EXIT(as);
1962                         return (0);
1963                 }
1964         cont:
1965                 /*
1966                  * Move to the next hole.
1967                  */
1968                 if (forward) {
1969                         lseg = hseg;
1970                         if (lseg == NULL)
1971                                 break;
1972                         hseg = AS_SEGNEXT(as, hseg);
1973                 } else {
1974                         hseg = lseg;
1975                         if (hseg == NULL)
1976                                 break;
1977                         lseg = AS_SEGPREV(as, lseg);
1978                 }
1979         }
1980         if (fast_path && (align != 0 || save_redzone != 0)) {
1981                 fast_path = 0;
1982                 minlen = save_minlen;
1983                 redzone = save_redzone;
1984                 goto retry;
1985         }
1986         *basep = save_base;
1987         *lenp = save_len;
1988         AS_LOCK_EXIT(as);
1989         return (-1);
1990 }
1991 
1992 /*
1993  * Find a hole of at least size minlen within [*basep, *basep + *lenp).
1994  *
1995  * If flags specifies AH_HI, the hole will have the highest possible address
1996  * in the range.  We use the as->a_lastgap field to figure out where to
1997  * start looking for a gap.
1998  *
1999  * Otherwise, the gap will have the lowest possible address.
2000  *
2001  * If flags specifies AH_CONTAIN, the hole will contain the address addr.
2002  *
2003  * If an adequate hole is found, base and len are set to reflect the part of
2004  * the hole that is within range, and 0 is returned, otherwise,
2005  * -1 is returned.
2006  *
2007  * NOTE: This routine is not correct when base+len overflows caddr_t.
2008  */
2009 int
2010 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
2011     caddr_t addr)
2012 {
2013 
2014         return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
2015 }
2016 
2017 /*
2018  * Return the next range within [base, base + len) that is backed
2019  * with "real memory".  Skip holes and non-seg_vn segments.
2020  * We're lazy and only return one segment at a time.
2021  */
2022 int
2023 as_memory(struct as *as, caddr_t *basep, size_t *lenp)
2024 {
2025         extern struct seg_ops segspt_shmops;    /* needs a header file */
2026         struct seg *seg;
2027         caddr_t addr, eaddr;
2028         caddr_t segend;
2029 
2030         AS_LOCK_ENTER(as, RW_READER);
2031 
2032         addr = *basep;
2033         eaddr = addr + *lenp;
2034 
2035         seg = as_findseg(as, addr, 0);
2036         if (seg != NULL)
2037                 addr = MAX(seg->s_base, addr);
2038 
2039         for (;;) {
2040                 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2041                         AS_LOCK_EXIT(as);
2042                         return (EINVAL);
2043                 }
2044 
2045                 if (seg->s_ops == &segvn_ops) {
2046                         segend = seg->s_base + seg->s_size;
2047                         break;
2048                 }
2049 
2050                 /*
2051                  * We do ISM by looking into the private data
2052                  * to determine the real size of the segment.
2053                  */
2054                 if (seg->s_ops == &segspt_shmops) {
2055                         segend = seg->s_base + spt_realsize(seg);
2056                         if (addr < segend)
2057                                 break;
2058                 }
2059 
2060                 seg = AS_SEGNEXT(as, seg);
2061 
2062                 if (seg != NULL)
2063                         addr = seg->s_base;
2064         }
2065 
2066         *basep = addr;
2067 
2068         if (segend > eaddr)
2069                 *lenp = eaddr - addr;
2070         else
2071                 *lenp = segend - addr;
2072 
2073         AS_LOCK_EXIT(as);
2074         return (0);
2075 }
2076 
2077 /*
2078  * Swap the pages associated with the address space as out to
2079  * secondary storage, returning the number of bytes actually
2080  * swapped.
2081  *
2082  * The value returned is intended to correlate well with the process's
2083  * memory requirements.  Its usefulness for this purpose depends on
2084  * how well the segment-level routines do at returning accurate
2085  * information.
2086  */
2087 size_t
2088 as_swapout(struct as *as)
2089 {
2090         struct seg *seg;
2091         size_t swpcnt = 0;
2092 
2093         /*
2094          * Kernel-only processes have given up their address
2095          * spaces.  Of course, we shouldn't be attempting to
2096          * swap out such processes in the first place...
2097          */
2098         if (as == NULL)
2099                 return (0);
2100 
2101         AS_LOCK_ENTER(as, RW_READER);
2102 
2103         /*
2104          * Free all mapping resources associated with the address
2105          * space.  The segment-level swapout routines capitalize
2106          * on this unmapping by scavanging pages that have become
2107          * unmapped here.
2108          */
2109         hat_swapout(as->a_hat);
2110 
2111         /*
2112          * Call the swapout routines of all segments in the address
2113          * space to do the actual work, accumulating the amount of
2114          * space reclaimed.
2115          */
2116         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2117                 struct seg_ops *ov = seg->s_ops;
2118 
2119                 /*
2120                  * We have to check to see if the seg has
2121                  * an ops vector because the seg may have
2122                  * been in the middle of being set up when
2123                  * the process was picked for swapout.
2124                  */
2125                 if ((ov != NULL) && (ov->swapout != NULL))
2126                         swpcnt += SEGOP_SWAPOUT(seg);
2127         }
2128         AS_LOCK_EXIT(as);
2129         return (swpcnt);
2130 }
2131 
2132 /*
2133  * Determine whether data from the mappings in interval [addr, addr + size)
2134  * are in the primary memory (core) cache.
2135  */
2136 int
2137 as_incore(struct as *as, caddr_t addr,
2138     size_t size, char *vec, size_t *sizep)
2139 {
2140         struct seg *seg;
2141         size_t ssize;
2142         caddr_t raddr;          /* rounded down addr */
2143         size_t rsize;           /* rounded up size */
2144         size_t isize;                   /* iteration size */
2145         int error = 0;          /* result, assume success */
2146 
2147         *sizep = 0;
2148         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2149         rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2150             (size_t)raddr;
2151 
2152         if (raddr + rsize < raddr)           /* check for wraparound */
2153                 return (ENOMEM);
2154 
2155         AS_LOCK_ENTER(as, RW_READER);
2156         seg = as_segat(as, raddr);
2157         if (seg == NULL) {
2158                 AS_LOCK_EXIT(as);
2159                 return (-1);
2160         }
2161 
2162         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2163                 if (raddr >= seg->s_base + seg->s_size) {
2164                         seg = AS_SEGNEXT(as, seg);
2165                         if (seg == NULL || raddr != seg->s_base) {
2166                                 error = -1;
2167                                 break;
2168                         }
2169                 }
2170                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2171                         ssize = seg->s_base + seg->s_size - raddr;
2172                 else
2173                         ssize = rsize;
2174                 *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec);
2175                 if (isize != ssize) {
2176                         error = -1;
2177                         break;
2178                 }
2179                 vec += btopr(ssize);
2180         }
2181         AS_LOCK_EXIT(as);
2182         return (error);
2183 }
2184 
2185 static void
2186 as_segunlock(struct seg *seg, caddr_t addr, int attr,
2187     ulong_t *bitmap, size_t position, size_t npages)
2188 {
2189         caddr_t range_start;
2190         size_t  pos1 = position;
2191         size_t  pos2;
2192         size_t  size;
2193         size_t  end_pos = npages + position;
2194 
2195         while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2196                 size = ptob((pos2 - pos1));
2197                 range_start = (caddr_t)((uintptr_t)addr +
2198                     ptob(pos1 - position));
2199 
2200                 (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK,
2201                     (ulong_t *)NULL, (size_t)NULL);
2202                 pos1 = pos2;
2203         }
2204 }
2205 
2206 static void
2207 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
2208     caddr_t raddr, size_t rsize)
2209 {
2210         struct seg *seg = as_segat(as, raddr);
2211         size_t ssize;
2212 
2213         while (rsize != 0) {
2214                 if (raddr >= seg->s_base + seg->s_size)
2215                         seg = AS_SEGNEXT(as, seg);
2216 
2217                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2218                         ssize = seg->s_base + seg->s_size - raddr;
2219                 else
2220                         ssize = rsize;
2221 
2222                 as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
2223 
2224                 rsize -= ssize;
2225                 raddr += ssize;
2226         }
2227 }
2228 
2229 /*
2230  * Cache control operations over the interval [addr, addr + size) in
2231  * address space "as".
2232  */
2233 /*ARGSUSED*/
2234 int
2235 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
2236     uintptr_t arg, ulong_t *lock_map, size_t pos)
2237 {
2238         struct seg *seg;        /* working segment */
2239         caddr_t raddr;          /* rounded down addr */
2240         caddr_t initraddr;      /* saved initial rounded down addr */
2241         size_t rsize;           /* rounded up size */
2242         size_t initrsize;       /* saved initial rounded up size */
2243         size_t ssize;           /* size of seg */
2244         int error = 0;                  /* result */
2245         size_t mlock_size;      /* size of bitmap */
2246         ulong_t *mlock_map;     /* pointer to bitmap used */
2247                                 /* to represent the locked */
2248                                 /* pages. */
2249 retry:
2250         if (error == IE_RETRY)
2251                 AS_LOCK_ENTER(as, RW_WRITER);
2252         else
2253                 AS_LOCK_ENTER(as, RW_READER);
2254 
2255         /*
2256          * If these are address space lock/unlock operations, loop over
2257          * all segments in the address space, as appropriate.
2258          */
2259         if (func == MC_LOCKAS) {
2260                 size_t npages, idx;
2261                 size_t rlen = 0;        /* rounded as length */
2262 
2263                 idx = pos;
2264 
2265                 if (arg & MCL_FUTURE) {
2266                         mutex_enter(&as->a_contents);
2267                         AS_SETPGLCK(as);
2268                         mutex_exit(&as->a_contents);
2269                 }
2270                 if ((arg & MCL_CURRENT) == 0) {
2271                         AS_LOCK_EXIT(as);
2272                         return (0);
2273                 }
2274 
2275                 seg = AS_SEGFIRST(as);
2276                 if (seg == NULL) {
2277                         AS_LOCK_EXIT(as);
2278                         return (0);
2279                 }
2280 
2281                 do {
2282                         raddr = (caddr_t)((uintptr_t)seg->s_base &
2283                             (uintptr_t)PAGEMASK);
2284                         rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2285                             PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2286                 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2287 
2288                 mlock_size = BT_BITOUL(btopr(rlen));
2289                 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2290                     sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2291                                 AS_LOCK_EXIT(as);
2292                                 return (EAGAIN);
2293                 }
2294 
2295                 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2296                         error = SEGOP_LOCKOP(seg, seg->s_base,
2297                             seg->s_size, attr, MC_LOCK, mlock_map, pos);
2298                         if (error != 0)
2299                                 break;
2300                         pos += seg_pages(seg);
2301                 }
2302 
2303                 if (error) {
2304                         for (seg = AS_SEGFIRST(as); seg != NULL;
2305                             seg = AS_SEGNEXT(as, seg)) {
2306 
2307                                 raddr = (caddr_t)((uintptr_t)seg->s_base &
2308                                     (uintptr_t)PAGEMASK);
2309                                 npages = seg_pages(seg);
2310                                 as_segunlock(seg, raddr, attr, mlock_map,
2311                                     idx, npages);
2312                                 idx += npages;
2313                         }
2314                 }
2315 
2316                 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2317                 AS_LOCK_EXIT(as);
2318                 goto lockerr;
2319         } else if (func == MC_UNLOCKAS) {
2320                 mutex_enter(&as->a_contents);
2321                 AS_CLRPGLCK(as);
2322                 mutex_exit(&as->a_contents);
2323 
2324                 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2325                         error = SEGOP_LOCKOP(seg, seg->s_base,
2326                             seg->s_size, attr, MC_UNLOCK, NULL, 0);
2327                         if (error != 0)
2328                                 break;
2329                 }
2330 
2331                 AS_LOCK_EXIT(as);
2332                 goto lockerr;
2333         }
2334 
2335         /*
2336          * Normalize addresses and sizes.
2337          */
2338         initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2339         initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2340             (size_t)raddr;
2341 
2342         if (raddr + rsize < raddr) {         /* check for wraparound */
2343                 AS_LOCK_EXIT(as);
2344                 return (ENOMEM);
2345         }
2346 
2347         /*
2348          * Get initial segment.
2349          */
2350         if ((seg = as_segat(as, raddr)) == NULL) {
2351                 AS_LOCK_EXIT(as);
2352                 return (ENOMEM);
2353         }
2354 
2355         if (func == MC_LOCK) {
2356                 mlock_size = BT_BITOUL(btopr(rsize));
2357                 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2358                     sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2359                                 AS_LOCK_EXIT(as);
2360                                 return (EAGAIN);
2361                 }
2362         }
2363 
2364         /*
2365          * Loop over all segments.  If a hole in the address range is
2366          * discovered, then fail.  For each segment, perform the appropriate
2367          * control operation.
2368          */
2369         while (rsize != 0) {
2370 
2371                 /*
2372                  * Make sure there's no hole, calculate the portion
2373                  * of the next segment to be operated over.
2374                  */
2375                 if (raddr >= seg->s_base + seg->s_size) {
2376                         seg = AS_SEGNEXT(as, seg);
2377                         if (seg == NULL || raddr != seg->s_base) {
2378                                 if (func == MC_LOCK) {
2379                                         as_unlockerr(as, attr, mlock_map,
2380                                             initraddr, initrsize - rsize);
2381                                         kmem_free(mlock_map,
2382                                             mlock_size * sizeof (ulong_t));
2383                                 }
2384                                 AS_LOCK_EXIT(as);
2385                                 return (ENOMEM);
2386                         }
2387                 }
2388                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2389                         ssize = seg->s_base + seg->s_size - raddr;
2390                 else
2391                         ssize = rsize;
2392 
2393                 /*
2394                  * Dispatch on specific function.
2395                  */
2396                 switch (func) {
2397 
2398                 /*
2399                  * Synchronize cached data from mappings with backing
2400                  * objects.
2401                  */
2402                 case MC_SYNC:
2403                         if (error = SEGOP_SYNC(seg, raddr, ssize,
2404                             attr, (uint_t)arg)) {
2405                                 AS_LOCK_EXIT(as);
2406                                 return (error);
2407                         }
2408                         break;
2409 
2410                 /*
2411                  * Lock pages in memory.
2412                  */
2413                 case MC_LOCK:
2414                         if (error = SEGOP_LOCKOP(seg, raddr, ssize,
2415                             attr, func, mlock_map, pos)) {
2416                                 as_unlockerr(as, attr, mlock_map, initraddr,
2417                                     initrsize - rsize + ssize);
2418                                 kmem_free(mlock_map, mlock_size *
2419                                     sizeof (ulong_t));
2420                                 AS_LOCK_EXIT(as);
2421                                 goto lockerr;
2422                         }
2423                         break;
2424 
2425                 /*
2426                  * Unlock mapped pages.
2427                  */
2428                 case MC_UNLOCK:
2429                         (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func,
2430                             (ulong_t *)NULL, (size_t)NULL);
2431                         break;
2432 
2433                 /*
2434                  * Store VM advise for mapped pages in segment layer.
2435                  */
2436                 case MC_ADVISE:
2437                         error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg);
2438 
2439                         /*
2440                          * Check for regular errors and special retry error
2441                          */
2442                         if (error) {
2443                                 if (error == IE_RETRY) {
2444                                         /*
2445                                          * Need to acquire writers lock, so
2446                                          * have to drop readers lock and start
2447                                          * all over again
2448                                          */
2449                                         AS_LOCK_EXIT(as);
2450                                         goto retry;
2451                                 } else if (error == IE_REATTACH) {
2452                                         /*
2453                                          * Find segment for current address
2454                                          * because current segment just got
2455                                          * split or concatenated
2456                                          */
2457                                         seg = as_segat(as, raddr);
2458                                         if (seg == NULL) {
2459                                                 AS_LOCK_EXIT(as);
2460                                                 return (ENOMEM);
2461                                         }
2462                                 } else {
2463                                         /*
2464                                          * Regular error
2465                                          */
2466                                         AS_LOCK_EXIT(as);
2467                                         return (error);
2468                                 }
2469                         }
2470                         break;
2471 
2472                 case MC_INHERIT_ZERO:
2473                         if (seg->s_ops->inherit == NULL) {
2474                                 error = ENOTSUP;
2475                         } else {
2476                                 error = SEGOP_INHERIT(seg, raddr, ssize,
2477                                     SEGP_INH_ZERO);
2478                         }
2479                         if (error != 0) {
2480                                 AS_LOCK_EXIT(as);
2481                                 return (error);
2482                         }
2483                         break;
2484 
2485                 /*
2486                  * Can't happen.
2487                  */
2488                 default:
2489                         panic("as_ctl: bad operation %d", func);
2490                         /*NOTREACHED*/
2491                 }
2492 
2493                 rsize -= ssize;
2494                 raddr += ssize;
2495         }
2496 
2497         if (func == MC_LOCK)
2498                 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2499         AS_LOCK_EXIT(as);
2500         return (0);
2501 lockerr:
2502 
2503         /*
2504          * If the lower levels returned EDEADLK for a segment lockop,
2505          * it means that we should retry the operation.  Let's wait
2506          * a bit also to let the deadlock causing condition clear.
2507          * This is part of a gross hack to work around a design flaw
2508          * in the ufs/sds logging code and should go away when the
2509          * logging code is re-designed to fix the problem. See bug
2510          * 4125102 for details of the problem.
2511          */
2512         if (error == EDEADLK) {
2513                 delay(deadlk_wait);
2514                 error = 0;
2515                 goto retry;
2516         }
2517         return (error);
2518 }
2519 
2520 int
2521 fc_decode(faultcode_t fault_err)
2522 {
2523         int error = 0;
2524 
2525         switch (FC_CODE(fault_err)) {
2526         case FC_OBJERR:
2527                 error = FC_ERRNO(fault_err);
2528                 break;
2529         case FC_PROT:
2530                 error = EACCES;
2531                 break;
2532         default:
2533                 error = EFAULT;
2534                 break;
2535         }
2536         return (error);
2537 }
2538 
2539 /*
2540  * Pagelock pages from a range that spans more than 1 segment.  Obtain shadow
2541  * lists from each segment and copy them to one contiguous shadow list (plist)
2542  * as expected by the caller.  Save pointers to per segment shadow lists at
2543  * the tail of plist so that they can be used during as_pageunlock().
2544  */
2545 static int
2546 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2547     caddr_t addr, size_t size, enum seg_rw rw)
2548 {
2549         caddr_t sv_addr = addr;
2550         size_t sv_size = size;
2551         struct seg *sv_seg = seg;
2552         ulong_t segcnt = 1;
2553         ulong_t cnt;
2554         size_t ssize;
2555         pgcnt_t npages = btop(size);
2556         page_t **plist;
2557         page_t **pl;
2558         int error;
2559         caddr_t eaddr;
2560         faultcode_t fault_err = 0;
2561         pgcnt_t pl_off;
2562         extern struct seg_ops segspt_shmops;
2563 
2564         ASSERT(AS_LOCK_HELD(as));
2565         ASSERT(seg != NULL);
2566         ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2567         ASSERT(addr + size > seg->s_base + seg->s_size);
2568         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2569         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2570 
2571         /*
2572          * Count the number of segments covered by the range we are about to
2573          * lock. The segment count is used to size the shadow list we return
2574          * back to the caller.
2575          */
2576         for (; size != 0; size -= ssize, addr += ssize) {
2577                 if (addr >= seg->s_base + seg->s_size) {
2578 
2579                         seg = AS_SEGNEXT(as, seg);
2580                         if (seg == NULL || addr != seg->s_base) {
2581                                 AS_LOCK_EXIT(as);
2582                                 return (EFAULT);
2583                         }
2584                         /*
2585                          * Do a quick check if subsequent segments
2586                          * will most likely support pagelock.
2587                          */
2588                         if (seg->s_ops == &segvn_ops) {
2589                                 vnode_t *vp;
2590 
2591                                 if (SEGOP_GETVP(seg, addr, &vp) != 0 ||
2592                                     vp != NULL) {
2593                                         AS_LOCK_EXIT(as);
2594                                         goto slow;
2595                                 }
2596                         } else if (seg->s_ops != &segspt_shmops) {
2597                                 AS_LOCK_EXIT(as);
2598                                 goto slow;
2599                         }
2600                         segcnt++;
2601                 }
2602                 if (addr + size > seg->s_base + seg->s_size) {
2603                         ssize = seg->s_base + seg->s_size - addr;
2604                 } else {
2605                         ssize = size;
2606                 }
2607         }
2608         ASSERT(segcnt > 1);
2609 
2610         plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2611 
2612         addr = sv_addr;
2613         size = sv_size;
2614         seg = sv_seg;
2615 
2616         for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2617                 if (addr >= seg->s_base + seg->s_size) {
2618                         seg = AS_SEGNEXT(as, seg);
2619                         ASSERT(seg != NULL && addr == seg->s_base);
2620                         cnt++;
2621                         ASSERT(cnt < segcnt);
2622                 }
2623                 if (addr + size > seg->s_base + seg->s_size) {
2624                         ssize = seg->s_base + seg->s_size - addr;
2625                 } else {
2626                         ssize = size;
2627                 }
2628                 pl = &plist[npages + cnt];
2629                 error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2630                     L_PAGELOCK, rw);
2631                 if (error) {
2632                         break;
2633                 }
2634                 ASSERT(plist[npages + cnt] != NULL);
2635                 ASSERT(pl_off + btop(ssize) <= npages);
2636                 bcopy(plist[npages + cnt], &plist[pl_off],
2637                     btop(ssize) * sizeof (page_t *));
2638                 pl_off += btop(ssize);
2639         }
2640 
2641         if (size == 0) {
2642                 AS_LOCK_EXIT(as);
2643                 ASSERT(cnt == segcnt - 1);
2644                 *ppp = plist;
2645                 return (0);
2646         }
2647 
2648         /*
2649          * one of pagelock calls failed. The error type is in error variable.
2650          * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2651          * type is either EFAULT or ENOTSUP. Otherwise just return the error
2652          * back to the caller.
2653          */
2654 
2655         eaddr = addr;
2656         seg = sv_seg;
2657 
2658         for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2659                 if (addr >= seg->s_base + seg->s_size) {
2660                         seg = AS_SEGNEXT(as, seg);
2661                         ASSERT(seg != NULL && addr == seg->s_base);
2662                         cnt++;
2663                         ASSERT(cnt < segcnt);
2664                 }
2665                 if (eaddr > seg->s_base + seg->s_size) {
2666                         ssize = seg->s_base + seg->s_size - addr;
2667                 } else {
2668                         ssize = eaddr - addr;
2669                 }
2670                 pl = &plist[npages + cnt];
2671                 ASSERT(*pl != NULL);
2672                 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2673                     L_PAGEUNLOCK, rw);
2674         }
2675 
2676         AS_LOCK_EXIT(as);
2677 
2678         kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2679 
2680         if (error != ENOTSUP && error != EFAULT) {
2681                 return (error);
2682         }
2683 
2684 slow:
2685         /*
2686          * If we are here because pagelock failed due to the need to cow fault
2687          * in the pages we want to lock F_SOFTLOCK will do this job and in
2688          * next as_pagelock() call for this address range pagelock will
2689          * hopefully succeed.
2690          */
2691         fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2692         if (fault_err != 0) {
2693                 return (fc_decode(fault_err));
2694         }
2695         *ppp = NULL;
2696 
2697         return (0);
2698 }
2699 
2700 /*
2701  * lock pages in a given address space. Return shadow list. If
2702  * the list is NULL, the MMU mapping is also locked.
2703  */
2704 int
2705 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
2706     size_t size, enum seg_rw rw)
2707 {
2708         size_t rsize;
2709         caddr_t raddr;
2710         faultcode_t fault_err;
2711         struct seg *seg;
2712         int err;
2713 
2714         TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START,
2715             "as_pagelock_start: addr %p size %ld", addr, size);
2716 
2717         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2718         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2719             (size_t)raddr;
2720 
2721         /*
2722          * if the request crosses two segments let
2723          * as_fault handle it.
2724          */
2725         AS_LOCK_ENTER(as, RW_READER);
2726 
2727         seg = as_segat(as, raddr);
2728         if (seg == NULL) {
2729                 AS_LOCK_EXIT(as);
2730                 return (EFAULT);
2731         }
2732         ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2733         if (raddr + rsize > seg->s_base + seg->s_size) {
2734                 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2735         }
2736         if (raddr + rsize <= raddr) {
2737                 AS_LOCK_EXIT(as);
2738                 return (EFAULT);
2739         }
2740 
2741         TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
2742             "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize);
2743 
2744         /*
2745          * try to lock pages and pass back shadow list
2746          */
2747         err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2748 
2749         TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
2750 
2751         AS_LOCK_EXIT(as);
2752 
2753         if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2754                 return (err);
2755         }
2756 
2757         /*
2758          * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2759          * to no pagelock support for this segment or pages need to be cow
2760          * faulted in. If fault is needed F_SOFTLOCK will do this job for
2761          * this as_pagelock() call and in the next as_pagelock() call for the
2762          * same address range pagelock call will hopefull succeed.
2763          */
2764         fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2765         if (fault_err != 0) {
2766                 return (fc_decode(fault_err));
2767         }
2768         *ppp = NULL;
2769 
2770         TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end");
2771         return (0);
2772 }
2773 
2774 /*
2775  * unlock pages locked by as_pagelock_segs().  Retrieve per segment shadow
2776  * lists from the end of plist and call pageunlock interface for each segment.
2777  * Drop as lock and free plist.
2778  */
2779 static void
2780 as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
2781     struct page **plist, enum seg_rw rw)
2782 {
2783         ulong_t cnt;
2784         caddr_t eaddr = addr + size;
2785         pgcnt_t npages = btop(size);
2786         size_t ssize;
2787         page_t **pl;
2788 
2789         ASSERT(AS_LOCK_HELD(as));
2790         ASSERT(seg != NULL);
2791         ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2792         ASSERT(addr + size > seg->s_base + seg->s_size);
2793         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2794         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2795         ASSERT(plist != NULL);
2796 
2797         for (cnt = 0; addr < eaddr; addr += ssize) {
2798                 if (addr >= seg->s_base + seg->s_size) {
2799                         seg = AS_SEGNEXT(as, seg);
2800                         ASSERT(seg != NULL && addr == seg->s_base);
2801                         cnt++;
2802                 }
2803                 if (eaddr > seg->s_base + seg->s_size) {
2804                         ssize = seg->s_base + seg->s_size - addr;
2805                 } else {
2806                         ssize = eaddr - addr;
2807                 }
2808                 pl = &plist[npages + cnt];
2809                 ASSERT(*pl != NULL);
2810                 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2811                     L_PAGEUNLOCK, rw);
2812         }
2813         ASSERT(cnt > 0);
2814         AS_LOCK_EXIT(as);
2815 
2816         cnt++;
2817         kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2818 }
2819 
2820 /*
2821  * unlock pages in a given address range
2822  */
2823 void
2824 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2825     enum seg_rw rw)
2826 {
2827         struct seg *seg;
2828         size_t rsize;
2829         caddr_t raddr;
2830 
2831         TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START,
2832             "as_pageunlock_start: addr %p size %ld", addr, size);
2833 
2834         /*
2835          * if the shadow list is NULL, as_pagelock was
2836          * falling back to as_fault
2837          */
2838         if (pp == NULL) {
2839                 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2840                 return;
2841         }
2842 
2843         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2844         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2845             (size_t)raddr;
2846 
2847         AS_LOCK_ENTER(as, RW_READER);
2848         seg = as_segat(as, raddr);
2849         ASSERT(seg != NULL);
2850 
2851         TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
2852             "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
2853 
2854         ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2855         if (raddr + rsize <= seg->s_base + seg->s_size) {
2856                 SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2857         } else {
2858                 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2859                 return;
2860         }
2861         AS_LOCK_EXIT(as);
2862         TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
2863 }
2864 
2865 int
2866 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2867     boolean_t wait)
2868 {
2869         struct seg *seg;
2870         size_t ssize;
2871         caddr_t raddr;                  /* rounded down addr */
2872         size_t rsize;                   /* rounded up size */
2873         int error = 0;
2874         size_t pgsz = page_get_pagesize(szc);
2875 
2876 setpgsz_top:
2877         if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
2878                 return (EINVAL);
2879         }
2880 
2881         raddr = addr;
2882         rsize = size;
2883 
2884         if (raddr + rsize < raddr)           /* check for wraparound */
2885                 return (ENOMEM);
2886 
2887         AS_LOCK_ENTER(as, RW_WRITER);
2888         as_clearwatchprot(as, raddr, rsize);
2889         seg = as_segat(as, raddr);
2890         if (seg == NULL) {
2891                 as_setwatch(as);
2892                 AS_LOCK_EXIT(as);
2893                 return (ENOMEM);
2894         }
2895 
2896         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2897                 if (raddr >= seg->s_base + seg->s_size) {
2898                         seg = AS_SEGNEXT(as, seg);
2899                         if (seg == NULL || raddr != seg->s_base) {
2900                                 error = ENOMEM;
2901                                 break;
2902                         }
2903                 }
2904                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2905                         ssize = seg->s_base + seg->s_size - raddr;
2906                 } else {
2907                         ssize = rsize;
2908                 }
2909 
2910 retry:
2911                 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
2912 
2913                 if (error == IE_NOMEM) {
2914                         error = EAGAIN;
2915                         break;
2916                 }
2917 
2918                 if (error == IE_RETRY) {
2919                         AS_LOCK_EXIT(as);
2920                         goto setpgsz_top;
2921                 }
2922 
2923                 if (error == ENOTSUP) {
2924                         error = EINVAL;
2925                         break;
2926                 }
2927 
2928                 if (wait && (error == EAGAIN)) {
2929                         /*
2930                          * Memory is currently locked.  It must be unlocked
2931                          * before this operation can succeed through a retry.
2932                          * The possible reasons for locked memory and
2933                          * corresponding strategies for unlocking are:
2934                          * (1) Normal I/O
2935                          *      wait for a signal that the I/O operation
2936                          *      has completed and the memory is unlocked.
2937                          * (2) Asynchronous I/O
2938                          *      The aio subsystem does not unlock pages when
2939                          *      the I/O is completed. Those pages are unlocked
2940                          *      when the application calls aiowait/aioerror.
2941                          *      So, to prevent blocking forever, cv_broadcast()
2942                          *      is done to wake up aio_cleanup_thread.
2943                          *      Subsequently, segvn_reclaim will be called, and
2944                          *      that will do AS_CLRUNMAPWAIT() and wake us up.
2945                          * (3) Long term page locking:
2946                          *      This is not relevant for as_setpagesize()
2947                          *      because we cannot change the page size for
2948                          *      driver memory. The attempt to do so will
2949                          *      fail with a different error than EAGAIN so
2950                          *      there's no need to trigger as callbacks like
2951                          *      as_unmap, as_setprot or as_free would do.
2952                          */
2953                         mutex_enter(&as->a_contents);
2954                         if (!AS_ISNOUNMAPWAIT(as)) {
2955                                 if (AS_ISUNMAPWAIT(as) == 0) {
2956                                         cv_broadcast(&as->a_cv);
2957                                 }
2958                                 AS_SETUNMAPWAIT(as);
2959                                 AS_LOCK_EXIT(as);
2960                                 while (AS_ISUNMAPWAIT(as)) {
2961                                         cv_wait(&as->a_cv, &as->a_contents);
2962                                 }
2963                         } else {
2964                                 /*
2965                                  * We may have raced with
2966                                  * segvn_reclaim()/segspt_reclaim(). In this
2967                                  * case clean nounmapwait flag and retry since
2968                                  * softlockcnt in this segment may be already
2969                                  * 0.  We don't drop as writer lock so our
2970                                  * number of retries without sleeping should
2971                                  * be very small. See segvn_reclaim() for
2972                                  * more comments.
2973                                  */
2974                                 AS_CLRNOUNMAPWAIT(as);
2975                                 mutex_exit(&as->a_contents);
2976                                 goto retry;
2977                         }
2978                         mutex_exit(&as->a_contents);
2979                         goto setpgsz_top;
2980                 } else if (error != 0) {
2981                         break;
2982                 }
2983         }
2984         as_setwatch(as);
2985         AS_LOCK_EXIT(as);
2986         return (error);
2987 }
2988 
2989 /*
2990  * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments
2991  * in its chunk where s_szc is less than the szc we want to set.
2992  */
2993 static int
2994 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
2995     int *retry)
2996 {
2997         struct seg *seg;
2998         size_t ssize;
2999         int error;
3000 
3001         ASSERT(AS_WRITE_HELD(as));
3002 
3003         seg = as_segat(as, raddr);
3004         if (seg == NULL) {
3005                 panic("as_iset3_default_lpsize: no seg");
3006         }
3007 
3008         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
3009                 if (raddr >= seg->s_base + seg->s_size) {
3010                         seg = AS_SEGNEXT(as, seg);
3011                         if (seg == NULL || raddr != seg->s_base) {
3012                                 panic("as_iset3_default_lpsize: as changed");
3013                         }
3014                 }
3015                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3016                         ssize = seg->s_base + seg->s_size - raddr;
3017                 } else {
3018                         ssize = rsize;
3019                 }
3020 
3021                 if (szc > seg->s_szc) {
3022                         error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
3023                         /* Only retry on EINVAL segments that have no vnode. */
3024                         if (error == EINVAL) {
3025                                 vnode_t *vp = NULL;
3026                                 if ((SEGOP_GETTYPE(seg, raddr) & MAP_SHARED) &&
3027                                     (SEGOP_GETVP(seg, raddr, &vp) != 0 ||
3028                                     vp == NULL)) {
3029                                         *retry = 1;
3030                                 } else {
3031                                         *retry = 0;
3032                                 }
3033                         }
3034                         if (error) {
3035                                 return (error);
3036                         }
3037                 }
3038         }
3039         return (0);
3040 }
3041 
3042 /*
3043  * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
3044  * pagesize on each segment in its range, but if any fails with EINVAL,
3045  * then it reduces the pagesizes to the next size in the bitmap and
3046  * retries as_iset3_default_lpsize(). The reason why the code retries
3047  * smaller allowed sizes on EINVAL is because (a) the anon offset may not
3048  * match the bigger sizes, and (b) it's hard to get this offset (to begin
3049  * with) to pass to map_pgszcvec().
3050  */
3051 static int
3052 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
3053     uint_t szcvec)
3054 {
3055         int error;
3056         int retry;
3057 
3058         ASSERT(AS_WRITE_HELD(as));
3059 
3060         for (;;) {
3061                 error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
3062                 if (error == EINVAL && retry) {
3063                         szcvec &= ~(1 << szc);
3064                         if (szcvec <= 1) {
3065                                 return (EINVAL);
3066                         }
3067                         szc = highbit(szcvec) - 1;
3068                 } else {
3069                         return (error);
3070                 }
3071         }
3072 }
3073 
3074 /*
3075  * as_iset1_default_lpsize() breaks its chunk into areas where existing
3076  * segments have a smaller szc than we want to set. For each such area,
3077  * it calls as_iset2_default_lpsize()
3078  */
3079 static int
3080 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3081     uint_t szcvec)
3082 {
3083         struct seg *seg;
3084         size_t ssize;
3085         caddr_t setaddr = raddr;
3086         size_t setsize = 0;
3087         int set;
3088         int error;
3089 
3090         ASSERT(AS_WRITE_HELD(as));
3091 
3092         seg = as_segat(as, raddr);
3093         if (seg == NULL) {
3094                 panic("as_iset1_default_lpsize: no seg");
3095         }
3096         if (seg->s_szc < szc) {
3097                 set = 1;
3098         } else {
3099                 set = 0;
3100         }
3101 
3102         for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3103                 if (raddr >= seg->s_base + seg->s_size) {
3104                         seg = AS_SEGNEXT(as, seg);
3105                         if (seg == NULL || raddr != seg->s_base) {
3106                                 panic("as_iset1_default_lpsize: as changed");
3107                         }
3108                         if (seg->s_szc >= szc && set) {
3109                                 ASSERT(setsize != 0);
3110                                 error = as_iset2_default_lpsize(as,
3111                                     setaddr, setsize, szc, szcvec);
3112                                 if (error) {
3113                                         return (error);
3114                                 }
3115                                 set = 0;
3116                         } else if (seg->s_szc < szc && !set) {
3117                                 setaddr = raddr;
3118                                 setsize = 0;
3119                                 set = 1;
3120                         }
3121                 }
3122                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3123                         ssize = seg->s_base + seg->s_size - raddr;
3124                 } else {
3125                         ssize = rsize;
3126                 }
3127         }
3128         error = 0;
3129         if (set) {
3130                 ASSERT(setsize != 0);
3131                 error = as_iset2_default_lpsize(as, setaddr, setsize,
3132                     szc, szcvec);
3133         }
3134         return (error);
3135 }
3136 
3137 /*
3138  * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3139  * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3140  * chunk to as_iset1_default_lpsize().
3141  */
3142 static int
3143 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
3144     int type)
3145 {
3146         int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
3147         uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
3148             flags, rtype, 1);
3149         uint_t szc;
3150         uint_t nszc;
3151         int error;
3152         caddr_t a;
3153         caddr_t eaddr;
3154         size_t segsize;
3155         size_t pgsz;
3156         uint_t save_szcvec;
3157 
3158         ASSERT(AS_WRITE_HELD(as));
3159         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
3160         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
3161 
3162         szcvec &= ~1;
3163         if (szcvec <= 1) {   /* skip if base page size */
3164                 return (0);
3165         }
3166 
3167         /* Get the pagesize of the first larger page size. */
3168         szc = lowbit(szcvec) - 1;
3169         pgsz = page_get_pagesize(szc);
3170         eaddr = addr + size;
3171         addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3172         eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3173 
3174         save_szcvec = szcvec;
3175         szcvec >>= (szc + 1);
3176         nszc = szc;
3177         while (szcvec) {
3178                 if ((szcvec & 0x1) == 0) {
3179                         nszc++;
3180                         szcvec >>= 1;
3181                         continue;
3182                 }
3183                 nszc++;
3184                 pgsz = page_get_pagesize(nszc);
3185                 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3186                 if (a != addr) {
3187                         ASSERT(szc > 0);
3188                         ASSERT(a < eaddr);
3189                         segsize = a - addr;
3190                         error = as_iset1_default_lpsize(as, addr, segsize, szc,
3191                             save_szcvec);
3192                         if (error) {
3193                                 return (error);
3194                         }
3195                         addr = a;
3196                 }
3197                 szc = nszc;
3198                 szcvec >>= 1;
3199         }
3200 
3201         ASSERT(addr < eaddr);
3202         szcvec = save_szcvec;
3203         while (szcvec) {
3204                 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3205                 ASSERT(a >= addr);
3206                 if (a != addr) {
3207                         ASSERT(szc > 0);
3208                         segsize = a - addr;
3209                         error = as_iset1_default_lpsize(as, addr, segsize, szc,
3210                             save_szcvec);
3211                         if (error) {
3212                                 return (error);
3213                         }
3214                         addr = a;
3215                 }
3216                 szcvec &= ~(1 << szc);
3217                 if (szcvec) {
3218                         szc = highbit(szcvec) - 1;
3219                         pgsz = page_get_pagesize(szc);
3220                 }
3221         }
3222         ASSERT(addr == eaddr);
3223 
3224         return (0);
3225 }
3226 
3227 /*
3228  * Set the default large page size for the range. Called via memcntl with
3229  * page size set to 0. as_set_default_lpsize breaks the range down into
3230  * chunks with the same type/flags, ignores-non segvn segments, and passes
3231  * each chunk to as_iset_default_lpsize().
3232  */
3233 int
3234 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
3235 {
3236         struct seg *seg;
3237         caddr_t raddr;
3238         size_t rsize;
3239         size_t ssize;
3240         int rtype, rflags;
3241         int stype, sflags;
3242         int error;
3243         caddr_t setaddr;
3244         size_t setsize;
3245         int segvn;
3246 
3247         if (size == 0)
3248                 return (0);
3249 
3250         AS_LOCK_ENTER(as, RW_WRITER);
3251 again:
3252         error = 0;
3253 
3254         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3255         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3256             (size_t)raddr;
3257 
3258         if (raddr + rsize < raddr) {         /* check for wraparound */
3259                 AS_LOCK_EXIT(as);
3260                 return (ENOMEM);
3261         }
3262         as_clearwatchprot(as, raddr, rsize);
3263         seg = as_segat(as, raddr);
3264         if (seg == NULL) {
3265                 as_setwatch(as);
3266                 AS_LOCK_EXIT(as);
3267                 return (ENOMEM);
3268         }
3269         if (seg->s_ops == &segvn_ops) {
3270                 rtype = SEGOP_GETTYPE(seg, addr);
3271                 rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3272                 rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3273                 segvn = 1;
3274         } else {
3275                 segvn = 0;
3276         }
3277         setaddr = raddr;
3278         setsize = 0;
3279 
3280         for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3281                 if (raddr >= (seg->s_base + seg->s_size)) {
3282                         seg = AS_SEGNEXT(as, seg);
3283                         if (seg == NULL || raddr != seg->s_base) {
3284                                 error = ENOMEM;
3285                                 break;
3286                         }
3287                         if (seg->s_ops == &segvn_ops) {
3288                                 stype = SEGOP_GETTYPE(seg, raddr);
3289                                 sflags = stype & (MAP_TEXT | MAP_INITDATA);
3290                                 stype &= (MAP_SHARED | MAP_PRIVATE);
3291                                 if (segvn && (rflags != sflags ||
3292                                     rtype != stype)) {
3293                                         /*
3294                                          * The next segment is also segvn but
3295                                          * has different flags and/or type.
3296                                          */
3297                                         ASSERT(setsize != 0);
3298                                         error = as_iset_default_lpsize(as,
3299                                             setaddr, setsize, rflags, rtype);
3300                                         if (error) {
3301                                                 break;
3302                                         }
3303                                         rflags = sflags;
3304                                         rtype = stype;
3305                                         setaddr = raddr;
3306                                         setsize = 0;
3307                                 } else if (!segvn) {
3308                                         rflags = sflags;
3309                                         rtype = stype;
3310                                         setaddr = raddr;
3311                                         setsize = 0;
3312                                         segvn = 1;
3313                                 }
3314                         } else if (segvn) {
3315                                 /* The next segment is not segvn. */
3316                                 ASSERT(setsize != 0);
3317                                 error = as_iset_default_lpsize(as,
3318                                     setaddr, setsize, rflags, rtype);
3319                                 if (error) {
3320                                         break;
3321                                 }
3322                                 segvn = 0;
3323                         }
3324                 }
3325                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3326                         ssize = seg->s_base + seg->s_size - raddr;
3327                 } else {
3328                         ssize = rsize;
3329                 }
3330         }
3331         if (error == 0 && segvn) {
3332                 /* The last chunk when rsize == 0. */
3333                 ASSERT(setsize != 0);
3334                 error = as_iset_default_lpsize(as, setaddr, setsize,
3335                     rflags, rtype);
3336         }
3337 
3338         if (error == IE_RETRY) {
3339                 goto again;
3340         } else if (error == IE_NOMEM) {
3341                 error = EAGAIN;
3342         } else if (error == ENOTSUP) {
3343                 error = EINVAL;
3344         } else if (error == EAGAIN) {
3345                 mutex_enter(&as->a_contents);
3346                 if (!AS_ISNOUNMAPWAIT(as)) {
3347                         if (AS_ISUNMAPWAIT(as) == 0) {
3348                                 cv_broadcast(&as->a_cv);
3349                         }
3350                         AS_SETUNMAPWAIT(as);
3351                         AS_LOCK_EXIT(as);
3352                         while (AS_ISUNMAPWAIT(as)) {
3353                                 cv_wait(&as->a_cv, &as->a_contents);
3354                         }
3355                         mutex_exit(&as->a_contents);
3356                         AS_LOCK_ENTER(as, RW_WRITER);
3357                 } else {
3358                         /*
3359                          * We may have raced with
3360                          * segvn_reclaim()/segspt_reclaim(). In this case
3361                          * clean nounmapwait flag and retry since softlockcnt
3362                          * in this segment may be already 0.  We don't drop as
3363                          * writer lock so our number of retries without
3364                          * sleeping should be very small. See segvn_reclaim()
3365                          * for more comments.
3366                          */
3367                         AS_CLRNOUNMAPWAIT(as);
3368                         mutex_exit(&as->a_contents);
3369                 }
3370                 goto again;
3371         }
3372 
3373         as_setwatch(as);
3374         AS_LOCK_EXIT(as);
3375         return (error);
3376 }
3377 
3378 /*
3379  * Setup all of the uninitialized watched pages that we can.
3380  */
3381 void
3382 as_setwatch(struct as *as)
3383 {
3384         struct watched_page *pwp;
3385         struct seg *seg;
3386         caddr_t vaddr;
3387         uint_t prot;
3388         int  err, retrycnt;
3389 
3390         if (avl_numnodes(&as->a_wpage) == 0)
3391                 return;
3392 
3393         ASSERT(AS_WRITE_HELD(as));
3394 
3395         for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3396             pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3397                 retrycnt = 0;
3398         retry:
3399                 vaddr = pwp->wp_vaddr;
3400                 if (pwp->wp_oprot != 0 ||    /* already set up */
3401                     (seg = as_segat(as, vaddr)) == NULL ||
3402                     SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0)
3403                         continue;
3404 
3405                 pwp->wp_oprot = prot;
3406                 if (pwp->wp_read)
3407                         prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3408                 if (pwp->wp_write)
3409                         prot &= ~PROT_WRITE;
3410                 if (pwp->wp_exec)
3411                         prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3412                 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3413                         err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3414                         if (err == IE_RETRY) {
3415                                 pwp->wp_oprot = 0;
3416                                 ASSERT(retrycnt == 0);
3417                                 retrycnt++;
3418                                 goto retry;
3419                         }
3420                 }
3421                 pwp->wp_prot = prot;
3422         }
3423 }
3424 
3425 /*
3426  * Clear all of the watched pages in the address space.
3427  */
3428 void
3429 as_clearwatch(struct as *as)
3430 {
3431         struct watched_page *pwp;
3432         struct seg *seg;
3433         caddr_t vaddr;
3434         uint_t prot;
3435         int err, retrycnt;
3436 
3437         if (avl_numnodes(&as->a_wpage) == 0)
3438                 return;
3439 
3440         ASSERT(AS_WRITE_HELD(as));
3441 
3442         for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3443             pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3444                 retrycnt = 0;
3445         retry:
3446                 vaddr = pwp->wp_vaddr;
3447                 if (pwp->wp_oprot == 0 ||    /* not set up */
3448                     (seg = as_segat(as, vaddr)) == NULL)
3449                         continue;
3450 
3451                 if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3452                         err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3453                         if (err == IE_RETRY) {
3454                                 ASSERT(retrycnt == 0);
3455                                 retrycnt++;
3456                                 goto retry;
3457                         }
3458                 }
3459                 pwp->wp_oprot = 0;
3460                 pwp->wp_prot = 0;
3461         }
3462 }
3463 
3464 /*
3465  * Force a new setup for all the watched pages in the range.
3466  */
3467 static void
3468 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3469 {
3470         struct watched_page *pwp;
3471         struct watched_page tpw;
3472         caddr_t eaddr = addr + size;
3473         caddr_t vaddr;
3474         struct seg *seg;
3475         int err, retrycnt;
3476         uint_t  wprot;
3477         avl_index_t where;
3478 
3479         if (avl_numnodes(&as->a_wpage) == 0)
3480                 return;
3481 
3482         ASSERT(AS_WRITE_HELD(as));
3483 
3484         tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3485         if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3486                 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3487 
3488         while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3489                 retrycnt = 0;
3490                 vaddr = pwp->wp_vaddr;
3491 
3492                 wprot = prot;
3493                 if (pwp->wp_read)
3494                         wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3495                 if (pwp->wp_write)
3496                         wprot &= ~PROT_WRITE;
3497                 if (pwp->wp_exec)
3498                         wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3499                 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3500                 retry:
3501                         seg = as_segat(as, vaddr);
3502                         if (seg == NULL) {
3503                                 panic("as_setwatchprot: no seg");
3504                                 /*NOTREACHED*/
3505                         }
3506                         err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot);
3507                         if (err == IE_RETRY) {
3508                                 ASSERT(retrycnt == 0);
3509                                 retrycnt++;
3510                                 goto retry;
3511                         }
3512                 }
3513                 pwp->wp_oprot = prot;
3514                 pwp->wp_prot = wprot;
3515 
3516                 pwp = AVL_NEXT(&as->a_wpage, pwp);
3517         }
3518 }
3519 
3520 /*
3521  * Clear all of the watched pages in the range.
3522  */
3523 static void
3524 as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3525 {
3526         caddr_t eaddr = addr + size;
3527         struct watched_page *pwp;
3528         struct watched_page tpw;
3529         uint_t prot;
3530         struct seg *seg;
3531         int err, retrycnt;
3532         avl_index_t where;
3533 
3534         if (avl_numnodes(&as->a_wpage) == 0)
3535                 return;
3536 
3537         tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3538         if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3539                 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3540 
3541         ASSERT(AS_WRITE_HELD(as));
3542 
3543         while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3544 
3545                 if ((prot = pwp->wp_oprot) != 0) {
3546                         retrycnt = 0;
3547 
3548                         if (prot != pwp->wp_prot) {
3549                         retry:
3550                                 seg = as_segat(as, pwp->wp_vaddr);
3551                                 if (seg == NULL)
3552                                         continue;
3553                                 err = SEGOP_SETPROT(seg, pwp->wp_vaddr,
3554                                     PAGESIZE, prot);
3555                                 if (err == IE_RETRY) {
3556                                         ASSERT(retrycnt == 0);
3557                                         retrycnt++;
3558                                         goto retry;
3559 
3560                                 }
3561                         }
3562                         pwp->wp_oprot = 0;
3563                         pwp->wp_prot = 0;
3564                 }
3565 
3566                 pwp = AVL_NEXT(&as->a_wpage, pwp);
3567         }
3568 }
3569 
3570 void
3571 as_signal_proc(struct as *as, k_siginfo_t *siginfo)
3572 {
3573         struct proc *p;
3574 
3575         mutex_enter(&pidlock);
3576         for (p = practive; p; p = p->p_next) {
3577                 if (p->p_as == as) {
3578                         mutex_enter(&p->p_lock);
3579                         if (p->p_as == as)
3580                                 sigaddq(p, NULL, siginfo, KM_NOSLEEP);
3581                         mutex_exit(&p->p_lock);
3582                 }
3583         }
3584         mutex_exit(&pidlock);
3585 }
3586 
3587 /*
3588  * return memory object ID
3589  */
3590 int
3591 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3592 {
3593         struct seg      *seg;
3594         int             sts;
3595 
3596         AS_LOCK_ENTER(as, RW_READER);
3597         seg = as_segat(as, addr);
3598         if (seg == NULL) {
3599                 AS_LOCK_EXIT(as);
3600                 return (EFAULT);
3601         }
3602         /*
3603          * catch old drivers which may not support getmemid
3604          */
3605         if (seg->s_ops->getmemid == NULL) {
3606                 AS_LOCK_EXIT(as);
3607                 return (ENODEV);
3608         }
3609 
3610         sts = SEGOP_GETMEMID(seg, addr, memidp);
3611 
3612         AS_LOCK_EXIT(as);
3613         return (sts);
3614 }