1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright 2016 Joyent, Inc.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  */
  27 
  28 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  29 /*        All Rights Reserved   */
  30 
  31 /*
  32  * University Copyright- Copyright (c) 1982, 1986, 1988
  33  * The Regents of the University of California
  34  * All Rights Reserved
  35  *
  36  * University Acknowledgment- Portions of this document are derived from
  37  * software developed by the University of California, Berkeley, and its
  38  * contributors.
  39  */
  40 
  41 /*
  42  * VM - address spaces.
  43  */
  44 
  45 #include <sys/types.h>
  46 #include <sys/t_lock.h>
  47 #include <sys/param.h>
  48 #include <sys/errno.h>
  49 #include <sys/systm.h>
  50 #include <sys/mman.h>
  51 #include <sys/sysmacros.h>
  52 #include <sys/cpuvar.h>
  53 #include <sys/sysinfo.h>
  54 #include <sys/kmem.h>
  55 #include <sys/vnode.h>
  56 #include <sys/vmsystm.h>
  57 #include <sys/cmn_err.h>
  58 #include <sys/debug.h>
  59 #include <sys/tnf_probe.h>
  60 #include <sys/vtrace.h>
  61 #include <sys/ddi.h>
  62 
  63 #include <vm/hat.h>
  64 #include <vm/as.h>
  65 #include <vm/seg.h>
  66 #include <vm/seg_vn.h>
  67 #include <vm/seg_dev.h>
  68 #include <vm/seg_kmem.h>
  69 #include <vm/seg_map.h>
  70 #include <vm/seg_spt.h>
  71 #include <vm/page.h>
  72 
  73 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
  74 
  75 ulong_t as_user_seg_limit = 0xffff; /* max segments in an (non-kas) AS */
  76 
  77 static struct kmem_cache *as_cache;
  78 
  79 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
  80 static void as_clearwatchprot(struct as *, caddr_t, size_t);
  81 int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
  82 
  83 
  84 /*
  85  * Verifying the segment lists is very time-consuming; it may not be
  86  * desirable always to define VERIFY_SEGLIST when DEBUG is set.
  87  */
  88 #ifdef DEBUG
  89 #define VERIFY_SEGLIST
  90 int do_as_verify = 0;
  91 #endif
  92 
  93 /*
  94  * Allocate a new callback data structure entry and fill in the events of
  95  * interest, the address range of interest, and the callback argument.
  96  * Link the entry on the as->a_callbacks list. A callback entry for the
  97  * entire address space may be specified with vaddr = 0 and size = -1.
  98  *
  99  * CALLERS RESPONSIBILITY: If not calling from within the process context for
 100  * the specified as, the caller must guarantee persistence of the specified as
 101  * for the duration of this function (eg. pages being locked within the as
 102  * will guarantee persistence).
 103  */
 104 int
 105 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
 106     caddr_t vaddr, size_t size, int sleepflag)
 107 {
 108         struct as_callback      *current_head, *cb;
 109         caddr_t                 saddr;
 110         size_t                  rsize;
 111 
 112         /* callback function and an event are mandatory */
 113         if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
 114                 return (EINVAL);
 115 
 116         /* Adding a callback after as_free has been called is not allowed */
 117         if (as == &kas)
 118                 return (ENOMEM);
 119 
 120         /*
 121          * vaddr = 0 and size = -1 is used to indicate that the callback range
 122          * is the entire address space so no rounding is done in that case.
 123          */
 124         if (size != -1) {
 125                 saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
 126                 rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
 127                     (size_t)saddr;
 128                 /* check for wraparound */
 129                 if (saddr + rsize < saddr)
 130                         return (ENOMEM);
 131         } else {
 132                 if (vaddr != 0)
 133                         return (EINVAL);
 134                 saddr = vaddr;
 135                 rsize = size;
 136         }
 137 
 138         /* Allocate and initialize a callback entry */
 139         cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
 140         if (cb == NULL)
 141                 return (EAGAIN);
 142 
 143         cb->ascb_func = cb_func;
 144         cb->ascb_arg = arg;
 145         cb->ascb_events = events;
 146         cb->ascb_saddr = saddr;
 147         cb->ascb_len = rsize;
 148 
 149         /* Add the entry to the list */
 150         mutex_enter(&as->a_contents);
 151         current_head = as->a_callbacks;
 152         as->a_callbacks = cb;
 153         cb->ascb_next = current_head;
 154 
 155         /*
 156          * The call to this function may lose in a race with
 157          * a pertinent event - eg. a thread does long term memory locking
 158          * but before the callback is added another thread executes as_unmap.
 159          * A broadcast here resolves that.
 160          */
 161         if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
 162                 AS_CLRUNMAPWAIT(as);
 163                 cv_broadcast(&as->a_cv);
 164         }
 165 
 166         mutex_exit(&as->a_contents);
 167         return (0);
 168 }
 169 
 170 /*
 171  * Search the callback list for an entry which pertains to arg.
 172  *
 173  * This is called from within the client upon completion of the callback.
 174  * RETURN VALUES:
 175  *      AS_CALLBACK_DELETED  (callback entry found and deleted)
 176  *      AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
 177  *      AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
 178  *                      entry will be made in as_do_callbacks)
 179  *
 180  * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
 181  * set, it indicates that as_do_callbacks is processing this entry.  The
 182  * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
 183  * to unblock as_do_callbacks, in case it is blocked.
 184  *
 185  * CALLERS RESPONSIBILITY: If not calling from within the process context for
 186  * the specified as, the caller must guarantee persistence of the specified as
 187  * for the duration of this function (eg. pages being locked within the as
 188  * will guarantee persistence).
 189  */
 190 uint_t
 191 as_delete_callback(struct as *as, void *arg)
 192 {
 193         struct as_callback **prevcb = &as->a_callbacks;
 194         struct as_callback *cb;
 195         uint_t rc = AS_CALLBACK_NOTFOUND;
 196 
 197         mutex_enter(&as->a_contents);
 198         for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
 199                 if (cb->ascb_arg != arg)
 200                         continue;
 201 
 202                 /*
 203                  * If the events indicate AS_CALLBACK_CALLED, just clear
 204                  * AS_ALL_EVENT in the events field and wakeup the thread
 205                  * that may be waiting in as_do_callbacks.  as_do_callbacks
 206                  * will take care of removing this entry from the list.  In
 207                  * that case, return AS_CALLBACK_DELETE_DEFERRED.  Otherwise
 208                  * (AS_CALLBACK_CALLED not set), just remove it from the
 209                  * list, return the memory and return AS_CALLBACK_DELETED.
 210                  */
 211                 if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
 212                         /* leave AS_CALLBACK_CALLED */
 213                         cb->ascb_events &= ~AS_ALL_EVENT;
 214                         rc = AS_CALLBACK_DELETE_DEFERRED;
 215                         cv_broadcast(&as->a_cv);
 216                 } else {
 217                         *prevcb = cb->ascb_next;
 218                         kmem_free(cb, sizeof (struct as_callback));
 219                         rc = AS_CALLBACK_DELETED;
 220                 }
 221                 break;
 222         }
 223         mutex_exit(&as->a_contents);
 224         return (rc);
 225 }
 226 
 227 /*
 228  * Searches the as callback list for a matching entry.
 229  * Returns a pointer to the first matching callback, or NULL if
 230  * nothing is found.
 231  * This function never sleeps so it is ok to call it with more
 232  * locks held but the (required) a_contents mutex.
 233  *
 234  * See also comment on as_do_callbacks below.
 235  */
 236 static struct as_callback *
 237 as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
 238     size_t event_len)
 239 {
 240         struct as_callback      *cb;
 241 
 242         ASSERT(MUTEX_HELD(&as->a_contents));
 243         for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
 244                 /*
 245                  * If the callback has not already been called, then
 246                  * check if events or address range pertains.  An event_len
 247                  * of zero means do an unconditional callback.
 248                  */
 249                 if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
 250                     ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
 251                     (event_addr + event_len < cb->ascb_saddr) ||
 252                     (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
 253                         continue;
 254                 }
 255                 break;
 256         }
 257         return (cb);
 258 }
 259 
 260 /*
 261  * Executes a given callback and removes it from the callback list for
 262  * this address space.
 263  * This function may sleep so the caller must drop all locks except
 264  * a_contents before calling this func.
 265  *
 266  * See also comments on as_do_callbacks below.
 267  */
 268 static void
 269 as_execute_callback(struct as *as, struct as_callback *cb,
 270     uint_t events)
 271 {
 272         struct as_callback **prevcb;
 273         void    *cb_arg;
 274 
 275         ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
 276         cb->ascb_events |= AS_CALLBACK_CALLED;
 277         mutex_exit(&as->a_contents);
 278         (*cb->ascb_func)(as, cb->ascb_arg, events);
 279         mutex_enter(&as->a_contents);
 280         /*
 281          * the callback function is required to delete the callback
 282          * when the callback function determines it is OK for
 283          * this thread to continue. as_delete_callback will clear
 284          * the AS_ALL_EVENT in the events field when it is deleted.
 285          * If the callback function called as_delete_callback,
 286          * events will already be cleared and there will be no blocking.
 287          */
 288         while ((cb->ascb_events & events) != 0) {
 289                 cv_wait(&as->a_cv, &as->a_contents);
 290         }
 291         /*
 292          * This entry needs to be taken off the list. Normally, the
 293          * callback func itself does that, but unfortunately the list
 294          * may have changed while the callback was running because the
 295          * a_contents mutex was dropped and someone else other than the
 296          * callback func itself could have called as_delete_callback,
 297          * so we have to search to find this entry again.  The entry
 298          * must have AS_CALLBACK_CALLED, and have the same 'arg'.
 299          */
 300         cb_arg = cb->ascb_arg;
 301         prevcb = &as->a_callbacks;
 302         for (cb = as->a_callbacks; cb != NULL;
 303             prevcb = &cb->ascb_next, cb = *prevcb) {
 304                 if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
 305                     (cb_arg != cb->ascb_arg)) {
 306                         continue;
 307                 }
 308                 *prevcb = cb->ascb_next;
 309                 kmem_free(cb, sizeof (struct as_callback));
 310                 break;
 311         }
 312 }
 313 
 314 /*
 315  * Check the callback list for a matching event and intersection of
 316  * address range. If there is a match invoke the callback.  Skip an entry if:
 317  *    - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
 318  *    - not event of interest
 319  *    - not address range of interest
 320  *
 321  * An event_len of zero indicates a request for an unconditional callback
 322  * (regardless of event), only the AS_CALLBACK_CALLED is checked.  The
 323  * a_contents lock must be dropped before a callback, so only one callback
 324  * can be done before returning. Return -1 (true) if a callback was
 325  * executed and removed from the list, else return 0 (false).
 326  *
 327  * The logically separate parts, i.e. finding a matching callback and
 328  * executing a given callback have been separated into two functions
 329  * so that they can be called with different sets of locks held beyond
 330  * the always-required a_contents. as_find_callback does not sleep so
 331  * it is ok to call it if more locks than a_contents (i.e. the a_lock
 332  * rwlock) are held. as_execute_callback on the other hand may sleep
 333  * so all locks beyond a_contents must be dropped by the caller if one
 334  * does not want to end comatose.
 335  */
 336 static int
 337 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
 338     size_t event_len)
 339 {
 340         struct as_callback *cb;
 341 
 342         if ((cb = as_find_callback(as, events, event_addr, event_len))) {
 343                 as_execute_callback(as, cb, events);
 344                 return (-1);
 345         }
 346         return (0);
 347 }
 348 
 349 /*
 350  * Search for the segment containing addr. If a segment containing addr
 351  * exists, that segment is returned.  If no such segment exists, and
 352  * the list spans addresses greater than addr, then the first segment
 353  * whose base is greater than addr is returned; otherwise, NULL is
 354  * returned unless tail is true, in which case the last element of the
 355  * list is returned.
 356  *
 357  * a_seglast is used to cache the last found segment for repeated
 358  * searches to the same addr (which happens frequently).
 359  */
 360 struct seg *
 361 as_findseg(struct as *as, caddr_t addr, int tail)
 362 {
 363         struct seg *seg = as->a_seglast;
 364         avl_index_t where;
 365 
 366         ASSERT(AS_LOCK_HELD(as));
 367 
 368         if (seg != NULL &&
 369             seg->s_base <= addr &&
 370             addr < seg->s_base + seg->s_size)
 371                 return (seg);
 372 
 373         seg = avl_find(&as->a_segtree, &addr, &where);
 374         if (seg != NULL)
 375                 return (as->a_seglast = seg);
 376 
 377         seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 378         if (seg == NULL && tail)
 379                 seg = avl_last(&as->a_segtree);
 380         return (as->a_seglast = seg);
 381 }
 382 
 383 #ifdef VERIFY_SEGLIST
 384 /*
 385  * verify that the linked list is coherent
 386  */
 387 static void
 388 as_verify(struct as *as)
 389 {
 390         struct seg *seg, *seglast, *p, *n;
 391         uint_t nsegs = 0;
 392 
 393         if (do_as_verify == 0)
 394                 return;
 395 
 396         seglast = as->a_seglast;
 397 
 398         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 399                 ASSERT(seg->s_as == as);
 400                 p = AS_SEGPREV(as, seg);
 401                 n = AS_SEGNEXT(as, seg);
 402                 ASSERT(p == NULL || p->s_as == as);
 403                 ASSERT(p == NULL || p->s_base < seg->s_base);
 404                 ASSERT(n == NULL || n->s_base > seg->s_base);
 405                 ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
 406                 if (seg == seglast)
 407                         seglast = NULL;
 408                 nsegs++;
 409         }
 410         ASSERT(seglast == NULL);
 411         ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
 412 }
 413 #endif /* VERIFY_SEGLIST */
 414 
 415 /*
 416  * Add a new segment to the address space. The avl_find()
 417  * may be expensive so we attempt to use last segment accessed
 418  * in as_gap() as an insertion point.
 419  */
 420 int
 421 as_addseg(struct as  *as, struct seg *newseg)
 422 {
 423         struct seg *seg;
 424         caddr_t addr;
 425         caddr_t eaddr;
 426         avl_index_t where;
 427 
 428         ASSERT(AS_WRITE_HELD(as));
 429 
 430         as->a_updatedir = 1; /* inform /proc */
 431         gethrestime(&as->a_updatetime);
 432 
 433         if (as->a_lastgaphl != NULL) {
 434                 struct seg *hseg = NULL;
 435                 struct seg *lseg = NULL;
 436 
 437                 if (as->a_lastgaphl->s_base > newseg->s_base) {
 438                         hseg = as->a_lastgaphl;
 439                         lseg = AVL_PREV(&as->a_segtree, hseg);
 440                 } else {
 441                         lseg = as->a_lastgaphl;
 442                         hseg = AVL_NEXT(&as->a_segtree, lseg);
 443                 }
 444 
 445                 if (hseg && lseg && lseg->s_base < newseg->s_base &&
 446                     hseg->s_base > newseg->s_base) {
 447                         avl_insert_here(&as->a_segtree, newseg, lseg,
 448                             AVL_AFTER);
 449                         as->a_lastgaphl = NULL;
 450                         as->a_seglast = newseg;
 451                         return (0);
 452                 }
 453                 as->a_lastgaphl = NULL;
 454         }
 455 
 456         addr = newseg->s_base;
 457         eaddr = addr + newseg->s_size;
 458 again:
 459 
 460         seg = avl_find(&as->a_segtree, &addr, &where);
 461 
 462         if (seg == NULL)
 463                 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
 464 
 465         if (seg == NULL)
 466                 seg = avl_last(&as->a_segtree);
 467 
 468         if (seg != NULL) {
 469                 caddr_t base = seg->s_base;
 470 
 471                 /*
 472                  * If top of seg is below the requested address, then
 473                  * the insertion point is at the end of the linked list,
 474                  * and seg points to the tail of the list.  Otherwise,
 475                  * the insertion point is immediately before seg.
 476                  */
 477                 if (base + seg->s_size > addr) {
 478                         if (addr >= base || eaddr > base) {
 479 #ifdef __sparc
 480                                 extern struct seg_ops segnf_ops;
 481 
 482                                 /*
 483                                  * no-fault segs must disappear if overlaid.
 484                                  * XXX need new segment type so
 485                                  * we don't have to check s_ops
 486                                  */
 487                                 if (seg->s_ops == &segnf_ops) {
 488                                         seg_unmap(seg);
 489                                         goto again;
 490                                 }
 491 #endif
 492                                 return (-1);    /* overlapping segment */
 493                         }
 494                 }
 495         }
 496         as->a_seglast = newseg;
 497         avl_insert(&as->a_segtree, newseg, where);
 498 
 499 #ifdef VERIFY_SEGLIST
 500         as_verify(as);
 501 #endif
 502         return (0);
 503 }
 504 
 505 struct seg *
 506 as_removeseg(struct as *as, struct seg *seg)
 507 {
 508         avl_tree_t *t;
 509 
 510         ASSERT(AS_WRITE_HELD(as));
 511 
 512         as->a_updatedir = 1; /* inform /proc */
 513         gethrestime(&as->a_updatetime);
 514 
 515         if (seg == NULL)
 516                 return (NULL);
 517 
 518         t = &as->a_segtree;
 519         if (as->a_seglast == seg)
 520                 as->a_seglast = NULL;
 521         as->a_lastgaphl = NULL;
 522 
 523         /*
 524          * if this segment is at an address higher than
 525          * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
 526          */
 527         if (as->a_lastgap &&
 528             (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
 529                 as->a_lastgap = AVL_NEXT(t, seg);
 530 
 531         /*
 532          * remove the segment from the seg tree
 533          */
 534         avl_remove(t, seg);
 535 
 536 #ifdef VERIFY_SEGLIST
 537         as_verify(as);
 538 #endif
 539         return (seg);
 540 }
 541 
 542 /*
 543  * Find a segment containing addr.
 544  */
 545 struct seg *
 546 as_segat(struct as *as, caddr_t addr)
 547 {
 548         struct seg *seg = as->a_seglast;
 549 
 550         ASSERT(AS_LOCK_HELD(as));
 551 
 552         if (seg != NULL && seg->s_base <= addr &&
 553             addr < seg->s_base + seg->s_size)
 554                 return (seg);
 555 
 556         seg = avl_find(&as->a_segtree, &addr, NULL);
 557         return (seg);
 558 }
 559 
 560 /*
 561  * Serialize all searches for holes in an address space to
 562  * prevent two or more threads from allocating the same virtual
 563  * address range.  The address space must not be "read/write"
 564  * locked by the caller since we may block.
 565  */
 566 void
 567 as_rangelock(struct as *as)
 568 {
 569         mutex_enter(&as->a_contents);
 570         while (AS_ISCLAIMGAP(as))
 571                 cv_wait(&as->a_cv, &as->a_contents);
 572         AS_SETCLAIMGAP(as);
 573         mutex_exit(&as->a_contents);
 574 }
 575 
 576 /*
 577  * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
 578  */
 579 void
 580 as_rangeunlock(struct as *as)
 581 {
 582         mutex_enter(&as->a_contents);
 583         AS_CLRCLAIMGAP(as);
 584         cv_signal(&as->a_cv);
 585         mutex_exit(&as->a_contents);
 586 }
 587 
 588 /*
 589  * compar segments (or just an address) by segment address range
 590  */
 591 static int
 592 as_segcompar(const void *x, const void *y)
 593 {
 594         struct seg *a = (struct seg *)x;
 595         struct seg *b = (struct seg *)y;
 596 
 597         if (a->s_base < b->s_base)
 598                 return (-1);
 599         if (a->s_base >= b->s_base + b->s_size)
 600                 return (1);
 601         return (0);
 602 }
 603 
 604 
 605 void
 606 as_avlinit(struct as *as)
 607 {
 608         avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
 609             offsetof(struct seg, s_tree));
 610         avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
 611             offsetof(struct watched_page, wp_link));
 612 }
 613 
 614 /*ARGSUSED*/
 615 static int
 616 as_constructor(void *buf, void *cdrarg, int kmflags)
 617 {
 618         struct as *as = buf;
 619 
 620         mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
 621         cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
 622         rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
 623         as_avlinit(as);
 624         return (0);
 625 }
 626 
 627 /*ARGSUSED1*/
 628 static void
 629 as_destructor(void *buf, void *cdrarg)
 630 {
 631         struct as *as = buf;
 632 
 633         avl_destroy(&as->a_segtree);
 634         mutex_destroy(&as->a_contents);
 635         cv_destroy(&as->a_cv);
 636         rw_destroy(&as->a_lock);
 637 }
 638 
 639 void
 640 as_init(void)
 641 {
 642         as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
 643             as_constructor, as_destructor, NULL, NULL, NULL, 0);
 644 }
 645 
 646 /*
 647  * Allocate and initialize an address space data structure.
 648  * We call hat_alloc to allow any machine dependent
 649  * information in the hat structure to be initialized.
 650  */
 651 struct as *
 652 as_alloc(void)
 653 {
 654         struct as *as;
 655 
 656         as = kmem_cache_alloc(as_cache, KM_SLEEP);
 657 
 658         as->a_flags          = 0;
 659         as->a_vbits          = 0;
 660         as->a_hrm            = NULL;
 661         as->a_seglast                = NULL;
 662         as->a_size           = 0;
 663         as->a_resvsize               = 0;
 664         as->a_updatedir              = 0;
 665         gethrestime(&as->a_updatetime);
 666         as->a_objectdir              = NULL;
 667         as->a_sizedir                = 0;
 668         as->a_userlimit              = (caddr_t)USERLIMIT;
 669         as->a_lastgap                = NULL;
 670         as->a_lastgaphl              = NULL;
 671         as->a_callbacks              = NULL;
 672         as->a_proc           = NULL;
 673 
 674         AS_LOCK_ENTER(as, RW_WRITER);
 675         as->a_hat = hat_alloc(as);   /* create hat for default system mmu */
 676         AS_LOCK_EXIT(as);
 677 
 678         return (as);
 679 }
 680 
 681 /*
 682  * Free an address space data structure.
 683  * Need to free the hat first and then
 684  * all the segments on this as and finally
 685  * the space for the as struct itself.
 686  */
 687 void
 688 as_free(struct as *as)
 689 {
 690         struct hat *hat = as->a_hat;
 691         struct seg *seg, *next;
 692         boolean_t free_started = B_FALSE;
 693 
 694 top:
 695         /*
 696          * Invoke ALL callbacks. as_do_callbacks will do one callback
 697          * per call, and not return (-1) until the callback has completed.
 698          * When as_do_callbacks returns zero, all callbacks have completed.
 699          */
 700         mutex_enter(&as->a_contents);
 701         while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
 702                 ;
 703 
 704         mutex_exit(&as->a_contents);
 705         AS_LOCK_ENTER(as, RW_WRITER);
 706 
 707         if (!free_started) {
 708                 free_started = B_TRUE;
 709                 hat_free_start(hat);
 710         }
 711         for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
 712                 int err;
 713 
 714                 next = AS_SEGNEXT(as, seg);
 715 retry:
 716                 err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
 717                 if (err == EAGAIN) {
 718                         mutex_enter(&as->a_contents);
 719                         if (as->a_callbacks) {
 720                                 AS_LOCK_EXIT(as);
 721                         } else if (!AS_ISNOUNMAPWAIT(as)) {
 722                                 /*
 723                                  * Memory is currently locked. Wait for a
 724                                  * cv_signal that it has been unlocked, then
 725                                  * try the operation again.
 726                                  */
 727                                 if (AS_ISUNMAPWAIT(as) == 0)
 728                                         cv_broadcast(&as->a_cv);
 729                                 AS_SETUNMAPWAIT(as);
 730                                 AS_LOCK_EXIT(as);
 731                                 while (AS_ISUNMAPWAIT(as))
 732                                         cv_wait(&as->a_cv, &as->a_contents);
 733                         } else {
 734                                 /*
 735                                  * We may have raced with
 736                                  * segvn_reclaim()/segspt_reclaim(). In this
 737                                  * case clean nounmapwait flag and retry since
 738                                  * softlockcnt in this segment may be already
 739                                  * 0.  We don't drop as writer lock so our
 740                                  * number of retries without sleeping should
 741                                  * be very small. See segvn_reclaim() for
 742                                  * more comments.
 743                                  */
 744                                 AS_CLRNOUNMAPWAIT(as);
 745                                 mutex_exit(&as->a_contents);
 746                                 goto retry;
 747                         }
 748                         mutex_exit(&as->a_contents);
 749                         goto top;
 750                 } else {
 751                         /*
 752                          * We do not expect any other error return at this
 753                          * time. This is similar to an ASSERT in seg_unmap()
 754                          */
 755                         ASSERT(err == 0);
 756                 }
 757         }
 758         hat_free_end(hat);
 759         AS_LOCK_EXIT(as);
 760 
 761         /* /proc stuff */
 762         ASSERT(avl_numnodes(&as->a_wpage) == 0);
 763         if (as->a_objectdir) {
 764                 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
 765                 as->a_objectdir = NULL;
 766                 as->a_sizedir = 0;
 767         }
 768 
 769         /*
 770          * Free the struct as back to kmem.  Assert it has no segments.
 771          */
 772         ASSERT(avl_numnodes(&as->a_segtree) == 0);
 773         kmem_cache_free(as_cache, as);
 774 }
 775 
 776 int
 777 as_dup(struct as *as, struct proc *forkedproc)
 778 {
 779         struct as *newas;
 780         struct seg *seg, *newseg;
 781         size_t  purgesize = 0;
 782         int error;
 783 
 784         AS_LOCK_ENTER(as, RW_WRITER);
 785         as_clearwatch(as);
 786         newas = as_alloc();
 787         newas->a_userlimit = as->a_userlimit;
 788         newas->a_proc = forkedproc;
 789 
 790         AS_LOCK_ENTER(newas, RW_WRITER);
 791 
 792         (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
 793 
 794         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
 795 
 796                 if (seg->s_flags & S_PURGE) {
 797                         purgesize += seg->s_size;
 798                         continue;
 799                 }
 800 
 801                 newseg = seg_alloc(newas, seg->s_base, seg->s_size);
 802                 if (newseg == NULL) {
 803                         AS_LOCK_EXIT(newas);
 804                         as_setwatch(as);
 805                         AS_LOCK_EXIT(as);
 806                         as_free(newas);
 807                         return (-1);
 808                 }
 809                 if ((error = SEGOP_DUP(seg, newseg)) != 0) {
 810                         /*
 811                          * We call seg_free() on the new seg
 812                          * because the segment is not set up
 813                          * completely; i.e. it has no ops.
 814                          */
 815                         as_setwatch(as);
 816                         AS_LOCK_EXIT(as);
 817                         seg_free(newseg);
 818                         AS_LOCK_EXIT(newas);
 819                         as_free(newas);
 820                         return (error);
 821                 }
 822                 newas->a_size += seg->s_size;
 823         }
 824         newas->a_resvsize = as->a_resvsize - purgesize;
 825 
 826         error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
 827 
 828         AS_LOCK_EXIT(newas);
 829 
 830         as_setwatch(as);
 831         AS_LOCK_EXIT(as);
 832         if (error != 0) {
 833                 as_free(newas);
 834                 return (error);
 835         }
 836         forkedproc->p_as = newas;
 837         return (0);
 838 }
 839 
 840 /*
 841  * Handle a ``fault'' at addr for size bytes.
 842  */
 843 faultcode_t
 844 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
 845     enum fault_type type, enum seg_rw rw)
 846 {
 847         struct seg *seg;
 848         caddr_t raddr;                  /* rounded down addr */
 849         size_t rsize;                   /* rounded up size */
 850         size_t ssize;
 851         faultcode_t res = 0;
 852         caddr_t addrsav;
 853         struct seg *segsav;
 854         int as_lock_held;
 855         klwp_t *lwp = ttolwp(curthread);
 856         zone_t *zonep = curzone;
 857 
 858 retry:
 859         /*
 860          * Indicate that the lwp is not to be stopped while waiting for a
 861          * pagefault.  This is to avoid deadlock while debugging a process
 862          * via /proc over NFS (in particular).
 863          */
 864         if (lwp != NULL)
 865                 lwp->lwp_nostop++;
 866 
 867         /*
 868          * same length must be used when we softlock and softunlock.  We
 869          * don't support softunlocking lengths less than the original length
 870          * when there is largepage support.  See seg_dev.c for more
 871          * comments.
 872          */
 873         switch (type) {
 874 
 875         case F_SOFTLOCK:
 876                 CPU_STATS_ADD_K(vm, softlock, 1);
 877                 break;
 878 
 879         case F_SOFTUNLOCK:
 880                 break;
 881 
 882         case F_PROT:
 883                 CPU_STATS_ADD_K(vm, prot_fault, 1);
 884                 break;
 885 
 886         case F_INVAL:
 887                 CPU_STATS_ENTER_K();
 888                 CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
 889                 if (as == &kas)
 890                         CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
 891                 CPU_STATS_EXIT_K();
 892                 if (zonep->zone_pg_flt_delay != 0) {
 893                         /*
 894                          * The zone in which this process is running is
 895                          * currently over it's physical memory cap. Throttle
 896                          * page faults to help the user-land memory capper
 897                          * catch up. Note that drv_usectohz() rounds up.
 898                          */
 899                         atomic_add_64(&zonep->zone_pf_throttle, 1);
 900                         atomic_add_64(&zonep->zone_pf_throttle_usec,
 901                             zonep->zone_pg_flt_delay);
 902                         if (zonep->zone_pg_flt_delay < TICK_TO_USEC(1)) {
 903                                 drv_usecwait(zonep->zone_pg_flt_delay);
 904                         } else {
 905                                 delay(drv_usectohz(zonep->zone_pg_flt_delay));
 906                         }
 907                 }
 908                 break;
 909         }
 910 
 911         /* Kernel probe */
 912         TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */,
 913             tnf_opaque, address,        addr,
 914             tnf_fault_type,     fault_type,     type,
 915             tnf_seg_access,     access,         rw);
 916 
 917         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
 918         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
 919             (size_t)raddr;
 920 
 921         /*
 922          * XXX -- Don't grab the as lock for segkmap. We should grab it for
 923          * correctness, but then we could be stuck holding this lock for
 924          * a LONG time if the fault needs to be resolved on a slow
 925          * filesystem, and then no-one will be able to exec new commands,
 926          * as exec'ing requires the write lock on the as.
 927          */
 928         if (as == &kas && segkmap && segkmap->s_base <= raddr &&
 929             raddr + size < segkmap->s_base + segkmap->s_size) {
 930                 seg = segkmap;
 931                 as_lock_held = 0;
 932         } else {
 933                 AS_LOCK_ENTER(as, RW_READER);
 934 
 935                 seg = as_segat(as, raddr);
 936                 if (seg == NULL) {
 937                         AS_LOCK_EXIT(as);
 938                         if (lwp != NULL)
 939                                 lwp->lwp_nostop--;
 940                         return (FC_NOMAP);
 941                 }
 942 
 943                 as_lock_held = 1;
 944         }
 945 
 946         addrsav = raddr;
 947         segsav = seg;
 948 
 949         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
 950                 if (raddr >= seg->s_base + seg->s_size) {
 951                         seg = AS_SEGNEXT(as, seg);
 952                         if (seg == NULL || raddr != seg->s_base) {
 953                                 res = FC_NOMAP;
 954                                 break;
 955                         }
 956                 }
 957                 if (raddr + rsize > seg->s_base + seg->s_size)
 958                         ssize = seg->s_base + seg->s_size - raddr;
 959                 else
 960                         ssize = rsize;
 961 
 962                 res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw);
 963                 if (res != 0)
 964                         break;
 965         }
 966 
 967         /*
 968          * If we were SOFTLOCKing and encountered a failure,
 969          * we must SOFTUNLOCK the range we already did. (Maybe we
 970          * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
 971          * right here...)
 972          */
 973         if (res != 0 && type == F_SOFTLOCK) {
 974                 for (seg = segsav; addrsav < raddr; addrsav += ssize) {
 975                         if (addrsav >= seg->s_base + seg->s_size)
 976                                 seg = AS_SEGNEXT(as, seg);
 977                         ASSERT(seg != NULL);
 978                         /*
 979                          * Now call the fault routine again to perform the
 980                          * unlock using S_OTHER instead of the rw variable
 981                          * since we never got a chance to touch the pages.
 982                          */
 983                         if (raddr > seg->s_base + seg->s_size)
 984                                 ssize = seg->s_base + seg->s_size - addrsav;
 985                         else
 986                                 ssize = raddr - addrsav;
 987                         (void) SEGOP_FAULT(hat, seg, addrsav, ssize,
 988                             F_SOFTUNLOCK, S_OTHER);
 989                 }
 990         }
 991         if (as_lock_held)
 992                 AS_LOCK_EXIT(as);
 993         if (lwp != NULL)
 994                 lwp->lwp_nostop--;
 995 
 996         /*
 997          * If the lower levels returned EDEADLK for a fault,
 998          * It means that we should retry the fault.  Let's wait
 999          * a bit also to let the deadlock causing condition clear.
1000          * This is part of a gross hack to work around a design flaw
1001          * in the ufs/sds logging code and should go away when the
1002          * logging code is re-designed to fix the problem. See bug
1003          * 4125102 for details of the problem.
1004          */
1005         if (FC_ERRNO(res) == EDEADLK) {
1006                 delay(deadlk_wait);
1007                 res = 0;
1008                 goto retry;
1009         }
1010         return (res);
1011 }
1012 
1013 
1014 
1015 /*
1016  * Asynchronous ``fault'' at addr for size bytes.
1017  */
1018 faultcode_t
1019 as_faulta(struct as *as, caddr_t addr, size_t size)
1020 {
1021         struct seg *seg;
1022         caddr_t raddr;                  /* rounded down addr */
1023         size_t rsize;                   /* rounded up size */
1024         faultcode_t res = 0;
1025         klwp_t *lwp = ttolwp(curthread);
1026 
1027 retry:
1028         /*
1029          * Indicate that the lwp is not to be stopped while waiting
1030          * for a pagefault.  This is to avoid deadlock while debugging
1031          * a process via /proc over NFS (in particular).
1032          */
1033         if (lwp != NULL)
1034                 lwp->lwp_nostop++;
1035 
1036         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1037         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1038             (size_t)raddr;
1039 
1040         AS_LOCK_ENTER(as, RW_READER);
1041         seg = as_segat(as, raddr);
1042         if (seg == NULL) {
1043                 AS_LOCK_EXIT(as);
1044                 if (lwp != NULL)
1045                         lwp->lwp_nostop--;
1046                 return (FC_NOMAP);
1047         }
1048 
1049         for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1050                 if (raddr >= seg->s_base + seg->s_size) {
1051                         seg = AS_SEGNEXT(as, seg);
1052                         if (seg == NULL || raddr != seg->s_base) {
1053                                 res = FC_NOMAP;
1054                                 break;
1055                         }
1056                 }
1057                 res = SEGOP_FAULTA(seg, raddr);
1058                 if (res != 0)
1059                         break;
1060         }
1061         AS_LOCK_EXIT(as);
1062         if (lwp != NULL)
1063                 lwp->lwp_nostop--;
1064         /*
1065          * If the lower levels returned EDEADLK for a fault,
1066          * It means that we should retry the fault.  Let's wait
1067          * a bit also to let the deadlock causing condition clear.
1068          * This is part of a gross hack to work around a design flaw
1069          * in the ufs/sds logging code and should go away when the
1070          * logging code is re-designed to fix the problem. See bug
1071          * 4125102 for details of the problem.
1072          */
1073         if (FC_ERRNO(res) == EDEADLK) {
1074                 delay(deadlk_wait);
1075                 res = 0;
1076                 goto retry;
1077         }
1078         return (res);
1079 }
1080 
1081 /*
1082  * Set the virtual mapping for the interval from [addr : addr + size)
1083  * in address space `as' to have the specified protection.
1084  * It is ok for the range to cross over several segments,
1085  * as long as they are contiguous.
1086  */
1087 int
1088 as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1089 {
1090         struct seg *seg;
1091         struct as_callback *cb;
1092         size_t ssize;
1093         caddr_t raddr;                  /* rounded down addr */
1094         size_t rsize;                   /* rounded up size */
1095         int error = 0, writer = 0;
1096         caddr_t saveraddr;
1097         size_t saversize;
1098 
1099 setprot_top:
1100         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1101         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1102             (size_t)raddr;
1103 
1104         if (raddr + rsize < raddr)           /* check for wraparound */
1105                 return (ENOMEM);
1106 
1107         saveraddr = raddr;
1108         saversize = rsize;
1109 
1110         /*
1111          * Normally we only lock the as as a reader. But
1112          * if due to setprot the segment driver needs to split
1113          * a segment it will return IE_RETRY. Therefore we re-acquire
1114          * the as lock as a writer so the segment driver can change
1115          * the seg list. Also the segment driver will return IE_RETRY
1116          * after it has changed the segment list so we therefore keep
1117          * locking as a writer. Since these opeartions should be rare
1118          * want to only lock as a writer when necessary.
1119          */
1120         if (writer || avl_numnodes(&as->a_wpage) != 0) {
1121                 AS_LOCK_ENTER(as, RW_WRITER);
1122         } else {
1123                 AS_LOCK_ENTER(as, RW_READER);
1124         }
1125 
1126         as_clearwatchprot(as, raddr, rsize);
1127         seg = as_segat(as, raddr);
1128         if (seg == NULL) {
1129                 as_setwatch(as);
1130                 AS_LOCK_EXIT(as);
1131                 return (ENOMEM);
1132         }
1133 
1134         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1135                 if (raddr >= seg->s_base + seg->s_size) {
1136                         seg = AS_SEGNEXT(as, seg);
1137                         if (seg == NULL || raddr != seg->s_base) {
1138                                 error = ENOMEM;
1139                                 break;
1140                         }
1141                 }
1142                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1143                         ssize = seg->s_base + seg->s_size - raddr;
1144                 else
1145                         ssize = rsize;
1146 retry:
1147                 error = SEGOP_SETPROT(seg, raddr, ssize, prot);
1148 
1149                 if (error == IE_NOMEM) {
1150                         error = EAGAIN;
1151                         break;
1152                 }
1153 
1154                 if (error == IE_RETRY) {
1155                         AS_LOCK_EXIT(as);
1156                         writer = 1;
1157                         goto setprot_top;
1158                 }
1159 
1160                 if (error == EAGAIN) {
1161                         /*
1162                          * Make sure we have a_lock as writer.
1163                          */
1164                         if (writer == 0) {
1165                                 AS_LOCK_EXIT(as);
1166                                 writer = 1;
1167                                 goto setprot_top;
1168                         }
1169 
1170                         /*
1171                          * Memory is currently locked.  It must be unlocked
1172                          * before this operation can succeed through a retry.
1173                          * The possible reasons for locked memory and
1174                          * corresponding strategies for unlocking are:
1175                          * (1) Normal I/O
1176                          *      wait for a signal that the I/O operation
1177                          *      has completed and the memory is unlocked.
1178                          * (2) Asynchronous I/O
1179                          *      The aio subsystem does not unlock pages when
1180                          *      the I/O is completed. Those pages are unlocked
1181                          *      when the application calls aiowait/aioerror.
1182                          *      So, to prevent blocking forever, cv_broadcast()
1183                          *      is done to wake up aio_cleanup_thread.
1184                          *      Subsequently, segvn_reclaim will be called, and
1185                          *      that will do AS_CLRUNMAPWAIT() and wake us up.
1186                          * (3) Long term page locking:
1187                          *      Drivers intending to have pages locked for a
1188                          *      period considerably longer than for normal I/O
1189                          *      (essentially forever) may have registered for a
1190                          *      callback so they may unlock these pages on
1191                          *      request. This is needed to allow this operation
1192                          *      to succeed. Each entry on the callback list is
1193                          *      examined. If the event or address range pertains
1194                          *      the callback is invoked (unless it already is in
1195                          *      progress). The a_contents lock must be dropped
1196                          *      before the callback, so only one callback can
1197                          *      be done at a time. Go to the top and do more
1198                          *      until zero is returned. If zero is returned,
1199                          *      either there were no callbacks for this event
1200                          *      or they were already in progress.
1201                          */
1202                         mutex_enter(&as->a_contents);
1203                         if (as->a_callbacks &&
1204                             (cb = as_find_callback(as, AS_SETPROT_EVENT,
1205                             seg->s_base, seg->s_size))) {
1206                                 AS_LOCK_EXIT(as);
1207                                 as_execute_callback(as, cb, AS_SETPROT_EVENT);
1208                         } else if (!AS_ISNOUNMAPWAIT(as)) {
1209                                 if (AS_ISUNMAPWAIT(as) == 0)
1210                                         cv_broadcast(&as->a_cv);
1211                                 AS_SETUNMAPWAIT(as);
1212                                 AS_LOCK_EXIT(as);
1213                                 while (AS_ISUNMAPWAIT(as))
1214                                         cv_wait(&as->a_cv, &as->a_contents);
1215                         } else {
1216                                 /*
1217                                  * We may have raced with
1218                                  * segvn_reclaim()/segspt_reclaim(). In this
1219                                  * case clean nounmapwait flag and retry since
1220                                  * softlockcnt in this segment may be already
1221                                  * 0.  We don't drop as writer lock so our
1222                                  * number of retries without sleeping should
1223                                  * be very small. See segvn_reclaim() for
1224                                  * more comments.
1225                                  */
1226                                 AS_CLRNOUNMAPWAIT(as);
1227                                 mutex_exit(&as->a_contents);
1228                                 goto retry;
1229                         }
1230                         mutex_exit(&as->a_contents);
1231                         goto setprot_top;
1232                 } else if (error != 0)
1233                         break;
1234         }
1235         if (error != 0) {
1236                 as_setwatch(as);
1237         } else {
1238                 as_setwatchprot(as, saveraddr, saversize, prot);
1239         }
1240         AS_LOCK_EXIT(as);
1241         return (error);
1242 }
1243 
1244 /*
1245  * Check to make sure that the interval [addr, addr + size)
1246  * in address space `as' has at least the specified protection.
1247  * It is ok for the range to cross over several segments, as long
1248  * as they are contiguous.
1249  */
1250 int
1251 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1252 {
1253         struct seg *seg;
1254         size_t ssize;
1255         caddr_t raddr;                  /* rounded down addr */
1256         size_t rsize;                   /* rounded up size */
1257         int error = 0;
1258 
1259         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1260         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1261             (size_t)raddr;
1262 
1263         if (raddr + rsize < raddr)           /* check for wraparound */
1264                 return (ENOMEM);
1265 
1266         /*
1267          * This is ugly as sin...
1268          * Normally, we only acquire the address space readers lock.
1269          * However, if the address space has watchpoints present,
1270          * we must acquire the writer lock on the address space for
1271          * the benefit of as_clearwatchprot() and as_setwatchprot().
1272          */
1273         if (avl_numnodes(&as->a_wpage) != 0)
1274                 AS_LOCK_ENTER(as, RW_WRITER);
1275         else
1276                 AS_LOCK_ENTER(as, RW_READER);
1277         as_clearwatchprot(as, raddr, rsize);
1278         seg = as_segat(as, raddr);
1279         if (seg == NULL) {
1280                 as_setwatch(as);
1281                 AS_LOCK_EXIT(as);
1282                 return (ENOMEM);
1283         }
1284 
1285         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1286                 if (raddr >= seg->s_base + seg->s_size) {
1287                         seg = AS_SEGNEXT(as, seg);
1288                         if (seg == NULL || raddr != seg->s_base) {
1289                                 error = ENOMEM;
1290                                 break;
1291                         }
1292                 }
1293                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1294                         ssize = seg->s_base + seg->s_size - raddr;
1295                 else
1296                         ssize = rsize;
1297 
1298                 error = SEGOP_CHECKPROT(seg, raddr, ssize, prot);
1299                 if (error != 0)
1300                         break;
1301         }
1302         as_setwatch(as);
1303         AS_LOCK_EXIT(as);
1304         return (error);
1305 }
1306 
1307 int
1308 as_unmap(struct as *as, caddr_t addr, size_t size)
1309 {
1310         struct seg *seg, *seg_next;
1311         struct as_callback *cb;
1312         caddr_t raddr, eaddr;
1313         size_t ssize, rsize = 0;
1314         int err;
1315 
1316 top:
1317         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1318         eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1319             (uintptr_t)PAGEMASK);
1320 
1321         AS_LOCK_ENTER(as, RW_WRITER);
1322 
1323         as->a_updatedir = 1; /* inform /proc */
1324         gethrestime(&as->a_updatetime);
1325 
1326         /*
1327          * Use as_findseg to find the first segment in the range, then
1328          * step through the segments in order, following s_next.
1329          */
1330         as_clearwatchprot(as, raddr, eaddr - raddr);
1331 
1332         for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
1333                 if (eaddr <= seg->s_base)
1334                         break;          /* eaddr was in a gap; all done */
1335 
1336                 /* this is implied by the test above */
1337                 ASSERT(raddr < eaddr);
1338 
1339                 if (raddr < seg->s_base)
1340                         raddr = seg->s_base;         /* raddr was in a gap */
1341 
1342                 if (eaddr > (seg->s_base + seg->s_size))
1343                         ssize = seg->s_base + seg->s_size - raddr;
1344                 else
1345                         ssize = eaddr - raddr;
1346 
1347                 /*
1348                  * Save next segment pointer since seg can be
1349                  * destroyed during the segment unmap operation.
1350                  */
1351                 seg_next = AS_SEGNEXT(as, seg);
1352 
1353                 /*
1354                  * We didn't count /dev/null mappings, so ignore them here.
1355                  * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1356                  * we have to do this check here while we have seg.)
1357                  */
1358                 rsize = 0;
1359                 if (!SEG_IS_DEVNULL_MAPPING(seg) &&
1360                     !SEG_IS_PARTIAL_RESV(seg))
1361                         rsize = ssize;
1362 
1363 retry:
1364                 err = SEGOP_UNMAP(seg, raddr, ssize);
1365                 if (err == EAGAIN) {
1366                         /*
1367                          * Memory is currently locked.  It must be unlocked
1368                          * before this operation can succeed through a retry.
1369                          * The possible reasons for locked memory and
1370                          * corresponding strategies for unlocking are:
1371                          * (1) Normal I/O
1372                          *      wait for a signal that the I/O operation
1373                          *      has completed and the memory is unlocked.
1374                          * (2) Asynchronous I/O
1375                          *      The aio subsystem does not unlock pages when
1376                          *      the I/O is completed. Those pages are unlocked
1377                          *      when the application calls aiowait/aioerror.
1378                          *      So, to prevent blocking forever, cv_broadcast()
1379                          *      is done to wake up aio_cleanup_thread.
1380                          *      Subsequently, segvn_reclaim will be called, and
1381                          *      that will do AS_CLRUNMAPWAIT() and wake us up.
1382                          * (3) Long term page locking:
1383                          *      Drivers intending to have pages locked for a
1384                          *      period considerably longer than for normal I/O
1385                          *      (essentially forever) may have registered for a
1386                          *      callback so they may unlock these pages on
1387                          *      request. This is needed to allow this operation
1388                          *      to succeed. Each entry on the callback list is
1389                          *      examined. If the event or address range pertains
1390                          *      the callback is invoked (unless it already is in
1391                          *      progress). The a_contents lock must be dropped
1392                          *      before the callback, so only one callback can
1393                          *      be done at a time. Go to the top and do more
1394                          *      until zero is returned. If zero is returned,
1395                          *      either there were no callbacks for this event
1396                          *      or they were already in progress.
1397                          */
1398                         mutex_enter(&as->a_contents);
1399                         if (as->a_callbacks &&
1400                             (cb = as_find_callback(as, AS_UNMAP_EVENT,
1401                             seg->s_base, seg->s_size))) {
1402                                 AS_LOCK_EXIT(as);
1403                                 as_execute_callback(as, cb, AS_UNMAP_EVENT);
1404                         } else if (!AS_ISNOUNMAPWAIT(as)) {
1405                                 if (AS_ISUNMAPWAIT(as) == 0)
1406                                         cv_broadcast(&as->a_cv);
1407                                 AS_SETUNMAPWAIT(as);
1408                                 AS_LOCK_EXIT(as);
1409                                 while (AS_ISUNMAPWAIT(as))
1410                                         cv_wait(&as->a_cv, &as->a_contents);
1411                         } else {
1412                                 /*
1413                                  * We may have raced with
1414                                  * segvn_reclaim()/segspt_reclaim(). In this
1415                                  * case clean nounmapwait flag and retry since
1416                                  * softlockcnt in this segment may be already
1417                                  * 0.  We don't drop as writer lock so our
1418                                  * number of retries without sleeping should
1419                                  * be very small. See segvn_reclaim() for
1420                                  * more comments.
1421                                  */
1422                                 AS_CLRNOUNMAPWAIT(as);
1423                                 mutex_exit(&as->a_contents);
1424                                 goto retry;
1425                         }
1426                         mutex_exit(&as->a_contents);
1427                         goto top;
1428                 } else if (err == IE_RETRY) {
1429                         AS_LOCK_EXIT(as);
1430                         goto top;
1431                 } else if (err) {
1432                         as_setwatch(as);
1433                         AS_LOCK_EXIT(as);
1434                         return (-1);
1435                 }
1436 
1437                 as->a_size -= ssize;
1438                 if (rsize)
1439                         as->a_resvsize -= rsize;
1440                 raddr += ssize;
1441         }
1442         AS_LOCK_EXIT(as);
1443         return (0);
1444 }
1445 
1446 static int
1447 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
1448     int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1449 {
1450         uint_t szc;
1451         uint_t nszc;
1452         int error;
1453         caddr_t a;
1454         caddr_t eaddr;
1455         size_t segsize;
1456         struct seg *seg;
1457         size_t pgsz;
1458         int do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
1459         uint_t save_szcvec;
1460 
1461         ASSERT(AS_WRITE_HELD(as));
1462         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1463         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1464         ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
1465         if (!do_off) {
1466                 vn_a->offset = 0;
1467         }
1468 
1469         if (szcvec <= 1) {
1470                 seg = seg_alloc(as, addr, size);
1471                 if (seg == NULL) {
1472                         return (ENOMEM);
1473                 }
1474                 vn_a->szc = 0;
1475                 error = (*crfp)(seg, vn_a);
1476                 if (error != 0) {
1477                         seg_free(seg);
1478                 } else {
1479                         as->a_size += size;
1480                         as->a_resvsize += size;
1481                 }
1482                 return (error);
1483         }
1484 
1485         eaddr = addr + size;
1486         save_szcvec = szcvec;
1487         szcvec >>= 1;
1488         szc = 0;
1489         nszc = 0;
1490         while (szcvec) {
1491                 if ((szcvec & 0x1) == 0) {
1492                         nszc++;
1493                         szcvec >>= 1;
1494                         continue;
1495                 }
1496                 nszc++;
1497                 pgsz = page_get_pagesize(nszc);
1498                 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
1499                 if (a != addr) {
1500                         ASSERT(a < eaddr);
1501                         segsize = a - addr;
1502                         seg = seg_alloc(as, addr, segsize);
1503                         if (seg == NULL) {
1504                                 return (ENOMEM);
1505                         }
1506                         vn_a->szc = szc;
1507                         error = (*crfp)(seg, vn_a);
1508                         if (error != 0) {
1509                                 seg_free(seg);
1510                                 return (error);
1511                         }
1512                         as->a_size += segsize;
1513                         as->a_resvsize += segsize;
1514                         *segcreated = 1;
1515                         if (do_off) {
1516                                 vn_a->offset += segsize;
1517                         }
1518                         addr = a;
1519                 }
1520                 szc = nszc;
1521                 szcvec >>= 1;
1522         }
1523 
1524         ASSERT(addr < eaddr);
1525         szcvec = save_szcvec | 1; /* add 8K pages */
1526         while (szcvec) {
1527                 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
1528                 ASSERT(a >= addr);
1529                 if (a != addr) {
1530                         segsize = a - addr;
1531                         seg = seg_alloc(as, addr, segsize);
1532                         if (seg == NULL) {
1533                                 return (ENOMEM);
1534                         }
1535                         vn_a->szc = szc;
1536                         error = (*crfp)(seg, vn_a);
1537                         if (error != 0) {
1538                                 seg_free(seg);
1539                                 return (error);
1540                         }
1541                         as->a_size += segsize;
1542                         as->a_resvsize += segsize;
1543                         *segcreated = 1;
1544                         if (do_off) {
1545                                 vn_a->offset += segsize;
1546                         }
1547                         addr = a;
1548                 }
1549                 szcvec &= ~(1 << szc);
1550                 if (szcvec) {
1551                         szc = highbit(szcvec) - 1;
1552                         pgsz = page_get_pagesize(szc);
1553                 }
1554         }
1555         ASSERT(addr == eaddr);
1556 
1557         return (0);
1558 }
1559 
1560 static int
1561 as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
1562     int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1563 {
1564         uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
1565         int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
1566         uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1567             type, 0);
1568         int error;
1569         struct seg *seg;
1570         struct vattr va;
1571         u_offset_t eoff;
1572         size_t save_size = 0;
1573         extern size_t textrepl_size_thresh;
1574 
1575         ASSERT(AS_WRITE_HELD(as));
1576         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1577         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1578         ASSERT(vn_a->vp != NULL);
1579         ASSERT(vn_a->amp == NULL);
1580 
1581 again:
1582         if (szcvec <= 1) {
1583                 seg = seg_alloc(as, addr, size);
1584                 if (seg == NULL) {
1585                         return (ENOMEM);
1586                 }
1587                 vn_a->szc = 0;
1588                 error = (*crfp)(seg, vn_a);
1589                 if (error != 0) {
1590                         seg_free(seg);
1591                 } else {
1592                         as->a_size += size;
1593                         as->a_resvsize += size;
1594                 }
1595                 return (error);
1596         }
1597 
1598         va.va_mask = AT_SIZE;
1599         if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) {
1600                 szcvec = 0;
1601                 goto again;
1602         }
1603         eoff = vn_a->offset & PAGEMASK;
1604         if (eoff >= va.va_size) {
1605                 szcvec = 0;
1606                 goto again;
1607         }
1608         eoff += size;
1609         if (btopr(va.va_size) < btopr(eoff)) {
1610                 save_size = size;
1611                 size = va.va_size - (vn_a->offset & PAGEMASK);
1612                 size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
1613                 szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1614                     type, 0);
1615                 if (szcvec <= 1) {
1616                         size = save_size;
1617                         goto again;
1618                 }
1619         }
1620 
1621         if (size > textrepl_size_thresh) {
1622                 vn_a->flags |= _MAP_TEXTREPL;
1623         }
1624         error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a,
1625             segcreated);
1626         if (error != 0) {
1627                 return (error);
1628         }
1629         if (save_size) {
1630                 addr += size;
1631                 size = save_size - size;
1632                 szcvec = 0;
1633                 goto again;
1634         }
1635         return (0);
1636 }
1637 
1638 /*
1639  * as_map_ansegs: shared or private anonymous memory.  Note that the flags
1640  * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1641  */
1642 static int
1643 as_map_ansegs(struct as *as, caddr_t addr, size_t size,
1644     int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1645 {
1646         uint_t szcvec;
1647         uchar_t type;
1648 
1649         ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
1650         if (vn_a->type == MAP_SHARED) {
1651                 type = MAPPGSZC_SHM;
1652         } else if (vn_a->type == MAP_PRIVATE) {
1653                 if (vn_a->szc == AS_MAP_HEAP) {
1654                         type = MAPPGSZC_HEAP;
1655                 } else if (vn_a->szc == AS_MAP_STACK) {
1656                         type = MAPPGSZC_STACK;
1657                 } else {
1658                         type = MAPPGSZC_PRIVM;
1659                 }
1660         }
1661         szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
1662             (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
1663             (vn_a->flags & MAP_TEXT), type, 0);
1664         ASSERT(AS_WRITE_HELD(as));
1665         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1666         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1667         ASSERT(vn_a->vp == NULL);
1668 
1669         return (as_map_segvn_segs(as, addr, size, szcvec,
1670             crfp, vn_a, segcreated));
1671 }
1672 
1673 int
1674 as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp)
1675 {
1676         AS_LOCK_ENTER(as, RW_WRITER);
1677         return (as_map_locked(as, addr, size, crfp, argsp));
1678 }
1679 
1680 int
1681 as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(),
1682     void *argsp)
1683 {
1684         struct seg *seg = NULL;
1685         caddr_t raddr;                  /* rounded down addr */
1686         size_t rsize;                   /* rounded up size */
1687         int error;
1688         int unmap = 0;
1689         /*
1690          * The use of a_proc is preferred to handle the case where curproc is
1691          * a door_call server and is allocating memory in the client's (a_proc)
1692          * address space.
1693          * When creating a shared memory segment a_proc will be NULL so we
1694          * fallback to curproc in that case.
1695          */
1696         struct proc *p = (as->a_proc == NULL) ? curproc : as->a_proc;
1697         struct segvn_crargs crargs;
1698 
1699         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1700         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1701             (size_t)raddr;
1702 
1703         /*
1704          * check for wrap around
1705          */
1706         if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
1707                 AS_LOCK_EXIT(as);
1708                 return (ENOMEM);
1709         }
1710 
1711         as->a_updatedir = 1; /* inform /proc */
1712         gethrestime(&as->a_updatetime);
1713 
1714         if (as != &kas) {
1715                 if (as->a_size + rsize > (size_t)p->p_vmem_ctl) {
1716                         AS_LOCK_EXIT(as);
1717 
1718                         (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM],
1719                             p->p_rctls, p, RCA_UNSAFE_ALL);
1720                         return (ENOMEM);
1721                 }
1722 
1723                 /*
1724                  * Keep the number of segments in a userspace AS constrained to
1725                  * a reasonable limit.  Linux enforces a value slightly less
1726                  * than 64k in order to avoid ELF limits if/when a process
1727                  * dumps core.  While SunOS avoids that specific problem with
1728                  * other tricks, the limit is still valuable to keep kernel
1729                  * memory consumption in check.
1730                  */
1731                 if (avl_numnodes(&as->a_segtree) >= as_user_seg_limit) {
1732                         AS_LOCK_EXIT(as);
1733                         atomic_inc_32(&p->p_zone->zone_mfseglim);
1734                         return (ENOMEM);
1735                 }
1736         }
1737 
1738         if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
1739                 crargs = *(struct segvn_crargs *)argsp;
1740                 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap);
1741                 if (error != 0) {
1742                         AS_LOCK_EXIT(as);
1743                         if (unmap) {
1744                                 (void) as_unmap(as, addr, size);
1745                         }
1746                         return (error);
1747                 }
1748         } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
1749                 crargs = *(struct segvn_crargs *)argsp;
1750                 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap);
1751                 if (error != 0) {
1752                         AS_LOCK_EXIT(as);
1753                         if (unmap) {
1754                                 (void) as_unmap(as, addr, size);
1755                         }
1756                         return (error);
1757                 }
1758         } else {
1759                 seg = seg_alloc(as, addr, size);
1760                 if (seg == NULL) {
1761                         AS_LOCK_EXIT(as);
1762                         return (ENOMEM);
1763                 }
1764 
1765                 error = (*crfp)(seg, argsp);
1766                 if (error != 0) {
1767                         seg_free(seg);
1768                         AS_LOCK_EXIT(as);
1769                         return (error);
1770                 }
1771                 /*
1772                  * Add size now so as_unmap will work if as_ctl fails.
1773                  */
1774                 as->a_size += rsize;
1775                 as->a_resvsize += rsize;
1776         }
1777 
1778         as_setwatch(as);
1779 
1780         /*
1781          * If the address space is locked,
1782          * establish memory locks for the new segment.
1783          */
1784         mutex_enter(&as->a_contents);
1785         if (AS_ISPGLCK(as)) {
1786                 mutex_exit(&as->a_contents);
1787                 AS_LOCK_EXIT(as);
1788                 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
1789                 if (error != 0)
1790                         (void) as_unmap(as, addr, size);
1791         } else {
1792                 mutex_exit(&as->a_contents);
1793                 AS_LOCK_EXIT(as);
1794         }
1795         return (error);
1796 }
1797 
1798 
1799 /*
1800  * Delete all segments in the address space marked with S_PURGE.
1801  * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1802  * These segments are deleted as a first step before calls to as_gap(), so
1803  * that they don't affect mmap() or shmat().
1804  */
1805 void
1806 as_purge(struct as *as)
1807 {
1808         struct seg *seg;
1809         struct seg *next_seg;
1810 
1811         /*
1812          * the setting of NEEDSPURGE is protect by as_rangelock(), so
1813          * no need to grab a_contents mutex for this check
1814          */
1815         if ((as->a_flags & AS_NEEDSPURGE) == 0)
1816                 return;
1817 
1818         AS_LOCK_ENTER(as, RW_WRITER);
1819         next_seg = NULL;
1820         seg = AS_SEGFIRST(as);
1821         while (seg != NULL) {
1822                 next_seg = AS_SEGNEXT(as, seg);
1823                 if (seg->s_flags & S_PURGE)
1824                         SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1825                 seg = next_seg;
1826         }
1827         AS_LOCK_EXIT(as);
1828 
1829         mutex_enter(&as->a_contents);
1830         as->a_flags &= ~AS_NEEDSPURGE;
1831         mutex_exit(&as->a_contents);
1832 }
1833 
1834 /*
1835  * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1836  * range of addresses at least "minlen" long, where the base of the range is
1837  * at "off" phase from an "align" boundary and there is space for a
1838  * "redzone"-sized redzone on eithe rside of the range.  Thus,
1839  * if align was 4M and off was 16k, the user wants a hole which will start
1840  * 16k into a 4M page.
1841  *
1842  * If flags specifies AH_HI, the hole will have the highest possible address
1843  * in the range.  We use the as->a_lastgap field to figure out where to
1844  * start looking for a gap.
1845  *
1846  * Otherwise, the gap will have the lowest possible address.
1847  *
1848  * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1849  *
1850  * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1851  * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1852  *
1853  * NOTE: This routine is not correct when base+len overflows caddr_t.
1854  */
1855 int
1856 as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
1857     uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off)
1858 {
1859         caddr_t lobound = *basep;
1860         caddr_t hibound = lobound + *lenp;
1861         struct seg *lseg, *hseg;
1862         caddr_t lo, hi;
1863         int forward;
1864         caddr_t save_base;
1865         size_t save_len;
1866         size_t save_minlen;
1867         size_t save_redzone;
1868         int fast_path = 1;
1869 
1870         save_base = *basep;
1871         save_len = *lenp;
1872         save_minlen = minlen;
1873         save_redzone = redzone;
1874 
1875         /*
1876          * For the first pass/fast_path, just add align and redzone into
1877          * minlen since if we get an allocation, we can guarantee that it
1878          * will fit the alignment and redzone requested.
1879          * This increases the chance that hibound will be adjusted to
1880          * a_lastgap->s_base which will likely allow us to find an
1881          * acceptable hole in the address space quicker.
1882          * If we can't find a hole with this fast_path, then we look for
1883          * smaller holes in which the alignment and offset may allow
1884          * the allocation to fit.
1885          */
1886         minlen += align;
1887         minlen += 2 * redzone;
1888         redzone = 0;
1889 
1890         AS_LOCK_ENTER(as, RW_READER);
1891         if (AS_SEGFIRST(as) == NULL) {
1892                 if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
1893                     align, redzone, off)) {
1894                         AS_LOCK_EXIT(as);
1895                         return (0);
1896                 } else {
1897                         AS_LOCK_EXIT(as);
1898                         *basep = save_base;
1899                         *lenp = save_len;
1900                         return (-1);
1901                 }
1902         }
1903 
1904 retry:
1905         /*
1906          * Set up to iterate over all the inter-segment holes in the given
1907          * direction.  lseg is NULL for the lowest-addressed hole and hseg is
1908          * NULL for the highest-addressed hole.  If moving backwards, we reset
1909          * sseg to denote the highest-addressed segment.
1910          */
1911         forward = (flags & AH_DIR) == AH_LO;
1912         if (forward) {
1913                 hseg = as_findseg(as, lobound, 1);
1914                 lseg = AS_SEGPREV(as, hseg);
1915         } else {
1916 
1917                 /*
1918                  * If allocating at least as much as the last allocation,
1919                  * use a_lastgap's base as a better estimate of hibound.
1920                  */
1921                 if (as->a_lastgap &&
1922                     minlen >= as->a_lastgap->s_size &&
1923                     hibound >= as->a_lastgap->s_base)
1924                         hibound = as->a_lastgap->s_base;
1925 
1926                 hseg = as_findseg(as, hibound, 1);
1927                 if (hseg->s_base + hseg->s_size < hibound) {
1928                         lseg = hseg;
1929                         hseg = NULL;
1930                 } else {
1931                         lseg = AS_SEGPREV(as, hseg);
1932                 }
1933         }
1934 
1935         for (;;) {
1936                 /*
1937                  * Set lo and hi to the hole's boundaries.  (We should really
1938                  * use MAXADDR in place of hibound in the expression below,
1939                  * but can't express it easily; using hibound in its place is
1940                  * harmless.)
1941                  */
1942                 lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
1943                 hi = (hseg == NULL) ? hibound : hseg->s_base;
1944                 /*
1945                  * If the iteration has moved past the interval from lobound
1946                  * to hibound it's pointless to continue.
1947                  */
1948                 if ((forward && lo > hibound) || (!forward && hi < lobound))
1949                         break;
1950                 else if (lo > hibound || hi < lobound)
1951                         goto cont;
1952                 /*
1953                  * Candidate hole lies at least partially within the allowable
1954                  * range.  Restrict it to fall completely within that range,
1955                  * i.e., to [max(lo, lobound), min(hi, hibound)].
1956                  */
1957                 if (lo < lobound)
1958                         lo = lobound;
1959                 if (hi > hibound)
1960                         hi = hibound;
1961                 /*
1962                  * Verify that the candidate hole is big enough and meets
1963                  * hardware constraints.  If the hole is too small, no need
1964                  * to do the further checks since they will fail.
1965                  */
1966                 *basep = lo;
1967                 *lenp = hi - lo;
1968                 if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
1969                     minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
1970                     ((flags & AH_CONTAIN) == 0 ||
1971                     (*basep <= addr && *basep + *lenp > addr))) {
1972                         if (!forward)
1973                                 as->a_lastgap = hseg;
1974                         if (hseg != NULL)
1975                                 as->a_lastgaphl = hseg;
1976                         else
1977                                 as->a_lastgaphl = lseg;
1978                         AS_LOCK_EXIT(as);
1979                         return (0);
1980                 }
1981         cont:
1982                 /*
1983                  * Move to the next hole.
1984                  */
1985                 if (forward) {
1986                         lseg = hseg;
1987                         if (lseg == NULL)
1988                                 break;
1989                         hseg = AS_SEGNEXT(as, hseg);
1990                 } else {
1991                         hseg = lseg;
1992                         if (hseg == NULL)
1993                                 break;
1994                         lseg = AS_SEGPREV(as, lseg);
1995                 }
1996         }
1997         if (fast_path && (align != 0 || save_redzone != 0)) {
1998                 fast_path = 0;
1999                 minlen = save_minlen;
2000                 redzone = save_redzone;
2001                 goto retry;
2002         }
2003         *basep = save_base;
2004         *lenp = save_len;
2005         AS_LOCK_EXIT(as);
2006         return (-1);
2007 }
2008 
2009 /*
2010  * Find a hole of at least size minlen within [*basep, *basep + *lenp).
2011  *
2012  * If flags specifies AH_HI, the hole will have the highest possible address
2013  * in the range.  We use the as->a_lastgap field to figure out where to
2014  * start looking for a gap.
2015  *
2016  * Otherwise, the gap will have the lowest possible address.
2017  *
2018  * If flags specifies AH_CONTAIN, the hole will contain the address addr.
2019  *
2020  * If an adequate hole is found, base and len are set to reflect the part of
2021  * the hole that is within range, and 0 is returned, otherwise,
2022  * -1 is returned.
2023  *
2024  * NOTE: This routine is not correct when base+len overflows caddr_t.
2025  */
2026 int
2027 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
2028     caddr_t addr)
2029 {
2030 
2031         return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
2032 }
2033 
2034 /*
2035  * Return the next range within [base, base + len) that is backed
2036  * with "real memory".  Skip holes and non-seg_vn segments.
2037  * We're lazy and only return one segment at a time.
2038  */
2039 int
2040 as_memory(struct as *as, caddr_t *basep, size_t *lenp)
2041 {
2042         extern struct seg_ops segspt_shmops;    /* needs a header file */
2043         struct seg *seg;
2044         caddr_t addr, eaddr;
2045         caddr_t segend;
2046 
2047         AS_LOCK_ENTER(as, RW_READER);
2048 
2049         addr = *basep;
2050         eaddr = addr + *lenp;
2051 
2052         seg = as_findseg(as, addr, 0);
2053         if (seg != NULL)
2054                 addr = MAX(seg->s_base, addr);
2055 
2056         for (;;) {
2057                 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2058                         AS_LOCK_EXIT(as);
2059                         return (EINVAL);
2060                 }
2061 
2062                 if (seg->s_ops == &segvn_ops) {
2063                         segend = seg->s_base + seg->s_size;
2064                         break;
2065                 }
2066 
2067                 /*
2068                  * We do ISM by looking into the private data
2069                  * to determine the real size of the segment.
2070                  */
2071                 if (seg->s_ops == &segspt_shmops) {
2072                         segend = seg->s_base + spt_realsize(seg);
2073                         if (addr < segend)
2074                                 break;
2075                 }
2076 
2077                 seg = AS_SEGNEXT(as, seg);
2078 
2079                 if (seg != NULL)
2080                         addr = seg->s_base;
2081         }
2082 
2083         *basep = addr;
2084 
2085         if (segend > eaddr)
2086                 *lenp = eaddr - addr;
2087         else
2088                 *lenp = segend - addr;
2089 
2090         AS_LOCK_EXIT(as);
2091         return (0);
2092 }
2093 
2094 /*
2095  * Swap the pages associated with the address space as out to
2096  * secondary storage, returning the number of bytes actually
2097  * swapped.
2098  *
2099  * The value returned is intended to correlate well with the process's
2100  * memory requirements.  Its usefulness for this purpose depends on
2101  * how well the segment-level routines do at returning accurate
2102  * information.
2103  */
2104 size_t
2105 as_swapout(struct as *as)
2106 {
2107         struct seg *seg;
2108         size_t swpcnt = 0;
2109 
2110         /*
2111          * Kernel-only processes have given up their address
2112          * spaces.  Of course, we shouldn't be attempting to
2113          * swap out such processes in the first place...
2114          */
2115         if (as == NULL)
2116                 return (0);
2117 
2118         AS_LOCK_ENTER(as, RW_READER);
2119 
2120         /*
2121          * Free all mapping resources associated with the address
2122          * space.  The segment-level swapout routines capitalize
2123          * on this unmapping by scavanging pages that have become
2124          * unmapped here.
2125          */
2126         hat_swapout(as->a_hat);
2127 
2128         /*
2129          * Call the swapout routines of all segments in the address
2130          * space to do the actual work, accumulating the amount of
2131          * space reclaimed.
2132          */
2133         for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2134                 struct seg_ops *ov = seg->s_ops;
2135 
2136                 /*
2137                  * We have to check to see if the seg has
2138                  * an ops vector because the seg may have
2139                  * been in the middle of being set up when
2140                  * the process was picked for swapout.
2141                  */
2142                 if ((ov != NULL) && (ov->swapout != NULL))
2143                         swpcnt += SEGOP_SWAPOUT(seg);
2144         }
2145         AS_LOCK_EXIT(as);
2146         return (swpcnt);
2147 }
2148 
2149 /*
2150  * Determine whether data from the mappings in interval [addr, addr + size)
2151  * are in the primary memory (core) cache.
2152  */
2153 int
2154 as_incore(struct as *as, caddr_t addr,
2155     size_t size, char *vec, size_t *sizep)
2156 {
2157         struct seg *seg;
2158         size_t ssize;
2159         caddr_t raddr;          /* rounded down addr */
2160         size_t rsize;           /* rounded up size */
2161         size_t isize;                   /* iteration size */
2162         int error = 0;          /* result, assume success */
2163 
2164         *sizep = 0;
2165         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2166         rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2167             (size_t)raddr;
2168 
2169         if (raddr + rsize < raddr)           /* check for wraparound */
2170                 return (ENOMEM);
2171 
2172         AS_LOCK_ENTER(as, RW_READER);
2173         seg = as_segat(as, raddr);
2174         if (seg == NULL) {
2175                 AS_LOCK_EXIT(as);
2176                 return (-1);
2177         }
2178 
2179         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2180                 if (raddr >= seg->s_base + seg->s_size) {
2181                         seg = AS_SEGNEXT(as, seg);
2182                         if (seg == NULL || raddr != seg->s_base) {
2183                                 error = -1;
2184                                 break;
2185                         }
2186                 }
2187                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2188                         ssize = seg->s_base + seg->s_size - raddr;
2189                 else
2190                         ssize = rsize;
2191                 *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec);
2192                 if (isize != ssize) {
2193                         error = -1;
2194                         break;
2195                 }
2196                 vec += btopr(ssize);
2197         }
2198         AS_LOCK_EXIT(as);
2199         return (error);
2200 }
2201 
2202 static void
2203 as_segunlock(struct seg *seg, caddr_t addr, int attr,
2204     ulong_t *bitmap, size_t position, size_t npages)
2205 {
2206         caddr_t range_start;
2207         size_t  pos1 = position;
2208         size_t  pos2;
2209         size_t  size;
2210         size_t  end_pos = npages + position;
2211 
2212         while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2213                 size = ptob((pos2 - pos1));
2214                 range_start = (caddr_t)((uintptr_t)addr +
2215                     ptob(pos1 - position));
2216 
2217                 (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK,
2218                     (ulong_t *)NULL, (size_t)NULL);
2219                 pos1 = pos2;
2220         }
2221 }
2222 
2223 static void
2224 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
2225     caddr_t raddr, size_t rsize)
2226 {
2227         struct seg *seg = as_segat(as, raddr);
2228         size_t ssize;
2229 
2230         while (rsize != 0) {
2231                 if (raddr >= seg->s_base + seg->s_size)
2232                         seg = AS_SEGNEXT(as, seg);
2233 
2234                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2235                         ssize = seg->s_base + seg->s_size - raddr;
2236                 else
2237                         ssize = rsize;
2238 
2239                 as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
2240 
2241                 rsize -= ssize;
2242                 raddr += ssize;
2243         }
2244 }
2245 
2246 /*
2247  * Cache control operations over the interval [addr, addr + size) in
2248  * address space "as".
2249  */
2250 /*ARGSUSED*/
2251 int
2252 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
2253     uintptr_t arg, ulong_t *lock_map, size_t pos)
2254 {
2255         struct seg *seg;        /* working segment */
2256         caddr_t raddr;          /* rounded down addr */
2257         caddr_t initraddr;      /* saved initial rounded down addr */
2258         size_t rsize;           /* rounded up size */
2259         size_t initrsize;       /* saved initial rounded up size */
2260         size_t ssize;           /* size of seg */
2261         int error = 0;                  /* result */
2262         size_t mlock_size;      /* size of bitmap */
2263         ulong_t *mlock_map;     /* pointer to bitmap used */
2264                                 /* to represent the locked */
2265                                 /* pages. */
2266 retry:
2267         if (error == IE_RETRY)
2268                 AS_LOCK_ENTER(as, RW_WRITER);
2269         else
2270                 AS_LOCK_ENTER(as, RW_READER);
2271 
2272         /*
2273          * If these are address space lock/unlock operations, loop over
2274          * all segments in the address space, as appropriate.
2275          */
2276         if (func == MC_LOCKAS) {
2277                 size_t npages, idx;
2278                 size_t rlen = 0;        /* rounded as length */
2279 
2280                 idx = pos;
2281 
2282                 if (arg & MCL_FUTURE) {
2283                         mutex_enter(&as->a_contents);
2284                         AS_SETPGLCK(as);
2285                         mutex_exit(&as->a_contents);
2286                 }
2287                 if ((arg & MCL_CURRENT) == 0) {
2288                         AS_LOCK_EXIT(as);
2289                         return (0);
2290                 }
2291 
2292                 seg = AS_SEGFIRST(as);
2293                 if (seg == NULL) {
2294                         AS_LOCK_EXIT(as);
2295                         return (0);
2296                 }
2297 
2298                 do {
2299                         raddr = (caddr_t)((uintptr_t)seg->s_base &
2300                             (uintptr_t)PAGEMASK);
2301                         rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2302                             PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2303                 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2304 
2305                 mlock_size = BT_BITOUL(btopr(rlen));
2306                 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2307                     sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2308                                 AS_LOCK_EXIT(as);
2309                                 return (EAGAIN);
2310                 }
2311 
2312                 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2313                         error = SEGOP_LOCKOP(seg, seg->s_base,
2314                             seg->s_size, attr, MC_LOCK, mlock_map, pos);
2315                         if (error != 0)
2316                                 break;
2317                         pos += seg_pages(seg);
2318                 }
2319 
2320                 if (error) {
2321                         for (seg = AS_SEGFIRST(as); seg != NULL;
2322                             seg = AS_SEGNEXT(as, seg)) {
2323 
2324                                 raddr = (caddr_t)((uintptr_t)seg->s_base &
2325                                     (uintptr_t)PAGEMASK);
2326                                 npages = seg_pages(seg);
2327                                 as_segunlock(seg, raddr, attr, mlock_map,
2328                                     idx, npages);
2329                                 idx += npages;
2330                         }
2331                 }
2332 
2333                 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2334                 AS_LOCK_EXIT(as);
2335                 goto lockerr;
2336         } else if (func == MC_UNLOCKAS) {
2337                 mutex_enter(&as->a_contents);
2338                 AS_CLRPGLCK(as);
2339                 mutex_exit(&as->a_contents);
2340 
2341                 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2342                         error = SEGOP_LOCKOP(seg, seg->s_base,
2343                             seg->s_size, attr, MC_UNLOCK, NULL, 0);
2344                         if (error != 0)
2345                                 break;
2346                 }
2347 
2348                 AS_LOCK_EXIT(as);
2349                 goto lockerr;
2350         }
2351 
2352         /*
2353          * Normalize addresses and sizes.
2354          */
2355         initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2356         initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2357             (size_t)raddr;
2358 
2359         if (raddr + rsize < raddr) {         /* check for wraparound */
2360                 AS_LOCK_EXIT(as);
2361                 return (ENOMEM);
2362         }
2363 
2364         /*
2365          * Get initial segment.
2366          */
2367         if ((seg = as_segat(as, raddr)) == NULL) {
2368                 AS_LOCK_EXIT(as);
2369                 return (ENOMEM);
2370         }
2371 
2372         if (func == MC_LOCK) {
2373                 mlock_size = BT_BITOUL(btopr(rsize));
2374                 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2375                     sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2376                                 AS_LOCK_EXIT(as);
2377                                 return (EAGAIN);
2378                 }
2379         }
2380 
2381         /*
2382          * Loop over all segments.  If a hole in the address range is
2383          * discovered, then fail.  For each segment, perform the appropriate
2384          * control operation.
2385          */
2386         while (rsize != 0) {
2387 
2388                 /*
2389                  * Make sure there's no hole, calculate the portion
2390                  * of the next segment to be operated over.
2391                  */
2392                 if (raddr >= seg->s_base + seg->s_size) {
2393                         seg = AS_SEGNEXT(as, seg);
2394                         if (seg == NULL || raddr != seg->s_base) {
2395                                 if (func == MC_LOCK) {
2396                                         as_unlockerr(as, attr, mlock_map,
2397                                             initraddr, initrsize - rsize);
2398                                         kmem_free(mlock_map,
2399                                             mlock_size * sizeof (ulong_t));
2400                                 }
2401                                 AS_LOCK_EXIT(as);
2402                                 return (ENOMEM);
2403                         }
2404                 }
2405                 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2406                         ssize = seg->s_base + seg->s_size - raddr;
2407                 else
2408                         ssize = rsize;
2409 
2410                 /*
2411                  * Dispatch on specific function.
2412                  */
2413                 switch (func) {
2414 
2415                 /*
2416                  * Synchronize cached data from mappings with backing
2417                  * objects.
2418                  */
2419                 case MC_SYNC:
2420                         if (error = SEGOP_SYNC(seg, raddr, ssize,
2421                             attr, (uint_t)arg)) {
2422                                 AS_LOCK_EXIT(as);
2423                                 return (error);
2424                         }
2425                         break;
2426 
2427                 /*
2428                  * Lock pages in memory.
2429                  */
2430                 case MC_LOCK:
2431                         if (error = SEGOP_LOCKOP(seg, raddr, ssize,
2432                             attr, func, mlock_map, pos)) {
2433                                 as_unlockerr(as, attr, mlock_map, initraddr,
2434                                     initrsize - rsize + ssize);
2435                                 kmem_free(mlock_map, mlock_size *
2436                                     sizeof (ulong_t));
2437                                 AS_LOCK_EXIT(as);
2438                                 goto lockerr;
2439                         }
2440                         break;
2441 
2442                 /*
2443                  * Unlock mapped pages.
2444                  */
2445                 case MC_UNLOCK:
2446                         (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func,
2447                             (ulong_t *)NULL, (size_t)NULL);
2448                         break;
2449 
2450                 /*
2451                  * Store VM advise for mapped pages in segment layer.
2452                  */
2453                 case MC_ADVISE:
2454                         error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg);
2455 
2456                         /*
2457                          * Check for regular errors and special retry error
2458                          */
2459                         if (error) {
2460                                 if (error == IE_RETRY) {
2461                                         /*
2462                                          * Need to acquire writers lock, so
2463                                          * have to drop readers lock and start
2464                                          * all over again
2465                                          */
2466                                         AS_LOCK_EXIT(as);
2467                                         goto retry;
2468                                 } else if (error == IE_REATTACH) {
2469                                         /*
2470                                          * Find segment for current address
2471                                          * because current segment just got
2472                                          * split or concatenated
2473                                          */
2474                                         seg = as_segat(as, raddr);
2475                                         if (seg == NULL) {
2476                                                 AS_LOCK_EXIT(as);
2477                                                 return (ENOMEM);
2478                                         }
2479                                 } else {
2480                                         /*
2481                                          * Regular error
2482                                          */
2483                                         AS_LOCK_EXIT(as);
2484                                         return (error);
2485                                 }
2486                         }
2487                         break;
2488 
2489                 case MC_INHERIT_ZERO:
2490                         if (seg->s_ops->inherit == NULL) {
2491                                 error = ENOTSUP;
2492                         } else {
2493                                 error = SEGOP_INHERIT(seg, raddr, ssize,
2494                                     SEGP_INH_ZERO);
2495                         }
2496                         if (error != 0) {
2497                                 AS_LOCK_EXIT(as);
2498                                 return (error);
2499                         }
2500                         break;
2501 
2502                 /*
2503                  * Can't happen.
2504                  */
2505                 default:
2506                         panic("as_ctl: bad operation %d", func);
2507                         /*NOTREACHED*/
2508                 }
2509 
2510                 rsize -= ssize;
2511                 raddr += ssize;
2512         }
2513 
2514         if (func == MC_LOCK)
2515                 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2516         AS_LOCK_EXIT(as);
2517         return (0);
2518 lockerr:
2519 
2520         /*
2521          * If the lower levels returned EDEADLK for a segment lockop,
2522          * it means that we should retry the operation.  Let's wait
2523          * a bit also to let the deadlock causing condition clear.
2524          * This is part of a gross hack to work around a design flaw
2525          * in the ufs/sds logging code and should go away when the
2526          * logging code is re-designed to fix the problem. See bug
2527          * 4125102 for details of the problem.
2528          */
2529         if (error == EDEADLK) {
2530                 delay(deadlk_wait);
2531                 error = 0;
2532                 goto retry;
2533         }
2534         return (error);
2535 }
2536 
2537 int
2538 fc_decode(faultcode_t fault_err)
2539 {
2540         int error = 0;
2541 
2542         switch (FC_CODE(fault_err)) {
2543         case FC_OBJERR:
2544                 error = FC_ERRNO(fault_err);
2545                 break;
2546         case FC_PROT:
2547                 error = EACCES;
2548                 break;
2549         default:
2550                 error = EFAULT;
2551                 break;
2552         }
2553         return (error);
2554 }
2555 
2556 /*
2557  * Pagelock pages from a range that spans more than 1 segment.  Obtain shadow
2558  * lists from each segment and copy them to one contiguous shadow list (plist)
2559  * as expected by the caller.  Save pointers to per segment shadow lists at
2560  * the tail of plist so that they can be used during as_pageunlock().
2561  */
2562 static int
2563 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2564     caddr_t addr, size_t size, enum seg_rw rw)
2565 {
2566         caddr_t sv_addr = addr;
2567         size_t sv_size = size;
2568         struct seg *sv_seg = seg;
2569         ulong_t segcnt = 1;
2570         ulong_t cnt;
2571         size_t ssize;
2572         pgcnt_t npages = btop(size);
2573         page_t **plist;
2574         page_t **pl;
2575         int error;
2576         caddr_t eaddr;
2577         faultcode_t fault_err = 0;
2578         pgcnt_t pl_off;
2579         extern struct seg_ops segspt_shmops;
2580 
2581         ASSERT(AS_LOCK_HELD(as));
2582         ASSERT(seg != NULL);
2583         ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2584         ASSERT(addr + size > seg->s_base + seg->s_size);
2585         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2586         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2587 
2588         /*
2589          * Count the number of segments covered by the range we are about to
2590          * lock. The segment count is used to size the shadow list we return
2591          * back to the caller.
2592          */
2593         for (; size != 0; size -= ssize, addr += ssize) {
2594                 if (addr >= seg->s_base + seg->s_size) {
2595 
2596                         seg = AS_SEGNEXT(as, seg);
2597                         if (seg == NULL || addr != seg->s_base) {
2598                                 AS_LOCK_EXIT(as);
2599                                 return (EFAULT);
2600                         }
2601                         /*
2602                          * Do a quick check if subsequent segments
2603                          * will most likely support pagelock.
2604                          */
2605                         if (seg->s_ops == &segvn_ops) {
2606                                 vnode_t *vp;
2607 
2608                                 if (SEGOP_GETVP(seg, addr, &vp) != 0 ||
2609                                     vp != NULL) {
2610                                         AS_LOCK_EXIT(as);
2611                                         goto slow;
2612                                 }
2613                         } else if (seg->s_ops != &segspt_shmops) {
2614                                 AS_LOCK_EXIT(as);
2615                                 goto slow;
2616                         }
2617                         segcnt++;
2618                 }
2619                 if (addr + size > seg->s_base + seg->s_size) {
2620                         ssize = seg->s_base + seg->s_size - addr;
2621                 } else {
2622                         ssize = size;
2623                 }
2624         }
2625         ASSERT(segcnt > 1);
2626 
2627         plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2628 
2629         addr = sv_addr;
2630         size = sv_size;
2631         seg = sv_seg;
2632 
2633         for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2634                 if (addr >= seg->s_base + seg->s_size) {
2635                         seg = AS_SEGNEXT(as, seg);
2636                         ASSERT(seg != NULL && addr == seg->s_base);
2637                         cnt++;
2638                         ASSERT(cnt < segcnt);
2639                 }
2640                 if (addr + size > seg->s_base + seg->s_size) {
2641                         ssize = seg->s_base + seg->s_size - addr;
2642                 } else {
2643                         ssize = size;
2644                 }
2645                 pl = &plist[npages + cnt];
2646                 error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2647                     L_PAGELOCK, rw);
2648                 if (error) {
2649                         break;
2650                 }
2651                 ASSERT(plist[npages + cnt] != NULL);
2652                 ASSERT(pl_off + btop(ssize) <= npages);
2653                 bcopy(plist[npages + cnt], &plist[pl_off],
2654                     btop(ssize) * sizeof (page_t *));
2655                 pl_off += btop(ssize);
2656         }
2657 
2658         if (size == 0) {
2659                 AS_LOCK_EXIT(as);
2660                 ASSERT(cnt == segcnt - 1);
2661                 *ppp = plist;
2662                 return (0);
2663         }
2664 
2665         /*
2666          * one of pagelock calls failed. The error type is in error variable.
2667          * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2668          * type is either EFAULT or ENOTSUP. Otherwise just return the error
2669          * back to the caller.
2670          */
2671 
2672         eaddr = addr;
2673         seg = sv_seg;
2674 
2675         for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2676                 if (addr >= seg->s_base + seg->s_size) {
2677                         seg = AS_SEGNEXT(as, seg);
2678                         ASSERT(seg != NULL && addr == seg->s_base);
2679                         cnt++;
2680                         ASSERT(cnt < segcnt);
2681                 }
2682                 if (eaddr > seg->s_base + seg->s_size) {
2683                         ssize = seg->s_base + seg->s_size - addr;
2684                 } else {
2685                         ssize = eaddr - addr;
2686                 }
2687                 pl = &plist[npages + cnt];
2688                 ASSERT(*pl != NULL);
2689                 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2690                     L_PAGEUNLOCK, rw);
2691         }
2692 
2693         AS_LOCK_EXIT(as);
2694 
2695         kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2696 
2697         if (error != ENOTSUP && error != EFAULT) {
2698                 return (error);
2699         }
2700 
2701 slow:
2702         /*
2703          * If we are here because pagelock failed due to the need to cow fault
2704          * in the pages we want to lock F_SOFTLOCK will do this job and in
2705          * next as_pagelock() call for this address range pagelock will
2706          * hopefully succeed.
2707          */
2708         fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2709         if (fault_err != 0) {
2710                 return (fc_decode(fault_err));
2711         }
2712         *ppp = NULL;
2713 
2714         return (0);
2715 }
2716 
2717 /*
2718  * lock pages in a given address space. Return shadow list. If
2719  * the list is NULL, the MMU mapping is also locked.
2720  */
2721 int
2722 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
2723     size_t size, enum seg_rw rw)
2724 {
2725         size_t rsize;
2726         caddr_t raddr;
2727         faultcode_t fault_err;
2728         struct seg *seg;
2729         int err;
2730 
2731         TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START,
2732             "as_pagelock_start: addr %p size %ld", addr, size);
2733 
2734         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2735         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2736             (size_t)raddr;
2737 
2738         /*
2739          * if the request crosses two segments let
2740          * as_fault handle it.
2741          */
2742         AS_LOCK_ENTER(as, RW_READER);
2743 
2744         seg = as_segat(as, raddr);
2745         if (seg == NULL) {
2746                 AS_LOCK_EXIT(as);
2747                 return (EFAULT);
2748         }
2749         ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2750         if (raddr + rsize > seg->s_base + seg->s_size) {
2751                 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2752         }
2753         if (raddr + rsize <= raddr) {
2754                 AS_LOCK_EXIT(as);
2755                 return (EFAULT);
2756         }
2757 
2758         TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
2759             "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize);
2760 
2761         /*
2762          * try to lock pages and pass back shadow list
2763          */
2764         err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2765 
2766         TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
2767 
2768         AS_LOCK_EXIT(as);
2769 
2770         if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2771                 return (err);
2772         }
2773 
2774         /*
2775          * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2776          * to no pagelock support for this segment or pages need to be cow
2777          * faulted in. If fault is needed F_SOFTLOCK will do this job for
2778          * this as_pagelock() call and in the next as_pagelock() call for the
2779          * same address range pagelock call will hopefull succeed.
2780          */
2781         fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2782         if (fault_err != 0) {
2783                 return (fc_decode(fault_err));
2784         }
2785         *ppp = NULL;
2786 
2787         TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end");
2788         return (0);
2789 }
2790 
2791 /*
2792  * unlock pages locked by as_pagelock_segs().  Retrieve per segment shadow
2793  * lists from the end of plist and call pageunlock interface for each segment.
2794  * Drop as lock and free plist.
2795  */
2796 static void
2797 as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
2798     struct page **plist, enum seg_rw rw)
2799 {
2800         ulong_t cnt;
2801         caddr_t eaddr = addr + size;
2802         pgcnt_t npages = btop(size);
2803         size_t ssize;
2804         page_t **pl;
2805 
2806         ASSERT(AS_LOCK_HELD(as));
2807         ASSERT(seg != NULL);
2808         ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2809         ASSERT(addr + size > seg->s_base + seg->s_size);
2810         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2811         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2812         ASSERT(plist != NULL);
2813 
2814         for (cnt = 0; addr < eaddr; addr += ssize) {
2815                 if (addr >= seg->s_base + seg->s_size) {
2816                         seg = AS_SEGNEXT(as, seg);
2817                         ASSERT(seg != NULL && addr == seg->s_base);
2818                         cnt++;
2819                 }
2820                 if (eaddr > seg->s_base + seg->s_size) {
2821                         ssize = seg->s_base + seg->s_size - addr;
2822                 } else {
2823                         ssize = eaddr - addr;
2824                 }
2825                 pl = &plist[npages + cnt];
2826                 ASSERT(*pl != NULL);
2827                 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2828                     L_PAGEUNLOCK, rw);
2829         }
2830         ASSERT(cnt > 0);
2831         AS_LOCK_EXIT(as);
2832 
2833         cnt++;
2834         kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2835 }
2836 
2837 /*
2838  * unlock pages in a given address range
2839  */
2840 void
2841 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2842     enum seg_rw rw)
2843 {
2844         struct seg *seg;
2845         size_t rsize;
2846         caddr_t raddr;
2847 
2848         TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START,
2849             "as_pageunlock_start: addr %p size %ld", addr, size);
2850 
2851         /*
2852          * if the shadow list is NULL, as_pagelock was
2853          * falling back to as_fault
2854          */
2855         if (pp == NULL) {
2856                 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2857                 return;
2858         }
2859 
2860         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2861         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2862             (size_t)raddr;
2863 
2864         AS_LOCK_ENTER(as, RW_READER);
2865         seg = as_segat(as, raddr);
2866         ASSERT(seg != NULL);
2867 
2868         TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
2869             "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
2870 
2871         ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2872         if (raddr + rsize <= seg->s_base + seg->s_size) {
2873                 SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2874         } else {
2875                 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2876                 return;
2877         }
2878         AS_LOCK_EXIT(as);
2879         TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
2880 }
2881 
2882 int
2883 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2884     boolean_t wait)
2885 {
2886         struct seg *seg;
2887         size_t ssize;
2888         caddr_t raddr;                  /* rounded down addr */
2889         size_t rsize;                   /* rounded up size */
2890         int error = 0;
2891         size_t pgsz = page_get_pagesize(szc);
2892 
2893 setpgsz_top:
2894         if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
2895                 return (EINVAL);
2896         }
2897 
2898         raddr = addr;
2899         rsize = size;
2900 
2901         if (raddr + rsize < raddr)           /* check for wraparound */
2902                 return (ENOMEM);
2903 
2904         AS_LOCK_ENTER(as, RW_WRITER);
2905         as_clearwatchprot(as, raddr, rsize);
2906         seg = as_segat(as, raddr);
2907         if (seg == NULL) {
2908                 as_setwatch(as);
2909                 AS_LOCK_EXIT(as);
2910                 return (ENOMEM);
2911         }
2912 
2913         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2914                 if (raddr >= seg->s_base + seg->s_size) {
2915                         seg = AS_SEGNEXT(as, seg);
2916                         if (seg == NULL || raddr != seg->s_base) {
2917                                 error = ENOMEM;
2918                                 break;
2919                         }
2920                 }
2921                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2922                         ssize = seg->s_base + seg->s_size - raddr;
2923                 } else {
2924                         ssize = rsize;
2925                 }
2926 
2927 retry:
2928                 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
2929 
2930                 if (error == IE_NOMEM) {
2931                         error = EAGAIN;
2932                         break;
2933                 }
2934 
2935                 if (error == IE_RETRY) {
2936                         AS_LOCK_EXIT(as);
2937                         goto setpgsz_top;
2938                 }
2939 
2940                 if (error == ENOTSUP) {
2941                         error = EINVAL;
2942                         break;
2943                 }
2944 
2945                 if (wait && (error == EAGAIN)) {
2946                         /*
2947                          * Memory is currently locked.  It must be unlocked
2948                          * before this operation can succeed through a retry.
2949                          * The possible reasons for locked memory and
2950                          * corresponding strategies for unlocking are:
2951                          * (1) Normal I/O
2952                          *      wait for a signal that the I/O operation
2953                          *      has completed and the memory is unlocked.
2954                          * (2) Asynchronous I/O
2955                          *      The aio subsystem does not unlock pages when
2956                          *      the I/O is completed. Those pages are unlocked
2957                          *      when the application calls aiowait/aioerror.
2958                          *      So, to prevent blocking forever, cv_broadcast()
2959                          *      is done to wake up aio_cleanup_thread.
2960                          *      Subsequently, segvn_reclaim will be called, and
2961                          *      that will do AS_CLRUNMAPWAIT() and wake us up.
2962                          * (3) Long term page locking:
2963                          *      This is not relevant for as_setpagesize()
2964                          *      because we cannot change the page size for
2965                          *      driver memory. The attempt to do so will
2966                          *      fail with a different error than EAGAIN so
2967                          *      there's no need to trigger as callbacks like
2968                          *      as_unmap, as_setprot or as_free would do.
2969                          */
2970                         mutex_enter(&as->a_contents);
2971                         if (!AS_ISNOUNMAPWAIT(as)) {
2972                                 if (AS_ISUNMAPWAIT(as) == 0) {
2973                                         cv_broadcast(&as->a_cv);
2974                                 }
2975                                 AS_SETUNMAPWAIT(as);
2976                                 AS_LOCK_EXIT(as);
2977                                 while (AS_ISUNMAPWAIT(as)) {
2978                                         cv_wait(&as->a_cv, &as->a_contents);
2979                                 }
2980                         } else {
2981                                 /*
2982                                  * We may have raced with
2983                                  * segvn_reclaim()/segspt_reclaim(). In this
2984                                  * case clean nounmapwait flag and retry since
2985                                  * softlockcnt in this segment may be already
2986                                  * 0.  We don't drop as writer lock so our
2987                                  * number of retries without sleeping should
2988                                  * be very small. See segvn_reclaim() for
2989                                  * more comments.
2990                                  */
2991                                 AS_CLRNOUNMAPWAIT(as);
2992                                 mutex_exit(&as->a_contents);
2993                                 goto retry;
2994                         }
2995                         mutex_exit(&as->a_contents);
2996                         goto setpgsz_top;
2997                 } else if (error != 0) {
2998                         break;
2999                 }
3000         }
3001         as_setwatch(as);
3002         AS_LOCK_EXIT(as);
3003         return (error);
3004 }
3005 
3006 /*
3007  * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments
3008  * in its chunk where s_szc is less than the szc we want to set.
3009  */
3010 static int
3011 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3012     int *retry)
3013 {
3014         struct seg *seg;
3015         size_t ssize;
3016         int error;
3017 
3018         ASSERT(AS_WRITE_HELD(as));
3019 
3020         seg = as_segat(as, raddr);
3021         if (seg == NULL) {
3022                 panic("as_iset3_default_lpsize: no seg");
3023         }
3024 
3025         for (; rsize != 0; rsize -= ssize, raddr += ssize) {
3026                 if (raddr >= seg->s_base + seg->s_size) {
3027                         seg = AS_SEGNEXT(as, seg);
3028                         if (seg == NULL || raddr != seg->s_base) {
3029                                 panic("as_iset3_default_lpsize: as changed");
3030                         }
3031                 }
3032                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3033                         ssize = seg->s_base + seg->s_size - raddr;
3034                 } else {
3035                         ssize = rsize;
3036                 }
3037 
3038                 if (szc > seg->s_szc) {
3039                         error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
3040                         /* Only retry on EINVAL segments that have no vnode. */
3041                         if (error == EINVAL) {
3042                                 vnode_t *vp = NULL;
3043                                 if ((SEGOP_GETTYPE(seg, raddr) & MAP_SHARED) &&
3044                                     (SEGOP_GETVP(seg, raddr, &vp) != 0 ||
3045                                     vp == NULL)) {
3046                                         *retry = 1;
3047                                 } else {
3048                                         *retry = 0;
3049                                 }
3050                         }
3051                         if (error) {
3052                                 return (error);
3053                         }
3054                 }
3055         }
3056         return (0);
3057 }
3058 
3059 /*
3060  * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
3061  * pagesize on each segment in its range, but if any fails with EINVAL,
3062  * then it reduces the pagesizes to the next size in the bitmap and
3063  * retries as_iset3_default_lpsize(). The reason why the code retries
3064  * smaller allowed sizes on EINVAL is because (a) the anon offset may not
3065  * match the bigger sizes, and (b) it's hard to get this offset (to begin
3066  * with) to pass to map_pgszcvec().
3067  */
3068 static int
3069 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
3070     uint_t szcvec)
3071 {
3072         int error;
3073         int retry;
3074 
3075         ASSERT(AS_WRITE_HELD(as));
3076 
3077         for (;;) {
3078                 error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
3079                 if (error == EINVAL && retry) {
3080                         szcvec &= ~(1 << szc);
3081                         if (szcvec <= 1) {
3082                                 return (EINVAL);
3083                         }
3084                         szc = highbit(szcvec) - 1;
3085                 } else {
3086                         return (error);
3087                 }
3088         }
3089 }
3090 
3091 /*
3092  * as_iset1_default_lpsize() breaks its chunk into areas where existing
3093  * segments have a smaller szc than we want to set. For each such area,
3094  * it calls as_iset2_default_lpsize()
3095  */
3096 static int
3097 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3098     uint_t szcvec)
3099 {
3100         struct seg *seg;
3101         size_t ssize;
3102         caddr_t setaddr = raddr;
3103         size_t setsize = 0;
3104         int set;
3105         int error;
3106 
3107         ASSERT(AS_WRITE_HELD(as));
3108 
3109         seg = as_segat(as, raddr);
3110         if (seg == NULL) {
3111                 panic("as_iset1_default_lpsize: no seg");
3112         }
3113         if (seg->s_szc < szc) {
3114                 set = 1;
3115         } else {
3116                 set = 0;
3117         }
3118 
3119         for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3120                 if (raddr >= seg->s_base + seg->s_size) {
3121                         seg = AS_SEGNEXT(as, seg);
3122                         if (seg == NULL || raddr != seg->s_base) {
3123                                 panic("as_iset1_default_lpsize: as changed");
3124                         }
3125                         if (seg->s_szc >= szc && set) {
3126                                 ASSERT(setsize != 0);
3127                                 error = as_iset2_default_lpsize(as,
3128                                     setaddr, setsize, szc, szcvec);
3129                                 if (error) {
3130                                         return (error);
3131                                 }
3132                                 set = 0;
3133                         } else if (seg->s_szc < szc && !set) {
3134                                 setaddr = raddr;
3135                                 setsize = 0;
3136                                 set = 1;
3137                         }
3138                 }
3139                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3140                         ssize = seg->s_base + seg->s_size - raddr;
3141                 } else {
3142                         ssize = rsize;
3143                 }
3144         }
3145         error = 0;
3146         if (set) {
3147                 ASSERT(setsize != 0);
3148                 error = as_iset2_default_lpsize(as, setaddr, setsize,
3149                     szc, szcvec);
3150         }
3151         return (error);
3152 }
3153 
3154 /*
3155  * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3156  * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3157  * chunk to as_iset1_default_lpsize().
3158  */
3159 static int
3160 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
3161     int type)
3162 {
3163         int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
3164         uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
3165             flags, rtype, 1);
3166         uint_t szc;
3167         uint_t nszc;
3168         int error;
3169         caddr_t a;
3170         caddr_t eaddr;
3171         size_t segsize;
3172         size_t pgsz;
3173         uint_t save_szcvec;
3174 
3175         ASSERT(AS_WRITE_HELD(as));
3176         ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
3177         ASSERT(IS_P2ALIGNED(size, PAGESIZE));
3178 
3179         szcvec &= ~1;
3180         if (szcvec <= 1) {   /* skip if base page size */
3181                 return (0);
3182         }
3183 
3184         /* Get the pagesize of the first larger page size. */
3185         szc = lowbit(szcvec) - 1;
3186         pgsz = page_get_pagesize(szc);
3187         eaddr = addr + size;
3188         addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3189         eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3190 
3191         save_szcvec = szcvec;
3192         szcvec >>= (szc + 1);
3193         nszc = szc;
3194         while (szcvec) {
3195                 if ((szcvec & 0x1) == 0) {
3196                         nszc++;
3197                         szcvec >>= 1;
3198                         continue;
3199                 }
3200                 nszc++;
3201                 pgsz = page_get_pagesize(nszc);
3202                 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3203                 if (a != addr) {
3204                         ASSERT(szc > 0);
3205                         ASSERT(a < eaddr);
3206                         segsize = a - addr;
3207                         error = as_iset1_default_lpsize(as, addr, segsize, szc,
3208                             save_szcvec);
3209                         if (error) {
3210                                 return (error);
3211                         }
3212                         addr = a;
3213                 }
3214                 szc = nszc;
3215                 szcvec >>= 1;
3216         }
3217 
3218         ASSERT(addr < eaddr);
3219         szcvec = save_szcvec;
3220         while (szcvec) {
3221                 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3222                 ASSERT(a >= addr);
3223                 if (a != addr) {
3224                         ASSERT(szc > 0);
3225                         segsize = a - addr;
3226                         error = as_iset1_default_lpsize(as, addr, segsize, szc,
3227                             save_szcvec);
3228                         if (error) {
3229                                 return (error);
3230                         }
3231                         addr = a;
3232                 }
3233                 szcvec &= ~(1 << szc);
3234                 if (szcvec) {
3235                         szc = highbit(szcvec) - 1;
3236                         pgsz = page_get_pagesize(szc);
3237                 }
3238         }
3239         ASSERT(addr == eaddr);
3240 
3241         return (0);
3242 }
3243 
3244 /*
3245  * Set the default large page size for the range. Called via memcntl with
3246  * page size set to 0. as_set_default_lpsize breaks the range down into
3247  * chunks with the same type/flags, ignores-non segvn segments, and passes
3248  * each chunk to as_iset_default_lpsize().
3249  */
3250 int
3251 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
3252 {
3253         struct seg *seg;
3254         caddr_t raddr;
3255         size_t rsize;
3256         size_t ssize;
3257         int rtype, rflags;
3258         int stype, sflags;
3259         int error;
3260         caddr_t setaddr;
3261         size_t setsize;
3262         int segvn;
3263 
3264         if (size == 0)
3265                 return (0);
3266 
3267         AS_LOCK_ENTER(as, RW_WRITER);
3268 again:
3269         error = 0;
3270 
3271         raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3272         rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3273             (size_t)raddr;
3274 
3275         if (raddr + rsize < raddr) {         /* check for wraparound */
3276                 AS_LOCK_EXIT(as);
3277                 return (ENOMEM);
3278         }
3279         as_clearwatchprot(as, raddr, rsize);
3280         seg = as_segat(as, raddr);
3281         if (seg == NULL) {
3282                 as_setwatch(as);
3283                 AS_LOCK_EXIT(as);
3284                 return (ENOMEM);
3285         }
3286         if (seg->s_ops == &segvn_ops) {
3287                 rtype = SEGOP_GETTYPE(seg, addr);
3288                 rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3289                 rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3290                 segvn = 1;
3291         } else {
3292                 segvn = 0;
3293         }
3294         setaddr = raddr;
3295         setsize = 0;
3296 
3297         for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3298                 if (raddr >= (seg->s_base + seg->s_size)) {
3299                         seg = AS_SEGNEXT(as, seg);
3300                         if (seg == NULL || raddr != seg->s_base) {
3301                                 error = ENOMEM;
3302                                 break;
3303                         }
3304                         if (seg->s_ops == &segvn_ops) {
3305                                 stype = SEGOP_GETTYPE(seg, raddr);
3306                                 sflags = stype & (MAP_TEXT | MAP_INITDATA);
3307                                 stype &= (MAP_SHARED | MAP_PRIVATE);
3308                                 if (segvn && (rflags != sflags ||
3309                                     rtype != stype)) {
3310                                         /*
3311                                          * The next segment is also segvn but
3312                                          * has different flags and/or type.
3313                                          */
3314                                         ASSERT(setsize != 0);
3315                                         error = as_iset_default_lpsize(as,
3316                                             setaddr, setsize, rflags, rtype);
3317                                         if (error) {
3318                                                 break;
3319                                         }
3320                                         rflags = sflags;
3321                                         rtype = stype;
3322                                         setaddr = raddr;
3323                                         setsize = 0;
3324                                 } else if (!segvn) {
3325                                         rflags = sflags;
3326                                         rtype = stype;
3327                                         setaddr = raddr;
3328                                         setsize = 0;
3329                                         segvn = 1;
3330                                 }
3331                         } else if (segvn) {
3332                                 /* The next segment is not segvn. */
3333                                 ASSERT(setsize != 0);
3334                                 error = as_iset_default_lpsize(as,
3335                                     setaddr, setsize, rflags, rtype);
3336                                 if (error) {
3337                                         break;
3338                                 }
3339                                 segvn = 0;
3340                         }
3341                 }
3342                 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3343                         ssize = seg->s_base + seg->s_size - raddr;
3344                 } else {
3345                         ssize = rsize;
3346                 }
3347         }
3348         if (error == 0 && segvn) {
3349                 /* The last chunk when rsize == 0. */
3350                 ASSERT(setsize != 0);
3351                 error = as_iset_default_lpsize(as, setaddr, setsize,
3352                     rflags, rtype);
3353         }
3354 
3355         if (error == IE_RETRY) {
3356                 goto again;
3357         } else if (error == IE_NOMEM) {
3358                 error = EAGAIN;
3359         } else if (error == ENOTSUP) {
3360                 error = EINVAL;
3361         } else if (error == EAGAIN) {
3362                 mutex_enter(&as->a_contents);
3363                 if (!AS_ISNOUNMAPWAIT(as)) {
3364                         if (AS_ISUNMAPWAIT(as) == 0) {
3365                                 cv_broadcast(&as->a_cv);
3366                         }
3367                         AS_SETUNMAPWAIT(as);
3368                         AS_LOCK_EXIT(as);
3369                         while (AS_ISUNMAPWAIT(as)) {
3370                                 cv_wait(&as->a_cv, &as->a_contents);
3371                         }
3372                         mutex_exit(&as->a_contents);
3373                         AS_LOCK_ENTER(as, RW_WRITER);
3374                 } else {
3375                         /*
3376                          * We may have raced with
3377                          * segvn_reclaim()/segspt_reclaim(). In this case
3378                          * clean nounmapwait flag and retry since softlockcnt
3379                          * in this segment may be already 0.  We don't drop as
3380                          * writer lock so our number of retries without
3381                          * sleeping should be very small. See segvn_reclaim()
3382                          * for more comments.
3383                          */
3384                         AS_CLRNOUNMAPWAIT(as);
3385                         mutex_exit(&as->a_contents);
3386                 }
3387                 goto again;
3388         }
3389 
3390         as_setwatch(as);
3391         AS_LOCK_EXIT(as);
3392         return (error);
3393 }
3394 
3395 /*
3396  * Setup all of the uninitialized watched pages that we can.
3397  */
3398 void
3399 as_setwatch(struct as *as)
3400 {
3401         struct watched_page *pwp;
3402         struct seg *seg;
3403         caddr_t vaddr;
3404         uint_t prot;
3405         int  err, retrycnt;
3406 
3407         if (avl_numnodes(&as->a_wpage) == 0)
3408                 return;
3409 
3410         ASSERT(AS_WRITE_HELD(as));
3411 
3412         for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3413             pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3414                 retrycnt = 0;
3415         retry:
3416                 vaddr = pwp->wp_vaddr;
3417                 if (pwp->wp_oprot != 0 ||    /* already set up */
3418                     (seg = as_segat(as, vaddr)) == NULL ||
3419                     SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0)
3420                         continue;
3421 
3422                 pwp->wp_oprot = prot;
3423                 if (pwp->wp_read)
3424                         prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3425                 if (pwp->wp_write)
3426                         prot &= ~PROT_WRITE;
3427                 if (pwp->wp_exec)
3428                         prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3429                 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3430                         err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3431                         if (err == IE_RETRY) {
3432                                 pwp->wp_oprot = 0;
3433                                 ASSERT(retrycnt == 0);
3434                                 retrycnt++;
3435                                 goto retry;
3436                         }
3437                 }
3438                 pwp->wp_prot = prot;
3439         }
3440 }
3441 
3442 /*
3443  * Clear all of the watched pages in the address space.
3444  */
3445 void
3446 as_clearwatch(struct as *as)
3447 {
3448         struct watched_page *pwp;
3449         struct seg *seg;
3450         caddr_t vaddr;
3451         uint_t prot;
3452         int err, retrycnt;
3453 
3454         if (avl_numnodes(&as->a_wpage) == 0)
3455                 return;
3456 
3457         ASSERT(AS_WRITE_HELD(as));
3458 
3459         for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3460             pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3461                 retrycnt = 0;
3462         retry:
3463                 vaddr = pwp->wp_vaddr;
3464                 if (pwp->wp_oprot == 0 ||    /* not set up */
3465                     (seg = as_segat(as, vaddr)) == NULL)
3466                         continue;
3467 
3468                 if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3469                         err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3470                         if (err == IE_RETRY) {
3471                                 ASSERT(retrycnt == 0);
3472                                 retrycnt++;
3473                                 goto retry;
3474                         }
3475                 }
3476                 pwp->wp_oprot = 0;
3477                 pwp->wp_prot = 0;
3478         }
3479 }
3480 
3481 /*
3482  * Force a new setup for all the watched pages in the range.
3483  */
3484 static void
3485 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3486 {
3487         struct watched_page *pwp;
3488         struct watched_page tpw;
3489         caddr_t eaddr = addr + size;
3490         caddr_t vaddr;
3491         struct seg *seg;
3492         int err, retrycnt;
3493         uint_t  wprot;
3494         avl_index_t where;
3495 
3496         if (avl_numnodes(&as->a_wpage) == 0)
3497                 return;
3498 
3499         ASSERT(AS_WRITE_HELD(as));
3500 
3501         tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3502         if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3503                 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3504 
3505         while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3506                 retrycnt = 0;
3507                 vaddr = pwp->wp_vaddr;
3508 
3509                 wprot = prot;
3510                 if (pwp->wp_read)
3511                         wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3512                 if (pwp->wp_write)
3513                         wprot &= ~PROT_WRITE;
3514                 if (pwp->wp_exec)
3515                         wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3516                 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3517                 retry:
3518                         seg = as_segat(as, vaddr);
3519                         if (seg == NULL) {
3520                                 panic("as_setwatchprot: no seg");
3521                                 /*NOTREACHED*/
3522                         }
3523                         err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot);
3524                         if (err == IE_RETRY) {
3525                                 ASSERT(retrycnt == 0);
3526                                 retrycnt++;
3527                                 goto retry;
3528                         }
3529                 }
3530                 pwp->wp_oprot = prot;
3531                 pwp->wp_prot = wprot;
3532 
3533                 pwp = AVL_NEXT(&as->a_wpage, pwp);
3534         }
3535 }
3536 
3537 /*
3538  * Clear all of the watched pages in the range.
3539  */
3540 static void
3541 as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3542 {
3543         caddr_t eaddr = addr + size;
3544         struct watched_page *pwp;
3545         struct watched_page tpw;
3546         uint_t prot;
3547         struct seg *seg;
3548         int err, retrycnt;
3549         avl_index_t where;
3550 
3551         if (avl_numnodes(&as->a_wpage) == 0)
3552                 return;
3553 
3554         tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3555         if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3556                 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3557 
3558         ASSERT(AS_WRITE_HELD(as));
3559 
3560         while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3561 
3562                 if ((prot = pwp->wp_oprot) != 0) {
3563                         retrycnt = 0;
3564 
3565                         if (prot != pwp->wp_prot) {
3566                         retry:
3567                                 seg = as_segat(as, pwp->wp_vaddr);
3568                                 if (seg == NULL)
3569                                         continue;
3570                                 err = SEGOP_SETPROT(seg, pwp->wp_vaddr,
3571                                     PAGESIZE, prot);
3572                                 if (err == IE_RETRY) {
3573                                         ASSERT(retrycnt == 0);
3574                                         retrycnt++;
3575                                         goto retry;
3576 
3577                                 }
3578                         }
3579                         pwp->wp_oprot = 0;
3580                         pwp->wp_prot = 0;
3581                 }
3582 
3583                 pwp = AVL_NEXT(&as->a_wpage, pwp);
3584         }
3585 }
3586 
3587 void
3588 as_signal_proc(struct as *as, k_siginfo_t *siginfo)
3589 {
3590         struct proc *p;
3591 
3592         mutex_enter(&pidlock);
3593         for (p = practive; p; p = p->p_next) {
3594                 if (p->p_as == as) {
3595                         mutex_enter(&p->p_lock);
3596                         if (p->p_as == as)
3597                                 sigaddq(p, NULL, siginfo, KM_NOSLEEP);
3598                         mutex_exit(&p->p_lock);
3599                 }
3600         }
3601         mutex_exit(&pidlock);
3602 }
3603 
3604 /*
3605  * return memory object ID
3606  */
3607 int
3608 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3609 {
3610         struct seg      *seg;
3611         int             sts;
3612 
3613         AS_LOCK_ENTER(as, RW_READER);
3614         seg = as_segat(as, addr);
3615         if (seg == NULL) {
3616                 AS_LOCK_EXIT(as);
3617                 return (EFAULT);
3618         }
3619         /*
3620          * catch old drivers which may not support getmemid
3621          */
3622         if (seg->s_ops->getmemid == NULL) {
3623                 AS_LOCK_EXIT(as);
3624                 return (ENODEV);
3625         }
3626 
3627         sts = SEGOP_GETMEMID(seg, addr, memidp);
3628 
3629         AS_LOCK_EXIT(as);
3630         return (sts);
3631 }