io-lx-public New usr/src/uts/common/os/grow.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved.
  24  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  25  */
  26 
  27 /*
  28  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  29  * Use is subject to license terms.
  30  */
  31 
  32 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  33 /*        All Rights Reserved   */
  34 
  35 #include <sys/types.h>
  36 #include <sys/inttypes.h>
  37 #include <sys/param.h>
  38 #include <sys/sysmacros.h>
  39 #include <sys/systm.h>
  40 #include <sys/signal.h>
  41 #include <sys/user.h>
  42 #include <sys/errno.h>
  43 #include <sys/var.h>
  44 #include <sys/proc.h>
  45 #include <sys/tuneable.h>
  46 #include <sys/debug.h>
  47 #include <sys/cmn_err.h>
  48 #include <sys/cred.h>
  49 #include <sys/vnode.h>
  50 #include <sys/vfs.h>
  51 #include <sys/vm.h>
  52 #include <sys/file.h>
  53 #include <sys/mman.h>
  54 #include <sys/vmparam.h>
  55 #include <sys/fcntl.h>
  56 #include <sys/lwpchan_impl.h>
  57 #include <sys/nbmlock.h>
  58 #include <sys/brand.h>
  59 
  60 #include <vm/hat.h>
  61 #include <vm/as.h>
  62 #include <vm/seg.h>
  63 #include <vm/seg_dev.h>
  64 #include <vm/seg_vn.h>
  65 
  66 int use_brk_lpg = 1;
  67 int use_stk_lpg = 1;
  68 
  69 static int brk_lpg(caddr_t nva);
  70 static int grow_lpg(caddr_t sp);
  71 
  72 int
  73 brk(caddr_t nva)
  74 {
  75         int error;
  76         proc_t *p = curproc;
  77 
  78         /*
  79          * Serialize brk operations on an address space.
  80          * This also serves as the lock protecting p_brksize
  81          * and p_brkpageszc.
  82          */
  83         as_rangelock(p->p_as);
  84         if (use_brk_lpg && (p->p_flag & SAUTOLPG) != 0) {
  85                 error = brk_lpg(nva);
  86         } else {
  87                 error = brk_internal(nva, p->p_brkpageszc);
  88         }
  89         as_rangeunlock(p->p_as);
  90         return ((error != 0 ? set_errno(error) : 0));
  91 }
  92 
  93 /*
  94  * Algorithm: call arch-specific map_pgsz to get best page size to use,
  95  * then call brk_internal().
  96  * Returns 0 on success.
  97  */
  98 static int
  99 brk_lpg(caddr_t nva)
 100 {
 101         struct proc *p = curproc;
 102         size_t pgsz, len;
 103         caddr_t addr, brkend;
 104         caddr_t bssbase = p->p_bssbase;
 105         caddr_t brkbase = p->p_brkbase;
 106         int oszc, szc;
 107         int err;
 108 
 109         oszc = p->p_brkpageszc;
 110 
 111         /*
 112          * If p_brkbase has not yet been set, the first call
 113          * to brk_internal() will initialize it.
 114          */
 115         if (brkbase == 0) {
 116                 return (brk_internal(nva, oszc));
 117         }
 118 
 119         len = nva - bssbase;
 120 
 121         pgsz = map_pgsz(MAPPGSZ_HEAP, p, bssbase, len, 0);
 122         szc = page_szc(pgsz);
 123 
 124         /*
 125          * Covers two cases:
 126          * 1. page_szc() returns -1 for invalid page size, so we want to
 127          * ignore it in that case.
 128          * 2. By design we never decrease page size, as it is more stable.
 129          */
 130         if (szc <= oszc) {
 131                 err = brk_internal(nva, oszc);
 132                 /* If failed, back off to base page size. */
 133                 if (err != 0 && oszc != 0) {
 134                         err = brk_internal(nva, 0);
 135                 }
 136                 return (err);
 137         }
 138 
 139         err = brk_internal(nva, szc);
 140         /* If using szc failed, map with base page size and return. */
 141         if (err != 0) {
 142                 if (szc != 0) {
 143                         err = brk_internal(nva, 0);
 144                 }
 145                 return (err);
 146         }
 147 
 148         /*
 149          * Round up brk base to a large page boundary and remap
 150          * anything in the segment already faulted in beyond that
 151          * point.
 152          */
 153         addr = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase, pgsz);
 154         brkend = brkbase + p->p_brksize;
 155         len = brkend - addr;
 156         /* Check that len is not negative. Update page size code for heap. */
 157         if (addr >= p->p_bssbase && brkend > addr && IS_P2ALIGNED(len, pgsz)) {
 158                 (void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
 159                 p->p_brkpageszc = szc;
 160         }
 161 
 162         ASSERT(err == 0);
 163         return (err);           /* should always be 0 */
 164 }
 165 
 166 /*
 167  * Returns 0 on success.
 168  */
 169 int
 170 brk_internal(caddr_t nva, uint_t brkszc)
 171 {
 172         caddr_t ova;                    /* current break address */
 173         size_t size;
 174         int     error;
 175         struct proc *p = curproc;
 176         struct as *as = p->p_as;
 177         size_t pgsz;
 178         uint_t szc;
 179         rctl_qty_t as_rctl;
 180 
 181         /*
 182          * extend heap to brkszc alignment but use current p->p_brkpageszc
 183          * for the newly created segment. This allows the new extension
 184          * segment to be concatenated successfully with the existing brk
 185          * segment.
 186          */
 187         if ((szc = brkszc) != 0) {
 188                 pgsz = page_get_pagesize(szc);
 189                 ASSERT(pgsz > PAGESIZE);
 190         } else {
 191                 pgsz = PAGESIZE;
 192         }
 193 
 194         mutex_enter(&p->p_lock);
 195         as_rctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_DATA],
 196             p->p_rctls, p);
 197         mutex_exit(&p->p_lock);
 198 
 199         /*
 200          * If p_brkbase has not yet been set, the first call
 201          * to brk() will initialize it.
 202          */
 203         if (p->p_brkbase == 0)
 204                 p->p_brkbase = nva;
 205 
 206         /*
 207          * Before multiple page size support existed p_brksize was the value
 208          * not rounded to the pagesize (i.e. it stored the exact user request
 209          * for heap size). If pgsz is greater than PAGESIZE calculate the
 210          * heap size as the real new heap size by rounding it up to pgsz.
 211          * This is useful since we may want to know where the heap ends
 212          * without knowing heap pagesize (e.g. some old code) and also if
 213          * heap pagesize changes we can update p_brkpageszc but delay adding
 214          * new mapping yet still know from p_brksize where the heap really
 215          * ends. The user requested heap end is stored in libc variable.
 216          */
 217         if (pgsz > PAGESIZE) {
 218                 caddr_t tnva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
 219                 size = tnva - p->p_brkbase;
 220                 if (tnva < p->p_brkbase || (size > p->p_brksize &&
 221                     size > (size_t)as_rctl)) {
 222                         szc = 0;
 223                         pgsz = PAGESIZE;
 224                         size = nva - p->p_brkbase;
 225                 }
 226         } else {
 227                 size = nva - p->p_brkbase;
 228         }
 229 
 230         /*
 231          * use PAGESIZE to roundup ova because we want to know the real value
 232          * of the current heap end in case p_brkpageszc changes since the last
 233          * p_brksize was computed.
 234          */
 235         nva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
 236         ova = (caddr_t)P2ROUNDUP((uintptr_t)(p->p_brkbase + p->p_brksize),
 237             PAGESIZE);
 238 
 239         if ((nva < p->p_brkbase) || (size > p->p_brksize &&
 240             size > as_rctl)) {
 241                 mutex_enter(&p->p_lock);
 242                 (void) rctl_action(rctlproc_legacy[RLIMIT_DATA], p->p_rctls, p,
 243                     RCA_SAFE);
 244                 mutex_exit(&p->p_lock);
 245                 return (ENOMEM);
 246         }
 247 
 248         if (nva > ova) {
 249                 struct segvn_crargs crargs =
 250                     SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
 251 
 252                 if (!(p->p_datprot & PROT_EXEC)) {
 253                         crargs.prot &= ~PROT_EXEC;
 254                 }
 255 
 256                 /*
 257                  * Add new zfod mapping to extend UNIX data segment
 258                  * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies
 259                  * via map_pgszcvec(). Use AS_MAP_HEAP to get intermediate
 260                  * page sizes if ova is not aligned to szc's pgsz.
 261                  */
 262                 if (szc > 0) {
 263                         caddr_t rbss;
 264 
 265                         rbss = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase,
 266                             pgsz);
 267                         if (IS_P2ALIGNED(p->p_bssbase, pgsz) || ova > rbss) {
 268                                 crargs.szc = p->p_brkpageszc ? p->p_brkpageszc :
 269                                     AS_MAP_NO_LPOOB;
 270                         } else if (ova == rbss) {
 271                                 crargs.szc = szc;
 272                         } else {
 273                                 crargs.szc = AS_MAP_HEAP;
 274                         }
 275                 } else {
 276                         crargs.szc = AS_MAP_NO_LPOOB;
 277                 }
 278                 crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_UP;
 279                 error = as_map(as, ova, (size_t)(nva - ova), segvn_create,
 280                     &crargs);
 281                 if (error) {
 282                         return (error);
 283                 }
 284 
 285         } else if (nva < ova) {
 286                 /*
 287                  * Release mapping to shrink UNIX data segment.
 288                  */
 289                 (void) as_unmap(as, nva, (size_t)(ova - nva));
 290         }
 291         p->p_brksize = size;
 292         return (0);
 293 }
 294 
 295 /*
 296  * Grow the stack to include sp.  Return 1 if successful, 0 otherwise.
 297  * This routine assumes that the stack grows downward.
 298  */
 299 int
 300 grow(caddr_t sp)
 301 {
 302         struct proc *p = curproc;
 303         struct as *as = p->p_as;
 304         size_t oldsize = p->p_stksize;
 305         size_t newsize;
 306         int err;
 307 
 308         /*
 309          * Serialize grow operations on an address space.
 310          * This also serves as the lock protecting p_stksize
 311          * and p_stkpageszc.
 312          */
 313         as_rangelock(as);
 314         if (use_stk_lpg && (p->p_flag & SAUTOLPG) != 0) {
 315                 err = grow_lpg(sp);
 316         } else {
 317                 err = grow_internal(sp, p->p_stkpageszc);
 318         }
 319         as_rangeunlock(as);
 320 
 321         if (err == 0 && (newsize = p->p_stksize) > oldsize) {
 322                 ASSERT(IS_P2ALIGNED(oldsize, PAGESIZE));
 323                 ASSERT(IS_P2ALIGNED(newsize, PAGESIZE));
 324                 /*
 325                  * Set up translations so the process doesn't have to fault in
 326                  * the stack pages we just gave it.
 327                  */
 328                 (void) as_fault(as->a_hat, as, p->p_usrstack - newsize,
 329                     newsize - oldsize, F_INVAL, S_WRITE);
 330         }
 331         return ((err == 0 ? 1 : 0));
 332 }
 333 
 334 /*
 335  * Algorithm: call arch-specific map_pgsz to get best page size to use,
 336  * then call grow_internal().
 337  * Returns 0 on success.
 338  */
 339 static int
 340 grow_lpg(caddr_t sp)
 341 {
 342         struct proc *p = curproc;
 343         size_t pgsz;
 344         size_t len, newsize;
 345         caddr_t addr, saddr;
 346         caddr_t growend;
 347         int oszc, szc;
 348         int err;
 349 
 350         newsize = p->p_usrstack - sp;
 351 
 352         oszc = p->p_stkpageszc;
 353         pgsz = map_pgsz(MAPPGSZ_STK, p, sp, newsize, 0);
 354         szc = page_szc(pgsz);
 355 
 356         /*
 357          * Covers two cases:
 358          * 1. page_szc() returns -1 for invalid page size, so we want to
 359          * ignore it in that case.
 360          * 2. By design we never decrease page size, as it is more stable.
 361          * This shouldn't happen as the stack never shrinks.
 362          */
 363         if (szc <= oszc) {
 364                 err = grow_internal(sp, oszc);
 365                 /* failed, fall back to base page size */
 366                 if (err != 0 && oszc != 0) {
 367                         err = grow_internal(sp, 0);
 368                 }
 369                 return (err);
 370         }
 371 
 372         /*
 373          * We've grown sufficiently to switch to a new page size.
 374          * So we are going to remap the whole segment with the new page size.
 375          */
 376         err = grow_internal(sp, szc);
 377         /* The grow with szc failed, so fall back to base page size. */
 378         if (err != 0) {
 379                 if (szc != 0) {
 380                         err = grow_internal(sp, 0);
 381                 }
 382                 return (err);
 383         }
 384 
 385         /*
 386          * Round up stack pointer to a large page boundary and remap
 387          * any pgsz pages in the segment already faulted in beyond that
 388          * point.
 389          */
 390         saddr = p->p_usrstack - p->p_stksize;
 391         addr = (caddr_t)P2ROUNDUP((uintptr_t)saddr, pgsz);
 392         growend = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack, pgsz);
 393         len = growend - addr;
 394         /* Check that len is not negative. Update page size code for stack. */
 395         if (addr >= saddr && growend > addr && IS_P2ALIGNED(len, pgsz)) {
 396                 (void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
 397                 p->p_stkpageszc = szc;
 398         }
 399 
 400         ASSERT(err == 0);
 401         return (err);           /* should always be 0 */
 402 }
 403 
 404 /*
 405  * This routine assumes that the stack grows downward.
 406  * Returns 0 on success, errno on failure.
 407  */
 408 int
 409 grow_internal(caddr_t sp, uint_t growszc)
 410 {
 411         struct proc *p = curproc;
 412         size_t newsize;
 413         size_t oldsize;
 414         int    error;
 415         size_t pgsz;
 416         uint_t szc;
 417         struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
 418 
 419         ASSERT(sp < p->p_usrstack);
 420         sp = (caddr_t)P2ALIGN((uintptr_t)sp, PAGESIZE);
 421 
 422         /*
 423          * grow to growszc alignment but use current p->p_stkpageszc for
 424          * the segvn_crargs szc passed to segvn_create. For memcntl to
 425          * increase the szc, this allows the new extension segment to be
 426          * concatenated successfully with the existing stack segment.
 427          */
 428         if ((szc = growszc) != 0) {
 429                 pgsz = page_get_pagesize(szc);
 430                 ASSERT(pgsz > PAGESIZE);
 431                 newsize = p->p_usrstack - (caddr_t)P2ALIGN((uintptr_t)sp, pgsz);
 432                 if (newsize > (size_t)p->p_stk_ctl) {
 433                         szc = 0;
 434                         pgsz = PAGESIZE;
 435                         newsize = p->p_usrstack - sp;
 436                 }
 437         } else {
 438                 pgsz = PAGESIZE;
 439                 newsize = p->p_usrstack - sp;
 440         }
 441 
 442         if (newsize > (size_t)p->p_stk_ctl) {
 443                 (void) rctl_action(rctlproc_legacy[RLIMIT_STACK], p->p_rctls, p,
 444                     RCA_UNSAFE_ALL);
 445 
 446                 return (ENOMEM);
 447         }
 448 
 449         oldsize = p->p_stksize;
 450         ASSERT(P2PHASE(oldsize, PAGESIZE) == 0);
 451 
 452         if (newsize <= oldsize) {    /* prevent the stack from shrinking */
 453                 return (0);
 454         }
 455 
 456         if (!(p->p_stkprot & PROT_EXEC)) {
 457                 crargs.prot &= ~PROT_EXEC;
 458         }
 459         /*
 460          * extend stack with the proposed new growszc, which is different
 461          * than p_stkpageszc only on a memcntl to increase the stack pagesize.
 462          * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies via
 463          * map_pgszcvec(). Use AS_MAP_STACK to get intermediate page sizes
 464          * if not aligned to szc's pgsz.
 465          */
 466         if (szc > 0) {
 467                 caddr_t oldsp = p->p_usrstack - oldsize;
 468                 caddr_t austk = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack,
 469                     pgsz);
 470 
 471                 if (IS_P2ALIGNED(p->p_usrstack, pgsz) || oldsp < austk) {
 472                         crargs.szc = p->p_stkpageszc ? p->p_stkpageszc :
 473                             AS_MAP_NO_LPOOB;
 474                 } else if (oldsp == austk) {
 475                         crargs.szc = szc;
 476                 } else {
 477                         crargs.szc = AS_MAP_STACK;
 478                 }
 479         } else {
 480                 crargs.szc = AS_MAP_NO_LPOOB;
 481         }
 482         crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_DOWN;
 483 
 484         if ((error = as_map(p->p_as, p->p_usrstack - newsize, newsize - oldsize,
 485             segvn_create, &crargs)) != 0) {
 486                 if (error == EAGAIN) {
 487                         cmn_err(CE_WARN, "Sorry, no swap space to grow stack "
 488                             "for pid %d (%s)", p->p_pid, PTOU(p)->u_comm);
 489                 }
 490                 return (error);
 491         }
 492         p->p_stksize = newsize;
 493         return (0);
 494 }
 495 
 496 /*
 497  * Find address for user to map.
 498  * If MAP_FIXED is not specified, we can pick any address we want, but we will
 499  * first try the value in *addrp if it is non-NULL.  Thus this is implementing
 500  * a way to try and get a preferred address.
 501  */
 502 int
 503 choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off,
 504     int vacalign, uint_t flags)
 505 {
 506 #if defined(__amd64)
 507         proc_t *p = curproc;
 508 #endif
 509         caddr_t basep = (caddr_t)(uintptr_t)((uintptr_t)*addrp & PAGEMASK);
 510         size_t lenp;
 511 
 512         ASSERT(AS_ISCLAIMGAP(as));      /* searches should be serialized */
 513 
 514         /*
 515          * If we have been provided a hint, we should still expand the lenp
 516          * to be the rest of the address space.  This will allow us to
 517          * treat the hint as a strong desire to be "nearby" the provided
 518          * address.  If we can't satisfy the hint, as_gap() will walk forward.
 519          */
 520         if (flags & _MAP_LOW32)
 521                 lenp = (caddr_t)USERLIMIT32 - basep;
 522 #if defined(__amd64)
 523         else if (p->p_model == DATAMODEL_NATIVE)
 524                 lenp = p->p_usrstack - basep -
 525                     ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK);
 526 #endif
 527         else
 528                 lenp = as->a_userlimit - basep;
 529 
 530         if (flags & MAP_FIXED) {
 531                 (void) as_unmap(as, *addrp, len);
 532                 return (0);
 533         } else if (basep != NULL && ((flags & MAP_ALIGN) == 0) &&
 534             !as_gap(as, len, &basep, &lenp, 0, *addrp)) {
 535                 /* User supplied address was available */
 536                 *addrp = basep;
 537         } else {
 538                 /*
 539                  * No user supplied address or the address supplied was not
 540                  * available.
 541                  */
 542                 map_addr(addrp, len, off, vacalign, flags);
 543         }
 544         if (*addrp == NULL)
 545                 return (ENOMEM);
 546         return (0);
 547 }
 548 
 549 caddr_t
 550 map_userlimit(proc_t *pp, struct as *as, int flags)
 551 {
 552         if (flags & _MAP_LOW32) {
 553                 if (PROC_IS_BRANDED(pp) && BROP(pp)->b_map32limit != NULL) {
 554                         return ((caddr_t)(uintptr_t)BROP(pp)->b_map32limit(pp));
 555                 } else {
 556                         return ((caddr_t)_userlimit32);
 557                 }
 558         }
 559 
 560         return (as->a_userlimit);
 561 }
 562 
 563 
 564 /*
 565  * Used for MAP_ANON - fast way to get anonymous pages
 566  */
 567 static int
 568 zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
 569     offset_t pos)
 570 {
 571         struct segvn_crargs vn_a;
 572         int error;
 573 
 574         if (((PROT_ALL & uprot) != uprot))
 575                 return (EACCES);
 576 
 577         if ((flags & MAP_FIXED) != 0) {
 578                 /*
 579                  * Use the user address.  First verify that
 580                  * the address to be used is page aligned.
 581                  * Then make some simple bounds checks.
 582                  */
 583                 if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
 584                         return (EINVAL);
 585 
 586                 switch (valid_usr_range(*addrp, len, uprot, as,
 587                     map_userlimit(as->a_proc, as, flags))) {
 588                 case RANGE_OKAY:
 589                         break;
 590                 case RANGE_BADPROT:
 591                         return (ENOTSUP);
 592                 case RANGE_BADADDR:
 593                 default:
 594                         return (ENOMEM);
 595                 }
 596         }
 597         /*
 598          * No need to worry about vac alignment for anonymous
 599          * pages since this is a "clone" object that doesn't
 600          * yet exist.
 601          */
 602         error = choose_addr(as, addrp, len, pos, ADDR_NOVACALIGN, flags);
 603         if (error != 0) {
 604                 return (error);
 605         }
 606 
 607         /*
 608          * Use the seg_vn segment driver; passing in the NULL amp
 609          * gives the desired "cloning" effect.
 610          */
 611         vn_a.vp = NULL;
 612         vn_a.offset = 0;
 613         vn_a.type = flags & MAP_TYPE;
 614         vn_a.prot = uprot;
 615         vn_a.maxprot = PROT_ALL;
 616         vn_a.flags = flags & ~MAP_TYPE;
 617         vn_a.cred = CRED();
 618         vn_a.amp = NULL;
 619         vn_a.szc = 0;
 620         vn_a.lgrp_mem_policy_flags = 0;
 621 
 622         return (as_map(as, *addrp, len, segvn_create, &vn_a));
 623 }
 624 
 625 static int
 626 smmap_common(caddr_t *addrp, size_t len,
 627     int prot, int flags, struct file *fp, offset_t pos)
 628 {
 629         struct vnode *vp;
 630         struct as *as = curproc->p_as;
 631         uint_t uprot, maxprot, type;
 632         int error;
 633         int in_crit = 0;
 634 
 635         if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | _MAP_NEW |
 636             _MAP_LOW32 | MAP_NORESERVE | MAP_ANON | MAP_ALIGN |
 637             MAP_TEXT | MAP_INITDATA)) != 0) {
 638                 /* | MAP_RENAME */      /* not implemented, let user know */
 639                 return (EINVAL);
 640         }
 641 
 642         if ((flags & MAP_TEXT) && !(prot & PROT_EXEC)) {
 643                 return (EINVAL);
 644         }
 645 
 646         if ((flags & (MAP_TEXT | MAP_INITDATA)) == (MAP_TEXT | MAP_INITDATA)) {
 647                 return (EINVAL);
 648         }
 649 
 650 #if defined(__sparc)
 651         /*
 652          * See if this is an "old mmap call".  If so, remember this
 653          * fact and convert the flags value given to mmap to indicate
 654          * the specified address in the system call must be used.
 655          * _MAP_NEW is turned set by all new uses of mmap.
 656          */
 657         if ((flags & _MAP_NEW) == 0)
 658                 flags |= MAP_FIXED;
 659 #endif
 660         flags &= ~_MAP_NEW;
 661 
 662         type = flags & MAP_TYPE;
 663         if (type != MAP_PRIVATE && type != MAP_SHARED)
 664                 return (EINVAL);
 665 
 666 
 667         if (flags & MAP_ALIGN) {
 668 
 669                 if (flags & MAP_FIXED)
 670                         return (EINVAL);
 671 
 672                 /* alignment needs to be a power of 2 >= page size */
 673                 if (((uintptr_t)*addrp < PAGESIZE && (uintptr_t)*addrp != 0) ||
 674                     !ISP2((uintptr_t)*addrp))
 675                         return (EINVAL);
 676         }
 677         /*
 678          * Check for bad lengths and file position.
 679          * We let the VOP_MAP routine check for negative lengths
 680          * since on some vnode types this might be appropriate.
 681          */
 682         if (len == 0 || (pos & (u_offset_t)PAGEOFFSET) != 0)
 683                 return (EINVAL);
 684 
 685         maxprot = PROT_ALL;             /* start out allowing all accesses */
 686         uprot = prot | PROT_USER;
 687 
 688         if (fp == NULL) {
 689                 ASSERT(flags & MAP_ANON);
 690                 /* discard lwpchan mappings, like munmap() */
 691                 if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
 692                         lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
 693                 as_rangelock(as);
 694                 error = zmap(as, addrp, len, uprot, flags, pos);
 695                 as_rangeunlock(as);
 696                 /*
 697                  * Tell machine specific code that lwp has mapped shared memory
 698                  */
 699                 if (error == 0 && (flags & MAP_SHARED)) {
 700                         /* EMPTY */
 701                         LWP_MMODEL_SHARED_AS(*addrp, len);
 702                 }
 703                 return (error);
 704         } else if ((flags & MAP_ANON) != 0)
 705                 return (EINVAL);
 706 
 707         vp = fp->f_vnode;
 708 
 709         /* Can't execute code from "noexec" mounted filesystem. */
 710         if ((vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0)
 711                 maxprot &= ~PROT_EXEC;
 712 
 713         /*
 714          * These checks were added as part of large files.
 715          *
 716          * Return ENXIO if the initial position is negative; return EOVERFLOW
 717          * if (offset + len) would overflow the maximum allowed offset for the
 718          * type of file descriptor being used.
 719          */
 720         if (vp->v_type == VREG) {
 721                 if (pos < 0)
 722                         return (ENXIO);
 723                 if ((offset_t)len > (OFFSET_MAX(fp) - pos))
 724                         return (EOVERFLOW);
 725         }
 726 
 727         if (type == MAP_SHARED && (fp->f_flag & FWRITE) == 0) {
 728                 /* no write access allowed */
 729                 maxprot &= ~PROT_WRITE;
 730         }
 731 
 732         /*
 733          * XXX - Do we also adjust maxprot based on protections
 734          * of the vnode?  E.g. if no execute permission is given
 735          * on the vnode for the current user, maxprot probably
 736          * should disallow PROT_EXEC also?  This is different
 737          * from the write access as this would be a per vnode
 738          * test as opposed to a per fd test for writability.
 739          */
 740 
 741         /*
 742          * Verify that the specified protections are not greater than
 743          * the maximum allowable protections.  Also test to make sure
 744          * that the file descriptor does allows for read access since
 745          * "write only" mappings are hard to do since normally we do
 746          * the read from the file before the page can be written.
 747          */
 748         if (((maxprot & uprot) != uprot) || (fp->f_flag & FREAD) == 0)
 749                 return (EACCES);
 750 
 751         /*
 752          * If the user specified an address, do some simple checks here
 753          */
 754         if ((flags & MAP_FIXED) != 0) {
 755                 /*
 756                  * Use the user address.  First verify that
 757                  * the address to be used is page aligned.
 758                  * Then make some simple bounds checks.
 759                  */
 760                 if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
 761                         return (EINVAL);
 762                 switch (valid_usr_range(*addrp, len, uprot, as,
 763                     map_userlimit(curproc, as, flags))) {
 764                 case RANGE_OKAY:
 765                         break;
 766                 case RANGE_BADPROT:
 767                         return (ENOTSUP);
 768                 case RANGE_BADADDR:
 769                 default:
 770                         return (ENOMEM);
 771                 }
 772         }
 773 
 774         if ((prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) &&
 775             nbl_need_check(vp)) {
 776                 int svmand;
 777                 nbl_op_t nop;
 778 
 779                 nbl_start_crit(vp, RW_READER);
 780                 in_crit = 1;
 781                 error = nbl_svmand(vp, fp->f_cred, &svmand);
 782                 if (error != 0)
 783                         goto done;
 784                 if ((prot & PROT_WRITE) && (type == MAP_SHARED)) {
 785                         if (prot & (PROT_READ | PROT_EXEC)) {
 786                                 nop = NBL_READWRITE;
 787                         } else {
 788                                 nop = NBL_WRITE;
 789                         }
 790                 } else {
 791                         nop = NBL_READ;
 792                 }
 793                 if (nbl_conflict(vp, nop, 0, LONG_MAX, svmand, NULL)) {
 794                         error = EACCES;
 795                         goto done;
 796                 }
 797         }
 798 
 799         /* discard lwpchan mappings, like munmap() */
 800         if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
 801                 lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
 802 
 803         /*
 804          * Ok, now let the vnode map routine do its thing to set things up.
 805          */
 806         error = VOP_MAP(vp, pos, as,
 807             addrp, len, uprot, maxprot, flags, fp->f_cred, NULL);
 808 
 809         if (error == 0) {
 810                 /*
 811                  * Tell machine specific code that lwp has mapped shared memory
 812                  */
 813                 if (flags & MAP_SHARED) {
 814                         /* EMPTY */
 815                         LWP_MMODEL_SHARED_AS(*addrp, len);
 816                 }
 817                 if (vp->v_type == VREG &&
 818                     (flags & (MAP_TEXT | MAP_INITDATA)) != 0) {
 819                         /*
 820                          * Mark this as an executable vnode
 821                          */
 822                         mutex_enter(&vp->v_lock);
 823                         vp->v_flag |= VVMEXEC;
 824                         mutex_exit(&vp->v_lock);
 825                 }
 826         }
 827 
 828 done:
 829         if (in_crit)
 830                 nbl_end_crit(vp);
 831         return (error);
 832 }
 833 
 834 #ifdef _LP64
 835 /*
 836  * LP64 mmap(2) system call: 64-bit offset, 64-bit address.
 837  *
 838  * The "large file" mmap routine mmap64(2) is also mapped to this routine
 839  * by the 64-bit version of libc.
 840  *
 841  * Eventually, this should be the only version, and have smmap_common()
 842  * folded back into it again.  Some day.
 843  */
 844 caddr_t
 845 smmap64(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos)
 846 {
 847         struct file *fp;
 848         int error;
 849 
 850         if (fd == -1 && (flags & MAP_ANON) != 0)
 851                 error = smmap_common(&addr, len, prot, flags,
 852                     NULL, (offset_t)pos);
 853         else if ((fp = getf(fd)) != NULL) {
 854                 error = smmap_common(&addr, len, prot, flags,
 855                     fp, (offset_t)pos);
 856                 releasef(fd);
 857         } else
 858                 error = EBADF;
 859 
 860         return (error ? (caddr_t)(uintptr_t)set_errno(error) : addr);
 861 }
 862 #endif  /* _LP64 */
 863 
 864 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
 865 
 866 /*
 867  * ILP32 mmap(2) system call: 32-bit offset, 32-bit address.
 868  */
 869 caddr_t
 870 smmap32(caddr32_t addr, size32_t len, int prot, int flags, int fd, off32_t pos)
 871 {
 872         struct file *fp;
 873         int error;
 874         caddr_t a = (caddr_t)(uintptr_t)addr;
 875 
 876         if (flags & _MAP_LOW32)
 877                 error = EINVAL;
 878         else if (fd == -1 && (flags & MAP_ANON) != 0)
 879                 error = smmap_common(&a, (size_t)len, prot,
 880                     flags | _MAP_LOW32, NULL, (offset_t)pos);
 881         else if ((fp = getf(fd)) != NULL) {
 882                 error = smmap_common(&a, (size_t)len, prot,
 883                     flags | _MAP_LOW32, fp, (offset_t)pos);
 884                 releasef(fd);
 885         } else
 886                 error = EBADF;
 887 
 888         ASSERT(error != 0 || (uintptr_t)(a + len) < (uintptr_t)UINT32_MAX);
 889 
 890         return (error ? (caddr_t)(uintptr_t)set_errno(error) : a);
 891 }
 892 
 893 /*
 894  * ILP32 mmap64(2) system call: 64-bit offset, 32-bit address.
 895  *
 896  * Now things really get ugly because we can't use the C-style
 897  * calling convention for more than 6 args, and 64-bit parameter
 898  * passing on 32-bit systems is less than clean.
 899  */
 900 
 901 struct mmaplf32a {
 902         caddr_t addr;
 903         size_t len;
 904 #ifdef _LP64
 905         /*
 906          * 32-bit contents, 64-bit cells
 907          */
 908         uint64_t prot;
 909         uint64_t flags;
 910         uint64_t fd;
 911         uint64_t offhi;
 912         uint64_t offlo;
 913 #else
 914         /*
 915          * 32-bit contents, 32-bit cells
 916          */
 917         uint32_t prot;
 918         uint32_t flags;
 919         uint32_t fd;
 920         uint32_t offhi;
 921         uint32_t offlo;
 922 #endif
 923 };
 924 
 925 int
 926 smmaplf32(struct mmaplf32a *uap, rval_t *rvp)
 927 {
 928         struct file *fp;
 929         int error;
 930         caddr_t a = uap->addr;
 931         int flags = (int)uap->flags;
 932         int fd = (int)uap->fd;
 933 #ifdef _BIG_ENDIAN
 934         offset_t off = ((u_offset_t)uap->offhi << 32) | (u_offset_t)uap->offlo;
 935 #else
 936         offset_t off = ((u_offset_t)uap->offlo << 32) | (u_offset_t)uap->offhi;
 937 #endif
 938 
 939         if (flags & _MAP_LOW32)
 940                 error = EINVAL;
 941         else if (fd == -1 && (flags & MAP_ANON) != 0)
 942                 error = smmap_common(&a, uap->len, (int)uap->prot,
 943                     flags | _MAP_LOW32, NULL, off);
 944         else if ((fp = getf(fd)) != NULL) {
 945                 error = smmap_common(&a, uap->len, (int)uap->prot,
 946                     flags | _MAP_LOW32, fp, off);
 947                 releasef(fd);
 948         } else
 949                 error = EBADF;
 950 
 951         if (error == 0)
 952                 rvp->r_val1 = (uintptr_t)a;
 953         return (error);
 954 }
 955 
 956 #endif  /* _SYSCALL32_IMPL || _ILP32 */
 957 
 958 int
 959 munmap(caddr_t addr, size_t len)
 960 {
 961         struct proc *p = curproc;
 962         struct as *as = p->p_as;
 963 
 964         if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
 965                 return (set_errno(EINVAL));
 966 
 967         if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
 968                 return (set_errno(EINVAL));
 969 
 970         /*
 971          * Discard lwpchan mappings.
 972          */
 973         if (p->p_lcp != NULL)
 974                 lwpchan_delete_mapping(p, addr, addr + len);
 975         if (as_unmap(as, addr, len) != 0)
 976                 return (set_errno(EINVAL));
 977 
 978         return (0);
 979 }
 980 
 981 int
 982 mprotect(caddr_t addr, size_t len, int prot)
 983 {
 984         struct as *as = curproc->p_as;
 985         uint_t uprot = prot | PROT_USER;
 986         int error;
 987 
 988         if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
 989                 return (set_errno(EINVAL));
 990 
 991         switch (valid_usr_range(addr, len, prot, as, as->a_userlimit)) {
 992         case RANGE_OKAY:
 993                 break;
 994         case RANGE_BADPROT:
 995                 return (set_errno(ENOTSUP));
 996         case RANGE_BADADDR:
 997         default:
 998                 return (set_errno(ENOMEM));
 999         }
1000 
1001         error = as_setprot(as, addr, len, uprot);
1002         if (error)
1003                 return (set_errno(error));
1004         return (0);
1005 }
1006 
1007 #define MC_CACHE        128                     /* internal result buffer */
1008 #define MC_QUANTUM      (MC_CACHE * PAGESIZE)   /* addresses covered in loop */
1009 
1010 int
1011 mincore(caddr_t addr, size_t len, char *vecp)
1012 {
1013         struct as *as = curproc->p_as;
1014         caddr_t ea;                     /* end address of loop */
1015         size_t rl;                      /* inner result length */
1016         char vec[MC_CACHE];             /* local vector cache */
1017         int error;
1018         model_t model;
1019         long    llen;
1020 
1021         model = get_udatamodel();
1022         /*
1023          * Validate form of address parameters.
1024          */
1025         if (model == DATAMODEL_NATIVE) {
1026                 llen = (long)len;
1027         } else {
1028                 llen = (int32_t)(size32_t)len;
1029         }
1030         if (((uintptr_t)addr & PAGEOFFSET) != 0 || llen <= 0)
1031                 return (set_errno(EINVAL));
1032 
1033         if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
1034                 return (set_errno(ENOMEM));
1035 
1036         /*
1037          * Loop over subranges of interval [addr : addr + len), recovering
1038          * results internally and then copying them out to caller.  Subrange
1039          * is based on the size of MC_CACHE, defined above.
1040          */
1041         for (ea = addr + len; addr < ea; addr += MC_QUANTUM) {
1042                 error = as_incore(as, addr,
1043                     (size_t)MIN(MC_QUANTUM, ea - addr), vec, &rl);
1044                 if (rl != 0) {
1045                         rl = (rl + PAGESIZE - 1) / PAGESIZE;
1046                         if (copyout(vec, vecp, rl) != 0)
1047                                 return (set_errno(EFAULT));
1048                         vecp += rl;
1049                 }
1050                 if (error != 0)
1051                         return (set_errno(ENOMEM));
1052         }
1053         return (0);
1054 }