io-lx-public-vs-joyent Wdiff usr/src/uts/common/os/grow.c

Print this page

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/os/grow.c
          +++ new/usr/src/uts/common/os/grow.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved.
  24   24   * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  25   25   */
  26   26  
  27   27  /*
  28   28   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  29   29   * Use is subject to license terms.
  30   30   */
  31   31  
  32   32  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  33   33  /*        All Rights Reserved   */
  34   34  
  35   35  #include <sys/types.h>
  36   36  #include <sys/inttypes.h>
  37   37  #include <sys/param.h>
  38   38  #include <sys/sysmacros.h>
  39   39  #include <sys/systm.h>
  40   40  #include <sys/signal.h>
  41   41  #include <sys/user.h>
  42   42  #include <sys/errno.h>
  43   43  #include <sys/var.h>
  44   44  #include <sys/proc.h>
  45   45  #include <sys/tuneable.h>
  46   46  #include <sys/debug.h>
  47   47  #include <sys/cmn_err.h>
  48   48  #include <sys/cred.h>
  49   49  #include <sys/vnode.h>
  50   50  #include <sys/vfs.h>
  51   51  #include <sys/vm.h>
  52   52  #include <sys/file.h>
  53   53  #include <sys/mman.h>
  54   54  #include <sys/vmparam.h>
  55   55  #include <sys/fcntl.h>
  56   56  #include <sys/lwpchan_impl.h>
  57   57  #include <sys/nbmlock.h>
  58   58  #include <sys/brand.h>
  59   59  
  60   60  #include <vm/hat.h>
  61   61  #include <vm/as.h>
  62   62  #include <vm/seg.h>
  63   63  #include <vm/seg_dev.h>
  64   64  #include <vm/seg_vn.h>
  65   65  
  66   66  int use_brk_lpg = 1;
  67   67  int use_stk_lpg = 1;
  68   68  
  69   69  static int brk_lpg(caddr_t nva);
  70   70  static int grow_lpg(caddr_t sp);
  71   71  
  72   72  int
  73   73  brk(caddr_t nva)
  74   74  {
  75   75          int error;
  76   76          proc_t *p = curproc;
  77   77  
  78   78          /*
  79   79           * Serialize brk operations on an address space.
  80   80           * This also serves as the lock protecting p_brksize
  81   81           * and p_brkpageszc.
  82   82           */
  83   83          as_rangelock(p->p_as);
  84   84          if (use_brk_lpg && (p->p_flag & SAUTOLPG) != 0) {
  85   85                  error = brk_lpg(nva);
  86   86          } else {
  87   87                  error = brk_internal(nva, p->p_brkpageszc);
  88   88          }
  89   89          as_rangeunlock(p->p_as);
  90   90          return ((error != 0 ? set_errno(error) : 0));
  91   91  }
  92   92  
  93   93  /*
  94   94   * Algorithm: call arch-specific map_pgsz to get best page size to use,
  95   95   * then call brk_internal().
  96   96   * Returns 0 on success.
  97   97   */
  98   98  static int
  99   99  brk_lpg(caddr_t nva)
 100  100  {
 101  101          struct proc *p = curproc;
 102  102          size_t pgsz, len;
 103  103          caddr_t addr, brkend;
 104  104          caddr_t bssbase = p->p_bssbase;
 105  105          caddr_t brkbase = p->p_brkbase;
 106  106          int oszc, szc;
 107  107          int err;
 108  108  
 109  109          oszc = p->p_brkpageszc;
 110  110  
 111  111          /*
 112  112           * If p_brkbase has not yet been set, the first call
 113  113           * to brk_internal() will initialize it.
 114  114           */
 115  115          if (brkbase == 0) {
 116  116                  return (brk_internal(nva, oszc));
 117  117          }
 118  118  
 119  119          len = nva - bssbase;
 120  120  
 121  121          pgsz = map_pgsz(MAPPGSZ_HEAP, p, bssbase, len, 0);
 122  122          szc = page_szc(pgsz);
 123  123  
 124  124          /*
 125  125           * Covers two cases:
 126  126           * 1. page_szc() returns -1 for invalid page size, so we want to
 127  127           * ignore it in that case.
 128  128           * 2. By design we never decrease page size, as it is more stable.
 129  129           */
 130  130          if (szc <= oszc) {
 131  131                  err = brk_internal(nva, oszc);
 132  132                  /* If failed, back off to base page size. */
 133  133                  if (err != 0 && oszc != 0) {
 134  134                          err = brk_internal(nva, 0);
 135  135                  }
 136  136                  return (err);
 137  137          }
 138  138  
 139  139          err = brk_internal(nva, szc);
 140  140          /* If using szc failed, map with base page size and return. */
 141  141          if (err != 0) {
 142  142                  if (szc != 0) {
 143  143                          err = brk_internal(nva, 0);
 144  144                  }
 145  145                  return (err);
 146  146          }
 147  147  
 148  148          /*
 149  149           * Round up brk base to a large page boundary and remap
 150  150           * anything in the segment already faulted in beyond that
 151  151           * point.
 152  152           */
 153  153          addr = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase, pgsz);
 154  154          brkend = brkbase + p->p_brksize;
 155  155          len = brkend - addr;
 156  156          /* Check that len is not negative. Update page size code for heap. */
 157  157          if (addr >= p->p_bssbase && brkend > addr && IS_P2ALIGNED(len, pgsz)) {
 158  158                  (void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
 159  159                  p->p_brkpageszc = szc;
 160  160          }
 161  161  
 162  162          ASSERT(err == 0);
 163  163          return (err);           /* should always be 0 */
 164  164  }
 165  165  
 166  166  /*
 167  167   * Returns 0 on success.
 168  168   */
 169  169  int
 170  170  brk_internal(caddr_t nva, uint_t brkszc)
 171  171  {
 172  172          caddr_t ova;                    /* current break address */
 173  173          size_t size;
 174  174          int     error;
 175  175          struct proc *p = curproc;
 176  176          struct as *as = p->p_as;
 177  177          size_t pgsz;
 178  178          uint_t szc;
 179  179          rctl_qty_t as_rctl;
 180  180  
 181  181          /*
 182  182           * extend heap to brkszc alignment but use current p->p_brkpageszc
 183  183           * for the newly created segment. This allows the new extension
 184  184           * segment to be concatenated successfully with the existing brk
 185  185           * segment.
 186  186           */
 187  187          if ((szc = brkszc) != 0) {
 188  188                  pgsz = page_get_pagesize(szc);
 189  189                  ASSERT(pgsz > PAGESIZE);
 190  190          } else {
 191  191                  pgsz = PAGESIZE;
 192  192          }
 193  193  
 194  194          mutex_enter(&p->p_lock);
 195  195          as_rctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_DATA],
 196  196              p->p_rctls, p);
 197  197          mutex_exit(&p->p_lock);
 198  198  
 199  199          /*
 200  200           * If p_brkbase has not yet been set, the first call
 201  201           * to brk() will initialize it.
 202  202           */
 203  203          if (p->p_brkbase == 0)
 204  204                  p->p_brkbase = nva;
 205  205  
 206  206          /*
 207  207           * Before multiple page size support existed p_brksize was the value
 208  208           * not rounded to the pagesize (i.e. it stored the exact user request
 209  209           * for heap size). If pgsz is greater than PAGESIZE calculate the
 210  210           * heap size as the real new heap size by rounding it up to pgsz.
 211  211           * This is useful since we may want to know where the heap ends
 212  212           * without knowing heap pagesize (e.g. some old code) and also if
 213  213           * heap pagesize changes we can update p_brkpageszc but delay adding
 214  214           * new mapping yet still know from p_brksize where the heap really
 215  215           * ends. The user requested heap end is stored in libc variable.
 216  216           */
 217  217          if (pgsz > PAGESIZE) {
 218  218                  caddr_t tnva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
 219  219                  size = tnva - p->p_brkbase;
 220  220                  if (tnva < p->p_brkbase || (size > p->p_brksize &&
 221  221                      size > (size_t)as_rctl)) {
 222  222                          szc = 0;
 223  223                          pgsz = PAGESIZE;
 224  224                          size = nva - p->p_brkbase;
 225  225                  }
 226  226          } else {
 227  227                  size = nva - p->p_brkbase;
 228  228          }
 229  229  
 230  230          /*
 231  231           * use PAGESIZE to roundup ova because we want to know the real value
 232  232           * of the current heap end in case p_brkpageszc changes since the last
 233  233           * p_brksize was computed.
 234  234           */
 235  235          nva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
 236  236          ova = (caddr_t)P2ROUNDUP((uintptr_t)(p->p_brkbase + p->p_brksize),
 237  237              PAGESIZE);
 238  238  
 239  239          if ((nva < p->p_brkbase) || (size > p->p_brksize &&
 240  240              size > as_rctl)) {
 241  241                  mutex_enter(&p->p_lock);
 242  242                  (void) rctl_action(rctlproc_legacy[RLIMIT_DATA], p->p_rctls, p,
 243  243                      RCA_SAFE);
 244  244                  mutex_exit(&p->p_lock);
 245  245                  return (ENOMEM);
 246  246          }
 247  247  
 248  248          if (nva > ova) {
 249  249                  struct segvn_crargs crargs =
 250  250                      SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
 251  251  
 252  252                  if (!(p->p_datprot & PROT_EXEC)) {
 253  253                          crargs.prot &= ~PROT_EXEC;
 254  254                  }
 255  255  
 256  256                  /*
 257  257                   * Add new zfod mapping to extend UNIX data segment
 258  258                   * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies
 259  259                   * via map_pgszcvec(). Use AS_MAP_HEAP to get intermediate
 260  260                   * page sizes if ova is not aligned to szc's pgsz.
 261  261                   */
 262  262                  if (szc > 0) {
 263  263                          caddr_t rbss;
 264  264  
 265  265                          rbss = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase,
 266  266                              pgsz);
 267  267                          if (IS_P2ALIGNED(p->p_bssbase, pgsz) || ova > rbss) {
 268  268                                  crargs.szc = p->p_brkpageszc ? p->p_brkpageszc :
 269  269                                      AS_MAP_NO_LPOOB;
 270  270                          } else if (ova == rbss) {
 271  271                                  crargs.szc = szc;
 272  272                          } else {
 273  273                                  crargs.szc = AS_MAP_HEAP;
 274  274                          }
 275  275                  } else {
 276  276                          crargs.szc = AS_MAP_NO_LPOOB;
 277  277                  }
 278  278                  crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_UP;
 279  279                  error = as_map(as, ova, (size_t)(nva - ova), segvn_create,
 280  280                      &crargs);
 281  281                  if (error) {
 282  282                          return (error);
 283  283                  }
 284  284  
 285  285          } else if (nva < ova) {
 286  286                  /*
 287  287                   * Release mapping to shrink UNIX data segment.
 288  288                   */
 289  289                  (void) as_unmap(as, nva, (size_t)(ova - nva));
 290  290          }
 291  291          p->p_brksize = size;
 292  292          return (0);
 293  293  }
 294  294  
 295  295  /*
 296  296   * Grow the stack to include sp.  Return 1 if successful, 0 otherwise.
 297  297   * This routine assumes that the stack grows downward.
 298  298   */
 299  299  int
 300  300  grow(caddr_t sp)
 301  301  {
 302  302          struct proc *p = curproc;
 303  303          struct as *as = p->p_as;
 304  304          size_t oldsize = p->p_stksize;
 305  305          size_t newsize;
 306  306          int err;
 307  307  
 308  308          /*
 309  309           * Serialize grow operations on an address space.
 310  310           * This also serves as the lock protecting p_stksize
 311  311           * and p_stkpageszc.
 312  312           */
 313  313          as_rangelock(as);
 314  314          if (use_stk_lpg && (p->p_flag & SAUTOLPG) != 0) {
 315  315                  err = grow_lpg(sp);
 316  316          } else {
 317  317                  err = grow_internal(sp, p->p_stkpageszc);
 318  318          }
 319  319          as_rangeunlock(as);
 320  320  
 321  321          if (err == 0 && (newsize = p->p_stksize) > oldsize) {
 322  322                  ASSERT(IS_P2ALIGNED(oldsize, PAGESIZE));
 323  323                  ASSERT(IS_P2ALIGNED(newsize, PAGESIZE));
 324  324                  /*
 325  325                   * Set up translations so the process doesn't have to fault in
 326  326                   * the stack pages we just gave it.
 327  327                   */
 328  328                  (void) as_fault(as->a_hat, as, p->p_usrstack - newsize,
 329  329                      newsize - oldsize, F_INVAL, S_WRITE);
 330  330          }
 331  331          return ((err == 0 ? 1 : 0));
 332  332  }
 333  333  
 334  334  /*
 335  335   * Algorithm: call arch-specific map_pgsz to get best page size to use,
 336  336   * then call grow_internal().
 337  337   * Returns 0 on success.
 338  338   */
 339  339  static int
 340  340  grow_lpg(caddr_t sp)
 341  341  {
 342  342          struct proc *p = curproc;
 343  343          size_t pgsz;
 344  344          size_t len, newsize;
 345  345          caddr_t addr, saddr;
 346  346          caddr_t growend;
 347  347          int oszc, szc;
 348  348          int err;
 349  349  
 350  350          newsize = p->p_usrstack - sp;
 351  351  
 352  352          oszc = p->p_stkpageszc;
 353  353          pgsz = map_pgsz(MAPPGSZ_STK, p, sp, newsize, 0);
 354  354          szc = page_szc(pgsz);
 355  355  
 356  356          /*
 357  357           * Covers two cases:
 358  358           * 1. page_szc() returns -1 for invalid page size, so we want to
 359  359           * ignore it in that case.
 360  360           * 2. By design we never decrease page size, as it is more stable.
 361  361           * This shouldn't happen as the stack never shrinks.
 362  362           */
 363  363          if (szc <= oszc) {
 364  364                  err = grow_internal(sp, oszc);
 365  365                  /* failed, fall back to base page size */
 366  366                  if (err != 0 && oszc != 0) {
 367  367                          err = grow_internal(sp, 0);
 368  368                  }
 369  369                  return (err);
 370  370          }
 371  371  
 372  372          /*
 373  373           * We've grown sufficiently to switch to a new page size.
 374  374           * So we are going to remap the whole segment with the new page size.
 375  375           */
 376  376          err = grow_internal(sp, szc);
 377  377          /* The grow with szc failed, so fall back to base page size. */
 378  378          if (err != 0) {
 379  379                  if (szc != 0) {
 380  380                          err = grow_internal(sp, 0);
 381  381                  }
 382  382                  return (err);
 383  383          }
 384  384  
 385  385          /*
 386  386           * Round up stack pointer to a large page boundary and remap
 387  387           * any pgsz pages in the segment already faulted in beyond that
 388  388           * point.
 389  389           */
 390  390          saddr = p->p_usrstack - p->p_stksize;
 391  391          addr = (caddr_t)P2ROUNDUP((uintptr_t)saddr, pgsz);
 392  392          growend = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack, pgsz);
 393  393          len = growend - addr;
 394  394          /* Check that len is not negative. Update page size code for stack. */
 395  395          if (addr >= saddr && growend > addr && IS_P2ALIGNED(len, pgsz)) {
 396  396                  (void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
 397  397                  p->p_stkpageszc = szc;
 398  398          }
 399  399  
 400  400          ASSERT(err == 0);
 401  401          return (err);           /* should always be 0 */
 402  402  }
 403  403  
 404  404  /*
 405  405   * This routine assumes that the stack grows downward.
 406  406   * Returns 0 on success, errno on failure.
 407  407   */
 408  408  int
 409  409  grow_internal(caddr_t sp, uint_t growszc)
 410  410  {
 411  411          struct proc *p = curproc;
 412  412          size_t newsize;
 413  413          size_t oldsize;
 414  414          int    error;
 415  415          size_t pgsz;
 416  416          uint_t szc;
 417  417          struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
 418  418  
 419  419          ASSERT(sp < p->p_usrstack);
 420  420          sp = (caddr_t)P2ALIGN((uintptr_t)sp, PAGESIZE);
 421  421  
 422  422          /*
 423  423           * grow to growszc alignment but use current p->p_stkpageszc for
 424  424           * the segvn_crargs szc passed to segvn_create. For memcntl to
 425  425           * increase the szc, this allows the new extension segment to be
 426  426           * concatenated successfully with the existing stack segment.
 427  427           */
 428  428          if ((szc = growszc) != 0) {
 429  429                  pgsz = page_get_pagesize(szc);
 430  430                  ASSERT(pgsz > PAGESIZE);
 431  431                  newsize = p->p_usrstack - (caddr_t)P2ALIGN((uintptr_t)sp, pgsz);
 432  432                  if (newsize > (size_t)p->p_stk_ctl) {
 433  433                          szc = 0;
 434  434                          pgsz = PAGESIZE;
 435  435                          newsize = p->p_usrstack - sp;
 436  436                  }
 437  437          } else {
 438  438                  pgsz = PAGESIZE;
 439  439                  newsize = p->p_usrstack - sp;
 440  440          }
 441  441  
 442  442          if (newsize > (size_t)p->p_stk_ctl) {
 443  443                  (void) rctl_action(rctlproc_legacy[RLIMIT_STACK], p->p_rctls, p,
 444  444                      RCA_UNSAFE_ALL);
 445  445  
 446  446                  return (ENOMEM);
 447  447          }
 448  448  
 449  449          oldsize = p->p_stksize;
 450  450          ASSERT(P2PHASE(oldsize, PAGESIZE) == 0);
 451  451  
 452  452          if (newsize <= oldsize) {       /* prevent the stack from shrinking */
 453  453                  return (0);
 454  454          }
 455  455  
 456  456          if (!(p->p_stkprot & PROT_EXEC)) {
 457  457                  crargs.prot &= ~PROT_EXEC;
 458  458          }
 459  459          /*
 460  460           * extend stack with the proposed new growszc, which is different
 461  461           * than p_stkpageszc only on a memcntl to increase the stack pagesize.
 462  462           * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies via
 463  463           * map_pgszcvec(). Use AS_MAP_STACK to get intermediate page sizes
 464  464           * if not aligned to szc's pgsz.
 465  465           */
 466  466          if (szc > 0) {
 467  467                  caddr_t oldsp = p->p_usrstack - oldsize;
 468  468                  caddr_t austk = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack,
 469  469                      pgsz);
 470  470  
 471  471                  if (IS_P2ALIGNED(p->p_usrstack, pgsz) || oldsp < austk) {
 472  472                          crargs.szc = p->p_stkpageszc ? p->p_stkpageszc :
 473  473                              AS_MAP_NO_LPOOB;
 474  474                  } else if (oldsp == austk) {
 475  475                          crargs.szc = szc;
 476  476                  } else {
 477  477                          crargs.szc = AS_MAP_STACK;
 478  478                  }
 479  479          } else {
 480  480                  crargs.szc = AS_MAP_NO_LPOOB;
 481  481          }
 482  482          crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_DOWN;
 483  483  
 484  484          if ((error = as_map(p->p_as, p->p_usrstack - newsize, newsize - oldsize,
 485  485              segvn_create, &crargs)) != 0) {
 486  486                  if (error == EAGAIN) {
 487  487                          cmn_err(CE_WARN, "Sorry, no swap space to grow stack "
 488  488                              "for pid %d (%s)", p->p_pid, PTOU(p)->u_comm);
 489  489                  }
 490  490                  return (error);
 491  491          }
 492  492          p->p_stksize = newsize;
 493  493          return (0);
 494  494  }
 495  495

↓ open down ↓

495 lines elided

↑ open up ↑

 496  496  /*
 497  497   * Find address for user to map.
 498  498   * If MAP_FIXED is not specified, we can pick any address we want, but we will
 499  499   * first try the value in *addrp if it is non-NULL.  Thus this is implementing
 500  500   * a way to try and get a preferred address.
 501  501   */
 502  502  int
 503  503  choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off,
 504  504      int vacalign, uint_t flags)
 505  505  {
      506 +#if defined(__amd64)
      507 +        proc_t *p = curproc;
      508 +#endif
 506  509          caddr_t basep = (caddr_t)(uintptr_t)((uintptr_t)*addrp & PAGEMASK);
 507      -        size_t lenp = len;
      510 +        size_t lenp;
 508  511  
 509  512          ASSERT(AS_ISCLAIMGAP(as));      /* searches should be serialized */
      513 +
      514 +        /*
      515 +         * If we have been provided a hint, we should still expand the lenp
      516 +         * to be the rest of the address space.  This will allow us to
      517 +         * treat the hint as a strong desire to be "nearby" the provided
      518 +         * address.  If we can't satisfy the hint, as_gap() will walk forward.
      519 +         */
      520 +        if (flags & _MAP_LOW32)
      521 +                lenp = (caddr_t)USERLIMIT32 - basep;
      522 +#if defined(__amd64)
      523 +        else if (p->p_model == DATAMODEL_NATIVE)
      524 +                lenp = p->p_usrstack - basep -
      525 +                    ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK);
      526 +#endif
      527 +        else
      528 +                lenp = as->a_userlimit - basep;
      529 +
 510  530          if (flags & MAP_FIXED) {
 511  531                  (void) as_unmap(as, *addrp, len);
 512  532                  return (0);
 513  533          } else if (basep != NULL && ((flags & MAP_ALIGN) == 0) &&
 514  534              !as_gap(as, len, &basep, &lenp, 0, *addrp)) {
 515  535                  /* User supplied address was available */
 516  536                  *addrp = basep;
 517  537          } else {
 518  538                  /*
 519  539                   * No user supplied address or the address supplied was not

 520  540                   * available.
 521  541                   */
 522  542                  map_addr(addrp, len, off, vacalign, flags);
 523  543          }
 524  544          if (*addrp == NULL)
 525  545                  return (ENOMEM);
 526  546          return (0);
 527  547  }
 528  548  
 529  549  caddr_t
 530  550  map_userlimit(proc_t *pp, struct as *as, int flags)
 531  551  {
 532  552          if (flags & _MAP_LOW32) {
 533  553                  if (PROC_IS_BRANDED(pp) && BROP(pp)->b_map32limit != NULL) {
 534  554                          return ((caddr_t)(uintptr_t)BROP(pp)->b_map32limit(pp));
 535  555                  } else {
 536  556                          return ((caddr_t)_userlimit32);
 537  557                  }
 538  558          }
 539  559  
 540  560          return (as->a_userlimit);
 541  561  }
 542  562  
 543  563  
 544  564  /*
 545  565   * Used for MAP_ANON - fast way to get anonymous pages
 546  566   */
 547  567  static int
 548  568  zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
 549  569      offset_t pos)
 550  570  {
 551  571          struct segvn_crargs vn_a;
 552  572          int error;
 553  573  
 554  574          if (((PROT_ALL & uprot) != uprot))
 555  575                  return (EACCES);
 556  576  
 557  577          if ((flags & MAP_FIXED) != 0) {
 558  578                  /*
 559  579                   * Use the user address.  First verify that
 560  580                   * the address to be used is page aligned.
 561  581                   * Then make some simple bounds checks.
 562  582                   */
 563  583                  if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
 564  584                          return (EINVAL);
 565  585  
 566  586                  switch (valid_usr_range(*addrp, len, uprot, as,
 567  587                      map_userlimit(as->a_proc, as, flags))) {
 568  588                  case RANGE_OKAY:
 569  589                          break;
 570  590                  case RANGE_BADPROT:
 571  591                          return (ENOTSUP);
 572  592                  case RANGE_BADADDR:
 573  593                  default:
 574  594                          return (ENOMEM);
 575  595                  }
 576  596          }
 577  597          /*
 578  598           * No need to worry about vac alignment for anonymous
 579  599           * pages since this is a "clone" object that doesn't
 580  600           * yet exist.
 581  601           */
 582  602          error = choose_addr(as, addrp, len, pos, ADDR_NOVACALIGN, flags);
 583  603          if (error != 0) {
 584  604                  return (error);
 585  605          }
 586  606  
 587  607          /*
 588  608           * Use the seg_vn segment driver; passing in the NULL amp
 589  609           * gives the desired "cloning" effect.
 590  610           */
 591  611          vn_a.vp = NULL;
 592  612          vn_a.offset = 0;
 593  613          vn_a.type = flags & MAP_TYPE;
 594  614          vn_a.prot = uprot;
 595  615          vn_a.maxprot = PROT_ALL;
 596  616          vn_a.flags = flags & ~MAP_TYPE;
 597  617          vn_a.cred = CRED();
 598  618          vn_a.amp = NULL;
 599  619          vn_a.szc = 0;
 600  620          vn_a.lgrp_mem_policy_flags = 0;
 601  621  
 602  622          return (as_map(as, *addrp, len, segvn_create, &vn_a));
 603  623  }
 604  624  
 605  625  static int
 606  626  smmap_common(caddr_t *addrp, size_t len,
 607  627      int prot, int flags, struct file *fp, offset_t pos)
 608  628  {
 609  629          struct vnode *vp;
 610  630          struct as *as = curproc->p_as;
 611  631          uint_t uprot, maxprot, type;
 612  632          int error;
 613  633          int in_crit = 0;
 614  634  
 615  635          if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | _MAP_NEW |
 616  636              _MAP_LOW32 | MAP_NORESERVE | MAP_ANON | MAP_ALIGN |
 617  637              MAP_TEXT | MAP_INITDATA)) != 0) {
 618  638                  /* | MAP_RENAME */      /* not implemented, let user know */
 619  639                  return (EINVAL);
 620  640          }
 621  641  
 622  642          if ((flags & MAP_TEXT) && !(prot & PROT_EXEC)) {
 623  643                  return (EINVAL);
 624  644          }
 625  645  
 626  646          if ((flags & (MAP_TEXT | MAP_INITDATA)) == (MAP_TEXT | MAP_INITDATA)) {
 627  647                  return (EINVAL);
 628  648          }
 629  649  
 630  650  #if defined(__sparc)
 631  651          /*
 632  652           * See if this is an "old mmap call".  If so, remember this
 633  653           * fact and convert the flags value given to mmap to indicate
 634  654           * the specified address in the system call must be used.
 635  655           * _MAP_NEW is turned set by all new uses of mmap.
 636  656           */
 637  657          if ((flags & _MAP_NEW) == 0)
 638  658                  flags |= MAP_FIXED;
 639  659  #endif
 640  660          flags &= ~_MAP_NEW;
 641  661  
 642  662          type = flags & MAP_TYPE;
 643  663          if (type != MAP_PRIVATE && type != MAP_SHARED)
 644  664                  return (EINVAL);
 645  665  
 646  666  
 647  667          if (flags & MAP_ALIGN) {
 648  668  
 649  669                  if (flags & MAP_FIXED)
 650  670                          return (EINVAL);
 651  671  
 652  672                  /* alignment needs to be a power of 2 >= page size */
 653  673                  if (((uintptr_t)*addrp < PAGESIZE && (uintptr_t)*addrp != 0) ||
 654  674                      !ISP2((uintptr_t)*addrp))
 655  675                          return (EINVAL);
 656  676          }
 657  677          /*
 658  678           * Check for bad lengths and file position.
 659  679           * We let the VOP_MAP routine check for negative lengths
 660  680           * since on some vnode types this might be appropriate.
 661  681           */
 662  682          if (len == 0 || (pos & (u_offset_t)PAGEOFFSET) != 0)
 663  683                  return (EINVAL);
 664  684  
 665  685          maxprot = PROT_ALL;             /* start out allowing all accesses */
 666  686          uprot = prot | PROT_USER;
 667  687  
 668  688          if (fp == NULL) {
 669  689                  ASSERT(flags & MAP_ANON);
 670  690                  /* discard lwpchan mappings, like munmap() */
 671  691                  if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
 672  692                          lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
 673  693                  as_rangelock(as);
 674  694                  error = zmap(as, addrp, len, uprot, flags, pos);
 675  695                  as_rangeunlock(as);
 676  696                  /*
 677  697                   * Tell machine specific code that lwp has mapped shared memory
 678  698                   */
 679  699                  if (error == 0 && (flags & MAP_SHARED)) {
 680  700                          /* EMPTY */
 681  701                          LWP_MMODEL_SHARED_AS(*addrp, len);
 682  702                  }
 683  703                  return (error);
 684  704          } else if ((flags & MAP_ANON) != 0)
 685  705                  return (EINVAL);
 686  706  
 687  707          vp = fp->f_vnode;
 688  708  
 689  709          /* Can't execute code from "noexec" mounted filesystem. */
 690  710          if ((vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0)
 691  711                  maxprot &= ~PROT_EXEC;
 692  712  
 693  713          /*
 694  714           * These checks were added as part of large files.
 695  715           *
 696  716           * Return ENXIO if the initial position is negative; return EOVERFLOW
 697  717           * if (offset + len) would overflow the maximum allowed offset for the
 698  718           * type of file descriptor being used.
 699  719           */
 700  720          if (vp->v_type == VREG) {
 701  721                  if (pos < 0)
 702  722                          return (ENXIO);
 703  723                  if ((offset_t)len > (OFFSET_MAX(fp) - pos))
 704  724                          return (EOVERFLOW);
 705  725          }
 706  726  
 707  727          if (type == MAP_SHARED && (fp->f_flag & FWRITE) == 0) {
 708  728                  /* no write access allowed */
 709  729                  maxprot &= ~PROT_WRITE;
 710  730          }
 711  731  
 712  732          /*
 713  733           * XXX - Do we also adjust maxprot based on protections
 714  734           * of the vnode?  E.g. if no execute permission is given
 715  735           * on the vnode for the current user, maxprot probably
 716  736           * should disallow PROT_EXEC also?  This is different
 717  737           * from the write access as this would be a per vnode
 718  738           * test as opposed to a per fd test for writability.
 719  739           */
 720  740  
 721  741          /*
 722  742           * Verify that the specified protections are not greater than
 723  743           * the maximum allowable protections.  Also test to make sure
 724  744           * that the file descriptor does allows for read access since
 725  745           * "write only" mappings are hard to do since normally we do
 726  746           * the read from the file before the page can be written.
 727  747           */
 728  748          if (((maxprot & uprot) != uprot) || (fp->f_flag & FREAD) == 0)
 729  749                  return (EACCES);
 730  750  
 731  751          /*
 732  752           * If the user specified an address, do some simple checks here
 733  753           */
 734  754          if ((flags & MAP_FIXED) != 0) {
 735  755                  /*
 736  756                   * Use the user address.  First verify that
 737  757                   * the address to be used is page aligned.
 738  758                   * Then make some simple bounds checks.
 739  759                   */
 740  760                  if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
 741  761                          return (EINVAL);
 742  762                  switch (valid_usr_range(*addrp, len, uprot, as,
 743  763                      map_userlimit(curproc, as, flags))) {
 744  764                  case RANGE_OKAY:
 745  765                          break;
 746  766                  case RANGE_BADPROT:
 747  767                          return (ENOTSUP);
 748  768                  case RANGE_BADADDR:
 749  769                  default:
 750  770                          return (ENOMEM);
 751  771                  }
 752  772          }
 753  773  
 754  774          if ((prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) &&
 755  775              nbl_need_check(vp)) {
 756  776                  int svmand;
 757  777                  nbl_op_t nop;
 758  778  
 759  779                  nbl_start_crit(vp, RW_READER);
 760  780                  in_crit = 1;
 761  781                  error = nbl_svmand(vp, fp->f_cred, &svmand);
 762  782                  if (error != 0)
 763  783                          goto done;
 764  784                  if ((prot & PROT_WRITE) && (type == MAP_SHARED)) {
 765  785                          if (prot & (PROT_READ | PROT_EXEC)) {
 766  786                                  nop = NBL_READWRITE;
 767  787                          } else {
 768  788                                  nop = NBL_WRITE;
 769  789                          }
 770  790                  } else {
 771  791                          nop = NBL_READ;
 772  792                  }
 773  793                  if (nbl_conflict(vp, nop, 0, LONG_MAX, svmand, NULL)) {
 774  794                          error = EACCES;
 775  795                          goto done;
 776  796                  }
 777  797          }
 778  798  
 779  799          /* discard lwpchan mappings, like munmap() */
 780  800          if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
 781  801                  lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
 782  802  
 783  803          /*
 784  804           * Ok, now let the vnode map routine do its thing to set things up.
 785  805           */
 786  806          error = VOP_MAP(vp, pos, as,
 787  807              addrp, len, uprot, maxprot, flags, fp->f_cred, NULL);
 788  808  
 789  809          if (error == 0) {
 790  810                  /*
 791  811                   * Tell machine specific code that lwp has mapped shared memory
 792  812                   */
 793  813                  if (flags & MAP_SHARED) {
 794  814                          /* EMPTY */
 795  815                          LWP_MMODEL_SHARED_AS(*addrp, len);
 796  816                  }
 797  817                  if (vp->v_type == VREG &&
 798  818                      (flags & (MAP_TEXT | MAP_INITDATA)) != 0) {
 799  819                          /*
 800  820                           * Mark this as an executable vnode
 801  821                           */
 802  822                          mutex_enter(&vp->v_lock);
 803  823                          vp->v_flag |= VVMEXEC;
 804  824                          mutex_exit(&vp->v_lock);
 805  825                  }
 806  826          }
 807  827  
 808  828  done:
 809  829          if (in_crit)
 810  830                  nbl_end_crit(vp);
 811  831          return (error);
 812  832  }
 813  833  
 814  834  #ifdef _LP64
 815  835  /*
 816  836   * LP64 mmap(2) system call: 64-bit offset, 64-bit address.
 817  837   *
 818  838   * The "large file" mmap routine mmap64(2) is also mapped to this routine
 819  839   * by the 64-bit version of libc.
 820  840   *
 821  841   * Eventually, this should be the only version, and have smmap_common()
 822  842   * folded back into it again.  Some day.
 823  843   */
 824  844  caddr_t
 825  845  smmap64(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos)
 826  846  {
 827  847          struct file *fp;
 828  848          int error;
 829  849  
 830  850          if (fd == -1 && (flags & MAP_ANON) != 0)
 831  851                  error = smmap_common(&addr, len, prot, flags,
 832  852                      NULL, (offset_t)pos);
 833  853          else if ((fp = getf(fd)) != NULL) {
 834  854                  error = smmap_common(&addr, len, prot, flags,
 835  855                      fp, (offset_t)pos);
 836  856                  releasef(fd);
 837  857          } else
 838  858                  error = EBADF;
 839  859  
 840  860          return (error ? (caddr_t)(uintptr_t)set_errno(error) : addr);
 841  861  }
 842  862  #endif  /* _LP64 */
 843  863  
 844  864  #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
 845  865  
 846  866  /*
 847  867   * ILP32 mmap(2) system call: 32-bit offset, 32-bit address.
 848  868   */
 849  869  caddr_t
 850  870  smmap32(caddr32_t addr, size32_t len, int prot, int flags, int fd, off32_t pos)
 851  871  {
 852  872          struct file *fp;
 853  873          int error;
 854  874          caddr_t a = (caddr_t)(uintptr_t)addr;
 855  875  
 856  876          if (flags & _MAP_LOW32)
 857  877                  error = EINVAL;
 858  878          else if (fd == -1 && (flags & MAP_ANON) != 0)
 859  879                  error = smmap_common(&a, (size_t)len, prot,
 860  880                      flags | _MAP_LOW32, NULL, (offset_t)pos);
 861  881          else if ((fp = getf(fd)) != NULL) {
 862  882                  error = smmap_common(&a, (size_t)len, prot,
 863  883                      flags | _MAP_LOW32, fp, (offset_t)pos);
 864  884                  releasef(fd);
 865  885          } else
 866  886                  error = EBADF;
 867  887  
 868  888          ASSERT(error != 0 || (uintptr_t)(a + len) < (uintptr_t)UINT32_MAX);
 869  889  
 870  890          return (error ? (caddr_t)(uintptr_t)set_errno(error) : a);
 871  891  }
 872  892  
 873  893  /*
 874  894   * ILP32 mmap64(2) system call: 64-bit offset, 32-bit address.
 875  895   *
 876  896   * Now things really get ugly because we can't use the C-style
 877  897   * calling convention for more than 6 args, and 64-bit parameter
 878  898   * passing on 32-bit systems is less than clean.
 879  899   */
 880  900  
 881  901  struct mmaplf32a {
 882  902          caddr_t addr;
 883  903          size_t len;
 884  904  #ifdef _LP64
 885  905          /*
 886  906           * 32-bit contents, 64-bit cells
 887  907           */
 888  908          uint64_t prot;
 889  909          uint64_t flags;
 890  910          uint64_t fd;
 891  911          uint64_t offhi;
 892  912          uint64_t offlo;
 893  913  #else
 894  914          /*
 895  915           * 32-bit contents, 32-bit cells
 896  916           */
 897  917          uint32_t prot;
 898  918          uint32_t flags;
 899  919          uint32_t fd;
 900  920          uint32_t offhi;
 901  921          uint32_t offlo;
 902  922  #endif
 903  923  };
 904  924  
 905  925  int
 906  926  smmaplf32(struct mmaplf32a *uap, rval_t *rvp)
 907  927  {
 908  928          struct file *fp;
 909  929          int error;
 910  930          caddr_t a = uap->addr;
 911  931          int flags = (int)uap->flags;
 912  932          int fd = (int)uap->fd;
 913  933  #ifdef _BIG_ENDIAN
 914  934          offset_t off = ((u_offset_t)uap->offhi << 32) | (u_offset_t)uap->offlo;
 915  935  #else
 916  936          offset_t off = ((u_offset_t)uap->offlo << 32) | (u_offset_t)uap->offhi;
 917  937  #endif
 918  938  
 919  939          if (flags & _MAP_LOW32)
 920  940                  error = EINVAL;
 921  941          else if (fd == -1 && (flags & MAP_ANON) != 0)
 922  942                  error = smmap_common(&a, uap->len, (int)uap->prot,
 923  943                      flags | _MAP_LOW32, NULL, off);
 924  944          else if ((fp = getf(fd)) != NULL) {
 925  945                  error = smmap_common(&a, uap->len, (int)uap->prot,
 926  946                      flags | _MAP_LOW32, fp, off);
 927  947                  releasef(fd);
 928  948          } else
 929  949                  error = EBADF;
 930  950  
 931  951          if (error == 0)
 932  952                  rvp->r_val1 = (uintptr_t)a;
 933  953          return (error);
 934  954  }
 935  955  
 936  956  #endif  /* _SYSCALL32_IMPL || _ILP32 */
 937  957  
 938  958  int
 939  959  munmap(caddr_t addr, size_t len)
 940  960  {
 941  961          struct proc *p = curproc;
 942  962          struct as *as = p->p_as;
 943  963  
 944  964          if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
 945  965                  return (set_errno(EINVAL));
 946  966  
 947  967          if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
 948  968                  return (set_errno(EINVAL));
 949  969  
 950  970          /*
 951  971           * Discard lwpchan mappings.
 952  972           */
 953  973          if (p->p_lcp != NULL)
 954  974                  lwpchan_delete_mapping(p, addr, addr + len);
 955  975          if (as_unmap(as, addr, len) != 0)
 956  976                  return (set_errno(EINVAL));
 957  977  
 958  978          return (0);
 959  979  }
 960  980  
 961  981  int
 962  982  mprotect(caddr_t addr, size_t len, int prot)
 963  983  {
 964  984          struct as *as = curproc->p_as;
 965  985          uint_t uprot = prot | PROT_USER;
 966  986          int error;
 967  987  
 968  988          if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
 969  989                  return (set_errno(EINVAL));
 970  990  
 971  991          switch (valid_usr_range(addr, len, prot, as, as->a_userlimit)) {
 972  992          case RANGE_OKAY:
 973  993                  break;
 974  994          case RANGE_BADPROT:
 975  995                  return (set_errno(ENOTSUP));
 976  996          case RANGE_BADADDR:
 977  997          default:
 978  998                  return (set_errno(ENOMEM));
 979  999          }
 980 1000  
 981 1001          error = as_setprot(as, addr, len, uprot);
 982 1002          if (error)
 983 1003                  return (set_errno(error));
 984 1004          return (0);
 985 1005  }
 986 1006  
 987 1007  #define MC_CACHE        128                     /* internal result buffer */
 988 1008  #define MC_QUANTUM      (MC_CACHE * PAGESIZE)   /* addresses covered in loop */
 989 1009  
 990 1010  int
 991 1011  mincore(caddr_t addr, size_t len, char *vecp)
 992 1012  {
 993 1013          struct as *as = curproc->p_as;
 994 1014          caddr_t ea;                     /* end address of loop */
 995 1015          size_t rl;                      /* inner result length */
 996 1016          char vec[MC_CACHE];             /* local vector cache */
 997 1017          int error;
 998 1018          model_t model;
 999 1019          long    llen;
1000 1020  
1001 1021          model = get_udatamodel();
1002 1022          /*
1003 1023           * Validate form of address parameters.
1004 1024           */
1005 1025          if (model == DATAMODEL_NATIVE) {
1006 1026                  llen = (long)len;
1007 1027          } else {
1008 1028                  llen = (int32_t)(size32_t)len;
1009 1029          }
1010 1030          if (((uintptr_t)addr & PAGEOFFSET) != 0 || llen <= 0)
1011 1031                  return (set_errno(EINVAL));
1012 1032  
1013 1033          if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
1014 1034                  return (set_errno(ENOMEM));
1015 1035  
1016 1036          /*
1017 1037           * Loop over subranges of interval [addr : addr + len), recovering
1018 1038           * results internally and then copying them out to caller.  Subrange
1019 1039           * is based on the size of MC_CACHE, defined above.
1020 1040           */
1021 1041          for (ea = addr + len; addr < ea; addr += MC_QUANTUM) {
1022 1042                  error = as_incore(as, addr,
1023 1043                      (size_t)MIN(MC_QUANTUM, ea - addr), vec, &rl);
1024 1044                  if (rl != 0) {
1025 1045                          rl = (rl + PAGESIZE - 1) / PAGESIZE;
1026 1046                          if (copyout(vec, vecp, rl) != 0)
1027 1047                                  return (set_errno(EFAULT));
1028 1048                          vecp += rl;
1029 1049                  }
1030 1050                  if (error != 0)
1031 1051                          return (set_errno(ENOMEM));
1032 1052          }
1033 1053          return (0);
1034 1054  }

↓ open down ↓

515 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX