Print this page
    
OS-3602 lxbrand LTP recv* tests failing on MSG_ERRQUEUE flag
OS-3600 lxbrand 32bit cannot boot with OS-3594 fix
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Bryan Cantrill <bryan@joyent.com>
OS-3594 lx brand: need support for MAP_32BIT
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/os/grow.c
          +++ new/usr/src/uts/common/os/grow.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  
    | 
      ↓ open down ↓ | 
    11 lines elided | 
    
      ↑ open up ↑ | 
  
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22      -/* Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved. */
       22 +/*
       23 + * Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved.
       24 + * Copyright (c) 2014, Joyent, Inc. All rights reserved.
       25 + */
  23   26  
  24   27  /*
  25   28   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  26   29   * Use is subject to license terms.
  27   30   */
  28   31  
  29   32  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  30   33  /*        All Rights Reserved   */
  31   34  
  32   35  #include <sys/types.h>
  33   36  #include <sys/inttypes.h>
  34   37  #include <sys/param.h>
  35   38  #include <sys/sysmacros.h>
  36   39  #include <sys/systm.h>
  37   40  #include <sys/signal.h>
  38   41  #include <sys/user.h>
  39   42  #include <sys/errno.h>
  40   43  #include <sys/var.h>
  41   44  #include <sys/proc.h>
  42   45  #include <sys/tuneable.h>
  43   46  #include <sys/debug.h>
  44   47  #include <sys/cmn_err.h>
  
    | 
      ↓ open down ↓ | 
    12 lines elided | 
    
      ↑ open up ↑ | 
  
  45   48  #include <sys/cred.h>
  46   49  #include <sys/vnode.h>
  47   50  #include <sys/vfs.h>
  48   51  #include <sys/vm.h>
  49   52  #include <sys/file.h>
  50   53  #include <sys/mman.h>
  51   54  #include <sys/vmparam.h>
  52   55  #include <sys/fcntl.h>
  53   56  #include <sys/lwpchan_impl.h>
  54   57  #include <sys/nbmlock.h>
       58 +#include <sys/brand.h>
  55   59  
  56   60  #include <vm/hat.h>
  57   61  #include <vm/as.h>
  58   62  #include <vm/seg.h>
  59   63  #include <vm/seg_dev.h>
  60   64  #include <vm/seg_vn.h>
  61   65  
  62   66  int use_brk_lpg = 1;
  63   67  int use_stk_lpg = 1;
  64   68  
  65   69  static int brk_lpg(caddr_t nva);
  66   70  static int grow_lpg(caddr_t sp);
  67   71  
  68   72  int
  69   73  brk(caddr_t nva)
  70   74  {
  71   75          int error;
  72   76          proc_t *p = curproc;
  73   77  
  74   78          /*
  75   79           * Serialize brk operations on an address space.
  76   80           * This also serves as the lock protecting p_brksize
  77   81           * and p_brkpageszc.
  78   82           */
  79   83          as_rangelock(p->p_as);
  80   84          if (use_brk_lpg && (p->p_flag & SAUTOLPG) != 0) {
  81   85                  error = brk_lpg(nva);
  82   86          } else {
  83   87                  error = brk_internal(nva, p->p_brkpageszc);
  84   88          }
  85   89          as_rangeunlock(p->p_as);
  86   90          return ((error != 0 ? set_errno(error) : 0));
  87   91  }
  88   92  
  89   93  /*
  90   94   * Algorithm: call arch-specific map_pgsz to get best page size to use,
  91   95   * then call brk_internal().
  92   96   * Returns 0 on success.
  93   97   */
  94   98  static int
  95   99  brk_lpg(caddr_t nva)
  96  100  {
  97  101          struct proc *p = curproc;
  98  102          size_t pgsz, len;
  99  103          caddr_t addr, brkend;
 100  104          caddr_t bssbase = p->p_bssbase;
 101  105          caddr_t brkbase = p->p_brkbase;
 102  106          int oszc, szc;
 103  107          int err;
 104  108  
 105  109          oszc = p->p_brkpageszc;
 106  110  
 107  111          /*
 108  112           * If p_brkbase has not yet been set, the first call
 109  113           * to brk_internal() will initialize it.
 110  114           */
 111  115          if (brkbase == 0) {
 112  116                  return (brk_internal(nva, oszc));
 113  117          }
 114  118  
 115  119          len = nva - bssbase;
 116  120  
 117  121          pgsz = map_pgsz(MAPPGSZ_HEAP, p, bssbase, len, 0);
 118  122          szc = page_szc(pgsz);
 119  123  
 120  124          /*
 121  125           * Covers two cases:
 122  126           * 1. page_szc() returns -1 for invalid page size, so we want to
 123  127           * ignore it in that case.
 124  128           * 2. By design we never decrease page size, as it is more stable.
 125  129           */
 126  130          if (szc <= oszc) {
 127  131                  err = brk_internal(nva, oszc);
 128  132                  /* If failed, back off to base page size. */
 129  133                  if (err != 0 && oszc != 0) {
 130  134                          err = brk_internal(nva, 0);
 131  135                  }
 132  136                  return (err);
 133  137          }
 134  138  
 135  139          err = brk_internal(nva, szc);
 136  140          /* If using szc failed, map with base page size and return. */
 137  141          if (err != 0) {
 138  142                  if (szc != 0) {
 139  143                          err = brk_internal(nva, 0);
 140  144                  }
 141  145                  return (err);
 142  146          }
 143  147  
 144  148          /*
 145  149           * Round up brk base to a large page boundary and remap
 146  150           * anything in the segment already faulted in beyond that
 147  151           * point.
 148  152           */
 149  153          addr = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase, pgsz);
 150  154          brkend = brkbase + p->p_brksize;
 151  155          len = brkend - addr;
 152  156          /* Check that len is not negative. Update page size code for heap. */
 153  157          if (addr >= p->p_bssbase && brkend > addr && IS_P2ALIGNED(len, pgsz)) {
 154  158                  (void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
 155  159                  p->p_brkpageszc = szc;
 156  160          }
 157  161  
 158  162          ASSERT(err == 0);
 159  163          return (err);           /* should always be 0 */
 160  164  }
 161  165  
 162  166  /*
 163  167   * Returns 0 on success.
 164  168   */
 165  169  int
 166  170  brk_internal(caddr_t nva, uint_t brkszc)
 167  171  {
 168  172          caddr_t ova;                    /* current break address */
 169  173          size_t size;
 170  174          int     error;
 171  175          struct proc *p = curproc;
 172  176          struct as *as = p->p_as;
 173  177          size_t pgsz;
 174  178          uint_t szc;
 175  179          rctl_qty_t as_rctl;
 176  180  
 177  181          /*
 178  182           * extend heap to brkszc alignment but use current p->p_brkpageszc
 179  183           * for the newly created segment. This allows the new extension
 180  184           * segment to be concatenated successfully with the existing brk
 181  185           * segment.
 182  186           */
 183  187          if ((szc = brkszc) != 0) {
 184  188                  pgsz = page_get_pagesize(szc);
 185  189                  ASSERT(pgsz > PAGESIZE);
 186  190          } else {
 187  191                  pgsz = PAGESIZE;
 188  192          }
 189  193  
 190  194          mutex_enter(&p->p_lock);
 191  195          as_rctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_DATA],
 192  196              p->p_rctls, p);
 193  197          mutex_exit(&p->p_lock);
 194  198  
 195  199          /*
 196  200           * If p_brkbase has not yet been set, the first call
 197  201           * to brk() will initialize it.
 198  202           */
 199  203          if (p->p_brkbase == 0)
 200  204                  p->p_brkbase = nva;
 201  205  
 202  206          /*
 203  207           * Before multiple page size support existed p_brksize was the value
 204  208           * not rounded to the pagesize (i.e. it stored the exact user request
 205  209           * for heap size). If pgsz is greater than PAGESIZE calculate the
 206  210           * heap size as the real new heap size by rounding it up to pgsz.
 207  211           * This is useful since we may want to know where the heap ends
 208  212           * without knowing heap pagesize (e.g. some old code) and also if
 209  213           * heap pagesize changes we can update p_brkpageszc but delay adding
 210  214           * new mapping yet still know from p_brksize where the heap really
 211  215           * ends. The user requested heap end is stored in libc variable.
 212  216           */
 213  217          if (pgsz > PAGESIZE) {
 214  218                  caddr_t tnva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
 215  219                  size = tnva - p->p_brkbase;
 216  220                  if (tnva < p->p_brkbase || (size > p->p_brksize &&
 217  221                      size > (size_t)as_rctl)) {
 218  222                          szc = 0;
 219  223                          pgsz = PAGESIZE;
 220  224                          size = nva - p->p_brkbase;
 221  225                  }
 222  226          } else {
 223  227                  size = nva - p->p_brkbase;
 224  228          }
 225  229  
 226  230          /*
 227  231           * use PAGESIZE to roundup ova because we want to know the real value
 228  232           * of the current heap end in case p_brkpageszc changes since the last
 229  233           * p_brksize was computed.
 230  234           */
 231  235          nva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
 232  236          ova = (caddr_t)P2ROUNDUP((uintptr_t)(p->p_brkbase + p->p_brksize),
 233  237              PAGESIZE);
 234  238  
 235  239          if ((nva < p->p_brkbase) || (size > p->p_brksize &&
 236  240              size > as_rctl)) {
 237  241                  mutex_enter(&p->p_lock);
 238  242                  (void) rctl_action(rctlproc_legacy[RLIMIT_DATA], p->p_rctls, p,
 239  243                      RCA_SAFE);
 240  244                  mutex_exit(&p->p_lock);
 241  245                  return (ENOMEM);
 242  246          }
 243  247  
 244  248          if (nva > ova) {
 245  249                  struct segvn_crargs crargs =
 246  250                      SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
 247  251  
 248  252                  if (!(p->p_datprot & PROT_EXEC)) {
 249  253                          crargs.prot &= ~PROT_EXEC;
 250  254                  }
 251  255  
 252  256                  /*
 253  257                   * Add new zfod mapping to extend UNIX data segment
 254  258                   * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies
 255  259                   * via map_pgszcvec(). Use AS_MAP_HEAP to get intermediate
 256  260                   * page sizes if ova is not aligned to szc's pgsz.
 257  261                   */
 258  262                  if (szc > 0) {
 259  263                          caddr_t rbss;
 260  264  
 261  265                          rbss = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase,
 262  266                              pgsz);
 263  267                          if (IS_P2ALIGNED(p->p_bssbase, pgsz) || ova > rbss) {
 264  268                                  crargs.szc = p->p_brkpageszc ? p->p_brkpageszc :
 265  269                                      AS_MAP_NO_LPOOB;
 266  270                          } else if (ova == rbss) {
 267  271                                  crargs.szc = szc;
 268  272                          } else {
 269  273                                  crargs.szc = AS_MAP_HEAP;
 270  274                          }
 271  275                  } else {
 272  276                          crargs.szc = AS_MAP_NO_LPOOB;
 273  277                  }
 274  278                  crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_UP;
 275  279                  error = as_map(as, ova, (size_t)(nva - ova), segvn_create,
 276  280                      &crargs);
 277  281                  if (error) {
 278  282                          return (error);
 279  283                  }
 280  284  
 281  285          } else if (nva < ova) {
 282  286                  /*
 283  287                   * Release mapping to shrink UNIX data segment.
 284  288                   */
 285  289                  (void) as_unmap(as, nva, (size_t)(ova - nva));
 286  290          }
 287  291          p->p_brksize = size;
 288  292          return (0);
 289  293  }
 290  294  
 291  295  /*
 292  296   * Grow the stack to include sp.  Return 1 if successful, 0 otherwise.
 293  297   * This routine assumes that the stack grows downward.
 294  298   */
 295  299  int
 296  300  grow(caddr_t sp)
 297  301  {
 298  302          struct proc *p = curproc;
 299  303          struct as *as = p->p_as;
 300  304          size_t oldsize = p->p_stksize;
 301  305          size_t newsize;
 302  306          int err;
 303  307  
 304  308          /*
 305  309           * Serialize grow operations on an address space.
 306  310           * This also serves as the lock protecting p_stksize
 307  311           * and p_stkpageszc.
 308  312           */
 309  313          as_rangelock(as);
 310  314          if (use_stk_lpg && (p->p_flag & SAUTOLPG) != 0) {
 311  315                  err = grow_lpg(sp);
 312  316          } else {
 313  317                  err = grow_internal(sp, p->p_stkpageszc);
 314  318          }
 315  319          as_rangeunlock(as);
 316  320  
 317  321          if (err == 0 && (newsize = p->p_stksize) > oldsize) {
 318  322                  ASSERT(IS_P2ALIGNED(oldsize, PAGESIZE));
 319  323                  ASSERT(IS_P2ALIGNED(newsize, PAGESIZE));
 320  324                  /*
 321  325                   * Set up translations so the process doesn't have to fault in
 322  326                   * the stack pages we just gave it.
 323  327                   */
 324  328                  (void) as_fault(as->a_hat, as, p->p_usrstack - newsize,
 325  329                      newsize - oldsize, F_INVAL, S_WRITE);
 326  330          }
 327  331          return ((err == 0 ? 1 : 0));
 328  332  }
 329  333  
 330  334  /*
 331  335   * Algorithm: call arch-specific map_pgsz to get best page size to use,
 332  336   * then call grow_internal().
 333  337   * Returns 0 on success.
 334  338   */
 335  339  static int
 336  340  grow_lpg(caddr_t sp)
 337  341  {
 338  342          struct proc *p = curproc;
 339  343          size_t pgsz;
 340  344          size_t len, newsize;
 341  345          caddr_t addr, saddr;
 342  346          caddr_t growend;
 343  347          int oszc, szc;
 344  348          int err;
 345  349  
 346  350          newsize = p->p_usrstack - sp;
 347  351  
 348  352          oszc = p->p_stkpageszc;
 349  353          pgsz = map_pgsz(MAPPGSZ_STK, p, sp, newsize, 0);
 350  354          szc = page_szc(pgsz);
 351  355  
 352  356          /*
 353  357           * Covers two cases:
 354  358           * 1. page_szc() returns -1 for invalid page size, so we want to
 355  359           * ignore it in that case.
 356  360           * 2. By design we never decrease page size, as it is more stable.
 357  361           * This shouldn't happen as the stack never shrinks.
 358  362           */
 359  363          if (szc <= oszc) {
 360  364                  err = grow_internal(sp, oszc);
 361  365                  /* failed, fall back to base page size */
 362  366                  if (err != 0 && oszc != 0) {
 363  367                          err = grow_internal(sp, 0);
 364  368                  }
 365  369                  return (err);
 366  370          }
 367  371  
 368  372          /*
 369  373           * We've grown sufficiently to switch to a new page size.
 370  374           * So we are going to remap the whole segment with the new page size.
 371  375           */
 372  376          err = grow_internal(sp, szc);
 373  377          /* The grow with szc failed, so fall back to base page size. */
 374  378          if (err != 0) {
 375  379                  if (szc != 0) {
 376  380                          err = grow_internal(sp, 0);
 377  381                  }
 378  382                  return (err);
 379  383          }
 380  384  
 381  385          /*
 382  386           * Round up stack pointer to a large page boundary and remap
 383  387           * any pgsz pages in the segment already faulted in beyond that
 384  388           * point.
 385  389           */
 386  390          saddr = p->p_usrstack - p->p_stksize;
 387  391          addr = (caddr_t)P2ROUNDUP((uintptr_t)saddr, pgsz);
 388  392          growend = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack, pgsz);
 389  393          len = growend - addr;
 390  394          /* Check that len is not negative. Update page size code for stack. */
 391  395          if (addr >= saddr && growend > addr && IS_P2ALIGNED(len, pgsz)) {
 392  396                  (void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
 393  397                  p->p_stkpageszc = szc;
 394  398          }
 395  399  
 396  400          ASSERT(err == 0);
 397  401          return (err);           /* should always be 0 */
 398  402  }
 399  403  
 400  404  /*
 401  405   * This routine assumes that the stack grows downward.
 402  406   * Returns 0 on success, errno on failure.
 403  407   */
 404  408  int
 405  409  grow_internal(caddr_t sp, uint_t growszc)
 406  410  {
 407  411          struct proc *p = curproc;
 408  412          size_t newsize;
 409  413          size_t oldsize;
 410  414          int    error;
 411  415          size_t pgsz;
 412  416          uint_t szc;
 413  417          struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
 414  418  
 415  419          ASSERT(sp < p->p_usrstack);
 416  420          sp = (caddr_t)P2ALIGN((uintptr_t)sp, PAGESIZE);
 417  421  
 418  422          /*
 419  423           * grow to growszc alignment but use current p->p_stkpageszc for
 420  424           * the segvn_crargs szc passed to segvn_create. For memcntl to
 421  425           * increase the szc, this allows the new extension segment to be
 422  426           * concatenated successfully with the existing stack segment.
 423  427           */
 424  428          if ((szc = growszc) != 0) {
 425  429                  pgsz = page_get_pagesize(szc);
 426  430                  ASSERT(pgsz > PAGESIZE);
 427  431                  newsize = p->p_usrstack - (caddr_t)P2ALIGN((uintptr_t)sp, pgsz);
 428  432                  if (newsize > (size_t)p->p_stk_ctl) {
 429  433                          szc = 0;
 430  434                          pgsz = PAGESIZE;
 431  435                          newsize = p->p_usrstack - sp;
 432  436                  }
 433  437          } else {
 434  438                  pgsz = PAGESIZE;
 435  439                  newsize = p->p_usrstack - sp;
 436  440          }
 437  441  
 438  442          if (newsize > (size_t)p->p_stk_ctl) {
 439  443                  (void) rctl_action(rctlproc_legacy[RLIMIT_STACK], p->p_rctls, p,
 440  444                      RCA_UNSAFE_ALL);
 441  445  
 442  446                  return (ENOMEM);
 443  447          }
 444  448  
 445  449          oldsize = p->p_stksize;
 446  450          ASSERT(P2PHASE(oldsize, PAGESIZE) == 0);
 447  451  
 448  452          if (newsize <= oldsize) {       /* prevent the stack from shrinking */
 449  453                  return (0);
 450  454          }
 451  455  
 452  456          if (!(p->p_stkprot & PROT_EXEC)) {
 453  457                  crargs.prot &= ~PROT_EXEC;
 454  458          }
 455  459          /*
 456  460           * extend stack with the proposed new growszc, which is different
 457  461           * than p_stkpageszc only on a memcntl to increase the stack pagesize.
 458  462           * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies via
 459  463           * map_pgszcvec(). Use AS_MAP_STACK to get intermediate page sizes
 460  464           * if not aligned to szc's pgsz.
 461  465           */
 462  466          if (szc > 0) {
 463  467                  caddr_t oldsp = p->p_usrstack - oldsize;
 464  468                  caddr_t austk = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack,
 465  469                      pgsz);
 466  470  
 467  471                  if (IS_P2ALIGNED(p->p_usrstack, pgsz) || oldsp < austk) {
 468  472                          crargs.szc = p->p_stkpageszc ? p->p_stkpageszc :
 469  473                              AS_MAP_NO_LPOOB;
 470  474                  } else if (oldsp == austk) {
 471  475                          crargs.szc = szc;
 472  476                  } else {
 473  477                          crargs.szc = AS_MAP_STACK;
 474  478                  }
 475  479          } else {
 476  480                  crargs.szc = AS_MAP_NO_LPOOB;
 477  481          }
 478  482          crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_DOWN;
 479  483  
 480  484          if ((error = as_map(p->p_as, p->p_usrstack - newsize, newsize - oldsize,
 481  485              segvn_create, &crargs)) != 0) {
 482  486                  if (error == EAGAIN) {
 483  487                          cmn_err(CE_WARN, "Sorry, no swap space to grow stack "
 484  488                              "for pid %d (%s)", p->p_pid, PTOU(p)->u_comm);
 485  489                  }
 486  490                  return (error);
 487  491          }
 488  492          p->p_stksize = newsize;
 489  493          return (0);
 490  494  }
 491  495  
 492  496  /*
 493  497   * Find address for user to map.
 494  498   * If MAP_FIXED is not specified, we can pick any address we want, but we will
 495  499   * first try the value in *addrp if it is non-NULL.  Thus this is implementing
 496  500   * a way to try and get a preferred address.
 497  501   */
 498  502  int
 499  503  choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off,
 500  504      int vacalign, uint_t flags)
 501  505  {
 502  506  #if defined(__amd64)
 503  507          proc_t *p = curproc;
 504  508  #endif
 505  509          caddr_t basep = (caddr_t)(uintptr_t)((uintptr_t)*addrp & PAGEMASK);
 506  510          size_t lenp;
 507  511  
 508  512          ASSERT(AS_ISCLAIMGAP(as));      /* searches should be serialized */
 509  513  
 510  514          /*
 511  515           * If we have been provided a hint, we should still expand the lenp
 512  516           * to be the rest of the address space.  This will allow us to
 513  517           * treat the hint as a strong desire to be "nearby" the provided
 514  518           * address.  If we can't satisfy the hint, as_gap() will walk forward.
 515  519           */
 516  520          if (flags & _MAP_LOW32)
 517  521                  lenp = (caddr_t)USERLIMIT32 - basep;
 518  522  #if defined(__amd64)
 519  523          else if (p->p_model == DATAMODEL_NATIVE)
 520  524                  lenp = p->p_usrstack - basep -
 521  525                      ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK);
 522  526  #endif
 523  527          else
 524  528                  lenp = as->a_userlimit - basep;
 525  529  
 526  530          if (flags & MAP_FIXED) {
 527  531                  (void) as_unmap(as, *addrp, len);
 528  532                  return (0);
 529  533          } else if (basep != NULL && ((flags & MAP_ALIGN) == 0) &&
 530  534              !as_gap(as, len, &basep, &lenp, 0, *addrp)) {
 531  535                  /* User supplied address was available */
 532  536                  *addrp = basep;
 533  537          } else {
 534  538                  /*
  
    | 
      ↓ open down ↓ | 
    470 lines elided | 
    
      ↑ open up ↑ | 
  
 535  539                   * No user supplied address or the address supplied was not
 536  540                   * available.
 537  541                   */
 538  542                  map_addr(addrp, len, off, vacalign, flags);
 539  543          }
 540  544          if (*addrp == NULL)
 541  545                  return (ENOMEM);
 542  546          return (0);
 543  547  }
 544  548  
      549 +caddr_t
      550 +map_userlimit(proc_t *pp, struct as *as, int flags)
      551 +{
      552 +        if (flags & _MAP_LOW32) {
      553 +                if (PROC_IS_BRANDED(pp) && BROP(pp)->b_map32limit != NULL) {
      554 +                        return ((caddr_t)(uintptr_t)BROP(pp)->b_map32limit(pp));
      555 +                } else {
      556 +                        return ((caddr_t)_userlimit32);
      557 +                }
      558 +        }
 545  559  
      560 +        return (as->a_userlimit);
      561 +}
      562 +
      563 +
 546  564  /*
 547  565   * Used for MAP_ANON - fast way to get anonymous pages
 548  566   */
 549  567  static int
 550  568  zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
 551  569      offset_t pos)
 552  570  {
 553  571          struct segvn_crargs vn_a;
 554  572          int error;
 555  573  
 556  574          if (((PROT_ALL & uprot) != uprot))
 557  575                  return (EACCES);
 558  576  
 559  577          if ((flags & MAP_FIXED) != 0) {
 560      -                caddr_t userlimit;
 561      -
 562  578                  /*
 563  579                   * Use the user address.  First verify that
 564  580                   * the address to be used is page aligned.
 565  581                   * Then make some simple bounds checks.
 566  582                   */
 567  583                  if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
 568  584                          return (EINVAL);
 569  585  
 570      -                userlimit = flags & _MAP_LOW32 ?
 571      -                    (caddr_t)USERLIMIT32 : as->a_userlimit;
 572      -                switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
      586 +                switch (valid_usr_range(*addrp, len, uprot, as,
      587 +                    map_userlimit(as->a_proc, as, flags))) {
 573  588                  case RANGE_OKAY:
 574  589                          break;
 575  590                  case RANGE_BADPROT:
 576  591                          return (ENOTSUP);
 577  592                  case RANGE_BADADDR:
 578  593                  default:
 579  594                          return (ENOMEM);
 580  595                  }
 581  596          }
 582  597          /*
 583  598           * No need to worry about vac alignment for anonymous
 584  599           * pages since this is a "clone" object that doesn't
 585  600           * yet exist.
 586  601           */
 587  602          error = choose_addr(as, addrp, len, pos, ADDR_NOVACALIGN, flags);
 588  603          if (error != 0) {
 589  604                  return (error);
 590  605          }
 591  606  
 592  607          /*
 593  608           * Use the seg_vn segment driver; passing in the NULL amp
 594  609           * gives the desired "cloning" effect.
 595  610           */
 596  611          vn_a.vp = NULL;
 597  612          vn_a.offset = 0;
 598  613          vn_a.type = flags & MAP_TYPE;
 599  614          vn_a.prot = uprot;
 600  615          vn_a.maxprot = PROT_ALL;
 601  616          vn_a.flags = flags & ~MAP_TYPE;
 602  617          vn_a.cred = CRED();
 603  618          vn_a.amp = NULL;
 604  619          vn_a.szc = 0;
 605  620          vn_a.lgrp_mem_policy_flags = 0;
 606  621  
 607  622          return (as_map(as, *addrp, len, segvn_create, &vn_a));
 608  623  }
 609  624  
 610  625  static int
 611  626  smmap_common(caddr_t *addrp, size_t len,
 612  627      int prot, int flags, struct file *fp, offset_t pos)
 613  628  {
 614  629          struct vnode *vp;
 615  630          struct as *as = curproc->p_as;
 616  631          uint_t uprot, maxprot, type;
 617  632          int error;
 618  633          int in_crit = 0;
 619  634  
 620  635          if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | _MAP_NEW |
 621  636              _MAP_LOW32 | MAP_NORESERVE | MAP_ANON | MAP_ALIGN |
 622  637              MAP_TEXT | MAP_INITDATA)) != 0) {
 623  638                  /* | MAP_RENAME */      /* not implemented, let user know */
 624  639                  return (EINVAL);
 625  640          }
 626  641  
 627  642          if ((flags & MAP_TEXT) && !(prot & PROT_EXEC)) {
 628  643                  return (EINVAL);
 629  644          }
 630  645  
 631  646          if ((flags & (MAP_TEXT | MAP_INITDATA)) == (MAP_TEXT | MAP_INITDATA)) {
 632  647                  return (EINVAL);
 633  648          }
 634  649  
 635  650  #if defined(__sparc)
 636  651          /*
 637  652           * See if this is an "old mmap call".  If so, remember this
 638  653           * fact and convert the flags value given to mmap to indicate
 639  654           * the specified address in the system call must be used.
 640  655           * _MAP_NEW is turned set by all new uses of mmap.
 641  656           */
 642  657          if ((flags & _MAP_NEW) == 0)
 643  658                  flags |= MAP_FIXED;
 644  659  #endif
 645  660          flags &= ~_MAP_NEW;
 646  661  
 647  662          type = flags & MAP_TYPE;
 648  663          if (type != MAP_PRIVATE && type != MAP_SHARED)
 649  664                  return (EINVAL);
 650  665  
 651  666  
 652  667          if (flags & MAP_ALIGN) {
 653  668  
 654  669                  if (flags & MAP_FIXED)
 655  670                          return (EINVAL);
 656  671  
 657  672                  /* alignment needs to be a power of 2 >= page size */
 658  673                  if (((uintptr_t)*addrp < PAGESIZE && (uintptr_t)*addrp != 0) ||
 659  674                      !ISP2((uintptr_t)*addrp))
 660  675                          return (EINVAL);
 661  676          }
 662  677          /*
 663  678           * Check for bad lengths and file position.
 664  679           * We let the VOP_MAP routine check for negative lengths
 665  680           * since on some vnode types this might be appropriate.
 666  681           */
 667  682          if (len == 0 || (pos & (u_offset_t)PAGEOFFSET) != 0)
 668  683                  return (EINVAL);
 669  684  
 670  685          maxprot = PROT_ALL;             /* start out allowing all accesses */
 671  686          uprot = prot | PROT_USER;
 672  687  
 673  688          if (fp == NULL) {
 674  689                  ASSERT(flags & MAP_ANON);
 675  690                  /* discard lwpchan mappings, like munmap() */
 676  691                  if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
 677  692                          lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
 678  693                  as_rangelock(as);
 679  694                  error = zmap(as, addrp, len, uprot, flags, pos);
 680  695                  as_rangeunlock(as);
 681  696                  /*
 682  697                   * Tell machine specific code that lwp has mapped shared memory
 683  698                   */
 684  699                  if (error == 0 && (flags & MAP_SHARED)) {
 685  700                          /* EMPTY */
 686  701                          LWP_MMODEL_SHARED_AS(*addrp, len);
 687  702                  }
 688  703                  return (error);
 689  704          } else if ((flags & MAP_ANON) != 0)
 690  705                  return (EINVAL);
 691  706  
 692  707          vp = fp->f_vnode;
 693  708  
 694  709          /* Can't execute code from "noexec" mounted filesystem. */
 695  710          if ((vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0)
 696  711                  maxprot &= ~PROT_EXEC;
 697  712  
 698  713          /*
 699  714           * These checks were added as part of large files.
 700  715           *
 701  716           * Return ENXIO if the initial position is negative; return EOVERFLOW
 702  717           * if (offset + len) would overflow the maximum allowed offset for the
 703  718           * type of file descriptor being used.
 704  719           */
 705  720          if (vp->v_type == VREG) {
 706  721                  if (pos < 0)
 707  722                          return (ENXIO);
 708  723                  if ((offset_t)len > (OFFSET_MAX(fp) - pos))
 709  724                          return (EOVERFLOW);
 710  725          }
 711  726  
 712  727          if (type == MAP_SHARED && (fp->f_flag & FWRITE) == 0) {
 713  728                  /* no write access allowed */
 714  729                  maxprot &= ~PROT_WRITE;
 715  730          }
 716  731  
 717  732          /*
 718  733           * XXX - Do we also adjust maxprot based on protections
 719  734           * of the vnode?  E.g. if no execute permission is given
 720  735           * on the vnode for the current user, maxprot probably
 721  736           * should disallow PROT_EXEC also?  This is different
 722  737           * from the write access as this would be a per vnode
 723  738           * test as opposed to a per fd test for writability.
 724  739           */
 725  740  
 726  741          /*
 727  742           * Verify that the specified protections are not greater than
 728  743           * the maximum allowable protections.  Also test to make sure
 729  744           * that the file descriptor does allows for read access since
  
    | 
      ↓ open down ↓ | 
    147 lines elided | 
    
      ↑ open up ↑ | 
  
 730  745           * "write only" mappings are hard to do since normally we do
 731  746           * the read from the file before the page can be written.
 732  747           */
 733  748          if (((maxprot & uprot) != uprot) || (fp->f_flag & FREAD) == 0)
 734  749                  return (EACCES);
 735  750  
 736  751          /*
 737  752           * If the user specified an address, do some simple checks here
 738  753           */
 739  754          if ((flags & MAP_FIXED) != 0) {
 740      -                caddr_t userlimit;
 741      -
 742  755                  /*
 743  756                   * Use the user address.  First verify that
 744  757                   * the address to be used is page aligned.
 745  758                   * Then make some simple bounds checks.
 746  759                   */
 747  760                  if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
 748  761                          return (EINVAL);
 749      -
 750      -                userlimit = flags & _MAP_LOW32 ?
 751      -                    (caddr_t)USERLIMIT32 : as->a_userlimit;
 752      -                switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) {
      762 +                switch (valid_usr_range(*addrp, len, uprot, as,
      763 +                    map_userlimit(curproc, as, flags))) {
 753  764                  case RANGE_OKAY:
 754  765                          break;
 755  766                  case RANGE_BADPROT:
 756  767                          return (ENOTSUP);
 757  768                  case RANGE_BADADDR:
 758  769                  default:
 759  770                          return (ENOMEM);
 760  771                  }
 761  772          }
 762  773  
 763  774          if ((prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) &&
 764  775              nbl_need_check(vp)) {
 765  776                  int svmand;
 766  777                  nbl_op_t nop;
 767  778  
 768  779                  nbl_start_crit(vp, RW_READER);
 769  780                  in_crit = 1;
 770  781                  error = nbl_svmand(vp, fp->f_cred, &svmand);
 771  782                  if (error != 0)
 772  783                          goto done;
 773  784                  if ((prot & PROT_WRITE) && (type == MAP_SHARED)) {
 774  785                          if (prot & (PROT_READ | PROT_EXEC)) {
 775  786                                  nop = NBL_READWRITE;
 776  787                          } else {
 777  788                                  nop = NBL_WRITE;
 778  789                          }
 779  790                  } else {
 780  791                          nop = NBL_READ;
 781  792                  }
 782  793                  if (nbl_conflict(vp, nop, 0, LONG_MAX, svmand, NULL)) {
 783  794                          error = EACCES;
 784  795                          goto done;
 785  796                  }
 786  797          }
 787  798  
 788  799          /* discard lwpchan mappings, like munmap() */
 789  800          if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
 790  801                  lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
 791  802  
 792  803          /*
 793  804           * Ok, now let the vnode map routine do its thing to set things up.
 794  805           */
 795  806          error = VOP_MAP(vp, pos, as,
 796  807              addrp, len, uprot, maxprot, flags, fp->f_cred, NULL);
 797  808  
 798  809          if (error == 0) {
 799  810                  /*
 800  811                   * Tell machine specific code that lwp has mapped shared memory
 801  812                   */
 802  813                  if (flags & MAP_SHARED) {
 803  814                          /* EMPTY */
 804  815                          LWP_MMODEL_SHARED_AS(*addrp, len);
 805  816                  }
 806  817                  if (vp->v_type == VREG &&
 807  818                      (flags & (MAP_TEXT | MAP_INITDATA)) != 0) {
 808  819                          /*
 809  820                           * Mark this as an executable vnode
 810  821                           */
 811  822                          mutex_enter(&vp->v_lock);
 812  823                          vp->v_flag |= VVMEXEC;
 813  824                          mutex_exit(&vp->v_lock);
 814  825                  }
 815  826          }
 816  827  
 817  828  done:
 818  829          if (in_crit)
 819  830                  nbl_end_crit(vp);
 820  831          return (error);
 821  832  }
 822  833  
 823  834  #ifdef _LP64
 824  835  /*
 825  836   * LP64 mmap(2) system call: 64-bit offset, 64-bit address.
 826  837   *
 827  838   * The "large file" mmap routine mmap64(2) is also mapped to this routine
 828  839   * by the 64-bit version of libc.
 829  840   *
 830  841   * Eventually, this should be the only version, and have smmap_common()
 831  842   * folded back into it again.  Some day.
 832  843   */
 833  844  caddr_t
 834  845  smmap64(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos)
 835  846  {
 836  847          struct file *fp;
 837  848          int error;
 838  849  
 839  850          if (fd == -1 && (flags & MAP_ANON) != 0)
 840  851                  error = smmap_common(&addr, len, prot, flags,
 841  852                      NULL, (offset_t)pos);
 842  853          else if ((fp = getf(fd)) != NULL) {
 843  854                  error = smmap_common(&addr, len, prot, flags,
 844  855                      fp, (offset_t)pos);
 845  856                  releasef(fd);
 846  857          } else
 847  858                  error = EBADF;
 848  859  
 849  860          return (error ? (caddr_t)(uintptr_t)set_errno(error) : addr);
 850  861  }
 851  862  #endif  /* _LP64 */
 852  863  
 853  864  #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
 854  865  
 855  866  /*
 856  867   * ILP32 mmap(2) system call: 32-bit offset, 32-bit address.
 857  868   */
 858  869  caddr_t
 859  870  smmap32(caddr32_t addr, size32_t len, int prot, int flags, int fd, off32_t pos)
 860  871  {
 861  872          struct file *fp;
 862  873          int error;
 863  874          caddr_t a = (caddr_t)(uintptr_t)addr;
 864  875  
 865  876          if (flags & _MAP_LOW32)
 866  877                  error = EINVAL;
 867  878          else if (fd == -1 && (flags & MAP_ANON) != 0)
 868  879                  error = smmap_common(&a, (size_t)len, prot,
 869  880                      flags | _MAP_LOW32, NULL, (offset_t)pos);
 870  881          else if ((fp = getf(fd)) != NULL) {
 871  882                  error = smmap_common(&a, (size_t)len, prot,
 872  883                      flags | _MAP_LOW32, fp, (offset_t)pos);
 873  884                  releasef(fd);
 874  885          } else
 875  886                  error = EBADF;
 876  887  
 877  888          ASSERT(error != 0 || (uintptr_t)(a + len) < (uintptr_t)UINT32_MAX);
 878  889  
 879  890          return (error ? (caddr_t)(uintptr_t)set_errno(error) : a);
 880  891  }
 881  892  
 882  893  /*
 883  894   * ILP32 mmap64(2) system call: 64-bit offset, 32-bit address.
 884  895   *
 885  896   * Now things really get ugly because we can't use the C-style
 886  897   * calling convention for more than 6 args, and 64-bit parameter
 887  898   * passing on 32-bit systems is less than clean.
 888  899   */
 889  900  
 890  901  struct mmaplf32a {
 891  902          caddr_t addr;
 892  903          size_t len;
 893  904  #ifdef _LP64
 894  905          /*
 895  906           * 32-bit contents, 64-bit cells
 896  907           */
 897  908          uint64_t prot;
 898  909          uint64_t flags;
 899  910          uint64_t fd;
 900  911          uint64_t offhi;
 901  912          uint64_t offlo;
 902  913  #else
 903  914          /*
 904  915           * 32-bit contents, 32-bit cells
 905  916           */
 906  917          uint32_t prot;
 907  918          uint32_t flags;
 908  919          uint32_t fd;
 909  920          uint32_t offhi;
 910  921          uint32_t offlo;
 911  922  #endif
 912  923  };
 913  924  
 914  925  int
 915  926  smmaplf32(struct mmaplf32a *uap, rval_t *rvp)
 916  927  {
 917  928          struct file *fp;
 918  929          int error;
 919  930          caddr_t a = uap->addr;
 920  931          int flags = (int)uap->flags;
 921  932          int fd = (int)uap->fd;
 922  933  #ifdef _BIG_ENDIAN
 923  934          offset_t off = ((u_offset_t)uap->offhi << 32) | (u_offset_t)uap->offlo;
 924  935  #else
 925  936          offset_t off = ((u_offset_t)uap->offlo << 32) | (u_offset_t)uap->offhi;
 926  937  #endif
 927  938  
 928  939          if (flags & _MAP_LOW32)
 929  940                  error = EINVAL;
 930  941          else if (fd == -1 && (flags & MAP_ANON) != 0)
 931  942                  error = smmap_common(&a, uap->len, (int)uap->prot,
 932  943                      flags | _MAP_LOW32, NULL, off);
 933  944          else if ((fp = getf(fd)) != NULL) {
 934  945                  error = smmap_common(&a, uap->len, (int)uap->prot,
 935  946                      flags | _MAP_LOW32, fp, off);
 936  947                  releasef(fd);
 937  948          } else
 938  949                  error = EBADF;
 939  950  
 940  951          if (error == 0)
 941  952                  rvp->r_val1 = (uintptr_t)a;
 942  953          return (error);
 943  954  }
 944  955  
 945  956  #endif  /* _SYSCALL32_IMPL || _ILP32 */
 946  957  
 947  958  int
 948  959  munmap(caddr_t addr, size_t len)
 949  960  {
 950  961          struct proc *p = curproc;
 951  962          struct as *as = p->p_as;
 952  963  
 953  964          if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
 954  965                  return (set_errno(EINVAL));
 955  966  
 956  967          if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
 957  968                  return (set_errno(EINVAL));
 958  969  
 959  970          /*
 960  971           * Discard lwpchan mappings.
 961  972           */
 962  973          if (p->p_lcp != NULL)
 963  974                  lwpchan_delete_mapping(p, addr, addr + len);
 964  975          if (as_unmap(as, addr, len) != 0)
 965  976                  return (set_errno(EINVAL));
 966  977  
 967  978          return (0);
 968  979  }
 969  980  
 970  981  int
 971  982  mprotect(caddr_t addr, size_t len, int prot)
 972  983  {
 973  984          struct as *as = curproc->p_as;
 974  985          uint_t uprot = prot | PROT_USER;
 975  986          int error;
 976  987  
 977  988          if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
 978  989                  return (set_errno(EINVAL));
 979  990  
 980  991          switch (valid_usr_range(addr, len, prot, as, as->a_userlimit)) {
 981  992          case RANGE_OKAY:
 982  993                  break;
 983  994          case RANGE_BADPROT:
 984  995                  return (set_errno(ENOTSUP));
 985  996          case RANGE_BADADDR:
 986  997          default:
 987  998                  return (set_errno(ENOMEM));
 988  999          }
 989 1000  
 990 1001          error = as_setprot(as, addr, len, uprot);
 991 1002          if (error)
 992 1003                  return (set_errno(error));
 993 1004          return (0);
 994 1005  }
 995 1006  
 996 1007  #define MC_CACHE        128                     /* internal result buffer */
 997 1008  #define MC_QUANTUM      (MC_CACHE * PAGESIZE)   /* addresses covered in loop */
 998 1009  
 999 1010  int
1000 1011  mincore(caddr_t addr, size_t len, char *vecp)
1001 1012  {
1002 1013          struct as *as = curproc->p_as;
1003 1014          caddr_t ea;                     /* end address of loop */
1004 1015          size_t rl;                      /* inner result length */
1005 1016          char vec[MC_CACHE];             /* local vector cache */
1006 1017          int error;
1007 1018          model_t model;
1008 1019          long    llen;
1009 1020  
1010 1021          model = get_udatamodel();
1011 1022          /*
1012 1023           * Validate form of address parameters.
1013 1024           */
1014 1025          if (model == DATAMODEL_NATIVE) {
1015 1026                  llen = (long)len;
1016 1027          } else {
1017 1028                  llen = (int32_t)(size32_t)len;
1018 1029          }
1019 1030          if (((uintptr_t)addr & PAGEOFFSET) != 0 || llen <= 0)
1020 1031                  return (set_errno(EINVAL));
1021 1032  
1022 1033          if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
1023 1034                  return (set_errno(ENOMEM));
1024 1035  
1025 1036          /*
1026 1037           * Loop over subranges of interval [addr : addr + len), recovering
1027 1038           * results internally and then copying them out to caller.  Subrange
1028 1039           * is based on the size of MC_CACHE, defined above.
1029 1040           */
1030 1041          for (ea = addr + len; addr < ea; addr += MC_QUANTUM) {
1031 1042                  error = as_incore(as, addr,
1032 1043                      (size_t)MIN(MC_QUANTUM, ea - addr), vec, &rl);
1033 1044                  if (rl != 0) {
1034 1045                          rl = (rl + PAGESIZE - 1) / PAGESIZE;
1035 1046                          if (copyout(vec, vecp, rl) != 0)
1036 1047                                  return (set_errno(EFAULT));
1037 1048                          vecp += rl;
1038 1049                  }
1039 1050                  if (error != 0)
1040 1051                          return (set_errno(ENOMEM));
1041 1052          }
1042 1053          return (0);
1043 1054  }
  
    | 
      ↓ open down ↓ | 
    281 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX