Print this page
    
OS-881 To workaround OS-580 add support to only invalidate mappings from a single process
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/syscall/memcntl.c
          +++ new/usr/src/uts/common/syscall/memcntl.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   * Copyright (c) 2015 Joyent, Inc.
  25   25   */
  26   26  
  27   27  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  28   28  /*        All Rights Reserved   */
  29   29  
  30   30  
  31   31  #include <sys/types.h>
  32   32  #include <sys/bitmap.h>
  33   33  #include <sys/sysmacros.h>
  34   34  #include <sys/kmem.h>
  35   35  #include <sys/param.h>
  36   36  #include <sys/systm.h>
  37   37  #include <sys/user.h>
  38   38  #include <sys/unistd.h>
  39   39  #include <sys/errno.h>
  40   40  #include <sys/proc.h>
  41   41  #include <sys/mman.h>
  42   42  #include <sys/tuneable.h>
  43   43  #include <sys/cmn_err.h>
  44   44  #include <sys/cred.h>
  45   45  #include <sys/vmsystm.h>
  46   46  #include <sys/debug.h>
  47   47  #include <sys/policy.h>
  48   48  
  49   49  #include <vm/as.h>
  50   50  #include <vm/seg.h>
  51   51  
  52   52  static uint_t mem_getpgszc(size_t);
  53   53  
  54   54  /*
  55   55   * Memory control operations
  56   56   */
  57   57  int
  58   58  memcntl(caddr_t addr, size_t len, int cmd, caddr_t arg, int attr, int mask)
  59   59  {
  60   60          struct as *as = ttoproc(curthread)->p_as;
  61   61          struct proc *p = ttoproc(curthread);
  62   62          size_t pgsz;
  63   63          uint_t szc, oszc, pgcmd;
  64   64          int error = 0;
  65   65          faultcode_t fc;
  66   66          uintptr_t iarg;
  67   67          STRUCT_DECL(memcntl_mha, mha);
  68   68  
  69   69          if (mask)
  70   70                  return (set_errno(EINVAL));
  71   71          if ((cmd == MC_LOCKAS) || (cmd == MC_UNLOCKAS)) {
  72   72                  if ((addr != 0) || (len != 0)) {
  73   73                          return (set_errno(EINVAL));
  74   74                  }
  75   75          } else if (cmd != MC_HAT_ADVISE) {
  76   76                  if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0) {
  77   77                          return (set_errno(EINVAL));
  78   78                  }
  79   79                  /*
  80   80                   * We're only concerned with the address range
  81   81                   * here, not the protections.  The protections
  82   82                   * are only used as a "filter" in this code,
  83   83                   * they aren't set or modified here.
  84   84                   */
  85   85                  if (valid_usr_range(addr, len, 0, as,
  86   86                      as->a_userlimit) != RANGE_OKAY) {
  87   87                          return (set_errno(ENOMEM));
  88   88                  }
  89   89          }
  90   90  
  91   91          if (cmd == MC_HAT_ADVISE) {
  92   92                  if (attr != 0 || mask != 0) {
  93   93                          return (set_errno(EINVAL));
  94   94                  }
  95   95  
  96   96          } else {
  97   97                  if ((VALID_ATTR & attr) != attr) {
  98   98                          return (set_errno(EINVAL));
  99   99                  }
 100  100                  if ((attr & SHARED) && (attr & PRIVATE)) {
 101  101                          return (set_errno(EINVAL));
 102  102                  }
 103  103                  if (((cmd == MC_LOCKAS) || (cmd == MC_LOCK) ||
 104  104                      (cmd == MC_UNLOCKAS) || (cmd == MC_UNLOCK)) &&
 105  105                      (error = secpolicy_lock_memory(CRED())) != 0)
 106  106                          return (set_errno(error));
 107  107          }
  
    | 
      ↓ open down ↓ | 
    107 lines elided | 
    
      ↑ open up ↑ | 
  
 108  108          if (attr) {
 109  109                  attr |= PROT_USER;
 110  110          }
 111  111  
 112  112          switch (cmd) {
 113  113          case MC_SYNC:
 114  114                  /*
 115  115                   * MS_SYNC used to be defined to be zero but is now non-zero.
 116  116                   * For binary compatibility we still accept zero
 117  117                   * (the absence of MS_ASYNC) to mean the same thing.
      118 +                 * Binary compatibility is not an issue for MS_INVALCURPROC.
 118  119                   */
 119  120                  iarg = (uintptr_t)arg;
 120  121                  if ((iarg & ~MS_INVALIDATE) == 0)
 121  122                          iarg |= MS_SYNC;
 122  123  
 123      -                if (((iarg & ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE)) != 0) ||
 124      -                    ((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC))) {
      124 +                if (((iarg &
      125 +                    ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE|MS_INVALCURPROC)) != 0) ||
      126 +                    ((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC)) ||
      127 +                    ((iarg & (MS_INVALIDATE|MS_INVALCURPROC)) ==
      128 +                    (MS_INVALIDATE|MS_INVALCURPROC))) {
 125  129                          error = set_errno(EINVAL);
 126  130                  } else {
 127  131                          error = as_ctl(as, addr, len, cmd, attr, iarg, NULL, 0);
 128  132                          if (error) {
 129  133                                  (void) set_errno(error);
 130  134                          }
 131  135                  }
 132  136                  return (error);
 133  137          case MC_LOCKAS:
 134  138                  if ((uintptr_t)arg & ~(MCL_FUTURE|MCL_CURRENT) ||
 135  139                      (uintptr_t)arg == 0) {
 136  140                          return (set_errno(EINVAL));
 137  141                  }
 138  142                  break;
 139  143          case MC_LOCK:
 140  144          case MC_UNLOCKAS:
 141  145          case MC_UNLOCK:
 142  146                  break;
 143  147          case MC_HAT_ADVISE:
 144  148                  /*
 145  149                   * Set prefered page size.
 146  150                   */
 147  151                  STRUCT_INIT(mha, get_udatamodel());
 148  152                  if (copyin(arg, STRUCT_BUF(mha), STRUCT_SIZE(mha))) {
 149  153                          return (set_errno(EFAULT));
 150  154                  }
 151  155  
 152  156                  pgcmd = STRUCT_FGET(mha, mha_cmd);
 153  157  
 154  158                  /*
 155  159                   * Currently only MHA_MAPSIZE_VA, MHA_MAPSIZE_STACK
 156  160                   * and MHA_MAPSIZE_BSSBRK are supported. Only one
 157  161                   * command may be specified at a time.
 158  162                   */
 159  163                  if ((~(MHA_MAPSIZE_VA|MHA_MAPSIZE_STACK|MHA_MAPSIZE_BSSBRK) &
 160  164                      pgcmd) || pgcmd == 0 || !ISP2(pgcmd) ||
 161  165                      STRUCT_FGET(mha, mha_flags))
 162  166                          return (set_errno(EINVAL));
 163  167  
 164  168                  pgsz = STRUCT_FGET(mha, mha_pagesize);
 165  169  
 166  170                  /*
 167  171                   * call platform specific map_pgsz() routine to get the
 168  172                   * optimal pgsz if pgsz is 0.
 169  173                   *
 170  174                   * For stack and heap operations addr and len must be zero.
 171  175                   */
 172  176                  if ((pgcmd & (MHA_MAPSIZE_BSSBRK|MHA_MAPSIZE_STACK)) != 0) {
 173  177                          if (addr != NULL || len != 0) {
 174  178                                  return (set_errno(EINVAL));
 175  179                          }
 176  180  
 177  181                          /*
 178  182                           * Disable autompss for this process unless pgsz == 0,
 179  183                           * which means the system should pick.  In the
 180  184                           * pgsz == 0 case, leave the SAUTOLPG setting alone, as
 181  185                           * we don't want to enable it when someone has
 182  186                           * disabled automatic large page selection for the
 183  187                           * whole system.
 184  188                           */
 185  189                          mutex_enter(&p->p_lock);
 186  190                          if (pgsz != 0) {
 187  191                                  p->p_flag &= ~SAUTOLPG;
 188  192                          }
 189  193                          mutex_exit(&p->p_lock);
 190  194  
 191  195                          as_rangelock(as);
 192  196  
 193  197                          if (pgsz == 0) {
 194  198                                  int     type;
 195  199  
 196  200                                  if (pgcmd == MHA_MAPSIZE_BSSBRK)
 197  201                                          type = MAPPGSZ_HEAP;
 198  202                                  else
 199  203                                          type = MAPPGSZ_STK;
 200  204  
 201  205                                  pgsz = map_pgsz(type, p, 0, 0, 1);
 202  206                          }
 203  207                  } else {
 204  208                          /*
 205  209                           * addr and len must be valid for range specified.
 206  210                           */
 207  211                          if (valid_usr_range(addr, len, 0, as,
 208  212                              as->a_userlimit) != RANGE_OKAY) {
 209  213                                  return (set_errno(ENOMEM));
 210  214                          }
 211  215                          /*
 212  216                           * Note that we don't disable automatic large page
 213  217                           * selection for anon segments based on use of
 214  218                           * memcntl().
 215  219                           */
 216  220                          if (pgsz == 0) {
 217  221                                  error = as_set_default_lpsize(as, addr, len);
 218  222                                  if (error) {
 219  223                                          (void) set_errno(error);
 220  224                                  }
 221  225                                  return (error);
 222  226                          }
 223  227  
 224  228                          /*
 225  229                           * addr and len must be prefered page size aligned
 226  230                           */
 227  231                          if (!IS_P2ALIGNED(addr, pgsz) ||
 228  232                              !IS_P2ALIGNED(len, pgsz)) {
 229  233                                  return (set_errno(EINVAL));
 230  234                          }
 231  235                  }
 232  236  
 233  237                  szc = mem_getpgszc(pgsz);
 234  238                  if (szc == (uint_t)-1) {
 235  239                          if ((pgcmd & (MHA_MAPSIZE_BSSBRK|MHA_MAPSIZE_STACK))
 236  240                              != 0) {
 237  241                                  as_rangeunlock(as);
 238  242                          }
 239  243                          return (set_errno(EINVAL));
 240  244                  }
 241  245  
 242  246                  /*
 243  247                   * For stack and heap operations we first need to pad
 244  248                   * out existing range (create new mappings) to the new
 245  249                   * prefered page size boundary. Also the start of the
 246  250                   * .bss for the heap or user's stack base may not be on
 247  251                   * the new prefered page size boundary. For these cases
 248  252                   * we align the base of the request on the new prefered
 249  253                   * page size.
 250  254                   */
 251  255                  if (pgcmd & MHA_MAPSIZE_BSSBRK) {
 252  256                          if (szc == p->p_brkpageszc) {
 253  257                                  as_rangeunlock(as);
 254  258                                  return (0);
 255  259                          }
 256  260                          if (szc > p->p_brkpageszc) {
 257  261                                  error = brk_internal(p->p_brkbase
 258  262                                      + p->p_brksize, szc);
 259  263                                  if (error) {
 260  264                                          as_rangeunlock(as);
 261  265                                          return (set_errno(error));
 262  266                                  }
 263  267                          }
 264  268                          /*
 265  269                           * It is possible for brk_internal to silently fail to
 266  270                           * promote the heap size, so don't panic or ASSERT.
 267  271                           */
 268  272                          if (!IS_P2ALIGNED(p->p_brkbase + p->p_brksize, pgsz)) {
 269  273                                  as_rangeunlock(as);
 270  274                                  return (set_errno(ENOMEM));
 271  275                          }
 272  276                          oszc = p->p_brkpageszc;
 273  277                          p->p_brkpageszc = szc;
 274  278  
 275  279                          addr = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase,
 276  280                              pgsz);
 277  281                          len = (p->p_brkbase + p->p_brksize) - addr;
 278  282                          ASSERT(IS_P2ALIGNED(len, pgsz));
 279  283                          /*
 280  284                           * Perhaps no existing pages to promote.
 281  285                           */
 282  286                          if (len == 0) {
 283  287                                  as_rangeunlock(as);
 284  288                                  return (0);
 285  289                          }
 286  290                  }
 287  291                  /*
 288  292                   * The code below, as does grow.c, assumes stacks always grow
 289  293                   * downward.
 290  294                   */
 291  295                  if (pgcmd & MHA_MAPSIZE_STACK) {
 292  296                          if (szc == p->p_stkpageszc) {
 293  297                                  as_rangeunlock(as);
 294  298                                  return (0);
 295  299                          }
 296  300  
 297  301                          if (szc > p->p_stkpageszc) {
 298  302                                  error = grow_internal(p->p_usrstack -
 299  303                                      p->p_stksize, szc);
 300  304                                  if (error) {
 301  305                                          as_rangeunlock(as);
 302  306                                          return (set_errno(error));
 303  307                                  }
 304  308                          }
 305  309                          /*
 306  310                           * It is possible for grow_internal to silently fail to
 307  311                           * promote the stack size, so don't panic or ASSERT.
 308  312                           */
 309  313                          if (!IS_P2ALIGNED(p->p_usrstack - p->p_stksize, pgsz)) {
 310  314                                  as_rangeunlock(as);
 311  315                                  return (set_errno(ENOMEM));
 312  316                          }
 313  317                          oszc = p->p_stkpageszc;
 314  318                          p->p_stkpageszc = szc;
 315  319  
 316  320                          addr = p->p_usrstack - p->p_stksize;
 317  321                          len = P2ALIGN(p->p_stksize, pgsz);
 318  322  
 319  323                          /*
 320  324                           * Perhaps nothing to promote.
 321  325                           */
 322  326                          if (len == 0 || addr >= p->p_usrstack ||
 323  327                              (addr + len) < addr) {
 324  328                                  as_rangeunlock(as);
 325  329                                  return (0);
 326  330                          }
 327  331                  }
 328  332                  ASSERT(IS_P2ALIGNED(addr, pgsz));
 329  333                  ASSERT(IS_P2ALIGNED(len, pgsz));
 330  334                  error = as_setpagesize(as, addr, len, szc, B_TRUE);
 331  335  
 332  336                  /*
 333  337                   * On stack or heap failures restore original
 334  338                   * pg size code.
 335  339                   */
 336  340                  if (error) {
 337  341                          if ((pgcmd & MHA_MAPSIZE_BSSBRK) != 0) {
 338  342                                  p->p_brkpageszc = oszc;
 339  343                          }
 340  344                          if ((pgcmd & MHA_MAPSIZE_STACK) != 0) {
 341  345                                  p->p_stkpageszc = oszc;
 342  346                          }
 343  347                          (void) set_errno(error);
 344  348                  }
 345  349                  if ((pgcmd & (MHA_MAPSIZE_BSSBRK|MHA_MAPSIZE_STACK)) != 0) {
 346  350                          as_rangeunlock(as);
 347  351                  }
 348  352                  return (error);
 349  353          case MC_ADVISE:
 350  354                  if ((uintptr_t)arg == MADV_FREE ||
 351  355                      (uintptr_t)arg == MADV_PURGE) {
 352  356                          len &= PAGEMASK;
 353  357                  }
 354  358                  switch ((uintptr_t)arg) {
 355  359                  case MADV_WILLNEED:
 356  360                          fc = as_faulta(as, addr, len);
 357  361                          if (fc) {
 358  362                                  if (FC_CODE(fc) == FC_OBJERR)
 359  363                                          error = set_errno(FC_ERRNO(fc));
 360  364                                  else if (FC_CODE(fc) == FC_NOMAP)
 361  365                                          error = set_errno(ENOMEM);
 362  366                                  else
 363  367                                          error = set_errno(EINVAL);
 364  368                                  return (error);
 365  369                          }
 366  370                          break;
 367  371  
 368  372                  case MADV_DONTNEED:
 369  373                          /*
 370  374                           * For now, don't need is turned into an as_ctl(MC_SYNC)
 371  375                           * operation flagged for async invalidate.
 372  376                           */
 373  377                          error = as_ctl(as, addr, len, MC_SYNC, attr,
 374  378                              MS_ASYNC | MS_INVALIDATE, NULL, 0);
 375  379                          if (error)
 376  380                                  (void) set_errno(error);
 377  381                          return (error);
 378  382  
 379  383                  default:
 380  384                          error = as_ctl(as, addr, len, cmd, attr,
 381  385                              (uintptr_t)arg, NULL, 0);
 382  386                          if (error)
 383  387                                  (void) set_errno(error);
 384  388                          return (error);
 385  389                  }
 386  390                  break;
 387  391          case MC_INHERIT_ZERO:
 388  392                  if (arg != 0 || attr != 0 || mask != 0)
 389  393                          return (set_errno(EINVAL));
 390  394                  break;
 391  395          default:
 392  396                  return (set_errno(EINVAL));
 393  397          }
 394  398  
 395  399          error = as_ctl(as, addr, len, cmd, attr, (uintptr_t)arg, NULL, 0);
 396  400  
 397  401          if (error)
 398  402                  (void) set_errno(error);
 399  403          return (error);
 400  404  }
 401  405  
 402  406  /*
 403  407   * Return page size code for page size passed in. If
 404  408   * matching page size not found or supported, return -1.
 405  409   */
 406  410  static uint_t
 407  411  mem_getpgszc(size_t pgsz) {
 408  412          return ((uint_t)page_szc_user_filtered(pgsz));
 409  413  }
  
    | 
      ↓ open down ↓ | 
    275 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX