1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright (c) 2015 Joyent, Inc.
  25  */
  26 
  27 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  28 /*        All Rights Reserved   */
  29 
  30 
  31 #include <sys/types.h>
  32 #include <sys/bitmap.h>
  33 #include <sys/sysmacros.h>
  34 #include <sys/kmem.h>
  35 #include <sys/param.h>
  36 #include <sys/systm.h>
  37 #include <sys/user.h>
  38 #include <sys/unistd.h>
  39 #include <sys/errno.h>
  40 #include <sys/proc.h>
  41 #include <sys/mman.h>
  42 #include <sys/tuneable.h>
  43 #include <sys/cmn_err.h>
  44 #include <sys/cred.h>
  45 #include <sys/vmsystm.h>
  46 #include <sys/debug.h>
  47 #include <sys/policy.h>
  48 
  49 #include <vm/as.h>
  50 #include <vm/seg.h>
  51 
  52 static uint_t mem_getpgszc(size_t);
  53 
  54 /*
  55  * Memory control operations
  56  */
  57 int
  58 memcntl(caddr_t addr, size_t len, int cmd, caddr_t arg, int attr, int mask)
  59 {
  60         struct as *as = ttoproc(curthread)->p_as;
  61         struct proc *p = ttoproc(curthread);
  62         size_t pgsz;
  63         uint_t szc, oszc, pgcmd;
  64         int error = 0;
  65         faultcode_t fc;
  66         uintptr_t iarg;
  67         STRUCT_DECL(memcntl_mha, mha);
  68 
  69         if (mask)
  70                 return (set_errno(EINVAL));
  71         if ((cmd == MC_LOCKAS) || (cmd == MC_UNLOCKAS)) {
  72                 if ((addr != 0) || (len != 0)) {
  73                         return (set_errno(EINVAL));
  74                 }
  75         } else if (cmd != MC_HAT_ADVISE) {
  76                 if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0) {
  77                         return (set_errno(EINVAL));
  78                 }
  79                 /*
  80                  * We're only concerned with the address range
  81                  * here, not the protections.  The protections
  82                  * are only used as a "filter" in this code,
  83                  * they aren't set or modified here.
  84                  */
  85                 if (valid_usr_range(addr, len, 0, as,
  86                     as->a_userlimit) != RANGE_OKAY) {
  87                         return (set_errno(ENOMEM));
  88                 }
  89         }
  90 
  91         if (cmd == MC_HAT_ADVISE) {
  92                 if (attr != 0 || mask != 0) {
  93                         return (set_errno(EINVAL));
  94                 }
  95 
  96         } else {
  97                 if ((VALID_ATTR & attr) != attr) {
  98                         return (set_errno(EINVAL));
  99                 }
 100                 if ((attr & SHARED) && (attr & PRIVATE)) {
 101                         return (set_errno(EINVAL));
 102                 }
 103                 if (((cmd == MC_LOCKAS) || (cmd == MC_LOCK) ||
 104                     (cmd == MC_UNLOCKAS) || (cmd == MC_UNLOCK)) &&
 105                     (error = secpolicy_lock_memory(CRED())) != 0)
 106                         return (set_errno(error));
 107         }
 108         if (attr) {
 109                 attr |= PROT_USER;
 110         }
 111 
 112         switch (cmd) {
 113         case MC_SYNC:
 114                 /*
 115                  * MS_SYNC used to be defined to be zero but is now non-zero.
 116                  * For binary compatibility we still accept zero
 117                  * (the absence of MS_ASYNC) to mean the same thing.
 118                  * Binary compatibility is not an issue for MS_INVALCURPROC.
 119                  */
 120                 iarg = (uintptr_t)arg;
 121                 if ((iarg & ~MS_INVALIDATE) == 0)
 122                         iarg |= MS_SYNC;
 123 
 124                 if (((iarg &
 125                     ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE|MS_INVALCURPROC)) != 0) ||
 126                     ((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC)) ||
 127                     ((iarg & (MS_INVALIDATE|MS_INVALCURPROC)) ==
 128                     (MS_INVALIDATE|MS_INVALCURPROC))) {
 129                         error = set_errno(EINVAL);
 130                 } else {
 131                         error = as_ctl(as, addr, len, cmd, attr, iarg, NULL, 0);
 132                         if (error) {
 133                                 (void) set_errno(error);
 134                         }
 135                 }
 136                 return (error);
 137         case MC_LOCKAS:
 138                 if ((uintptr_t)arg & ~(MCL_FUTURE|MCL_CURRENT) ||
 139                     (uintptr_t)arg == 0) {
 140                         return (set_errno(EINVAL));
 141                 }
 142                 break;
 143         case MC_LOCK:
 144         case MC_UNLOCKAS:
 145         case MC_UNLOCK:
 146                 break;
 147         case MC_HAT_ADVISE:
 148                 /*
 149                  * Set prefered page size.
 150                  */
 151                 STRUCT_INIT(mha, get_udatamodel());
 152                 if (copyin(arg, STRUCT_BUF(mha), STRUCT_SIZE(mha))) {
 153                         return (set_errno(EFAULT));
 154                 }
 155 
 156                 pgcmd = STRUCT_FGET(mha, mha_cmd);
 157 
 158                 /*
 159                  * Currently only MHA_MAPSIZE_VA, MHA_MAPSIZE_STACK
 160                  * and MHA_MAPSIZE_BSSBRK are supported. Only one
 161                  * command may be specified at a time.
 162                  */
 163                 if ((~(MHA_MAPSIZE_VA|MHA_MAPSIZE_STACK|MHA_MAPSIZE_BSSBRK) &
 164                     pgcmd) || pgcmd == 0 || !ISP2(pgcmd) ||
 165                     STRUCT_FGET(mha, mha_flags))
 166                         return (set_errno(EINVAL));
 167 
 168                 pgsz = STRUCT_FGET(mha, mha_pagesize);
 169 
 170                 /*
 171                  * call platform specific map_pgsz() routine to get the
 172                  * optimal pgsz if pgsz is 0.
 173                  *
 174                  * For stack and heap operations addr and len must be zero.
 175                  */
 176                 if ((pgcmd & (MHA_MAPSIZE_BSSBRK|MHA_MAPSIZE_STACK)) != 0) {
 177                         if (addr != NULL || len != 0) {
 178                                 return (set_errno(EINVAL));
 179                         }
 180 
 181                         /*
 182                          * Disable autompss for this process unless pgsz == 0,
 183                          * which means the system should pick.  In the
 184                          * pgsz == 0 case, leave the SAUTOLPG setting alone, as
 185                          * we don't want to enable it when someone has
 186                          * disabled automatic large page selection for the
 187                          * whole system.
 188                          */
 189                         mutex_enter(&p->p_lock);
 190                         if (pgsz != 0) {
 191                                 p->p_flag &= ~SAUTOLPG;
 192                         }
 193                         mutex_exit(&p->p_lock);
 194 
 195                         as_rangelock(as);
 196 
 197                         if (pgsz == 0) {
 198                                 int     type;
 199 
 200                                 if (pgcmd == MHA_MAPSIZE_BSSBRK)
 201                                         type = MAPPGSZ_HEAP;
 202                                 else
 203                                         type = MAPPGSZ_STK;
 204 
 205                                 pgsz = map_pgsz(type, p, 0, 0, 1);
 206                         }
 207                 } else {
 208                         /*
 209                          * addr and len must be valid for range specified.
 210                          */
 211                         if (valid_usr_range(addr, len, 0, as,
 212                             as->a_userlimit) != RANGE_OKAY) {
 213                                 return (set_errno(ENOMEM));
 214                         }
 215                         /*
 216                          * Note that we don't disable automatic large page
 217                          * selection for anon segments based on use of
 218                          * memcntl().
 219                          */
 220                         if (pgsz == 0) {
 221                                 error = as_set_default_lpsize(as, addr, len);
 222                                 if (error) {
 223                                         (void) set_errno(error);
 224                                 }
 225                                 return (error);
 226                         }
 227 
 228                         /*
 229                          * addr and len must be prefered page size aligned
 230                          */
 231                         if (!IS_P2ALIGNED(addr, pgsz) ||
 232                             !IS_P2ALIGNED(len, pgsz)) {
 233                                 return (set_errno(EINVAL));
 234                         }
 235                 }
 236 
 237                 szc = mem_getpgszc(pgsz);
 238                 if (szc == (uint_t)-1) {
 239                         if ((pgcmd & (MHA_MAPSIZE_BSSBRK|MHA_MAPSIZE_STACK))
 240                             != 0) {
 241                                 as_rangeunlock(as);
 242                         }
 243                         return (set_errno(EINVAL));
 244                 }
 245 
 246                 /*
 247                  * For stack and heap operations we first need to pad
 248                  * out existing range (create new mappings) to the new
 249                  * prefered page size boundary. Also the start of the
 250                  * .bss for the heap or user's stack base may not be on
 251                  * the new prefered page size boundary. For these cases
 252                  * we align the base of the request on the new prefered
 253                  * page size.
 254                  */
 255                 if (pgcmd & MHA_MAPSIZE_BSSBRK) {
 256                         if (szc == p->p_brkpageszc) {
 257                                 as_rangeunlock(as);
 258                                 return (0);
 259                         }
 260                         if (szc > p->p_brkpageszc) {
 261                                 error = brk_internal(p->p_brkbase
 262                                     + p->p_brksize, szc);
 263                                 if (error) {
 264                                         as_rangeunlock(as);
 265                                         return (set_errno(error));
 266                                 }
 267                         }
 268                         /*
 269                          * It is possible for brk_internal to silently fail to
 270                          * promote the heap size, so don't panic or ASSERT.
 271                          */
 272                         if (!IS_P2ALIGNED(p->p_brkbase + p->p_brksize, pgsz)) {
 273                                 as_rangeunlock(as);
 274                                 return (set_errno(ENOMEM));
 275                         }
 276                         oszc = p->p_brkpageszc;
 277                         p->p_brkpageszc = szc;
 278 
 279                         addr = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase,
 280                             pgsz);
 281                         len = (p->p_brkbase + p->p_brksize) - addr;
 282                         ASSERT(IS_P2ALIGNED(len, pgsz));
 283                         /*
 284                          * Perhaps no existing pages to promote.
 285                          */
 286                         if (len == 0) {
 287                                 as_rangeunlock(as);
 288                                 return (0);
 289                         }
 290                 }
 291                 /*
 292                  * The code below, as does grow.c, assumes stacks always grow
 293                  * downward.
 294                  */
 295                 if (pgcmd & MHA_MAPSIZE_STACK) {
 296                         if (szc == p->p_stkpageszc) {
 297                                 as_rangeunlock(as);
 298                                 return (0);
 299                         }
 300 
 301                         if (szc > p->p_stkpageszc) {
 302                                 error = grow_internal(p->p_usrstack -
 303                                     p->p_stksize, szc);
 304                                 if (error) {
 305                                         as_rangeunlock(as);
 306                                         return (set_errno(error));
 307                                 }
 308                         }
 309                         /*
 310                          * It is possible for grow_internal to silently fail to
 311                          * promote the stack size, so don't panic or ASSERT.
 312                          */
 313                         if (!IS_P2ALIGNED(p->p_usrstack - p->p_stksize, pgsz)) {
 314                                 as_rangeunlock(as);
 315                                 return (set_errno(ENOMEM));
 316                         }
 317                         oszc = p->p_stkpageszc;
 318                         p->p_stkpageszc = szc;
 319 
 320                         addr = p->p_usrstack - p->p_stksize;
 321                         len = P2ALIGN(p->p_stksize, pgsz);
 322 
 323                         /*
 324                          * Perhaps nothing to promote.
 325                          */
 326                         if (len == 0 || addr >= p->p_usrstack ||
 327                             (addr + len) < addr) {
 328                                 as_rangeunlock(as);
 329                                 return (0);
 330                         }
 331                 }
 332                 ASSERT(IS_P2ALIGNED(addr, pgsz));
 333                 ASSERT(IS_P2ALIGNED(len, pgsz));
 334                 error = as_setpagesize(as, addr, len, szc, B_TRUE);
 335 
 336                 /*
 337                  * On stack or heap failures restore original
 338                  * pg size code.
 339                  */
 340                 if (error) {
 341                         if ((pgcmd & MHA_MAPSIZE_BSSBRK) != 0) {
 342                                 p->p_brkpageszc = oszc;
 343                         }
 344                         if ((pgcmd & MHA_MAPSIZE_STACK) != 0) {
 345                                 p->p_stkpageszc = oszc;
 346                         }
 347                         (void) set_errno(error);
 348                 }
 349                 if ((pgcmd & (MHA_MAPSIZE_BSSBRK|MHA_MAPSIZE_STACK)) != 0) {
 350                         as_rangeunlock(as);
 351                 }
 352                 return (error);
 353         case MC_ADVISE:
 354                 if ((uintptr_t)arg == MADV_FREE ||
 355                     (uintptr_t)arg == MADV_PURGE) {
 356                         len &= PAGEMASK;
 357                 }
 358                 switch ((uintptr_t)arg) {
 359                 case MADV_WILLNEED:
 360                         fc = as_faulta(as, addr, len);
 361                         if (fc) {
 362                                 if (FC_CODE(fc) == FC_OBJERR)
 363                                         error = set_errno(FC_ERRNO(fc));
 364                                 else if (FC_CODE(fc) == FC_NOMAP)
 365                                         error = set_errno(ENOMEM);
 366                                 else
 367                                         error = set_errno(EINVAL);
 368                                 return (error);
 369                         }
 370                         break;
 371 
 372                 case MADV_DONTNEED:
 373                         /*
 374                          * For now, don't need is turned into an as_ctl(MC_SYNC)
 375                          * operation flagged for async invalidate.
 376                          */
 377                         error = as_ctl(as, addr, len, MC_SYNC, attr,
 378                             MS_ASYNC | MS_INVALIDATE, NULL, 0);
 379                         if (error)
 380                                 (void) set_errno(error);
 381                         return (error);
 382 
 383                 default:
 384                         error = as_ctl(as, addr, len, cmd, attr,
 385                             (uintptr_t)arg, NULL, 0);
 386                         if (error)
 387                                 (void) set_errno(error);
 388                         return (error);
 389                 }
 390                 break;
 391         case MC_INHERIT_ZERO:
 392                 if (arg != 0 || attr != 0 || mask != 0)
 393                         return (set_errno(EINVAL));
 394                 break;
 395         default:
 396                 return (set_errno(EINVAL));
 397         }
 398 
 399         error = as_ctl(as, addr, len, cmd, attr, (uintptr_t)arg, NULL, 0);
 400 
 401         if (error)
 402                 (void) set_errno(error);
 403         return (error);
 404 }
 405 
 406 /*
 407  * Return page size code for page size passed in. If
 408  * matching page size not found or supported, return -1.
 409  */
 410 static uint_t
 411 mem_getpgszc(size_t pgsz) {
 412         return ((uint_t)page_szc_user_filtered(pgsz));
 413 }