1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved.
24 * Copyright (c) 2014, Joyent, Inc. All rights reserved.
25 */
26
27 /*
28 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
29 * Use is subject to license terms.
30 */
31
32 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
33 /* All Rights Reserved */
34
35 #include <sys/types.h>
36 #include <sys/inttypes.h>
37 #include <sys/param.h>
38 #include <sys/sysmacros.h>
39 #include <sys/systm.h>
40 #include <sys/signal.h>
41 #include <sys/user.h>
42 #include <sys/errno.h>
43 #include <sys/var.h>
44 #include <sys/proc.h>
45 #include <sys/tuneable.h>
46 #include <sys/debug.h>
47 #include <sys/cmn_err.h>
48 #include <sys/cred.h>
49 #include <sys/vnode.h>
50 #include <sys/vfs.h>
51 #include <sys/vm.h>
52 #include <sys/file.h>
53 #include <sys/mman.h>
54 #include <sys/vmparam.h>
55 #include <sys/fcntl.h>
56 #include <sys/lwpchan_impl.h>
57 #include <sys/nbmlock.h>
58 #include <sys/brand.h>
59
60 #include <vm/hat.h>
61 #include <vm/as.h>
62 #include <vm/seg.h>
63 #include <vm/seg_dev.h>
64 #include <vm/seg_vn.h>
65
66 int use_brk_lpg = 1;
67 int use_stk_lpg = 1;
68
69 static int brk_lpg(caddr_t nva);
70 static int grow_lpg(caddr_t sp);
71
72 int
73 brk(caddr_t nva)
74 {
75 int error;
76 proc_t *p = curproc;
77
78 /*
79 * Serialize brk operations on an address space.
80 * This also serves as the lock protecting p_brksize
81 * and p_brkpageszc.
82 */
83 as_rangelock(p->p_as);
84 if (use_brk_lpg && (p->p_flag & SAUTOLPG) != 0) {
85 error = brk_lpg(nva);
86 } else {
87 error = brk_internal(nva, p->p_brkpageszc);
88 }
89 as_rangeunlock(p->p_as);
90 return ((error != 0 ? set_errno(error) : 0));
91 }
92
93 /*
94 * Algorithm: call arch-specific map_pgsz to get best page size to use,
95 * then call brk_internal().
96 * Returns 0 on success.
97 */
98 static int
99 brk_lpg(caddr_t nva)
100 {
101 struct proc *p = curproc;
102 size_t pgsz, len;
103 caddr_t addr, brkend;
104 caddr_t bssbase = p->p_bssbase;
105 caddr_t brkbase = p->p_brkbase;
106 int oszc, szc;
107 int err;
108
109 oszc = p->p_brkpageszc;
110
111 /*
112 * If p_brkbase has not yet been set, the first call
113 * to brk_internal() will initialize it.
114 */
115 if (brkbase == 0) {
116 return (brk_internal(nva, oszc));
117 }
118
119 len = nva - bssbase;
120
121 pgsz = map_pgsz(MAPPGSZ_HEAP, p, bssbase, len, 0);
122 szc = page_szc(pgsz);
123
124 /*
125 * Covers two cases:
126 * 1. page_szc() returns -1 for invalid page size, so we want to
127 * ignore it in that case.
128 * 2. By design we never decrease page size, as it is more stable.
129 */
130 if (szc <= oszc) {
131 err = brk_internal(nva, oszc);
132 /* If failed, back off to base page size. */
133 if (err != 0 && oszc != 0) {
134 err = brk_internal(nva, 0);
135 }
136 return (err);
137 }
138
139 err = brk_internal(nva, szc);
140 /* If using szc failed, map with base page size and return. */
141 if (err != 0) {
142 if (szc != 0) {
143 err = brk_internal(nva, 0);
144 }
145 return (err);
146 }
147
148 /*
149 * Round up brk base to a large page boundary and remap
150 * anything in the segment already faulted in beyond that
151 * point.
152 */
153 addr = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase, pgsz);
154 brkend = brkbase + p->p_brksize;
155 len = brkend - addr;
156 /* Check that len is not negative. Update page size code for heap. */
157 if (addr >= p->p_bssbase && brkend > addr && IS_P2ALIGNED(len, pgsz)) {
158 (void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
159 p->p_brkpageszc = szc;
160 }
161
162 ASSERT(err == 0);
163 return (err); /* should always be 0 */
164 }
165
166 /*
167 * Returns 0 on success.
168 */
169 int
170 brk_internal(caddr_t nva, uint_t brkszc)
171 {
172 caddr_t ova; /* current break address */
173 size_t size;
174 int error;
175 struct proc *p = curproc;
176 struct as *as = p->p_as;
177 size_t pgsz;
178 uint_t szc;
179 rctl_qty_t as_rctl;
180
181 /*
182 * extend heap to brkszc alignment but use current p->p_brkpageszc
183 * for the newly created segment. This allows the new extension
184 * segment to be concatenated successfully with the existing brk
185 * segment.
186 */
187 if ((szc = brkszc) != 0) {
188 pgsz = page_get_pagesize(szc);
189 ASSERT(pgsz > PAGESIZE);
190 } else {
191 pgsz = PAGESIZE;
192 }
193
194 mutex_enter(&p->p_lock);
195 as_rctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_DATA],
196 p->p_rctls, p);
197 mutex_exit(&p->p_lock);
198
199 /*
200 * If p_brkbase has not yet been set, the first call
201 * to brk() will initialize it.
202 */
203 if (p->p_brkbase == 0)
204 p->p_brkbase = nva;
205
206 /*
207 * Before multiple page size support existed p_brksize was the value
208 * not rounded to the pagesize (i.e. it stored the exact user request
209 * for heap size). If pgsz is greater than PAGESIZE calculate the
210 * heap size as the real new heap size by rounding it up to pgsz.
211 * This is useful since we may want to know where the heap ends
212 * without knowing heap pagesize (e.g. some old code) and also if
213 * heap pagesize changes we can update p_brkpageszc but delay adding
214 * new mapping yet still know from p_brksize where the heap really
215 * ends. The user requested heap end is stored in libc variable.
216 */
217 if (pgsz > PAGESIZE) {
218 caddr_t tnva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
219 size = tnva - p->p_brkbase;
220 if (tnva < p->p_brkbase || (size > p->p_brksize &&
221 size > (size_t)as_rctl)) {
222 szc = 0;
223 pgsz = PAGESIZE;
224 size = nva - p->p_brkbase;
225 }
226 } else {
227 size = nva - p->p_brkbase;
228 }
229
230 /*
231 * use PAGESIZE to roundup ova because we want to know the real value
232 * of the current heap end in case p_brkpageszc changes since the last
233 * p_brksize was computed.
234 */
235 nva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
236 ova = (caddr_t)P2ROUNDUP((uintptr_t)(p->p_brkbase + p->p_brksize),
237 PAGESIZE);
238
239 if ((nva < p->p_brkbase) || (size > p->p_brksize &&
240 size > as_rctl)) {
241 mutex_enter(&p->p_lock);
242 (void) rctl_action(rctlproc_legacy[RLIMIT_DATA], p->p_rctls, p,
243 RCA_SAFE);
244 mutex_exit(&p->p_lock);
245 return (ENOMEM);
246 }
247
248 if (nva > ova) {
249 struct segvn_crargs crargs =
250 SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
251
252 if (!(p->p_datprot & PROT_EXEC)) {
253 crargs.prot &= ~PROT_EXEC;
254 }
255
256 /*
257 * Add new zfod mapping to extend UNIX data segment
258 * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies
259 * via map_pgszcvec(). Use AS_MAP_HEAP to get intermediate
260 * page sizes if ova is not aligned to szc's pgsz.
261 */
262 if (szc > 0) {
263 caddr_t rbss;
264
265 rbss = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase,
266 pgsz);
267 if (IS_P2ALIGNED(p->p_bssbase, pgsz) || ova > rbss) {
268 crargs.szc = p->p_brkpageszc ? p->p_brkpageszc :
269 AS_MAP_NO_LPOOB;
270 } else if (ova == rbss) {
271 crargs.szc = szc;
272 } else {
273 crargs.szc = AS_MAP_HEAP;
274 }
275 } else {
276 crargs.szc = AS_MAP_NO_LPOOB;
277 }
278 crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_UP;
279 error = as_map(as, ova, (size_t)(nva - ova), segvn_create,
280 &crargs);
281 if (error) {
282 return (error);
283 }
284
285 } else if (nva < ova) {
286 /*
287 * Release mapping to shrink UNIX data segment.
288 */
289 (void) as_unmap(as, nva, (size_t)(ova - nva));
290 }
291 p->p_brksize = size;
292 return (0);
293 }
294
295 /*
296 * Grow the stack to include sp. Return 1 if successful, 0 otherwise.
297 * This routine assumes that the stack grows downward.
298 */
299 int
300 grow(caddr_t sp)
301 {
302 struct proc *p = curproc;
303 struct as *as = p->p_as;
304 size_t oldsize = p->p_stksize;
305 size_t newsize;
306 int err;
307
308 /*
309 * Serialize grow operations on an address space.
310 * This also serves as the lock protecting p_stksize
311 * and p_stkpageszc.
312 */
313 as_rangelock(as);
314 if (use_stk_lpg && (p->p_flag & SAUTOLPG) != 0) {
315 err = grow_lpg(sp);
316 } else {
317 err = grow_internal(sp, p->p_stkpageszc);
318 }
319 as_rangeunlock(as);
320
321 if (err == 0 && (newsize = p->p_stksize) > oldsize) {
322 ASSERT(IS_P2ALIGNED(oldsize, PAGESIZE));
323 ASSERT(IS_P2ALIGNED(newsize, PAGESIZE));
324 /*
325 * Set up translations so the process doesn't have to fault in
326 * the stack pages we just gave it.
327 */
328 (void) as_fault(as->a_hat, as, p->p_usrstack - newsize,
329 newsize - oldsize, F_INVAL, S_WRITE);
330 }
331 return ((err == 0 ? 1 : 0));
332 }
333
334 /*
335 * Algorithm: call arch-specific map_pgsz to get best page size to use,
336 * then call grow_internal().
337 * Returns 0 on success.
338 */
339 static int
340 grow_lpg(caddr_t sp)
341 {
342 struct proc *p = curproc;
343 size_t pgsz;
344 size_t len, newsize;
345 caddr_t addr, saddr;
346 caddr_t growend;
347 int oszc, szc;
348 int err;
349
350 newsize = p->p_usrstack - sp;
351
352 oszc = p->p_stkpageszc;
353 pgsz = map_pgsz(MAPPGSZ_STK, p, sp, newsize, 0);
354 szc = page_szc(pgsz);
355
356 /*
357 * Covers two cases:
358 * 1. page_szc() returns -1 for invalid page size, so we want to
359 * ignore it in that case.
360 * 2. By design we never decrease page size, as it is more stable.
361 * This shouldn't happen as the stack never shrinks.
362 */
363 if (szc <= oszc) {
364 err = grow_internal(sp, oszc);
365 /* failed, fall back to base page size */
366 if (err != 0 && oszc != 0) {
367 err = grow_internal(sp, 0);
368 }
369 return (err);
370 }
371
372 /*
373 * We've grown sufficiently to switch to a new page size.
374 * So we are going to remap the whole segment with the new page size.
375 */
376 err = grow_internal(sp, szc);
377 /* The grow with szc failed, so fall back to base page size. */
378 if (err != 0) {
379 if (szc != 0) {
380 err = grow_internal(sp, 0);
381 }
382 return (err);
383 }
384
385 /*
386 * Round up stack pointer to a large page boundary and remap
387 * any pgsz pages in the segment already faulted in beyond that
388 * point.
389 */
390 saddr = p->p_usrstack - p->p_stksize;
391 addr = (caddr_t)P2ROUNDUP((uintptr_t)saddr, pgsz);
392 growend = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack, pgsz);
393 len = growend - addr;
394 /* Check that len is not negative. Update page size code for stack. */
395 if (addr >= saddr && growend > addr && IS_P2ALIGNED(len, pgsz)) {
396 (void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
397 p->p_stkpageszc = szc;
398 }
399
400 ASSERT(err == 0);
401 return (err); /* should always be 0 */
402 }
403
404 /*
405 * This routine assumes that the stack grows downward.
406 * Returns 0 on success, errno on failure.
407 */
408 int
409 grow_internal(caddr_t sp, uint_t growszc)
410 {
411 struct proc *p = curproc;
412 size_t newsize;
413 size_t oldsize;
414 int error;
415 size_t pgsz;
416 uint_t szc;
417 struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
418
419 ASSERT(sp < p->p_usrstack);
420 sp = (caddr_t)P2ALIGN((uintptr_t)sp, PAGESIZE);
421
422 /*
423 * grow to growszc alignment but use current p->p_stkpageszc for
424 * the segvn_crargs szc passed to segvn_create. For memcntl to
425 * increase the szc, this allows the new extension segment to be
426 * concatenated successfully with the existing stack segment.
427 */
428 if ((szc = growszc) != 0) {
429 pgsz = page_get_pagesize(szc);
430 ASSERT(pgsz > PAGESIZE);
431 newsize = p->p_usrstack - (caddr_t)P2ALIGN((uintptr_t)sp, pgsz);
432 if (newsize > (size_t)p->p_stk_ctl) {
433 szc = 0;
434 pgsz = PAGESIZE;
435 newsize = p->p_usrstack - sp;
436 }
437 } else {
438 pgsz = PAGESIZE;
439 newsize = p->p_usrstack - sp;
440 }
441
442 if (newsize > (size_t)p->p_stk_ctl) {
443 (void) rctl_action(rctlproc_legacy[RLIMIT_STACK], p->p_rctls, p,
444 RCA_UNSAFE_ALL);
445
446 return (ENOMEM);
447 }
448
449 oldsize = p->p_stksize;
450 ASSERT(P2PHASE(oldsize, PAGESIZE) == 0);
451
452 if (newsize <= oldsize) { /* prevent the stack from shrinking */
453 return (0);
454 }
455
456 if (!(p->p_stkprot & PROT_EXEC)) {
457 crargs.prot &= ~PROT_EXEC;
458 }
459 /*
460 * extend stack with the proposed new growszc, which is different
461 * than p_stkpageszc only on a memcntl to increase the stack pagesize.
462 * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies via
463 * map_pgszcvec(). Use AS_MAP_STACK to get intermediate page sizes
464 * if not aligned to szc's pgsz.
465 */
466 if (szc > 0) {
467 caddr_t oldsp = p->p_usrstack - oldsize;
468 caddr_t austk = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack,
469 pgsz);
470
471 if (IS_P2ALIGNED(p->p_usrstack, pgsz) || oldsp < austk) {
472 crargs.szc = p->p_stkpageszc ? p->p_stkpageszc :
473 AS_MAP_NO_LPOOB;
474 } else if (oldsp == austk) {
475 crargs.szc = szc;
476 } else {
477 crargs.szc = AS_MAP_STACK;
478 }
479 } else {
480 crargs.szc = AS_MAP_NO_LPOOB;
481 }
482 crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_DOWN;
483
484 if ((error = as_map(p->p_as, p->p_usrstack - newsize, newsize - oldsize,
485 segvn_create, &crargs)) != 0) {
486 if (error == EAGAIN) {
487 cmn_err(CE_WARN, "Sorry, no swap space to grow stack "
488 "for pid %d (%s)", p->p_pid, PTOU(p)->u_comm);
489 }
490 return (error);
491 }
492 p->p_stksize = newsize;
493 return (0);
494 }
495
496 /*
497 * Find address for user to map.
498 * If MAP_FIXED is not specified, we can pick any address we want, but we will
499 * first try the value in *addrp if it is non-NULL. Thus this is implementing
500 * a way to try and get a preferred address.
501 */
502 int
503 choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off,
504 int vacalign, uint_t flags)
505 {
506 #if defined(__amd64)
507 proc_t *p = curproc;
508 #endif
509 caddr_t basep = (caddr_t)(uintptr_t)((uintptr_t)*addrp & PAGEMASK);
510 size_t lenp;
511
512 ASSERT(AS_ISCLAIMGAP(as)); /* searches should be serialized */
513
514 /*
515 * If we have been provided a hint, we should still expand the lenp
516 * to be the rest of the address space. This will allow us to
517 * treat the hint as a strong desire to be "nearby" the provided
518 * address. If we can't satisfy the hint, as_gap() will walk forward.
519 */
520 if (flags & _MAP_LOW32)
521 lenp = (caddr_t)USERLIMIT32 - basep;
522 #if defined(__amd64)
523 else if (p->p_model == DATAMODEL_NATIVE)
524 lenp = p->p_usrstack - basep -
525 ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK);
526 #endif
527 else
528 lenp = as->a_userlimit - basep;
529
530 if (flags & MAP_FIXED) {
531 (void) as_unmap(as, *addrp, len);
532 return (0);
533 } else if (basep != NULL && ((flags & MAP_ALIGN) == 0) &&
534 !as_gap(as, len, &basep, &lenp, 0, *addrp)) {
535 /* User supplied address was available */
536 *addrp = basep;
537 } else {
538 /*
539 * No user supplied address or the address supplied was not
540 * available.
541 */
542 map_addr(addrp, len, off, vacalign, flags);
543 }
544 if (*addrp == NULL)
545 return (ENOMEM);
546 return (0);
547 }
548
549 caddr_t
550 map_userlimit(proc_t *pp, struct as *as, int flags)
551 {
552 if (flags & _MAP_LOW32) {
553 if (PROC_IS_BRANDED(pp) && BROP(pp)->b_map32limit != NULL) {
554 return ((caddr_t)(uintptr_t)BROP(pp)->b_map32limit(pp));
555 } else {
556 return ((caddr_t)_userlimit32);
557 }
558 }
559
560 return (as->a_userlimit);
561 }
562
563
564 /*
565 * Used for MAP_ANON - fast way to get anonymous pages
566 */
567 static int
568 zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
569 offset_t pos)
570 {
571 struct segvn_crargs vn_a;
572 int error;
573
574 if (((PROT_ALL & uprot) != uprot))
575 return (EACCES);
576
577 if ((flags & MAP_FIXED) != 0) {
578 /*
579 * Use the user address. First verify that
580 * the address to be used is page aligned.
581 * Then make some simple bounds checks.
582 */
583 if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
584 return (EINVAL);
585
586 switch (valid_usr_range(*addrp, len, uprot, as,
587 map_userlimit(as->a_proc, as, flags))) {
588 case RANGE_OKAY:
589 break;
590 case RANGE_BADPROT:
591 return (ENOTSUP);
592 case RANGE_BADADDR:
593 default:
594 return (ENOMEM);
595 }
596 }
597 /*
598 * No need to worry about vac alignment for anonymous
599 * pages since this is a "clone" object that doesn't
600 * yet exist.
601 */
602 error = choose_addr(as, addrp, len, pos, ADDR_NOVACALIGN, flags);
603 if (error != 0) {
604 return (error);
605 }
606
607 /*
608 * Use the seg_vn segment driver; passing in the NULL amp
609 * gives the desired "cloning" effect.
610 */
611 vn_a.vp = NULL;
612 vn_a.offset = 0;
613 vn_a.type = flags & MAP_TYPE;
614 vn_a.prot = uprot;
615 vn_a.maxprot = PROT_ALL;
616 vn_a.flags = flags & ~MAP_TYPE;
617 vn_a.cred = CRED();
618 vn_a.amp = NULL;
619 vn_a.szc = 0;
620 vn_a.lgrp_mem_policy_flags = 0;
621
622 return (as_map(as, *addrp, len, segvn_create, &vn_a));
623 }
624
625 static int
626 smmap_common(caddr_t *addrp, size_t len,
627 int prot, int flags, struct file *fp, offset_t pos)
628 {
629 struct vnode *vp;
630 struct as *as = curproc->p_as;
631 uint_t uprot, maxprot, type;
632 int error;
633 int in_crit = 0;
634
635 if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | _MAP_NEW |
636 _MAP_LOW32 | MAP_NORESERVE | MAP_ANON | MAP_ALIGN |
637 MAP_TEXT | MAP_INITDATA)) != 0) {
638 /* | MAP_RENAME */ /* not implemented, let user know */
639 return (EINVAL);
640 }
641
642 if ((flags & MAP_TEXT) && !(prot & PROT_EXEC)) {
643 return (EINVAL);
644 }
645
646 if ((flags & (MAP_TEXT | MAP_INITDATA)) == (MAP_TEXT | MAP_INITDATA)) {
647 return (EINVAL);
648 }
649
650 #if defined(__sparc)
651 /*
652 * See if this is an "old mmap call". If so, remember this
653 * fact and convert the flags value given to mmap to indicate
654 * the specified address in the system call must be used.
655 * _MAP_NEW is turned set by all new uses of mmap.
656 */
657 if ((flags & _MAP_NEW) == 0)
658 flags |= MAP_FIXED;
659 #endif
660 flags &= ~_MAP_NEW;
661
662 type = flags & MAP_TYPE;
663 if (type != MAP_PRIVATE && type != MAP_SHARED)
664 return (EINVAL);
665
666
667 if (flags & MAP_ALIGN) {
668
669 if (flags & MAP_FIXED)
670 return (EINVAL);
671
672 /* alignment needs to be a power of 2 >= page size */
673 if (((uintptr_t)*addrp < PAGESIZE && (uintptr_t)*addrp != 0) ||
674 !ISP2((uintptr_t)*addrp))
675 return (EINVAL);
676 }
677 /*
678 * Check for bad lengths and file position.
679 * We let the VOP_MAP routine check for negative lengths
680 * since on some vnode types this might be appropriate.
681 */
682 if (len == 0 || (pos & (u_offset_t)PAGEOFFSET) != 0)
683 return (EINVAL);
684
685 maxprot = PROT_ALL; /* start out allowing all accesses */
686 uprot = prot | PROT_USER;
687
688 if (fp == NULL) {
689 ASSERT(flags & MAP_ANON);
690 /* discard lwpchan mappings, like munmap() */
691 if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
692 lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
693 as_rangelock(as);
694 error = zmap(as, addrp, len, uprot, flags, pos);
695 as_rangeunlock(as);
696 /*
697 * Tell machine specific code that lwp has mapped shared memory
698 */
699 if (error == 0 && (flags & MAP_SHARED)) {
700 /* EMPTY */
701 LWP_MMODEL_SHARED_AS(*addrp, len);
702 }
703 return (error);
704 } else if ((flags & MAP_ANON) != 0)
705 return (EINVAL);
706
707 vp = fp->f_vnode;
708
709 /* Can't execute code from "noexec" mounted filesystem. */
710 if ((vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0)
711 maxprot &= ~PROT_EXEC;
712
713 /*
714 * These checks were added as part of large files.
715 *
716 * Return ENXIO if the initial position is negative; return EOVERFLOW
717 * if (offset + len) would overflow the maximum allowed offset for the
718 * type of file descriptor being used.
719 */
720 if (vp->v_type == VREG) {
721 if (pos < 0)
722 return (ENXIO);
723 if ((offset_t)len > (OFFSET_MAX(fp) - pos))
724 return (EOVERFLOW);
725 }
726
727 if (type == MAP_SHARED && (fp->f_flag & FWRITE) == 0) {
728 /* no write access allowed */
729 maxprot &= ~PROT_WRITE;
730 }
731
732 /*
733 * XXX - Do we also adjust maxprot based on protections
734 * of the vnode? E.g. if no execute permission is given
735 * on the vnode for the current user, maxprot probably
736 * should disallow PROT_EXEC also? This is different
737 * from the write access as this would be a per vnode
738 * test as opposed to a per fd test for writability.
739 */
740
741 /*
742 * Verify that the specified protections are not greater than
743 * the maximum allowable protections. Also test to make sure
744 * that the file descriptor does allows for read access since
745 * "write only" mappings are hard to do since normally we do
746 * the read from the file before the page can be written.
747 */
748 if (((maxprot & uprot) != uprot) || (fp->f_flag & FREAD) == 0)
749 return (EACCES);
750
751 /*
752 * If the user specified an address, do some simple checks here
753 */
754 if ((flags & MAP_FIXED) != 0) {
755 /*
756 * Use the user address. First verify that
757 * the address to be used is page aligned.
758 * Then make some simple bounds checks.
759 */
760 if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
761 return (EINVAL);
762 switch (valid_usr_range(*addrp, len, uprot, as,
763 map_userlimit(curproc, as, flags))) {
764 case RANGE_OKAY:
765 break;
766 case RANGE_BADPROT:
767 return (ENOTSUP);
768 case RANGE_BADADDR:
769 default:
770 return (ENOMEM);
771 }
772 }
773
774 if ((prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) &&
775 nbl_need_check(vp)) {
776 int svmand;
777 nbl_op_t nop;
778
779 nbl_start_crit(vp, RW_READER);
780 in_crit = 1;
781 error = nbl_svmand(vp, fp->f_cred, &svmand);
782 if (error != 0)
783 goto done;
784 if ((prot & PROT_WRITE) && (type == MAP_SHARED)) {
785 if (prot & (PROT_READ | PROT_EXEC)) {
786 nop = NBL_READWRITE;
787 } else {
788 nop = NBL_WRITE;
789 }
790 } else {
791 nop = NBL_READ;
792 }
793 if (nbl_conflict(vp, nop, 0, LONG_MAX, svmand, NULL)) {
794 error = EACCES;
795 goto done;
796 }
797 }
798
799 /* discard lwpchan mappings, like munmap() */
800 if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
801 lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
802
803 /*
804 * Ok, now let the vnode map routine do its thing to set things up.
805 */
806 error = VOP_MAP(vp, pos, as,
807 addrp, len, uprot, maxprot, flags, fp->f_cred, NULL);
808
809 if (error == 0) {
810 /*
811 * Tell machine specific code that lwp has mapped shared memory
812 */
813 if (flags & MAP_SHARED) {
814 /* EMPTY */
815 LWP_MMODEL_SHARED_AS(*addrp, len);
816 }
817 if (vp->v_type == VREG &&
818 (flags & (MAP_TEXT | MAP_INITDATA)) != 0) {
819 /*
820 * Mark this as an executable vnode
821 */
822 mutex_enter(&vp->v_lock);
823 vp->v_flag |= VVMEXEC;
824 mutex_exit(&vp->v_lock);
825 }
826 }
827
828 done:
829 if (in_crit)
830 nbl_end_crit(vp);
831 return (error);
832 }
833
834 #ifdef _LP64
835 /*
836 * LP64 mmap(2) system call: 64-bit offset, 64-bit address.
837 *
838 * The "large file" mmap routine mmap64(2) is also mapped to this routine
839 * by the 64-bit version of libc.
840 *
841 * Eventually, this should be the only version, and have smmap_common()
842 * folded back into it again. Some day.
843 */
844 caddr_t
845 smmap64(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos)
846 {
847 struct file *fp;
848 int error;
849
850 if (fd == -1 && (flags & MAP_ANON) != 0)
851 error = smmap_common(&addr, len, prot, flags,
852 NULL, (offset_t)pos);
853 else if ((fp = getf(fd)) != NULL) {
854 error = smmap_common(&addr, len, prot, flags,
855 fp, (offset_t)pos);
856 releasef(fd);
857 } else
858 error = EBADF;
859
860 return (error ? (caddr_t)(uintptr_t)set_errno(error) : addr);
861 }
862 #endif /* _LP64 */
863
864 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
865
866 /*
867 * ILP32 mmap(2) system call: 32-bit offset, 32-bit address.
868 */
869 caddr_t
870 smmap32(caddr32_t addr, size32_t len, int prot, int flags, int fd, off32_t pos)
871 {
872 struct file *fp;
873 int error;
874 caddr_t a = (caddr_t)(uintptr_t)addr;
875
876 if (flags & _MAP_LOW32)
877 error = EINVAL;
878 else if (fd == -1 && (flags & MAP_ANON) != 0)
879 error = smmap_common(&a, (size_t)len, prot,
880 flags | _MAP_LOW32, NULL, (offset_t)pos);
881 else if ((fp = getf(fd)) != NULL) {
882 error = smmap_common(&a, (size_t)len, prot,
883 flags | _MAP_LOW32, fp, (offset_t)pos);
884 releasef(fd);
885 } else
886 error = EBADF;
887
888 ASSERT(error != 0 || (uintptr_t)(a + len) < (uintptr_t)UINT32_MAX);
889
890 return (error ? (caddr_t)(uintptr_t)set_errno(error) : a);
891 }
892
893 /*
894 * ILP32 mmap64(2) system call: 64-bit offset, 32-bit address.
895 *
896 * Now things really get ugly because we can't use the C-style
897 * calling convention for more than 6 args, and 64-bit parameter
898 * passing on 32-bit systems is less than clean.
899 */
900
901 struct mmaplf32a {
902 caddr_t addr;
903 size_t len;
904 #ifdef _LP64
905 /*
906 * 32-bit contents, 64-bit cells
907 */
908 uint64_t prot;
909 uint64_t flags;
910 uint64_t fd;
911 uint64_t offhi;
912 uint64_t offlo;
913 #else
914 /*
915 * 32-bit contents, 32-bit cells
916 */
917 uint32_t prot;
918 uint32_t flags;
919 uint32_t fd;
920 uint32_t offhi;
921 uint32_t offlo;
922 #endif
923 };
924
925 int
926 smmaplf32(struct mmaplf32a *uap, rval_t *rvp)
927 {
928 struct file *fp;
929 int error;
930 caddr_t a = uap->addr;
931 int flags = (int)uap->flags;
932 int fd = (int)uap->fd;
933 #ifdef _BIG_ENDIAN
934 offset_t off = ((u_offset_t)uap->offhi << 32) | (u_offset_t)uap->offlo;
935 #else
936 offset_t off = ((u_offset_t)uap->offlo << 32) | (u_offset_t)uap->offhi;
937 #endif
938
939 if (flags & _MAP_LOW32)
940 error = EINVAL;
941 else if (fd == -1 && (flags & MAP_ANON) != 0)
942 error = smmap_common(&a, uap->len, (int)uap->prot,
943 flags | _MAP_LOW32, NULL, off);
944 else if ((fp = getf(fd)) != NULL) {
945 error = smmap_common(&a, uap->len, (int)uap->prot,
946 flags | _MAP_LOW32, fp, off);
947 releasef(fd);
948 } else
949 error = EBADF;
950
951 if (error == 0)
952 rvp->r_val1 = (uintptr_t)a;
953 return (error);
954 }
955
956 #endif /* _SYSCALL32_IMPL || _ILP32 */
957
958 int
959 munmap(caddr_t addr, size_t len)
960 {
961 struct proc *p = curproc;
962 struct as *as = p->p_as;
963
964 if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
965 return (set_errno(EINVAL));
966
967 if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
968 return (set_errno(EINVAL));
969
970 /*
971 * Discard lwpchan mappings.
972 */
973 if (p->p_lcp != NULL)
974 lwpchan_delete_mapping(p, addr, addr + len);
975 if (as_unmap(as, addr, len) != 0)
976 return (set_errno(EINVAL));
977
978 return (0);
979 }
980
981 int
982 mprotect(caddr_t addr, size_t len, int prot)
983 {
984 struct as *as = curproc->p_as;
985 uint_t uprot = prot | PROT_USER;
986 int error;
987
988 if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
989 return (set_errno(EINVAL));
990
991 switch (valid_usr_range(addr, len, prot, as, as->a_userlimit)) {
992 case RANGE_OKAY:
993 break;
994 case RANGE_BADPROT:
995 return (set_errno(ENOTSUP));
996 case RANGE_BADADDR:
997 default:
998 return (set_errno(ENOMEM));
999 }
1000
1001 error = as_setprot(as, addr, len, uprot);
1002 if (error)
1003 return (set_errno(error));
1004 return (0);
1005 }
1006
1007 #define MC_CACHE 128 /* internal result buffer */
1008 #define MC_QUANTUM (MC_CACHE * PAGESIZE) /* addresses covered in loop */
1009
1010 int
1011 mincore(caddr_t addr, size_t len, char *vecp)
1012 {
1013 struct as *as = curproc->p_as;
1014 caddr_t ea; /* end address of loop */
1015 size_t rl; /* inner result length */
1016 char vec[MC_CACHE]; /* local vector cache */
1017 int error;
1018 model_t model;
1019 long llen;
1020
1021 model = get_udatamodel();
1022 /*
1023 * Validate form of address parameters.
1024 */
1025 if (model == DATAMODEL_NATIVE) {
1026 llen = (long)len;
1027 } else {
1028 llen = (int32_t)(size32_t)len;
1029 }
1030 if (((uintptr_t)addr & PAGEOFFSET) != 0 || llen <= 0)
1031 return (set_errno(EINVAL));
1032
1033 if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
1034 return (set_errno(ENOMEM));
1035
1036 /*
1037 * Loop over subranges of interval [addr : addr + len), recovering
1038 * results internally and then copying them out to caller. Subrange
1039 * is based on the size of MC_CACHE, defined above.
1040 */
1041 for (ea = addr + len; addr < ea; addr += MC_QUANTUM) {
1042 error = as_incore(as, addr,
1043 (size_t)MIN(MC_QUANTUM, ea - addr), vec, &rl);
1044 if (rl != 0) {
1045 rl = (rl + PAGESIZE - 1) / PAGESIZE;
1046 if (copyout(vec, vecp, rl) != 0)
1047 return (set_errno(EFAULT));
1048 vecp += rl;
1049 }
1050 if (error != 0)
1051 return (set_errno(ENOMEM));
1052 }
1053 return (0);
1054 }