Print this page
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/os/grow.c
+++ new/usr/src/uts/common/os/grow.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved.
24 24 * Copyright (c) 2014, Joyent, Inc. All rights reserved.
25 25 */
26 26
27 27 /*
28 28 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
29 29 * Use is subject to license terms.
30 30 */
31 31
32 32 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
33 33 /* All Rights Reserved */
34 34
35 35 #include <sys/types.h>
36 36 #include <sys/inttypes.h>
37 37 #include <sys/param.h>
38 38 #include <sys/sysmacros.h>
39 39 #include <sys/systm.h>
40 40 #include <sys/signal.h>
41 41 #include <sys/user.h>
42 42 #include <sys/errno.h>
43 43 #include <sys/var.h>
44 44 #include <sys/proc.h>
45 45 #include <sys/tuneable.h>
46 46 #include <sys/debug.h>
47 47 #include <sys/cmn_err.h>
48 48 #include <sys/cred.h>
49 49 #include <sys/vnode.h>
50 50 #include <sys/vfs.h>
51 51 #include <sys/vm.h>
52 52 #include <sys/file.h>
53 53 #include <sys/mman.h>
54 54 #include <sys/vmparam.h>
55 55 #include <sys/fcntl.h>
56 56 #include <sys/lwpchan_impl.h>
57 57 #include <sys/nbmlock.h>
58 58 #include <sys/brand.h>
59 59
60 60 #include <vm/hat.h>
61 61 #include <vm/as.h>
62 62 #include <vm/seg.h>
63 63 #include <vm/seg_dev.h>
64 64 #include <vm/seg_vn.h>
65 65
66 66 int use_brk_lpg = 1;
67 67 int use_stk_lpg = 1;
68 68
69 69 static int brk_lpg(caddr_t nva);
70 70 static int grow_lpg(caddr_t sp);
71 71
72 72 int
73 73 brk(caddr_t nva)
74 74 {
75 75 int error;
76 76 proc_t *p = curproc;
77 77
78 78 /*
79 79 * Serialize brk operations on an address space.
80 80 * This also serves as the lock protecting p_brksize
81 81 * and p_brkpageszc.
82 82 */
83 83 as_rangelock(p->p_as);
84 84 if (use_brk_lpg && (p->p_flag & SAUTOLPG) != 0) {
85 85 error = brk_lpg(nva);
86 86 } else {
87 87 error = brk_internal(nva, p->p_brkpageszc);
88 88 }
89 89 as_rangeunlock(p->p_as);
90 90 return ((error != 0 ? set_errno(error) : 0));
91 91 }
92 92
93 93 /*
94 94 * Algorithm: call arch-specific map_pgsz to get best page size to use,
95 95 * then call brk_internal().
96 96 * Returns 0 on success.
97 97 */
98 98 static int
99 99 brk_lpg(caddr_t nva)
100 100 {
101 101 struct proc *p = curproc;
102 102 size_t pgsz, len;
103 103 caddr_t addr, brkend;
104 104 caddr_t bssbase = p->p_bssbase;
105 105 caddr_t brkbase = p->p_brkbase;
106 106 int oszc, szc;
107 107 int err;
108 108
109 109 oszc = p->p_brkpageszc;
110 110
111 111 /*
112 112 * If p_brkbase has not yet been set, the first call
113 113 * to brk_internal() will initialize it.
114 114 */
115 115 if (brkbase == 0) {
116 116 return (brk_internal(nva, oszc));
117 117 }
118 118
119 119 len = nva - bssbase;
120 120
121 121 pgsz = map_pgsz(MAPPGSZ_HEAP, p, bssbase, len, 0);
122 122 szc = page_szc(pgsz);
123 123
124 124 /*
125 125 * Covers two cases:
126 126 * 1. page_szc() returns -1 for invalid page size, so we want to
127 127 * ignore it in that case.
128 128 * 2. By design we never decrease page size, as it is more stable.
129 129 */
130 130 if (szc <= oszc) {
131 131 err = brk_internal(nva, oszc);
132 132 /* If failed, back off to base page size. */
133 133 if (err != 0 && oszc != 0) {
134 134 err = brk_internal(nva, 0);
135 135 }
136 136 return (err);
137 137 }
138 138
139 139 err = brk_internal(nva, szc);
140 140 /* If using szc failed, map with base page size and return. */
141 141 if (err != 0) {
142 142 if (szc != 0) {
143 143 err = brk_internal(nva, 0);
144 144 }
145 145 return (err);
146 146 }
147 147
148 148 /*
149 149 * Round up brk base to a large page boundary and remap
150 150 * anything in the segment already faulted in beyond that
151 151 * point.
152 152 */
153 153 addr = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase, pgsz);
154 154 brkend = brkbase + p->p_brksize;
155 155 len = brkend - addr;
156 156 /* Check that len is not negative. Update page size code for heap. */
157 157 if (addr >= p->p_bssbase && brkend > addr && IS_P2ALIGNED(len, pgsz)) {
158 158 (void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
159 159 p->p_brkpageszc = szc;
160 160 }
161 161
162 162 ASSERT(err == 0);
163 163 return (err); /* should always be 0 */
164 164 }
165 165
166 166 /*
167 167 * Returns 0 on success.
168 168 */
169 169 int
170 170 brk_internal(caddr_t nva, uint_t brkszc)
171 171 {
172 172 caddr_t ova; /* current break address */
173 173 size_t size;
174 174 int error;
175 175 struct proc *p = curproc;
176 176 struct as *as = p->p_as;
177 177 size_t pgsz;
178 178 uint_t szc;
179 179 rctl_qty_t as_rctl;
180 180
181 181 /*
182 182 * extend heap to brkszc alignment but use current p->p_brkpageszc
183 183 * for the newly created segment. This allows the new extension
184 184 * segment to be concatenated successfully with the existing brk
185 185 * segment.
186 186 */
187 187 if ((szc = brkszc) != 0) {
188 188 pgsz = page_get_pagesize(szc);
189 189 ASSERT(pgsz > PAGESIZE);
190 190 } else {
191 191 pgsz = PAGESIZE;
192 192 }
193 193
194 194 mutex_enter(&p->p_lock);
195 195 as_rctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_DATA],
196 196 p->p_rctls, p);
197 197 mutex_exit(&p->p_lock);
198 198
199 199 /*
200 200 * If p_brkbase has not yet been set, the first call
201 201 * to brk() will initialize it.
202 202 */
203 203 if (p->p_brkbase == 0)
204 204 p->p_brkbase = nva;
205 205
206 206 /*
207 207 * Before multiple page size support existed p_brksize was the value
208 208 * not rounded to the pagesize (i.e. it stored the exact user request
209 209 * for heap size). If pgsz is greater than PAGESIZE calculate the
210 210 * heap size as the real new heap size by rounding it up to pgsz.
211 211 * This is useful since we may want to know where the heap ends
212 212 * without knowing heap pagesize (e.g. some old code) and also if
213 213 * heap pagesize changes we can update p_brkpageszc but delay adding
214 214 * new mapping yet still know from p_brksize where the heap really
215 215 * ends. The user requested heap end is stored in libc variable.
216 216 */
217 217 if (pgsz > PAGESIZE) {
218 218 caddr_t tnva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
219 219 size = tnva - p->p_brkbase;
220 220 if (tnva < p->p_brkbase || (size > p->p_brksize &&
221 221 size > (size_t)as_rctl)) {
222 222 szc = 0;
223 223 pgsz = PAGESIZE;
224 224 size = nva - p->p_brkbase;
225 225 }
226 226 } else {
227 227 size = nva - p->p_brkbase;
228 228 }
229 229
230 230 /*
231 231 * use PAGESIZE to roundup ova because we want to know the real value
232 232 * of the current heap end in case p_brkpageszc changes since the last
233 233 * p_brksize was computed.
234 234 */
235 235 nva = (caddr_t)P2ROUNDUP((uintptr_t)nva, pgsz);
236 236 ova = (caddr_t)P2ROUNDUP((uintptr_t)(p->p_brkbase + p->p_brksize),
237 237 PAGESIZE);
238 238
239 239 if ((nva < p->p_brkbase) || (size > p->p_brksize &&
240 240 size > as_rctl)) {
241 241 mutex_enter(&p->p_lock);
242 242 (void) rctl_action(rctlproc_legacy[RLIMIT_DATA], p->p_rctls, p,
243 243 RCA_SAFE);
244 244 mutex_exit(&p->p_lock);
245 245 return (ENOMEM);
246 246 }
247 247
248 248 if (nva > ova) {
249 249 struct segvn_crargs crargs =
250 250 SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
251 251
252 252 if (!(p->p_datprot & PROT_EXEC)) {
253 253 crargs.prot &= ~PROT_EXEC;
254 254 }
255 255
256 256 /*
257 257 * Add new zfod mapping to extend UNIX data segment
258 258 * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies
259 259 * via map_pgszcvec(). Use AS_MAP_HEAP to get intermediate
260 260 * page sizes if ova is not aligned to szc's pgsz.
261 261 */
262 262 if (szc > 0) {
263 263 caddr_t rbss;
264 264
265 265 rbss = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase,
266 266 pgsz);
267 267 if (IS_P2ALIGNED(p->p_bssbase, pgsz) || ova > rbss) {
268 268 crargs.szc = p->p_brkpageszc ? p->p_brkpageszc :
269 269 AS_MAP_NO_LPOOB;
270 270 } else if (ova == rbss) {
271 271 crargs.szc = szc;
272 272 } else {
273 273 crargs.szc = AS_MAP_HEAP;
274 274 }
275 275 } else {
276 276 crargs.szc = AS_MAP_NO_LPOOB;
277 277 }
278 278 crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_UP;
279 279 error = as_map(as, ova, (size_t)(nva - ova), segvn_create,
280 280 &crargs);
281 281 if (error) {
282 282 return (error);
283 283 }
284 284
285 285 } else if (nva < ova) {
286 286 /*
287 287 * Release mapping to shrink UNIX data segment.
288 288 */
289 289 (void) as_unmap(as, nva, (size_t)(ova - nva));
290 290 }
291 291 p->p_brksize = size;
292 292 return (0);
293 293 }
294 294
295 295 /*
296 296 * Grow the stack to include sp. Return 1 if successful, 0 otherwise.
297 297 * This routine assumes that the stack grows downward.
298 298 */
299 299 int
300 300 grow(caddr_t sp)
301 301 {
302 302 struct proc *p = curproc;
303 303 struct as *as = p->p_as;
304 304 size_t oldsize = p->p_stksize;
305 305 size_t newsize;
306 306 int err;
307 307
308 308 /*
309 309 * Serialize grow operations on an address space.
310 310 * This also serves as the lock protecting p_stksize
311 311 * and p_stkpageszc.
312 312 */
313 313 as_rangelock(as);
314 314 if (use_stk_lpg && (p->p_flag & SAUTOLPG) != 0) {
315 315 err = grow_lpg(sp);
316 316 } else {
317 317 err = grow_internal(sp, p->p_stkpageszc);
318 318 }
319 319 as_rangeunlock(as);
320 320
321 321 if (err == 0 && (newsize = p->p_stksize) > oldsize) {
322 322 ASSERT(IS_P2ALIGNED(oldsize, PAGESIZE));
323 323 ASSERT(IS_P2ALIGNED(newsize, PAGESIZE));
324 324 /*
325 325 * Set up translations so the process doesn't have to fault in
326 326 * the stack pages we just gave it.
327 327 */
328 328 (void) as_fault(as->a_hat, as, p->p_usrstack - newsize,
329 329 newsize - oldsize, F_INVAL, S_WRITE);
330 330 }
331 331 return ((err == 0 ? 1 : 0));
332 332 }
333 333
334 334 /*
335 335 * Algorithm: call arch-specific map_pgsz to get best page size to use,
336 336 * then call grow_internal().
337 337 * Returns 0 on success.
338 338 */
339 339 static int
340 340 grow_lpg(caddr_t sp)
341 341 {
342 342 struct proc *p = curproc;
343 343 size_t pgsz;
344 344 size_t len, newsize;
345 345 caddr_t addr, saddr;
346 346 caddr_t growend;
347 347 int oszc, szc;
348 348 int err;
349 349
350 350 newsize = p->p_usrstack - sp;
351 351
352 352 oszc = p->p_stkpageszc;
353 353 pgsz = map_pgsz(MAPPGSZ_STK, p, sp, newsize, 0);
354 354 szc = page_szc(pgsz);
355 355
356 356 /*
357 357 * Covers two cases:
358 358 * 1. page_szc() returns -1 for invalid page size, so we want to
359 359 * ignore it in that case.
360 360 * 2. By design we never decrease page size, as it is more stable.
361 361 * This shouldn't happen as the stack never shrinks.
362 362 */
363 363 if (szc <= oszc) {
364 364 err = grow_internal(sp, oszc);
365 365 /* failed, fall back to base page size */
366 366 if (err != 0 && oszc != 0) {
367 367 err = grow_internal(sp, 0);
368 368 }
369 369 return (err);
370 370 }
371 371
372 372 /*
373 373 * We've grown sufficiently to switch to a new page size.
374 374 * So we are going to remap the whole segment with the new page size.
375 375 */
376 376 err = grow_internal(sp, szc);
377 377 /* The grow with szc failed, so fall back to base page size. */
378 378 if (err != 0) {
379 379 if (szc != 0) {
380 380 err = grow_internal(sp, 0);
381 381 }
382 382 return (err);
383 383 }
384 384
385 385 /*
386 386 * Round up stack pointer to a large page boundary and remap
387 387 * any pgsz pages in the segment already faulted in beyond that
388 388 * point.
389 389 */
390 390 saddr = p->p_usrstack - p->p_stksize;
391 391 addr = (caddr_t)P2ROUNDUP((uintptr_t)saddr, pgsz);
392 392 growend = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack, pgsz);
393 393 len = growend - addr;
394 394 /* Check that len is not negative. Update page size code for stack. */
395 395 if (addr >= saddr && growend > addr && IS_P2ALIGNED(len, pgsz)) {
396 396 (void) as_setpagesize(p->p_as, addr, len, szc, B_FALSE);
397 397 p->p_stkpageszc = szc;
398 398 }
399 399
400 400 ASSERT(err == 0);
401 401 return (err); /* should always be 0 */
402 402 }
403 403
404 404 /*
405 405 * This routine assumes that the stack grows downward.
406 406 * Returns 0 on success, errno on failure.
407 407 */
408 408 int
409 409 grow_internal(caddr_t sp, uint_t growszc)
410 410 {
411 411 struct proc *p = curproc;
412 412 size_t newsize;
413 413 size_t oldsize;
414 414 int error;
415 415 size_t pgsz;
416 416 uint_t szc;
417 417 struct segvn_crargs crargs = SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
418 418
419 419 ASSERT(sp < p->p_usrstack);
420 420 sp = (caddr_t)P2ALIGN((uintptr_t)sp, PAGESIZE);
421 421
422 422 /*
423 423 * grow to growszc alignment but use current p->p_stkpageszc for
424 424 * the segvn_crargs szc passed to segvn_create. For memcntl to
425 425 * increase the szc, this allows the new extension segment to be
426 426 * concatenated successfully with the existing stack segment.
427 427 */
428 428 if ((szc = growszc) != 0) {
429 429 pgsz = page_get_pagesize(szc);
430 430 ASSERT(pgsz > PAGESIZE);
431 431 newsize = p->p_usrstack - (caddr_t)P2ALIGN((uintptr_t)sp, pgsz);
432 432 if (newsize > (size_t)p->p_stk_ctl) {
433 433 szc = 0;
434 434 pgsz = PAGESIZE;
435 435 newsize = p->p_usrstack - sp;
436 436 }
437 437 } else {
438 438 pgsz = PAGESIZE;
439 439 newsize = p->p_usrstack - sp;
440 440 }
441 441
442 442 if (newsize > (size_t)p->p_stk_ctl) {
443 443 (void) rctl_action(rctlproc_legacy[RLIMIT_STACK], p->p_rctls, p,
444 444 RCA_UNSAFE_ALL);
445 445
446 446 return (ENOMEM);
447 447 }
448 448
449 449 oldsize = p->p_stksize;
450 450 ASSERT(P2PHASE(oldsize, PAGESIZE) == 0);
451 451
452 452 if (newsize <= oldsize) { /* prevent the stack from shrinking */
453 453 return (0);
454 454 }
455 455
456 456 if (!(p->p_stkprot & PROT_EXEC)) {
457 457 crargs.prot &= ~PROT_EXEC;
458 458 }
459 459 /*
460 460 * extend stack with the proposed new growszc, which is different
461 461 * than p_stkpageszc only on a memcntl to increase the stack pagesize.
462 462 * AS_MAP_NO_LPOOB means use 0, and don't reapply OOB policies via
463 463 * map_pgszcvec(). Use AS_MAP_STACK to get intermediate page sizes
464 464 * if not aligned to szc's pgsz.
465 465 */
466 466 if (szc > 0) {
467 467 caddr_t oldsp = p->p_usrstack - oldsize;
468 468 caddr_t austk = (caddr_t)P2ALIGN((uintptr_t)p->p_usrstack,
469 469 pgsz);
470 470
471 471 if (IS_P2ALIGNED(p->p_usrstack, pgsz) || oldsp < austk) {
472 472 crargs.szc = p->p_stkpageszc ? p->p_stkpageszc :
473 473 AS_MAP_NO_LPOOB;
474 474 } else if (oldsp == austk) {
475 475 crargs.szc = szc;
476 476 } else {
477 477 crargs.szc = AS_MAP_STACK;
478 478 }
479 479 } else {
480 480 crargs.szc = AS_MAP_NO_LPOOB;
481 481 }
482 482 crargs.lgrp_mem_policy_flags = LGRP_MP_FLAG_EXTEND_DOWN;
483 483
484 484 if ((error = as_map(p->p_as, p->p_usrstack - newsize, newsize - oldsize,
485 485 segvn_create, &crargs)) != 0) {
486 486 if (error == EAGAIN) {
487 487 cmn_err(CE_WARN, "Sorry, no swap space to grow stack "
488 488 "for pid %d (%s)", p->p_pid, PTOU(p)->u_comm);
489 489 }
490 490 return (error);
491 491 }
492 492 p->p_stksize = newsize;
493 493 return (0);
494 494 }
495 495
|
↓ open down ↓ |
495 lines elided |
↑ open up ↑ |
496 496 /*
497 497 * Find address for user to map.
498 498 * If MAP_FIXED is not specified, we can pick any address we want, but we will
499 499 * first try the value in *addrp if it is non-NULL. Thus this is implementing
500 500 * a way to try and get a preferred address.
501 501 */
502 502 int
503 503 choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off,
504 504 int vacalign, uint_t flags)
505 505 {
506 +#if defined(__amd64)
507 + proc_t *p = curproc;
508 +#endif
506 509 caddr_t basep = (caddr_t)(uintptr_t)((uintptr_t)*addrp & PAGEMASK);
507 - size_t lenp = len;
510 + size_t lenp;
508 511
509 512 ASSERT(AS_ISCLAIMGAP(as)); /* searches should be serialized */
513 +
514 + /*
515 + * If we have been provided a hint, we should still expand the lenp
516 + * to be the rest of the address space. This will allow us to
517 + * treat the hint as a strong desire to be "nearby" the provided
518 + * address. If we can't satisfy the hint, as_gap() will walk forward.
519 + */
520 + if (flags & _MAP_LOW32)
521 + lenp = (caddr_t)USERLIMIT32 - basep;
522 +#if defined(__amd64)
523 + else if (p->p_model == DATAMODEL_NATIVE)
524 + lenp = p->p_usrstack - basep -
525 + ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK);
526 +#endif
527 + else
528 + lenp = as->a_userlimit - basep;
529 +
510 530 if (flags & MAP_FIXED) {
511 531 (void) as_unmap(as, *addrp, len);
512 532 return (0);
513 533 } else if (basep != NULL && ((flags & MAP_ALIGN) == 0) &&
514 534 !as_gap(as, len, &basep, &lenp, 0, *addrp)) {
515 535 /* User supplied address was available */
516 536 *addrp = basep;
517 537 } else {
518 538 /*
519 539 * No user supplied address or the address supplied was not
520 540 * available.
521 541 */
522 542 map_addr(addrp, len, off, vacalign, flags);
523 543 }
524 544 if (*addrp == NULL)
525 545 return (ENOMEM);
526 546 return (0);
527 547 }
528 548
529 549 caddr_t
530 550 map_userlimit(proc_t *pp, struct as *as, int flags)
531 551 {
532 552 if (flags & _MAP_LOW32) {
533 553 if (PROC_IS_BRANDED(pp) && BROP(pp)->b_map32limit != NULL) {
534 554 return ((caddr_t)(uintptr_t)BROP(pp)->b_map32limit(pp));
535 555 } else {
536 556 return ((caddr_t)_userlimit32);
537 557 }
538 558 }
539 559
540 560 return (as->a_userlimit);
541 561 }
542 562
543 563
544 564 /*
545 565 * Used for MAP_ANON - fast way to get anonymous pages
546 566 */
547 567 static int
548 568 zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags,
549 569 offset_t pos)
550 570 {
551 571 struct segvn_crargs vn_a;
552 572 int error;
553 573
554 574 if (((PROT_ALL & uprot) != uprot))
555 575 return (EACCES);
556 576
557 577 if ((flags & MAP_FIXED) != 0) {
558 578 /*
559 579 * Use the user address. First verify that
560 580 * the address to be used is page aligned.
561 581 * Then make some simple bounds checks.
562 582 */
563 583 if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
564 584 return (EINVAL);
565 585
566 586 switch (valid_usr_range(*addrp, len, uprot, as,
567 587 map_userlimit(as->a_proc, as, flags))) {
568 588 case RANGE_OKAY:
569 589 break;
570 590 case RANGE_BADPROT:
571 591 return (ENOTSUP);
572 592 case RANGE_BADADDR:
573 593 default:
574 594 return (ENOMEM);
575 595 }
576 596 }
577 597 /*
578 598 * No need to worry about vac alignment for anonymous
579 599 * pages since this is a "clone" object that doesn't
580 600 * yet exist.
581 601 */
582 602 error = choose_addr(as, addrp, len, pos, ADDR_NOVACALIGN, flags);
583 603 if (error != 0) {
584 604 return (error);
585 605 }
586 606
587 607 /*
588 608 * Use the seg_vn segment driver; passing in the NULL amp
589 609 * gives the desired "cloning" effect.
590 610 */
591 611 vn_a.vp = NULL;
592 612 vn_a.offset = 0;
593 613 vn_a.type = flags & MAP_TYPE;
594 614 vn_a.prot = uprot;
595 615 vn_a.maxprot = PROT_ALL;
596 616 vn_a.flags = flags & ~MAP_TYPE;
597 617 vn_a.cred = CRED();
598 618 vn_a.amp = NULL;
599 619 vn_a.szc = 0;
600 620 vn_a.lgrp_mem_policy_flags = 0;
601 621
602 622 return (as_map(as, *addrp, len, segvn_create, &vn_a));
603 623 }
604 624
605 625 static int
606 626 smmap_common(caddr_t *addrp, size_t len,
607 627 int prot, int flags, struct file *fp, offset_t pos)
608 628 {
609 629 struct vnode *vp;
610 630 struct as *as = curproc->p_as;
611 631 uint_t uprot, maxprot, type;
612 632 int error;
613 633 int in_crit = 0;
614 634
615 635 if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | _MAP_NEW |
616 636 _MAP_LOW32 | MAP_NORESERVE | MAP_ANON | MAP_ALIGN |
617 637 MAP_TEXT | MAP_INITDATA)) != 0) {
618 638 /* | MAP_RENAME */ /* not implemented, let user know */
619 639 return (EINVAL);
620 640 }
621 641
622 642 if ((flags & MAP_TEXT) && !(prot & PROT_EXEC)) {
623 643 return (EINVAL);
624 644 }
625 645
626 646 if ((flags & (MAP_TEXT | MAP_INITDATA)) == (MAP_TEXT | MAP_INITDATA)) {
627 647 return (EINVAL);
628 648 }
629 649
630 650 #if defined(__sparc)
631 651 /*
632 652 * See if this is an "old mmap call". If so, remember this
633 653 * fact and convert the flags value given to mmap to indicate
634 654 * the specified address in the system call must be used.
635 655 * _MAP_NEW is turned set by all new uses of mmap.
636 656 */
637 657 if ((flags & _MAP_NEW) == 0)
638 658 flags |= MAP_FIXED;
639 659 #endif
640 660 flags &= ~_MAP_NEW;
641 661
642 662 type = flags & MAP_TYPE;
643 663 if (type != MAP_PRIVATE && type != MAP_SHARED)
644 664 return (EINVAL);
645 665
646 666
647 667 if (flags & MAP_ALIGN) {
648 668
649 669 if (flags & MAP_FIXED)
650 670 return (EINVAL);
651 671
652 672 /* alignment needs to be a power of 2 >= page size */
653 673 if (((uintptr_t)*addrp < PAGESIZE && (uintptr_t)*addrp != 0) ||
654 674 !ISP2((uintptr_t)*addrp))
655 675 return (EINVAL);
656 676 }
657 677 /*
658 678 * Check for bad lengths and file position.
659 679 * We let the VOP_MAP routine check for negative lengths
660 680 * since on some vnode types this might be appropriate.
661 681 */
662 682 if (len == 0 || (pos & (u_offset_t)PAGEOFFSET) != 0)
663 683 return (EINVAL);
664 684
665 685 maxprot = PROT_ALL; /* start out allowing all accesses */
666 686 uprot = prot | PROT_USER;
667 687
668 688 if (fp == NULL) {
669 689 ASSERT(flags & MAP_ANON);
670 690 /* discard lwpchan mappings, like munmap() */
671 691 if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
672 692 lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
673 693 as_rangelock(as);
674 694 error = zmap(as, addrp, len, uprot, flags, pos);
675 695 as_rangeunlock(as);
676 696 /*
677 697 * Tell machine specific code that lwp has mapped shared memory
678 698 */
679 699 if (error == 0 && (flags & MAP_SHARED)) {
680 700 /* EMPTY */
681 701 LWP_MMODEL_SHARED_AS(*addrp, len);
682 702 }
683 703 return (error);
684 704 } else if ((flags & MAP_ANON) != 0)
685 705 return (EINVAL);
686 706
687 707 vp = fp->f_vnode;
688 708
689 709 /* Can't execute code from "noexec" mounted filesystem. */
690 710 if ((vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0)
691 711 maxprot &= ~PROT_EXEC;
692 712
693 713 /*
694 714 * These checks were added as part of large files.
695 715 *
696 716 * Return ENXIO if the initial position is negative; return EOVERFLOW
697 717 * if (offset + len) would overflow the maximum allowed offset for the
698 718 * type of file descriptor being used.
699 719 */
700 720 if (vp->v_type == VREG) {
701 721 if (pos < 0)
702 722 return (ENXIO);
703 723 if ((offset_t)len > (OFFSET_MAX(fp) - pos))
704 724 return (EOVERFLOW);
705 725 }
706 726
707 727 if (type == MAP_SHARED && (fp->f_flag & FWRITE) == 0) {
708 728 /* no write access allowed */
709 729 maxprot &= ~PROT_WRITE;
710 730 }
711 731
712 732 /*
713 733 * XXX - Do we also adjust maxprot based on protections
714 734 * of the vnode? E.g. if no execute permission is given
715 735 * on the vnode for the current user, maxprot probably
716 736 * should disallow PROT_EXEC also? This is different
717 737 * from the write access as this would be a per vnode
718 738 * test as opposed to a per fd test for writability.
719 739 */
720 740
721 741 /*
722 742 * Verify that the specified protections are not greater than
723 743 * the maximum allowable protections. Also test to make sure
724 744 * that the file descriptor does allows for read access since
725 745 * "write only" mappings are hard to do since normally we do
726 746 * the read from the file before the page can be written.
727 747 */
728 748 if (((maxprot & uprot) != uprot) || (fp->f_flag & FREAD) == 0)
729 749 return (EACCES);
730 750
731 751 /*
732 752 * If the user specified an address, do some simple checks here
733 753 */
734 754 if ((flags & MAP_FIXED) != 0) {
735 755 /*
736 756 * Use the user address. First verify that
737 757 * the address to be used is page aligned.
738 758 * Then make some simple bounds checks.
739 759 */
740 760 if (((uintptr_t)*addrp & PAGEOFFSET) != 0)
741 761 return (EINVAL);
742 762 switch (valid_usr_range(*addrp, len, uprot, as,
743 763 map_userlimit(curproc, as, flags))) {
744 764 case RANGE_OKAY:
745 765 break;
746 766 case RANGE_BADPROT:
747 767 return (ENOTSUP);
748 768 case RANGE_BADADDR:
749 769 default:
750 770 return (ENOMEM);
751 771 }
752 772 }
753 773
754 774 if ((prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) &&
755 775 nbl_need_check(vp)) {
756 776 int svmand;
757 777 nbl_op_t nop;
758 778
759 779 nbl_start_crit(vp, RW_READER);
760 780 in_crit = 1;
761 781 error = nbl_svmand(vp, fp->f_cred, &svmand);
762 782 if (error != 0)
763 783 goto done;
764 784 if ((prot & PROT_WRITE) && (type == MAP_SHARED)) {
765 785 if (prot & (PROT_READ | PROT_EXEC)) {
766 786 nop = NBL_READWRITE;
767 787 } else {
768 788 nop = NBL_WRITE;
769 789 }
770 790 } else {
771 791 nop = NBL_READ;
772 792 }
773 793 if (nbl_conflict(vp, nop, 0, LONG_MAX, svmand, NULL)) {
774 794 error = EACCES;
775 795 goto done;
776 796 }
777 797 }
778 798
779 799 /* discard lwpchan mappings, like munmap() */
780 800 if ((flags & MAP_FIXED) && curproc->p_lcp != NULL)
781 801 lwpchan_delete_mapping(curproc, *addrp, *addrp + len);
782 802
783 803 /*
784 804 * Ok, now let the vnode map routine do its thing to set things up.
785 805 */
786 806 error = VOP_MAP(vp, pos, as,
787 807 addrp, len, uprot, maxprot, flags, fp->f_cred, NULL);
788 808
789 809 if (error == 0) {
790 810 /*
791 811 * Tell machine specific code that lwp has mapped shared memory
792 812 */
793 813 if (flags & MAP_SHARED) {
794 814 /* EMPTY */
795 815 LWP_MMODEL_SHARED_AS(*addrp, len);
796 816 }
797 817 if (vp->v_type == VREG &&
798 818 (flags & (MAP_TEXT | MAP_INITDATA)) != 0) {
799 819 /*
800 820 * Mark this as an executable vnode
801 821 */
802 822 mutex_enter(&vp->v_lock);
803 823 vp->v_flag |= VVMEXEC;
804 824 mutex_exit(&vp->v_lock);
805 825 }
806 826 }
807 827
808 828 done:
809 829 if (in_crit)
810 830 nbl_end_crit(vp);
811 831 return (error);
812 832 }
813 833
814 834 #ifdef _LP64
815 835 /*
816 836 * LP64 mmap(2) system call: 64-bit offset, 64-bit address.
817 837 *
818 838 * The "large file" mmap routine mmap64(2) is also mapped to this routine
819 839 * by the 64-bit version of libc.
820 840 *
821 841 * Eventually, this should be the only version, and have smmap_common()
822 842 * folded back into it again. Some day.
823 843 */
824 844 caddr_t
825 845 smmap64(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos)
826 846 {
827 847 struct file *fp;
828 848 int error;
829 849
830 850 if (fd == -1 && (flags & MAP_ANON) != 0)
831 851 error = smmap_common(&addr, len, prot, flags,
832 852 NULL, (offset_t)pos);
833 853 else if ((fp = getf(fd)) != NULL) {
834 854 error = smmap_common(&addr, len, prot, flags,
835 855 fp, (offset_t)pos);
836 856 releasef(fd);
837 857 } else
838 858 error = EBADF;
839 859
840 860 return (error ? (caddr_t)(uintptr_t)set_errno(error) : addr);
841 861 }
842 862 #endif /* _LP64 */
843 863
844 864 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
845 865
846 866 /*
847 867 * ILP32 mmap(2) system call: 32-bit offset, 32-bit address.
848 868 */
849 869 caddr_t
850 870 smmap32(caddr32_t addr, size32_t len, int prot, int flags, int fd, off32_t pos)
851 871 {
852 872 struct file *fp;
853 873 int error;
854 874 caddr_t a = (caddr_t)(uintptr_t)addr;
855 875
856 876 if (flags & _MAP_LOW32)
857 877 error = EINVAL;
858 878 else if (fd == -1 && (flags & MAP_ANON) != 0)
859 879 error = smmap_common(&a, (size_t)len, prot,
860 880 flags | _MAP_LOW32, NULL, (offset_t)pos);
861 881 else if ((fp = getf(fd)) != NULL) {
862 882 error = smmap_common(&a, (size_t)len, prot,
863 883 flags | _MAP_LOW32, fp, (offset_t)pos);
864 884 releasef(fd);
865 885 } else
866 886 error = EBADF;
867 887
868 888 ASSERT(error != 0 || (uintptr_t)(a + len) < (uintptr_t)UINT32_MAX);
869 889
870 890 return (error ? (caddr_t)(uintptr_t)set_errno(error) : a);
871 891 }
872 892
873 893 /*
874 894 * ILP32 mmap64(2) system call: 64-bit offset, 32-bit address.
875 895 *
876 896 * Now things really get ugly because we can't use the C-style
877 897 * calling convention for more than 6 args, and 64-bit parameter
878 898 * passing on 32-bit systems is less than clean.
879 899 */
880 900
881 901 struct mmaplf32a {
882 902 caddr_t addr;
883 903 size_t len;
884 904 #ifdef _LP64
885 905 /*
886 906 * 32-bit contents, 64-bit cells
887 907 */
888 908 uint64_t prot;
889 909 uint64_t flags;
890 910 uint64_t fd;
891 911 uint64_t offhi;
892 912 uint64_t offlo;
893 913 #else
894 914 /*
895 915 * 32-bit contents, 32-bit cells
896 916 */
897 917 uint32_t prot;
898 918 uint32_t flags;
899 919 uint32_t fd;
900 920 uint32_t offhi;
901 921 uint32_t offlo;
902 922 #endif
903 923 };
904 924
905 925 int
906 926 smmaplf32(struct mmaplf32a *uap, rval_t *rvp)
907 927 {
908 928 struct file *fp;
909 929 int error;
910 930 caddr_t a = uap->addr;
911 931 int flags = (int)uap->flags;
912 932 int fd = (int)uap->fd;
913 933 #ifdef _BIG_ENDIAN
914 934 offset_t off = ((u_offset_t)uap->offhi << 32) | (u_offset_t)uap->offlo;
915 935 #else
916 936 offset_t off = ((u_offset_t)uap->offlo << 32) | (u_offset_t)uap->offhi;
917 937 #endif
918 938
919 939 if (flags & _MAP_LOW32)
920 940 error = EINVAL;
921 941 else if (fd == -1 && (flags & MAP_ANON) != 0)
922 942 error = smmap_common(&a, uap->len, (int)uap->prot,
923 943 flags | _MAP_LOW32, NULL, off);
924 944 else if ((fp = getf(fd)) != NULL) {
925 945 error = smmap_common(&a, uap->len, (int)uap->prot,
926 946 flags | _MAP_LOW32, fp, off);
927 947 releasef(fd);
928 948 } else
929 949 error = EBADF;
930 950
931 951 if (error == 0)
932 952 rvp->r_val1 = (uintptr_t)a;
933 953 return (error);
934 954 }
935 955
936 956 #endif /* _SYSCALL32_IMPL || _ILP32 */
937 957
938 958 int
939 959 munmap(caddr_t addr, size_t len)
940 960 {
941 961 struct proc *p = curproc;
942 962 struct as *as = p->p_as;
943 963
944 964 if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
945 965 return (set_errno(EINVAL));
946 966
947 967 if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
948 968 return (set_errno(EINVAL));
949 969
950 970 /*
951 971 * Discard lwpchan mappings.
952 972 */
953 973 if (p->p_lcp != NULL)
954 974 lwpchan_delete_mapping(p, addr, addr + len);
955 975 if (as_unmap(as, addr, len) != 0)
956 976 return (set_errno(EINVAL));
957 977
958 978 return (0);
959 979 }
960 980
961 981 int
962 982 mprotect(caddr_t addr, size_t len, int prot)
963 983 {
964 984 struct as *as = curproc->p_as;
965 985 uint_t uprot = prot | PROT_USER;
966 986 int error;
967 987
968 988 if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0)
969 989 return (set_errno(EINVAL));
970 990
971 991 switch (valid_usr_range(addr, len, prot, as, as->a_userlimit)) {
972 992 case RANGE_OKAY:
973 993 break;
974 994 case RANGE_BADPROT:
975 995 return (set_errno(ENOTSUP));
976 996 case RANGE_BADADDR:
977 997 default:
978 998 return (set_errno(ENOMEM));
979 999 }
980 1000
981 1001 error = as_setprot(as, addr, len, uprot);
982 1002 if (error)
983 1003 return (set_errno(error));
984 1004 return (0);
985 1005 }
986 1006
987 1007 #define MC_CACHE 128 /* internal result buffer */
988 1008 #define MC_QUANTUM (MC_CACHE * PAGESIZE) /* addresses covered in loop */
989 1009
990 1010 int
991 1011 mincore(caddr_t addr, size_t len, char *vecp)
992 1012 {
993 1013 struct as *as = curproc->p_as;
994 1014 caddr_t ea; /* end address of loop */
995 1015 size_t rl; /* inner result length */
996 1016 char vec[MC_CACHE]; /* local vector cache */
997 1017 int error;
998 1018 model_t model;
999 1019 long llen;
1000 1020
1001 1021 model = get_udatamodel();
1002 1022 /*
1003 1023 * Validate form of address parameters.
1004 1024 */
1005 1025 if (model == DATAMODEL_NATIVE) {
1006 1026 llen = (long)len;
1007 1027 } else {
1008 1028 llen = (int32_t)(size32_t)len;
1009 1029 }
1010 1030 if (((uintptr_t)addr & PAGEOFFSET) != 0 || llen <= 0)
1011 1031 return (set_errno(EINVAL));
1012 1032
1013 1033 if (valid_usr_range(addr, len, 0, as, as->a_userlimit) != RANGE_OKAY)
1014 1034 return (set_errno(ENOMEM));
1015 1035
1016 1036 /*
1017 1037 * Loop over subranges of interval [addr : addr + len), recovering
1018 1038 * results internally and then copying them out to caller. Subrange
1019 1039 * is based on the size of MC_CACHE, defined above.
1020 1040 */
1021 1041 for (ea = addr + len; addr < ea; addr += MC_QUANTUM) {
1022 1042 error = as_incore(as, addr,
1023 1043 (size_t)MIN(MC_QUANTUM, ea - addr), vec, &rl);
1024 1044 if (rl != 0) {
1025 1045 rl = (rl + PAGESIZE - 1) / PAGESIZE;
1026 1046 if (copyout(vec, vecp, rl) != 0)
1027 1047 return (set_errno(EFAULT));
1028 1048 vecp += rl;
1029 1049 }
1030 1050 if (error != 0)
1031 1051 return (set_errno(ENOMEM));
1032 1052 }
1033 1053 return (0);
1034 1054 }
|
↓ open down ↓ |
515 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX