Print this page
OS-803 make phys mem cap a bit harder
OS-1043 minimize vm_getusage impact
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/vm/vm_as.c
+++ new/usr/src/uts/common/vm/vm_as.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 23 * Use is subject to license terms.
24 24 * Copyright 2015, Joyent, Inc. All rights reserved.
25 25 * Copyright (c) 2016 by Delphix. All rights reserved.
26 26 */
27 27
28 28 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
29 29 /* All Rights Reserved */
30 30
31 31 /*
32 32 * University Copyright- Copyright (c) 1982, 1986, 1988
33 33 * The Regents of the University of California
34 34 * All Rights Reserved
35 35 *
36 36 * University Acknowledgment- Portions of this document are derived from
37 37 * software developed by the University of California, Berkeley, and its
38 38 * contributors.
39 39 */
40 40
41 41 /*
42 42 * VM - address spaces.
43 43 */
44 44
45 45 #include <sys/types.h>
46 46 #include <sys/t_lock.h>
47 47 #include <sys/param.h>
48 48 #include <sys/errno.h>
49 49 #include <sys/systm.h>
50 50 #include <sys/mman.h>
|
↓ open down ↓ |
50 lines elided |
↑ open up ↑ |
51 51 #include <sys/sysmacros.h>
52 52 #include <sys/cpuvar.h>
53 53 #include <sys/sysinfo.h>
54 54 #include <sys/kmem.h>
55 55 #include <sys/vnode.h>
56 56 #include <sys/vmsystm.h>
57 57 #include <sys/cmn_err.h>
58 58 #include <sys/debug.h>
59 59 #include <sys/tnf_probe.h>
60 60 #include <sys/vtrace.h>
61 +#include <sys/ddi.h>
61 62
62 63 #include <vm/hat.h>
63 64 #include <vm/as.h>
64 65 #include <vm/seg.h>
65 66 #include <vm/seg_vn.h>
66 67 #include <vm/seg_dev.h>
67 68 #include <vm/seg_kmem.h>
68 69 #include <vm/seg_map.h>
69 70 #include <vm/seg_spt.h>
70 71 #include <vm/page.h>
71 72
72 73 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
73 74
74 75 static struct kmem_cache *as_cache;
75 76
76 77 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
77 78 static void as_clearwatchprot(struct as *, caddr_t, size_t);
78 79 int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
79 80
80 81
81 82 /*
82 83 * Verifying the segment lists is very time-consuming; it may not be
83 84 * desirable always to define VERIFY_SEGLIST when DEBUG is set.
84 85 */
85 86 #ifdef DEBUG
86 87 #define VERIFY_SEGLIST
87 88 int do_as_verify = 0;
88 89 #endif
89 90
90 91 /*
91 92 * Allocate a new callback data structure entry and fill in the events of
92 93 * interest, the address range of interest, and the callback argument.
93 94 * Link the entry on the as->a_callbacks list. A callback entry for the
94 95 * entire address space may be specified with vaddr = 0 and size = -1.
95 96 *
96 97 * CALLERS RESPONSIBILITY: If not calling from within the process context for
97 98 * the specified as, the caller must guarantee persistence of the specified as
98 99 * for the duration of this function (eg. pages being locked within the as
99 100 * will guarantee persistence).
100 101 */
101 102 int
102 103 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
103 104 caddr_t vaddr, size_t size, int sleepflag)
104 105 {
105 106 struct as_callback *current_head, *cb;
106 107 caddr_t saddr;
107 108 size_t rsize;
108 109
109 110 /* callback function and an event are mandatory */
110 111 if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
111 112 return (EINVAL);
112 113
113 114 /* Adding a callback after as_free has been called is not allowed */
114 115 if (as == &kas)
115 116 return (ENOMEM);
116 117
117 118 /*
118 119 * vaddr = 0 and size = -1 is used to indicate that the callback range
119 120 * is the entire address space so no rounding is done in that case.
120 121 */
121 122 if (size != -1) {
122 123 saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
123 124 rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
124 125 (size_t)saddr;
125 126 /* check for wraparound */
126 127 if (saddr + rsize < saddr)
127 128 return (ENOMEM);
128 129 } else {
129 130 if (vaddr != 0)
130 131 return (EINVAL);
131 132 saddr = vaddr;
132 133 rsize = size;
133 134 }
134 135
135 136 /* Allocate and initialize a callback entry */
136 137 cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
137 138 if (cb == NULL)
138 139 return (EAGAIN);
139 140
140 141 cb->ascb_func = cb_func;
141 142 cb->ascb_arg = arg;
142 143 cb->ascb_events = events;
143 144 cb->ascb_saddr = saddr;
144 145 cb->ascb_len = rsize;
145 146
146 147 /* Add the entry to the list */
147 148 mutex_enter(&as->a_contents);
148 149 current_head = as->a_callbacks;
149 150 as->a_callbacks = cb;
150 151 cb->ascb_next = current_head;
151 152
152 153 /*
153 154 * The call to this function may lose in a race with
154 155 * a pertinent event - eg. a thread does long term memory locking
155 156 * but before the callback is added another thread executes as_unmap.
156 157 * A broadcast here resolves that.
157 158 */
158 159 if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
159 160 AS_CLRUNMAPWAIT(as);
160 161 cv_broadcast(&as->a_cv);
161 162 }
162 163
163 164 mutex_exit(&as->a_contents);
164 165 return (0);
165 166 }
166 167
167 168 /*
168 169 * Search the callback list for an entry which pertains to arg.
169 170 *
170 171 * This is called from within the client upon completion of the callback.
171 172 * RETURN VALUES:
172 173 * AS_CALLBACK_DELETED (callback entry found and deleted)
173 174 * AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
174 175 * AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
175 176 * entry will be made in as_do_callbacks)
176 177 *
177 178 * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
178 179 * set, it indicates that as_do_callbacks is processing this entry. The
179 180 * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
180 181 * to unblock as_do_callbacks, in case it is blocked.
181 182 *
182 183 * CALLERS RESPONSIBILITY: If not calling from within the process context for
183 184 * the specified as, the caller must guarantee persistence of the specified as
184 185 * for the duration of this function (eg. pages being locked within the as
185 186 * will guarantee persistence).
186 187 */
187 188 uint_t
188 189 as_delete_callback(struct as *as, void *arg)
189 190 {
190 191 struct as_callback **prevcb = &as->a_callbacks;
191 192 struct as_callback *cb;
192 193 uint_t rc = AS_CALLBACK_NOTFOUND;
193 194
194 195 mutex_enter(&as->a_contents);
195 196 for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
196 197 if (cb->ascb_arg != arg)
197 198 continue;
198 199
199 200 /*
200 201 * If the events indicate AS_CALLBACK_CALLED, just clear
201 202 * AS_ALL_EVENT in the events field and wakeup the thread
202 203 * that may be waiting in as_do_callbacks. as_do_callbacks
203 204 * will take care of removing this entry from the list. In
204 205 * that case, return AS_CALLBACK_DELETE_DEFERRED. Otherwise
205 206 * (AS_CALLBACK_CALLED not set), just remove it from the
206 207 * list, return the memory and return AS_CALLBACK_DELETED.
207 208 */
208 209 if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
209 210 /* leave AS_CALLBACK_CALLED */
210 211 cb->ascb_events &= ~AS_ALL_EVENT;
211 212 rc = AS_CALLBACK_DELETE_DEFERRED;
212 213 cv_broadcast(&as->a_cv);
213 214 } else {
214 215 *prevcb = cb->ascb_next;
215 216 kmem_free(cb, sizeof (struct as_callback));
216 217 rc = AS_CALLBACK_DELETED;
217 218 }
218 219 break;
219 220 }
220 221 mutex_exit(&as->a_contents);
221 222 return (rc);
222 223 }
223 224
224 225 /*
225 226 * Searches the as callback list for a matching entry.
226 227 * Returns a pointer to the first matching callback, or NULL if
227 228 * nothing is found.
228 229 * This function never sleeps so it is ok to call it with more
229 230 * locks held but the (required) a_contents mutex.
230 231 *
231 232 * See also comment on as_do_callbacks below.
232 233 */
233 234 static struct as_callback *
234 235 as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
235 236 size_t event_len)
236 237 {
237 238 struct as_callback *cb;
238 239
239 240 ASSERT(MUTEX_HELD(&as->a_contents));
240 241 for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
241 242 /*
242 243 * If the callback has not already been called, then
243 244 * check if events or address range pertains. An event_len
244 245 * of zero means do an unconditional callback.
245 246 */
246 247 if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
247 248 ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
248 249 (event_addr + event_len < cb->ascb_saddr) ||
249 250 (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
250 251 continue;
251 252 }
252 253 break;
253 254 }
254 255 return (cb);
255 256 }
256 257
257 258 /*
258 259 * Executes a given callback and removes it from the callback list for
259 260 * this address space.
260 261 * This function may sleep so the caller must drop all locks except
261 262 * a_contents before calling this func.
262 263 *
263 264 * See also comments on as_do_callbacks below.
264 265 */
265 266 static void
266 267 as_execute_callback(struct as *as, struct as_callback *cb,
267 268 uint_t events)
268 269 {
269 270 struct as_callback **prevcb;
270 271 void *cb_arg;
271 272
272 273 ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
273 274 cb->ascb_events |= AS_CALLBACK_CALLED;
274 275 mutex_exit(&as->a_contents);
275 276 (*cb->ascb_func)(as, cb->ascb_arg, events);
276 277 mutex_enter(&as->a_contents);
277 278 /*
278 279 * the callback function is required to delete the callback
279 280 * when the callback function determines it is OK for
280 281 * this thread to continue. as_delete_callback will clear
281 282 * the AS_ALL_EVENT in the events field when it is deleted.
282 283 * If the callback function called as_delete_callback,
283 284 * events will already be cleared and there will be no blocking.
284 285 */
285 286 while ((cb->ascb_events & events) != 0) {
286 287 cv_wait(&as->a_cv, &as->a_contents);
287 288 }
288 289 /*
289 290 * This entry needs to be taken off the list. Normally, the
290 291 * callback func itself does that, but unfortunately the list
291 292 * may have changed while the callback was running because the
292 293 * a_contents mutex was dropped and someone else other than the
293 294 * callback func itself could have called as_delete_callback,
294 295 * so we have to search to find this entry again. The entry
295 296 * must have AS_CALLBACK_CALLED, and have the same 'arg'.
296 297 */
297 298 cb_arg = cb->ascb_arg;
298 299 prevcb = &as->a_callbacks;
299 300 for (cb = as->a_callbacks; cb != NULL;
300 301 prevcb = &cb->ascb_next, cb = *prevcb) {
301 302 if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
302 303 (cb_arg != cb->ascb_arg)) {
303 304 continue;
304 305 }
305 306 *prevcb = cb->ascb_next;
306 307 kmem_free(cb, sizeof (struct as_callback));
307 308 break;
308 309 }
309 310 }
310 311
311 312 /*
312 313 * Check the callback list for a matching event and intersection of
313 314 * address range. If there is a match invoke the callback. Skip an entry if:
314 315 * - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
315 316 * - not event of interest
316 317 * - not address range of interest
317 318 *
318 319 * An event_len of zero indicates a request for an unconditional callback
319 320 * (regardless of event), only the AS_CALLBACK_CALLED is checked. The
320 321 * a_contents lock must be dropped before a callback, so only one callback
321 322 * can be done before returning. Return -1 (true) if a callback was
322 323 * executed and removed from the list, else return 0 (false).
323 324 *
324 325 * The logically separate parts, i.e. finding a matching callback and
325 326 * executing a given callback have been separated into two functions
326 327 * so that they can be called with different sets of locks held beyond
327 328 * the always-required a_contents. as_find_callback does not sleep so
328 329 * it is ok to call it if more locks than a_contents (i.e. the a_lock
329 330 * rwlock) are held. as_execute_callback on the other hand may sleep
330 331 * so all locks beyond a_contents must be dropped by the caller if one
331 332 * does not want to end comatose.
332 333 */
333 334 static int
334 335 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
335 336 size_t event_len)
336 337 {
337 338 struct as_callback *cb;
338 339
339 340 if ((cb = as_find_callback(as, events, event_addr, event_len))) {
340 341 as_execute_callback(as, cb, events);
341 342 return (-1);
342 343 }
343 344 return (0);
344 345 }
345 346
346 347 /*
347 348 * Search for the segment containing addr. If a segment containing addr
348 349 * exists, that segment is returned. If no such segment exists, and
349 350 * the list spans addresses greater than addr, then the first segment
350 351 * whose base is greater than addr is returned; otherwise, NULL is
351 352 * returned unless tail is true, in which case the last element of the
352 353 * list is returned.
353 354 *
354 355 * a_seglast is used to cache the last found segment for repeated
355 356 * searches to the same addr (which happens frequently).
356 357 */
357 358 struct seg *
358 359 as_findseg(struct as *as, caddr_t addr, int tail)
359 360 {
360 361 struct seg *seg = as->a_seglast;
361 362 avl_index_t where;
362 363
363 364 ASSERT(AS_LOCK_HELD(as));
364 365
365 366 if (seg != NULL &&
366 367 seg->s_base <= addr &&
367 368 addr < seg->s_base + seg->s_size)
368 369 return (seg);
369 370
370 371 seg = avl_find(&as->a_segtree, &addr, &where);
371 372 if (seg != NULL)
372 373 return (as->a_seglast = seg);
373 374
374 375 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
375 376 if (seg == NULL && tail)
376 377 seg = avl_last(&as->a_segtree);
377 378 return (as->a_seglast = seg);
378 379 }
379 380
380 381 #ifdef VERIFY_SEGLIST
381 382 /*
382 383 * verify that the linked list is coherent
383 384 */
384 385 static void
385 386 as_verify(struct as *as)
386 387 {
387 388 struct seg *seg, *seglast, *p, *n;
388 389 uint_t nsegs = 0;
389 390
390 391 if (do_as_verify == 0)
391 392 return;
392 393
393 394 seglast = as->a_seglast;
394 395
395 396 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
396 397 ASSERT(seg->s_as == as);
397 398 p = AS_SEGPREV(as, seg);
398 399 n = AS_SEGNEXT(as, seg);
399 400 ASSERT(p == NULL || p->s_as == as);
400 401 ASSERT(p == NULL || p->s_base < seg->s_base);
401 402 ASSERT(n == NULL || n->s_base > seg->s_base);
402 403 ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
403 404 if (seg == seglast)
404 405 seglast = NULL;
405 406 nsegs++;
406 407 }
407 408 ASSERT(seglast == NULL);
408 409 ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
409 410 }
410 411 #endif /* VERIFY_SEGLIST */
411 412
412 413 /*
413 414 * Add a new segment to the address space. The avl_find()
414 415 * may be expensive so we attempt to use last segment accessed
415 416 * in as_gap() as an insertion point.
416 417 */
417 418 int
418 419 as_addseg(struct as *as, struct seg *newseg)
419 420 {
420 421 struct seg *seg;
421 422 caddr_t addr;
422 423 caddr_t eaddr;
423 424 avl_index_t where;
424 425
425 426 ASSERT(AS_WRITE_HELD(as));
426 427
427 428 as->a_updatedir = 1; /* inform /proc */
428 429 gethrestime(&as->a_updatetime);
429 430
430 431 if (as->a_lastgaphl != NULL) {
431 432 struct seg *hseg = NULL;
432 433 struct seg *lseg = NULL;
433 434
434 435 if (as->a_lastgaphl->s_base > newseg->s_base) {
435 436 hseg = as->a_lastgaphl;
436 437 lseg = AVL_PREV(&as->a_segtree, hseg);
437 438 } else {
438 439 lseg = as->a_lastgaphl;
439 440 hseg = AVL_NEXT(&as->a_segtree, lseg);
440 441 }
441 442
442 443 if (hseg && lseg && lseg->s_base < newseg->s_base &&
443 444 hseg->s_base > newseg->s_base) {
444 445 avl_insert_here(&as->a_segtree, newseg, lseg,
445 446 AVL_AFTER);
446 447 as->a_lastgaphl = NULL;
447 448 as->a_seglast = newseg;
448 449 return (0);
449 450 }
450 451 as->a_lastgaphl = NULL;
451 452 }
452 453
453 454 addr = newseg->s_base;
454 455 eaddr = addr + newseg->s_size;
455 456 again:
456 457
457 458 seg = avl_find(&as->a_segtree, &addr, &where);
458 459
459 460 if (seg == NULL)
460 461 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
461 462
462 463 if (seg == NULL)
463 464 seg = avl_last(&as->a_segtree);
464 465
465 466 if (seg != NULL) {
466 467 caddr_t base = seg->s_base;
467 468
468 469 /*
469 470 * If top of seg is below the requested address, then
470 471 * the insertion point is at the end of the linked list,
471 472 * and seg points to the tail of the list. Otherwise,
472 473 * the insertion point is immediately before seg.
473 474 */
474 475 if (base + seg->s_size > addr) {
475 476 if (addr >= base || eaddr > base) {
476 477 #ifdef __sparc
477 478 extern struct seg_ops segnf_ops;
478 479
479 480 /*
480 481 * no-fault segs must disappear if overlaid.
481 482 * XXX need new segment type so
482 483 * we don't have to check s_ops
483 484 */
484 485 if (seg->s_ops == &segnf_ops) {
485 486 seg_unmap(seg);
486 487 goto again;
487 488 }
488 489 #endif
489 490 return (-1); /* overlapping segment */
490 491 }
491 492 }
492 493 }
493 494 as->a_seglast = newseg;
494 495 avl_insert(&as->a_segtree, newseg, where);
495 496
496 497 #ifdef VERIFY_SEGLIST
497 498 as_verify(as);
498 499 #endif
499 500 return (0);
500 501 }
501 502
502 503 struct seg *
503 504 as_removeseg(struct as *as, struct seg *seg)
504 505 {
505 506 avl_tree_t *t;
506 507
507 508 ASSERT(AS_WRITE_HELD(as));
508 509
509 510 as->a_updatedir = 1; /* inform /proc */
510 511 gethrestime(&as->a_updatetime);
511 512
512 513 if (seg == NULL)
513 514 return (NULL);
514 515
515 516 t = &as->a_segtree;
516 517 if (as->a_seglast == seg)
517 518 as->a_seglast = NULL;
518 519 as->a_lastgaphl = NULL;
519 520
520 521 /*
521 522 * if this segment is at an address higher than
522 523 * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
523 524 */
524 525 if (as->a_lastgap &&
525 526 (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
526 527 as->a_lastgap = AVL_NEXT(t, seg);
527 528
528 529 /*
529 530 * remove the segment from the seg tree
530 531 */
531 532 avl_remove(t, seg);
532 533
533 534 #ifdef VERIFY_SEGLIST
534 535 as_verify(as);
535 536 #endif
536 537 return (seg);
537 538 }
538 539
539 540 /*
540 541 * Find a segment containing addr.
541 542 */
542 543 struct seg *
543 544 as_segat(struct as *as, caddr_t addr)
544 545 {
545 546 struct seg *seg = as->a_seglast;
546 547
547 548 ASSERT(AS_LOCK_HELD(as));
548 549
549 550 if (seg != NULL && seg->s_base <= addr &&
550 551 addr < seg->s_base + seg->s_size)
551 552 return (seg);
552 553
553 554 seg = avl_find(&as->a_segtree, &addr, NULL);
554 555 return (seg);
555 556 }
556 557
557 558 /*
558 559 * Serialize all searches for holes in an address space to
559 560 * prevent two or more threads from allocating the same virtual
560 561 * address range. The address space must not be "read/write"
561 562 * locked by the caller since we may block.
562 563 */
563 564 void
564 565 as_rangelock(struct as *as)
565 566 {
566 567 mutex_enter(&as->a_contents);
567 568 while (AS_ISCLAIMGAP(as))
568 569 cv_wait(&as->a_cv, &as->a_contents);
569 570 AS_SETCLAIMGAP(as);
570 571 mutex_exit(&as->a_contents);
571 572 }
572 573
573 574 /*
574 575 * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
575 576 */
576 577 void
577 578 as_rangeunlock(struct as *as)
578 579 {
579 580 mutex_enter(&as->a_contents);
580 581 AS_CLRCLAIMGAP(as);
581 582 cv_signal(&as->a_cv);
582 583 mutex_exit(&as->a_contents);
583 584 }
584 585
585 586 /*
586 587 * compar segments (or just an address) by segment address range
587 588 */
588 589 static int
589 590 as_segcompar(const void *x, const void *y)
590 591 {
591 592 struct seg *a = (struct seg *)x;
592 593 struct seg *b = (struct seg *)y;
593 594
594 595 if (a->s_base < b->s_base)
595 596 return (-1);
596 597 if (a->s_base >= b->s_base + b->s_size)
597 598 return (1);
598 599 return (0);
599 600 }
600 601
601 602
602 603 void
603 604 as_avlinit(struct as *as)
604 605 {
605 606 avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
606 607 offsetof(struct seg, s_tree));
607 608 avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
608 609 offsetof(struct watched_page, wp_link));
609 610 }
610 611
611 612 /*ARGSUSED*/
612 613 static int
613 614 as_constructor(void *buf, void *cdrarg, int kmflags)
614 615 {
615 616 struct as *as = buf;
616 617
617 618 mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
618 619 cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
619 620 rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
620 621 as_avlinit(as);
621 622 return (0);
622 623 }
623 624
624 625 /*ARGSUSED1*/
625 626 static void
626 627 as_destructor(void *buf, void *cdrarg)
627 628 {
628 629 struct as *as = buf;
629 630
630 631 avl_destroy(&as->a_segtree);
631 632 mutex_destroy(&as->a_contents);
632 633 cv_destroy(&as->a_cv);
633 634 rw_destroy(&as->a_lock);
634 635 }
635 636
636 637 void
637 638 as_init(void)
638 639 {
639 640 as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
640 641 as_constructor, as_destructor, NULL, NULL, NULL, 0);
641 642 }
642 643
643 644 /*
644 645 * Allocate and initialize an address space data structure.
645 646 * We call hat_alloc to allow any machine dependent
646 647 * information in the hat structure to be initialized.
647 648 */
648 649 struct as *
649 650 as_alloc(void)
650 651 {
651 652 struct as *as;
652 653
653 654 as = kmem_cache_alloc(as_cache, KM_SLEEP);
654 655
655 656 as->a_flags = 0;
656 657 as->a_vbits = 0;
657 658 as->a_hrm = NULL;
658 659 as->a_seglast = NULL;
659 660 as->a_size = 0;
660 661 as->a_resvsize = 0;
661 662 as->a_updatedir = 0;
662 663 gethrestime(&as->a_updatetime);
663 664 as->a_objectdir = NULL;
664 665 as->a_sizedir = 0;
665 666 as->a_userlimit = (caddr_t)USERLIMIT;
666 667 as->a_lastgap = NULL;
667 668 as->a_lastgaphl = NULL;
668 669 as->a_callbacks = NULL;
669 670 as->a_proc = NULL;
670 671
671 672 AS_LOCK_ENTER(as, RW_WRITER);
672 673 as->a_hat = hat_alloc(as); /* create hat for default system mmu */
673 674 AS_LOCK_EXIT(as);
674 675
675 676 return (as);
676 677 }
677 678
678 679 /*
679 680 * Free an address space data structure.
680 681 * Need to free the hat first and then
681 682 * all the segments on this as and finally
682 683 * the space for the as struct itself.
683 684 */
684 685 void
685 686 as_free(struct as *as)
686 687 {
687 688 struct hat *hat = as->a_hat;
688 689 struct seg *seg, *next;
689 690 boolean_t free_started = B_FALSE;
690 691
691 692 top:
692 693 /*
693 694 * Invoke ALL callbacks. as_do_callbacks will do one callback
694 695 * per call, and not return (-1) until the callback has completed.
695 696 * When as_do_callbacks returns zero, all callbacks have completed.
696 697 */
697 698 mutex_enter(&as->a_contents);
698 699 while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
699 700 ;
700 701
701 702 mutex_exit(&as->a_contents);
702 703 AS_LOCK_ENTER(as, RW_WRITER);
703 704
704 705 if (!free_started) {
705 706 free_started = B_TRUE;
706 707 hat_free_start(hat);
707 708 }
708 709 for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
709 710 int err;
710 711
711 712 next = AS_SEGNEXT(as, seg);
712 713 retry:
713 714 err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
714 715 if (err == EAGAIN) {
715 716 mutex_enter(&as->a_contents);
716 717 if (as->a_callbacks) {
717 718 AS_LOCK_EXIT(as);
718 719 } else if (!AS_ISNOUNMAPWAIT(as)) {
719 720 /*
720 721 * Memory is currently locked. Wait for a
721 722 * cv_signal that it has been unlocked, then
722 723 * try the operation again.
723 724 */
724 725 if (AS_ISUNMAPWAIT(as) == 0)
725 726 cv_broadcast(&as->a_cv);
726 727 AS_SETUNMAPWAIT(as);
727 728 AS_LOCK_EXIT(as);
728 729 while (AS_ISUNMAPWAIT(as))
729 730 cv_wait(&as->a_cv, &as->a_contents);
730 731 } else {
731 732 /*
732 733 * We may have raced with
733 734 * segvn_reclaim()/segspt_reclaim(). In this
734 735 * case clean nounmapwait flag and retry since
735 736 * softlockcnt in this segment may be already
736 737 * 0. We don't drop as writer lock so our
737 738 * number of retries without sleeping should
738 739 * be very small. See segvn_reclaim() for
739 740 * more comments.
740 741 */
741 742 AS_CLRNOUNMAPWAIT(as);
742 743 mutex_exit(&as->a_contents);
743 744 goto retry;
744 745 }
745 746 mutex_exit(&as->a_contents);
746 747 goto top;
747 748 } else {
748 749 /*
749 750 * We do not expect any other error return at this
750 751 * time. This is similar to an ASSERT in seg_unmap()
751 752 */
752 753 ASSERT(err == 0);
753 754 }
754 755 }
755 756 hat_free_end(hat);
756 757 AS_LOCK_EXIT(as);
757 758
758 759 /* /proc stuff */
759 760 ASSERT(avl_numnodes(&as->a_wpage) == 0);
760 761 if (as->a_objectdir) {
761 762 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
762 763 as->a_objectdir = NULL;
763 764 as->a_sizedir = 0;
764 765 }
765 766
766 767 /*
767 768 * Free the struct as back to kmem. Assert it has no segments.
768 769 */
769 770 ASSERT(avl_numnodes(&as->a_segtree) == 0);
770 771 kmem_cache_free(as_cache, as);
771 772 }
772 773
773 774 int
774 775 as_dup(struct as *as, struct proc *forkedproc)
775 776 {
776 777 struct as *newas;
777 778 struct seg *seg, *newseg;
778 779 size_t purgesize = 0;
779 780 int error;
780 781
781 782 AS_LOCK_ENTER(as, RW_WRITER);
782 783 as_clearwatch(as);
783 784 newas = as_alloc();
784 785 newas->a_userlimit = as->a_userlimit;
785 786 newas->a_proc = forkedproc;
786 787
787 788 AS_LOCK_ENTER(newas, RW_WRITER);
788 789
789 790 (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
790 791
791 792 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
792 793
793 794 if (seg->s_flags & S_PURGE) {
794 795 purgesize += seg->s_size;
795 796 continue;
796 797 }
797 798
798 799 newseg = seg_alloc(newas, seg->s_base, seg->s_size);
799 800 if (newseg == NULL) {
800 801 AS_LOCK_EXIT(newas);
801 802 as_setwatch(as);
802 803 AS_LOCK_EXIT(as);
803 804 as_free(newas);
804 805 return (-1);
805 806 }
806 807 if ((error = SEGOP_DUP(seg, newseg)) != 0) {
807 808 /*
808 809 * We call seg_free() on the new seg
809 810 * because the segment is not set up
810 811 * completely; i.e. it has no ops.
811 812 */
812 813 as_setwatch(as);
813 814 AS_LOCK_EXIT(as);
814 815 seg_free(newseg);
815 816 AS_LOCK_EXIT(newas);
816 817 as_free(newas);
817 818 return (error);
818 819 }
819 820 newas->a_size += seg->s_size;
820 821 }
821 822 newas->a_resvsize = as->a_resvsize - purgesize;
822 823
823 824 error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
824 825
825 826 AS_LOCK_EXIT(newas);
826 827
827 828 as_setwatch(as);
828 829 AS_LOCK_EXIT(as);
829 830 if (error != 0) {
830 831 as_free(newas);
831 832 return (error);
832 833 }
833 834 forkedproc->p_as = newas;
834 835 return (0);
835 836 }
836 837
837 838 /*
838 839 * Handle a ``fault'' at addr for size bytes.
839 840 */
840 841 faultcode_t
841 842 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
842 843 enum fault_type type, enum seg_rw rw)
|
↓ open down ↓ |
772 lines elided |
↑ open up ↑ |
843 844 {
844 845 struct seg *seg;
845 846 caddr_t raddr; /* rounded down addr */
846 847 size_t rsize; /* rounded up size */
847 848 size_t ssize;
848 849 faultcode_t res = 0;
849 850 caddr_t addrsav;
850 851 struct seg *segsav;
851 852 int as_lock_held;
852 853 klwp_t *lwp = ttolwp(curthread);
854 + zone_t *zonep = curzone;
853 855
854 -
855 -
856 856 retry:
857 857 /*
858 858 * Indicate that the lwp is not to be stopped while waiting for a
859 859 * pagefault. This is to avoid deadlock while debugging a process
860 860 * via /proc over NFS (in particular).
861 861 */
862 862 if (lwp != NULL)
863 863 lwp->lwp_nostop++;
864 864
865 865 /*
866 866 * same length must be used when we softlock and softunlock. We
867 867 * don't support softunlocking lengths less than the original length
868 868 * when there is largepage support. See seg_dev.c for more
869 869 * comments.
870 870 */
871 871 switch (type) {
872 872
873 873 case F_SOFTLOCK:
874 874 CPU_STATS_ADD_K(vm, softlock, 1);
875 875 break;
876 876
877 877 case F_SOFTUNLOCK:
878 878 break;
879 879
|
↓ open down ↓ |
14 lines elided |
↑ open up ↑ |
880 880 case F_PROT:
881 881 CPU_STATS_ADD_K(vm, prot_fault, 1);
882 882 break;
883 883
884 884 case F_INVAL:
885 885 CPU_STATS_ENTER_K();
886 886 CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
887 887 if (as == &kas)
888 888 CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
889 889 CPU_STATS_EXIT_K();
890 + if (zonep->zone_pg_flt_delay != 0) {
891 + /*
892 + * The zone in which this process is running
893 + * is currently over it's physical memory cap.
894 + * Throttle page faults to help the user-land
895 + * memory capper catch up. Note that
896 + * drv_usectohz() rounds up.
897 + */
898 + atomic_add_64(&zonep->zone_pf_throttle, 1);
899 + atomic_add_64(&zonep->zone_pf_throttle_usec,
900 + zonep->zone_pg_flt_delay);
901 + if (zonep->zone_pg_flt_delay < TICK_TO_USEC(1))
902 + drv_usecwait(zonep->zone_pg_flt_delay);
903 + else
904 + delay(drv_usectohz(zonep->zone_pg_flt_delay));
905 + }
890 906 break;
891 907 }
892 908
893 909 /* Kernel probe */
894 910 TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */,
895 911 tnf_opaque, address, addr,
896 912 tnf_fault_type, fault_type, type,
897 913 tnf_seg_access, access, rw);
898 914
899 915 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
900 916 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
901 917 (size_t)raddr;
902 918
903 919 /*
904 920 * XXX -- Don't grab the as lock for segkmap. We should grab it for
905 921 * correctness, but then we could be stuck holding this lock for
906 922 * a LONG time if the fault needs to be resolved on a slow
907 923 * filesystem, and then no-one will be able to exec new commands,
908 924 * as exec'ing requires the write lock on the as.
909 925 */
910 926 if (as == &kas && segkmap && segkmap->s_base <= raddr &&
911 927 raddr + size < segkmap->s_base + segkmap->s_size) {
912 928 seg = segkmap;
913 929 as_lock_held = 0;
914 930 } else {
915 931 AS_LOCK_ENTER(as, RW_READER);
916 932
917 933 seg = as_segat(as, raddr);
918 934 if (seg == NULL) {
919 935 AS_LOCK_EXIT(as);
920 936 if (lwp != NULL)
921 937 lwp->lwp_nostop--;
922 938 return (FC_NOMAP);
923 939 }
924 940
925 941 as_lock_held = 1;
926 942 }
927 943
928 944 addrsav = raddr;
929 945 segsav = seg;
930 946
931 947 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
932 948 if (raddr >= seg->s_base + seg->s_size) {
933 949 seg = AS_SEGNEXT(as, seg);
934 950 if (seg == NULL || raddr != seg->s_base) {
935 951 res = FC_NOMAP;
936 952 break;
937 953 }
938 954 }
939 955 if (raddr + rsize > seg->s_base + seg->s_size)
940 956 ssize = seg->s_base + seg->s_size - raddr;
941 957 else
942 958 ssize = rsize;
943 959
944 960 res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw);
945 961 if (res != 0)
946 962 break;
947 963 }
948 964
949 965 /*
950 966 * If we were SOFTLOCKing and encountered a failure,
951 967 * we must SOFTUNLOCK the range we already did. (Maybe we
952 968 * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
953 969 * right here...)
954 970 */
955 971 if (res != 0 && type == F_SOFTLOCK) {
956 972 for (seg = segsav; addrsav < raddr; addrsav += ssize) {
957 973 if (addrsav >= seg->s_base + seg->s_size)
958 974 seg = AS_SEGNEXT(as, seg);
959 975 ASSERT(seg != NULL);
960 976 /*
961 977 * Now call the fault routine again to perform the
962 978 * unlock using S_OTHER instead of the rw variable
963 979 * since we never got a chance to touch the pages.
964 980 */
965 981 if (raddr > seg->s_base + seg->s_size)
966 982 ssize = seg->s_base + seg->s_size - addrsav;
967 983 else
968 984 ssize = raddr - addrsav;
969 985 (void) SEGOP_FAULT(hat, seg, addrsav, ssize,
970 986 F_SOFTUNLOCK, S_OTHER);
971 987 }
972 988 }
973 989 if (as_lock_held)
974 990 AS_LOCK_EXIT(as);
975 991 if (lwp != NULL)
976 992 lwp->lwp_nostop--;
977 993
978 994 /*
979 995 * If the lower levels returned EDEADLK for a fault,
980 996 * It means that we should retry the fault. Let's wait
981 997 * a bit also to let the deadlock causing condition clear.
982 998 * This is part of a gross hack to work around a design flaw
983 999 * in the ufs/sds logging code and should go away when the
984 1000 * logging code is re-designed to fix the problem. See bug
985 1001 * 4125102 for details of the problem.
986 1002 */
987 1003 if (FC_ERRNO(res) == EDEADLK) {
988 1004 delay(deadlk_wait);
989 1005 res = 0;
990 1006 goto retry;
991 1007 }
992 1008 return (res);
993 1009 }
994 1010
995 1011
996 1012
997 1013 /*
998 1014 * Asynchronous ``fault'' at addr for size bytes.
999 1015 */
1000 1016 faultcode_t
1001 1017 as_faulta(struct as *as, caddr_t addr, size_t size)
1002 1018 {
1003 1019 struct seg *seg;
1004 1020 caddr_t raddr; /* rounded down addr */
1005 1021 size_t rsize; /* rounded up size */
1006 1022 faultcode_t res = 0;
1007 1023 klwp_t *lwp = ttolwp(curthread);
1008 1024
1009 1025 retry:
1010 1026 /*
1011 1027 * Indicate that the lwp is not to be stopped while waiting
1012 1028 * for a pagefault. This is to avoid deadlock while debugging
1013 1029 * a process via /proc over NFS (in particular).
1014 1030 */
1015 1031 if (lwp != NULL)
1016 1032 lwp->lwp_nostop++;
1017 1033
1018 1034 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1019 1035 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1020 1036 (size_t)raddr;
1021 1037
1022 1038 AS_LOCK_ENTER(as, RW_READER);
1023 1039 seg = as_segat(as, raddr);
1024 1040 if (seg == NULL) {
1025 1041 AS_LOCK_EXIT(as);
1026 1042 if (lwp != NULL)
1027 1043 lwp->lwp_nostop--;
1028 1044 return (FC_NOMAP);
1029 1045 }
1030 1046
1031 1047 for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1032 1048 if (raddr >= seg->s_base + seg->s_size) {
1033 1049 seg = AS_SEGNEXT(as, seg);
1034 1050 if (seg == NULL || raddr != seg->s_base) {
1035 1051 res = FC_NOMAP;
1036 1052 break;
1037 1053 }
1038 1054 }
1039 1055 res = SEGOP_FAULTA(seg, raddr);
1040 1056 if (res != 0)
1041 1057 break;
1042 1058 }
1043 1059 AS_LOCK_EXIT(as);
1044 1060 if (lwp != NULL)
1045 1061 lwp->lwp_nostop--;
1046 1062 /*
1047 1063 * If the lower levels returned EDEADLK for a fault,
1048 1064 * It means that we should retry the fault. Let's wait
1049 1065 * a bit also to let the deadlock causing condition clear.
1050 1066 * This is part of a gross hack to work around a design flaw
1051 1067 * in the ufs/sds logging code and should go away when the
1052 1068 * logging code is re-designed to fix the problem. See bug
1053 1069 * 4125102 for details of the problem.
1054 1070 */
1055 1071 if (FC_ERRNO(res) == EDEADLK) {
1056 1072 delay(deadlk_wait);
1057 1073 res = 0;
1058 1074 goto retry;
1059 1075 }
1060 1076 return (res);
1061 1077 }
1062 1078
1063 1079 /*
1064 1080 * Set the virtual mapping for the interval from [addr : addr + size)
1065 1081 * in address space `as' to have the specified protection.
1066 1082 * It is ok for the range to cross over several segments,
1067 1083 * as long as they are contiguous.
1068 1084 */
1069 1085 int
1070 1086 as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1071 1087 {
1072 1088 struct seg *seg;
1073 1089 struct as_callback *cb;
1074 1090 size_t ssize;
1075 1091 caddr_t raddr; /* rounded down addr */
1076 1092 size_t rsize; /* rounded up size */
1077 1093 int error = 0, writer = 0;
1078 1094 caddr_t saveraddr;
1079 1095 size_t saversize;
1080 1096
1081 1097 setprot_top:
1082 1098 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1083 1099 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1084 1100 (size_t)raddr;
1085 1101
1086 1102 if (raddr + rsize < raddr) /* check for wraparound */
1087 1103 return (ENOMEM);
1088 1104
1089 1105 saveraddr = raddr;
1090 1106 saversize = rsize;
1091 1107
1092 1108 /*
1093 1109 * Normally we only lock the as as a reader. But
1094 1110 * if due to setprot the segment driver needs to split
1095 1111 * a segment it will return IE_RETRY. Therefore we re-acquire
1096 1112 * the as lock as a writer so the segment driver can change
1097 1113 * the seg list. Also the segment driver will return IE_RETRY
1098 1114 * after it has changed the segment list so we therefore keep
1099 1115 * locking as a writer. Since these opeartions should be rare
1100 1116 * want to only lock as a writer when necessary.
1101 1117 */
1102 1118 if (writer || avl_numnodes(&as->a_wpage) != 0) {
1103 1119 AS_LOCK_ENTER(as, RW_WRITER);
1104 1120 } else {
1105 1121 AS_LOCK_ENTER(as, RW_READER);
1106 1122 }
1107 1123
1108 1124 as_clearwatchprot(as, raddr, rsize);
1109 1125 seg = as_segat(as, raddr);
1110 1126 if (seg == NULL) {
1111 1127 as_setwatch(as);
1112 1128 AS_LOCK_EXIT(as);
1113 1129 return (ENOMEM);
1114 1130 }
1115 1131
1116 1132 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1117 1133 if (raddr >= seg->s_base + seg->s_size) {
1118 1134 seg = AS_SEGNEXT(as, seg);
1119 1135 if (seg == NULL || raddr != seg->s_base) {
1120 1136 error = ENOMEM;
1121 1137 break;
1122 1138 }
1123 1139 }
1124 1140 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1125 1141 ssize = seg->s_base + seg->s_size - raddr;
1126 1142 else
1127 1143 ssize = rsize;
1128 1144 retry:
1129 1145 error = SEGOP_SETPROT(seg, raddr, ssize, prot);
1130 1146
1131 1147 if (error == IE_NOMEM) {
1132 1148 error = EAGAIN;
1133 1149 break;
1134 1150 }
1135 1151
1136 1152 if (error == IE_RETRY) {
1137 1153 AS_LOCK_EXIT(as);
1138 1154 writer = 1;
1139 1155 goto setprot_top;
1140 1156 }
1141 1157
1142 1158 if (error == EAGAIN) {
1143 1159 /*
1144 1160 * Make sure we have a_lock as writer.
1145 1161 */
1146 1162 if (writer == 0) {
1147 1163 AS_LOCK_EXIT(as);
1148 1164 writer = 1;
1149 1165 goto setprot_top;
1150 1166 }
1151 1167
1152 1168 /*
1153 1169 * Memory is currently locked. It must be unlocked
1154 1170 * before this operation can succeed through a retry.
1155 1171 * The possible reasons for locked memory and
1156 1172 * corresponding strategies for unlocking are:
1157 1173 * (1) Normal I/O
1158 1174 * wait for a signal that the I/O operation
1159 1175 * has completed and the memory is unlocked.
1160 1176 * (2) Asynchronous I/O
1161 1177 * The aio subsystem does not unlock pages when
1162 1178 * the I/O is completed. Those pages are unlocked
1163 1179 * when the application calls aiowait/aioerror.
1164 1180 * So, to prevent blocking forever, cv_broadcast()
1165 1181 * is done to wake up aio_cleanup_thread.
1166 1182 * Subsequently, segvn_reclaim will be called, and
1167 1183 * that will do AS_CLRUNMAPWAIT() and wake us up.
1168 1184 * (3) Long term page locking:
1169 1185 * Drivers intending to have pages locked for a
1170 1186 * period considerably longer than for normal I/O
1171 1187 * (essentially forever) may have registered for a
1172 1188 * callback so they may unlock these pages on
1173 1189 * request. This is needed to allow this operation
1174 1190 * to succeed. Each entry on the callback list is
1175 1191 * examined. If the event or address range pertains
1176 1192 * the callback is invoked (unless it already is in
1177 1193 * progress). The a_contents lock must be dropped
1178 1194 * before the callback, so only one callback can
1179 1195 * be done at a time. Go to the top and do more
1180 1196 * until zero is returned. If zero is returned,
1181 1197 * either there were no callbacks for this event
1182 1198 * or they were already in progress.
1183 1199 */
1184 1200 mutex_enter(&as->a_contents);
1185 1201 if (as->a_callbacks &&
1186 1202 (cb = as_find_callback(as, AS_SETPROT_EVENT,
1187 1203 seg->s_base, seg->s_size))) {
1188 1204 AS_LOCK_EXIT(as);
1189 1205 as_execute_callback(as, cb, AS_SETPROT_EVENT);
1190 1206 } else if (!AS_ISNOUNMAPWAIT(as)) {
1191 1207 if (AS_ISUNMAPWAIT(as) == 0)
1192 1208 cv_broadcast(&as->a_cv);
1193 1209 AS_SETUNMAPWAIT(as);
1194 1210 AS_LOCK_EXIT(as);
1195 1211 while (AS_ISUNMAPWAIT(as))
1196 1212 cv_wait(&as->a_cv, &as->a_contents);
1197 1213 } else {
1198 1214 /*
1199 1215 * We may have raced with
1200 1216 * segvn_reclaim()/segspt_reclaim(). In this
1201 1217 * case clean nounmapwait flag and retry since
1202 1218 * softlockcnt in this segment may be already
1203 1219 * 0. We don't drop as writer lock so our
1204 1220 * number of retries without sleeping should
1205 1221 * be very small. See segvn_reclaim() for
1206 1222 * more comments.
1207 1223 */
1208 1224 AS_CLRNOUNMAPWAIT(as);
1209 1225 mutex_exit(&as->a_contents);
1210 1226 goto retry;
1211 1227 }
1212 1228 mutex_exit(&as->a_contents);
1213 1229 goto setprot_top;
1214 1230 } else if (error != 0)
1215 1231 break;
1216 1232 }
1217 1233 if (error != 0) {
1218 1234 as_setwatch(as);
1219 1235 } else {
1220 1236 as_setwatchprot(as, saveraddr, saversize, prot);
1221 1237 }
1222 1238 AS_LOCK_EXIT(as);
1223 1239 return (error);
1224 1240 }
1225 1241
1226 1242 /*
1227 1243 * Check to make sure that the interval [addr, addr + size)
1228 1244 * in address space `as' has at least the specified protection.
1229 1245 * It is ok for the range to cross over several segments, as long
1230 1246 * as they are contiguous.
1231 1247 */
1232 1248 int
1233 1249 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1234 1250 {
1235 1251 struct seg *seg;
1236 1252 size_t ssize;
1237 1253 caddr_t raddr; /* rounded down addr */
1238 1254 size_t rsize; /* rounded up size */
1239 1255 int error = 0;
1240 1256
1241 1257 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1242 1258 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1243 1259 (size_t)raddr;
1244 1260
1245 1261 if (raddr + rsize < raddr) /* check for wraparound */
1246 1262 return (ENOMEM);
1247 1263
1248 1264 /*
1249 1265 * This is ugly as sin...
1250 1266 * Normally, we only acquire the address space readers lock.
1251 1267 * However, if the address space has watchpoints present,
1252 1268 * we must acquire the writer lock on the address space for
1253 1269 * the benefit of as_clearwatchprot() and as_setwatchprot().
1254 1270 */
1255 1271 if (avl_numnodes(&as->a_wpage) != 0)
1256 1272 AS_LOCK_ENTER(as, RW_WRITER);
1257 1273 else
1258 1274 AS_LOCK_ENTER(as, RW_READER);
1259 1275 as_clearwatchprot(as, raddr, rsize);
1260 1276 seg = as_segat(as, raddr);
1261 1277 if (seg == NULL) {
1262 1278 as_setwatch(as);
1263 1279 AS_LOCK_EXIT(as);
1264 1280 return (ENOMEM);
1265 1281 }
1266 1282
1267 1283 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1268 1284 if (raddr >= seg->s_base + seg->s_size) {
1269 1285 seg = AS_SEGNEXT(as, seg);
1270 1286 if (seg == NULL || raddr != seg->s_base) {
1271 1287 error = ENOMEM;
1272 1288 break;
1273 1289 }
1274 1290 }
1275 1291 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1276 1292 ssize = seg->s_base + seg->s_size - raddr;
1277 1293 else
1278 1294 ssize = rsize;
1279 1295
1280 1296 error = SEGOP_CHECKPROT(seg, raddr, ssize, prot);
1281 1297 if (error != 0)
1282 1298 break;
1283 1299 }
1284 1300 as_setwatch(as);
1285 1301 AS_LOCK_EXIT(as);
1286 1302 return (error);
1287 1303 }
1288 1304
1289 1305 int
1290 1306 as_unmap(struct as *as, caddr_t addr, size_t size)
1291 1307 {
1292 1308 struct seg *seg, *seg_next;
1293 1309 struct as_callback *cb;
1294 1310 caddr_t raddr, eaddr;
1295 1311 size_t ssize, rsize = 0;
1296 1312 int err;
1297 1313
1298 1314 top:
1299 1315 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1300 1316 eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1301 1317 (uintptr_t)PAGEMASK);
1302 1318
1303 1319 AS_LOCK_ENTER(as, RW_WRITER);
1304 1320
1305 1321 as->a_updatedir = 1; /* inform /proc */
1306 1322 gethrestime(&as->a_updatetime);
1307 1323
1308 1324 /*
1309 1325 * Use as_findseg to find the first segment in the range, then
1310 1326 * step through the segments in order, following s_next.
1311 1327 */
1312 1328 as_clearwatchprot(as, raddr, eaddr - raddr);
1313 1329
1314 1330 for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
1315 1331 if (eaddr <= seg->s_base)
1316 1332 break; /* eaddr was in a gap; all done */
1317 1333
1318 1334 /* this is implied by the test above */
1319 1335 ASSERT(raddr < eaddr);
1320 1336
1321 1337 if (raddr < seg->s_base)
1322 1338 raddr = seg->s_base; /* raddr was in a gap */
1323 1339
1324 1340 if (eaddr > (seg->s_base + seg->s_size))
1325 1341 ssize = seg->s_base + seg->s_size - raddr;
1326 1342 else
1327 1343 ssize = eaddr - raddr;
1328 1344
1329 1345 /*
1330 1346 * Save next segment pointer since seg can be
1331 1347 * destroyed during the segment unmap operation.
1332 1348 */
1333 1349 seg_next = AS_SEGNEXT(as, seg);
1334 1350
1335 1351 /*
1336 1352 * We didn't count /dev/null mappings, so ignore them here.
1337 1353 * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1338 1354 * we have to do this check here while we have seg.)
1339 1355 */
1340 1356 rsize = 0;
1341 1357 if (!SEG_IS_DEVNULL_MAPPING(seg) &&
1342 1358 !SEG_IS_PARTIAL_RESV(seg))
1343 1359 rsize = ssize;
1344 1360
1345 1361 retry:
1346 1362 err = SEGOP_UNMAP(seg, raddr, ssize);
1347 1363 if (err == EAGAIN) {
1348 1364 /*
1349 1365 * Memory is currently locked. It must be unlocked
1350 1366 * before this operation can succeed through a retry.
1351 1367 * The possible reasons for locked memory and
1352 1368 * corresponding strategies for unlocking are:
1353 1369 * (1) Normal I/O
1354 1370 * wait for a signal that the I/O operation
1355 1371 * has completed and the memory is unlocked.
1356 1372 * (2) Asynchronous I/O
1357 1373 * The aio subsystem does not unlock pages when
1358 1374 * the I/O is completed. Those pages are unlocked
1359 1375 * when the application calls aiowait/aioerror.
1360 1376 * So, to prevent blocking forever, cv_broadcast()
1361 1377 * is done to wake up aio_cleanup_thread.
1362 1378 * Subsequently, segvn_reclaim will be called, and
1363 1379 * that will do AS_CLRUNMAPWAIT() and wake us up.
1364 1380 * (3) Long term page locking:
1365 1381 * Drivers intending to have pages locked for a
1366 1382 * period considerably longer than for normal I/O
1367 1383 * (essentially forever) may have registered for a
1368 1384 * callback so they may unlock these pages on
1369 1385 * request. This is needed to allow this operation
1370 1386 * to succeed. Each entry on the callback list is
1371 1387 * examined. If the event or address range pertains
1372 1388 * the callback is invoked (unless it already is in
1373 1389 * progress). The a_contents lock must be dropped
1374 1390 * before the callback, so only one callback can
1375 1391 * be done at a time. Go to the top and do more
1376 1392 * until zero is returned. If zero is returned,
1377 1393 * either there were no callbacks for this event
1378 1394 * or they were already in progress.
1379 1395 */
1380 1396 mutex_enter(&as->a_contents);
1381 1397 if (as->a_callbacks &&
1382 1398 (cb = as_find_callback(as, AS_UNMAP_EVENT,
1383 1399 seg->s_base, seg->s_size))) {
1384 1400 AS_LOCK_EXIT(as);
1385 1401 as_execute_callback(as, cb, AS_UNMAP_EVENT);
1386 1402 } else if (!AS_ISNOUNMAPWAIT(as)) {
1387 1403 if (AS_ISUNMAPWAIT(as) == 0)
1388 1404 cv_broadcast(&as->a_cv);
1389 1405 AS_SETUNMAPWAIT(as);
1390 1406 AS_LOCK_EXIT(as);
1391 1407 while (AS_ISUNMAPWAIT(as))
1392 1408 cv_wait(&as->a_cv, &as->a_contents);
1393 1409 } else {
1394 1410 /*
1395 1411 * We may have raced with
1396 1412 * segvn_reclaim()/segspt_reclaim(). In this
1397 1413 * case clean nounmapwait flag and retry since
1398 1414 * softlockcnt in this segment may be already
1399 1415 * 0. We don't drop as writer lock so our
1400 1416 * number of retries without sleeping should
1401 1417 * be very small. See segvn_reclaim() for
1402 1418 * more comments.
1403 1419 */
1404 1420 AS_CLRNOUNMAPWAIT(as);
1405 1421 mutex_exit(&as->a_contents);
1406 1422 goto retry;
1407 1423 }
1408 1424 mutex_exit(&as->a_contents);
1409 1425 goto top;
1410 1426 } else if (err == IE_RETRY) {
1411 1427 AS_LOCK_EXIT(as);
1412 1428 goto top;
1413 1429 } else if (err) {
1414 1430 as_setwatch(as);
1415 1431 AS_LOCK_EXIT(as);
1416 1432 return (-1);
1417 1433 }
1418 1434
1419 1435 as->a_size -= ssize;
1420 1436 if (rsize)
1421 1437 as->a_resvsize -= rsize;
1422 1438 raddr += ssize;
1423 1439 }
1424 1440 AS_LOCK_EXIT(as);
1425 1441 return (0);
1426 1442 }
1427 1443
1428 1444 static int
1429 1445 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
1430 1446 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1431 1447 {
1432 1448 uint_t szc;
1433 1449 uint_t nszc;
1434 1450 int error;
1435 1451 caddr_t a;
1436 1452 caddr_t eaddr;
1437 1453 size_t segsize;
1438 1454 struct seg *seg;
1439 1455 size_t pgsz;
1440 1456 int do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
1441 1457 uint_t save_szcvec;
1442 1458
1443 1459 ASSERT(AS_WRITE_HELD(as));
1444 1460 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1445 1461 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1446 1462 ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
1447 1463 if (!do_off) {
1448 1464 vn_a->offset = 0;
1449 1465 }
1450 1466
1451 1467 if (szcvec <= 1) {
1452 1468 seg = seg_alloc(as, addr, size);
1453 1469 if (seg == NULL) {
1454 1470 return (ENOMEM);
1455 1471 }
1456 1472 vn_a->szc = 0;
1457 1473 error = (*crfp)(seg, vn_a);
1458 1474 if (error != 0) {
1459 1475 seg_free(seg);
1460 1476 } else {
1461 1477 as->a_size += size;
1462 1478 as->a_resvsize += size;
1463 1479 }
1464 1480 return (error);
1465 1481 }
1466 1482
1467 1483 eaddr = addr + size;
1468 1484 save_szcvec = szcvec;
1469 1485 szcvec >>= 1;
1470 1486 szc = 0;
1471 1487 nszc = 0;
1472 1488 while (szcvec) {
1473 1489 if ((szcvec & 0x1) == 0) {
1474 1490 nszc++;
1475 1491 szcvec >>= 1;
1476 1492 continue;
1477 1493 }
1478 1494 nszc++;
1479 1495 pgsz = page_get_pagesize(nszc);
1480 1496 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
1481 1497 if (a != addr) {
1482 1498 ASSERT(a < eaddr);
1483 1499 segsize = a - addr;
1484 1500 seg = seg_alloc(as, addr, segsize);
1485 1501 if (seg == NULL) {
1486 1502 return (ENOMEM);
1487 1503 }
1488 1504 vn_a->szc = szc;
1489 1505 error = (*crfp)(seg, vn_a);
1490 1506 if (error != 0) {
1491 1507 seg_free(seg);
1492 1508 return (error);
1493 1509 }
1494 1510 as->a_size += segsize;
1495 1511 as->a_resvsize += segsize;
1496 1512 *segcreated = 1;
1497 1513 if (do_off) {
1498 1514 vn_a->offset += segsize;
1499 1515 }
1500 1516 addr = a;
1501 1517 }
1502 1518 szc = nszc;
1503 1519 szcvec >>= 1;
1504 1520 }
1505 1521
1506 1522 ASSERT(addr < eaddr);
1507 1523 szcvec = save_szcvec | 1; /* add 8K pages */
1508 1524 while (szcvec) {
1509 1525 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
1510 1526 ASSERT(a >= addr);
1511 1527 if (a != addr) {
1512 1528 segsize = a - addr;
1513 1529 seg = seg_alloc(as, addr, segsize);
1514 1530 if (seg == NULL) {
1515 1531 return (ENOMEM);
1516 1532 }
1517 1533 vn_a->szc = szc;
1518 1534 error = (*crfp)(seg, vn_a);
1519 1535 if (error != 0) {
1520 1536 seg_free(seg);
1521 1537 return (error);
1522 1538 }
1523 1539 as->a_size += segsize;
1524 1540 as->a_resvsize += segsize;
1525 1541 *segcreated = 1;
1526 1542 if (do_off) {
1527 1543 vn_a->offset += segsize;
1528 1544 }
1529 1545 addr = a;
1530 1546 }
1531 1547 szcvec &= ~(1 << szc);
1532 1548 if (szcvec) {
1533 1549 szc = highbit(szcvec) - 1;
1534 1550 pgsz = page_get_pagesize(szc);
1535 1551 }
1536 1552 }
1537 1553 ASSERT(addr == eaddr);
1538 1554
1539 1555 return (0);
1540 1556 }
1541 1557
1542 1558 static int
1543 1559 as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
1544 1560 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1545 1561 {
1546 1562 uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
1547 1563 int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
1548 1564 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1549 1565 type, 0);
1550 1566 int error;
1551 1567 struct seg *seg;
1552 1568 struct vattr va;
1553 1569 u_offset_t eoff;
1554 1570 size_t save_size = 0;
1555 1571 extern size_t textrepl_size_thresh;
1556 1572
1557 1573 ASSERT(AS_WRITE_HELD(as));
1558 1574 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1559 1575 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1560 1576 ASSERT(vn_a->vp != NULL);
1561 1577 ASSERT(vn_a->amp == NULL);
1562 1578
1563 1579 again:
1564 1580 if (szcvec <= 1) {
1565 1581 seg = seg_alloc(as, addr, size);
1566 1582 if (seg == NULL) {
1567 1583 return (ENOMEM);
1568 1584 }
1569 1585 vn_a->szc = 0;
1570 1586 error = (*crfp)(seg, vn_a);
1571 1587 if (error != 0) {
1572 1588 seg_free(seg);
1573 1589 } else {
1574 1590 as->a_size += size;
1575 1591 as->a_resvsize += size;
1576 1592 }
1577 1593 return (error);
1578 1594 }
1579 1595
1580 1596 va.va_mask = AT_SIZE;
1581 1597 if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) {
1582 1598 szcvec = 0;
1583 1599 goto again;
1584 1600 }
1585 1601 eoff = vn_a->offset & PAGEMASK;
1586 1602 if (eoff >= va.va_size) {
1587 1603 szcvec = 0;
1588 1604 goto again;
1589 1605 }
1590 1606 eoff += size;
1591 1607 if (btopr(va.va_size) < btopr(eoff)) {
1592 1608 save_size = size;
1593 1609 size = va.va_size - (vn_a->offset & PAGEMASK);
1594 1610 size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
1595 1611 szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1596 1612 type, 0);
1597 1613 if (szcvec <= 1) {
1598 1614 size = save_size;
1599 1615 goto again;
1600 1616 }
1601 1617 }
1602 1618
1603 1619 if (size > textrepl_size_thresh) {
1604 1620 vn_a->flags |= _MAP_TEXTREPL;
1605 1621 }
1606 1622 error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a,
1607 1623 segcreated);
1608 1624 if (error != 0) {
1609 1625 return (error);
1610 1626 }
1611 1627 if (save_size) {
1612 1628 addr += size;
1613 1629 size = save_size - size;
1614 1630 szcvec = 0;
1615 1631 goto again;
1616 1632 }
1617 1633 return (0);
1618 1634 }
1619 1635
1620 1636 /*
1621 1637 * as_map_ansegs: shared or private anonymous memory. Note that the flags
1622 1638 * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1623 1639 */
1624 1640 static int
1625 1641 as_map_ansegs(struct as *as, caddr_t addr, size_t size,
1626 1642 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1627 1643 {
1628 1644 uint_t szcvec;
1629 1645 uchar_t type;
1630 1646
1631 1647 ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
1632 1648 if (vn_a->type == MAP_SHARED) {
1633 1649 type = MAPPGSZC_SHM;
1634 1650 } else if (vn_a->type == MAP_PRIVATE) {
1635 1651 if (vn_a->szc == AS_MAP_HEAP) {
1636 1652 type = MAPPGSZC_HEAP;
1637 1653 } else if (vn_a->szc == AS_MAP_STACK) {
1638 1654 type = MAPPGSZC_STACK;
1639 1655 } else {
1640 1656 type = MAPPGSZC_PRIVM;
1641 1657 }
1642 1658 }
1643 1659 szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
1644 1660 (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
1645 1661 (vn_a->flags & MAP_TEXT), type, 0);
1646 1662 ASSERT(AS_WRITE_HELD(as));
1647 1663 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1648 1664 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1649 1665 ASSERT(vn_a->vp == NULL);
1650 1666
1651 1667 return (as_map_segvn_segs(as, addr, size, szcvec,
1652 1668 crfp, vn_a, segcreated));
1653 1669 }
1654 1670
1655 1671 int
1656 1672 as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp)
1657 1673 {
1658 1674 AS_LOCK_ENTER(as, RW_WRITER);
1659 1675 return (as_map_locked(as, addr, size, crfp, argsp));
1660 1676 }
1661 1677
1662 1678 int
1663 1679 as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(),
1664 1680 void *argsp)
1665 1681 {
1666 1682 struct seg *seg = NULL;
1667 1683 caddr_t raddr; /* rounded down addr */
1668 1684 size_t rsize; /* rounded up size */
1669 1685 int error;
1670 1686 int unmap = 0;
1671 1687 /*
1672 1688 * The use of a_proc is preferred to handle the case where curproc is
1673 1689 * a door_call server and is allocating memory in the client's (a_proc)
1674 1690 * address space.
1675 1691 * When creating a shared memory segment a_proc will be NULL so we
1676 1692 * fallback to curproc in that case.
1677 1693 */
1678 1694 struct proc *p = (as->a_proc == NULL) ? curproc : as->a_proc;
1679 1695 struct segvn_crargs crargs;
1680 1696
1681 1697 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1682 1698 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1683 1699 (size_t)raddr;
1684 1700
1685 1701 /*
1686 1702 * check for wrap around
1687 1703 */
1688 1704 if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
1689 1705 AS_LOCK_EXIT(as);
1690 1706 return (ENOMEM);
1691 1707 }
1692 1708
1693 1709 as->a_updatedir = 1; /* inform /proc */
1694 1710 gethrestime(&as->a_updatetime);
1695 1711
1696 1712 if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) {
1697 1713 AS_LOCK_EXIT(as);
1698 1714
1699 1715 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
1700 1716 RCA_UNSAFE_ALL);
1701 1717
1702 1718 return (ENOMEM);
1703 1719 }
1704 1720
1705 1721 if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
1706 1722 crargs = *(struct segvn_crargs *)argsp;
1707 1723 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap);
1708 1724 if (error != 0) {
1709 1725 AS_LOCK_EXIT(as);
1710 1726 if (unmap) {
1711 1727 (void) as_unmap(as, addr, size);
1712 1728 }
1713 1729 return (error);
1714 1730 }
1715 1731 } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
1716 1732 crargs = *(struct segvn_crargs *)argsp;
1717 1733 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap);
1718 1734 if (error != 0) {
1719 1735 AS_LOCK_EXIT(as);
1720 1736 if (unmap) {
1721 1737 (void) as_unmap(as, addr, size);
1722 1738 }
1723 1739 return (error);
1724 1740 }
1725 1741 } else {
1726 1742 seg = seg_alloc(as, addr, size);
1727 1743 if (seg == NULL) {
1728 1744 AS_LOCK_EXIT(as);
1729 1745 return (ENOMEM);
1730 1746 }
1731 1747
1732 1748 error = (*crfp)(seg, argsp);
1733 1749 if (error != 0) {
1734 1750 seg_free(seg);
1735 1751 AS_LOCK_EXIT(as);
1736 1752 return (error);
1737 1753 }
1738 1754 /*
1739 1755 * Add size now so as_unmap will work if as_ctl fails.
1740 1756 */
1741 1757 as->a_size += rsize;
1742 1758 as->a_resvsize += rsize;
1743 1759 }
1744 1760
1745 1761 as_setwatch(as);
1746 1762
1747 1763 /*
1748 1764 * If the address space is locked,
1749 1765 * establish memory locks for the new segment.
1750 1766 */
1751 1767 mutex_enter(&as->a_contents);
1752 1768 if (AS_ISPGLCK(as)) {
1753 1769 mutex_exit(&as->a_contents);
1754 1770 AS_LOCK_EXIT(as);
1755 1771 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
1756 1772 if (error != 0)
1757 1773 (void) as_unmap(as, addr, size);
1758 1774 } else {
1759 1775 mutex_exit(&as->a_contents);
1760 1776 AS_LOCK_EXIT(as);
1761 1777 }
1762 1778 return (error);
1763 1779 }
1764 1780
1765 1781
1766 1782 /*
1767 1783 * Delete all segments in the address space marked with S_PURGE.
1768 1784 * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1769 1785 * These segments are deleted as a first step before calls to as_gap(), so
1770 1786 * that they don't affect mmap() or shmat().
1771 1787 */
1772 1788 void
1773 1789 as_purge(struct as *as)
1774 1790 {
1775 1791 struct seg *seg;
1776 1792 struct seg *next_seg;
1777 1793
1778 1794 /*
1779 1795 * the setting of NEEDSPURGE is protect by as_rangelock(), so
1780 1796 * no need to grab a_contents mutex for this check
1781 1797 */
1782 1798 if ((as->a_flags & AS_NEEDSPURGE) == 0)
1783 1799 return;
1784 1800
1785 1801 AS_LOCK_ENTER(as, RW_WRITER);
1786 1802 next_seg = NULL;
1787 1803 seg = AS_SEGFIRST(as);
1788 1804 while (seg != NULL) {
1789 1805 next_seg = AS_SEGNEXT(as, seg);
1790 1806 if (seg->s_flags & S_PURGE)
1791 1807 SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1792 1808 seg = next_seg;
1793 1809 }
1794 1810 AS_LOCK_EXIT(as);
1795 1811
1796 1812 mutex_enter(&as->a_contents);
1797 1813 as->a_flags &= ~AS_NEEDSPURGE;
1798 1814 mutex_exit(&as->a_contents);
1799 1815 }
1800 1816
1801 1817 /*
1802 1818 * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1803 1819 * range of addresses at least "minlen" long, where the base of the range is
1804 1820 * at "off" phase from an "align" boundary and there is space for a
1805 1821 * "redzone"-sized redzone on eithe rside of the range. Thus,
1806 1822 * if align was 4M and off was 16k, the user wants a hole which will start
1807 1823 * 16k into a 4M page.
1808 1824 *
1809 1825 * If flags specifies AH_HI, the hole will have the highest possible address
1810 1826 * in the range. We use the as->a_lastgap field to figure out where to
1811 1827 * start looking for a gap.
1812 1828 *
1813 1829 * Otherwise, the gap will have the lowest possible address.
1814 1830 *
1815 1831 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1816 1832 *
1817 1833 * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1818 1834 * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1819 1835 *
1820 1836 * NOTE: This routine is not correct when base+len overflows caddr_t.
1821 1837 */
1822 1838 int
1823 1839 as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
1824 1840 uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off)
1825 1841 {
1826 1842 caddr_t lobound = *basep;
1827 1843 caddr_t hibound = lobound + *lenp;
1828 1844 struct seg *lseg, *hseg;
1829 1845 caddr_t lo, hi;
1830 1846 int forward;
1831 1847 caddr_t save_base;
1832 1848 size_t save_len;
1833 1849 size_t save_minlen;
1834 1850 size_t save_redzone;
1835 1851 int fast_path = 1;
1836 1852
1837 1853 save_base = *basep;
1838 1854 save_len = *lenp;
1839 1855 save_minlen = minlen;
1840 1856 save_redzone = redzone;
1841 1857
1842 1858 /*
1843 1859 * For the first pass/fast_path, just add align and redzone into
1844 1860 * minlen since if we get an allocation, we can guarantee that it
1845 1861 * will fit the alignment and redzone requested.
1846 1862 * This increases the chance that hibound will be adjusted to
1847 1863 * a_lastgap->s_base which will likely allow us to find an
1848 1864 * acceptable hole in the address space quicker.
1849 1865 * If we can't find a hole with this fast_path, then we look for
1850 1866 * smaller holes in which the alignment and offset may allow
1851 1867 * the allocation to fit.
1852 1868 */
1853 1869 minlen += align;
1854 1870 minlen += 2 * redzone;
1855 1871 redzone = 0;
1856 1872
1857 1873 AS_LOCK_ENTER(as, RW_READER);
1858 1874 if (AS_SEGFIRST(as) == NULL) {
1859 1875 if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
1860 1876 align, redzone, off)) {
1861 1877 AS_LOCK_EXIT(as);
1862 1878 return (0);
1863 1879 } else {
1864 1880 AS_LOCK_EXIT(as);
1865 1881 *basep = save_base;
1866 1882 *lenp = save_len;
1867 1883 return (-1);
1868 1884 }
1869 1885 }
1870 1886
1871 1887 retry:
1872 1888 /*
1873 1889 * Set up to iterate over all the inter-segment holes in the given
1874 1890 * direction. lseg is NULL for the lowest-addressed hole and hseg is
1875 1891 * NULL for the highest-addressed hole. If moving backwards, we reset
1876 1892 * sseg to denote the highest-addressed segment.
1877 1893 */
1878 1894 forward = (flags & AH_DIR) == AH_LO;
1879 1895 if (forward) {
1880 1896 hseg = as_findseg(as, lobound, 1);
1881 1897 lseg = AS_SEGPREV(as, hseg);
1882 1898 } else {
1883 1899
1884 1900 /*
1885 1901 * If allocating at least as much as the last allocation,
1886 1902 * use a_lastgap's base as a better estimate of hibound.
1887 1903 */
1888 1904 if (as->a_lastgap &&
1889 1905 minlen >= as->a_lastgap->s_size &&
1890 1906 hibound >= as->a_lastgap->s_base)
1891 1907 hibound = as->a_lastgap->s_base;
1892 1908
1893 1909 hseg = as_findseg(as, hibound, 1);
1894 1910 if (hseg->s_base + hseg->s_size < hibound) {
1895 1911 lseg = hseg;
1896 1912 hseg = NULL;
1897 1913 } else {
1898 1914 lseg = AS_SEGPREV(as, hseg);
1899 1915 }
1900 1916 }
1901 1917
1902 1918 for (;;) {
1903 1919 /*
1904 1920 * Set lo and hi to the hole's boundaries. (We should really
1905 1921 * use MAXADDR in place of hibound in the expression below,
1906 1922 * but can't express it easily; using hibound in its place is
1907 1923 * harmless.)
1908 1924 */
1909 1925 lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
1910 1926 hi = (hseg == NULL) ? hibound : hseg->s_base;
1911 1927 /*
1912 1928 * If the iteration has moved past the interval from lobound
1913 1929 * to hibound it's pointless to continue.
1914 1930 */
1915 1931 if ((forward && lo > hibound) || (!forward && hi < lobound))
1916 1932 break;
1917 1933 else if (lo > hibound || hi < lobound)
1918 1934 goto cont;
1919 1935 /*
1920 1936 * Candidate hole lies at least partially within the allowable
1921 1937 * range. Restrict it to fall completely within that range,
1922 1938 * i.e., to [max(lo, lobound), min(hi, hibound)].
1923 1939 */
1924 1940 if (lo < lobound)
1925 1941 lo = lobound;
1926 1942 if (hi > hibound)
1927 1943 hi = hibound;
1928 1944 /*
1929 1945 * Verify that the candidate hole is big enough and meets
1930 1946 * hardware constraints. If the hole is too small, no need
1931 1947 * to do the further checks since they will fail.
1932 1948 */
1933 1949 *basep = lo;
1934 1950 *lenp = hi - lo;
1935 1951 if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
1936 1952 minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
1937 1953 ((flags & AH_CONTAIN) == 0 ||
1938 1954 (*basep <= addr && *basep + *lenp > addr))) {
1939 1955 if (!forward)
1940 1956 as->a_lastgap = hseg;
1941 1957 if (hseg != NULL)
1942 1958 as->a_lastgaphl = hseg;
1943 1959 else
1944 1960 as->a_lastgaphl = lseg;
1945 1961 AS_LOCK_EXIT(as);
1946 1962 return (0);
1947 1963 }
1948 1964 cont:
1949 1965 /*
1950 1966 * Move to the next hole.
1951 1967 */
1952 1968 if (forward) {
1953 1969 lseg = hseg;
1954 1970 if (lseg == NULL)
1955 1971 break;
1956 1972 hseg = AS_SEGNEXT(as, hseg);
1957 1973 } else {
1958 1974 hseg = lseg;
1959 1975 if (hseg == NULL)
1960 1976 break;
1961 1977 lseg = AS_SEGPREV(as, lseg);
1962 1978 }
1963 1979 }
1964 1980 if (fast_path && (align != 0 || save_redzone != 0)) {
1965 1981 fast_path = 0;
1966 1982 minlen = save_minlen;
1967 1983 redzone = save_redzone;
1968 1984 goto retry;
1969 1985 }
1970 1986 *basep = save_base;
1971 1987 *lenp = save_len;
1972 1988 AS_LOCK_EXIT(as);
1973 1989 return (-1);
1974 1990 }
1975 1991
1976 1992 /*
1977 1993 * Find a hole of at least size minlen within [*basep, *basep + *lenp).
1978 1994 *
1979 1995 * If flags specifies AH_HI, the hole will have the highest possible address
1980 1996 * in the range. We use the as->a_lastgap field to figure out where to
1981 1997 * start looking for a gap.
1982 1998 *
1983 1999 * Otherwise, the gap will have the lowest possible address.
1984 2000 *
1985 2001 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1986 2002 *
1987 2003 * If an adequate hole is found, base and len are set to reflect the part of
1988 2004 * the hole that is within range, and 0 is returned, otherwise,
1989 2005 * -1 is returned.
1990 2006 *
1991 2007 * NOTE: This routine is not correct when base+len overflows caddr_t.
1992 2008 */
1993 2009 int
1994 2010 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
1995 2011 caddr_t addr)
1996 2012 {
1997 2013
1998 2014 return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
1999 2015 }
2000 2016
2001 2017 /*
2002 2018 * Return the next range within [base, base + len) that is backed
2003 2019 * with "real memory". Skip holes and non-seg_vn segments.
2004 2020 * We're lazy and only return one segment at a time.
2005 2021 */
2006 2022 int
2007 2023 as_memory(struct as *as, caddr_t *basep, size_t *lenp)
2008 2024 {
2009 2025 extern struct seg_ops segspt_shmops; /* needs a header file */
2010 2026 struct seg *seg;
2011 2027 caddr_t addr, eaddr;
2012 2028 caddr_t segend;
2013 2029
2014 2030 AS_LOCK_ENTER(as, RW_READER);
2015 2031
2016 2032 addr = *basep;
2017 2033 eaddr = addr + *lenp;
2018 2034
2019 2035 seg = as_findseg(as, addr, 0);
2020 2036 if (seg != NULL)
2021 2037 addr = MAX(seg->s_base, addr);
2022 2038
2023 2039 for (;;) {
2024 2040 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2025 2041 AS_LOCK_EXIT(as);
2026 2042 return (EINVAL);
2027 2043 }
2028 2044
2029 2045 if (seg->s_ops == &segvn_ops) {
2030 2046 segend = seg->s_base + seg->s_size;
2031 2047 break;
2032 2048 }
2033 2049
2034 2050 /*
2035 2051 * We do ISM by looking into the private data
2036 2052 * to determine the real size of the segment.
2037 2053 */
2038 2054 if (seg->s_ops == &segspt_shmops) {
2039 2055 segend = seg->s_base + spt_realsize(seg);
2040 2056 if (addr < segend)
2041 2057 break;
2042 2058 }
2043 2059
2044 2060 seg = AS_SEGNEXT(as, seg);
2045 2061
2046 2062 if (seg != NULL)
2047 2063 addr = seg->s_base;
2048 2064 }
2049 2065
2050 2066 *basep = addr;
2051 2067
2052 2068 if (segend > eaddr)
2053 2069 *lenp = eaddr - addr;
2054 2070 else
2055 2071 *lenp = segend - addr;
2056 2072
2057 2073 AS_LOCK_EXIT(as);
2058 2074 return (0);
2059 2075 }
2060 2076
2061 2077 /*
2062 2078 * Swap the pages associated with the address space as out to
2063 2079 * secondary storage, returning the number of bytes actually
2064 2080 * swapped.
2065 2081 *
2066 2082 * The value returned is intended to correlate well with the process's
2067 2083 * memory requirements. Its usefulness for this purpose depends on
2068 2084 * how well the segment-level routines do at returning accurate
2069 2085 * information.
2070 2086 */
2071 2087 size_t
2072 2088 as_swapout(struct as *as)
2073 2089 {
2074 2090 struct seg *seg;
2075 2091 size_t swpcnt = 0;
2076 2092
2077 2093 /*
2078 2094 * Kernel-only processes have given up their address
2079 2095 * spaces. Of course, we shouldn't be attempting to
2080 2096 * swap out such processes in the first place...
2081 2097 */
2082 2098 if (as == NULL)
2083 2099 return (0);
2084 2100
2085 2101 AS_LOCK_ENTER(as, RW_READER);
2086 2102
2087 2103 /*
2088 2104 * Free all mapping resources associated with the address
2089 2105 * space. The segment-level swapout routines capitalize
2090 2106 * on this unmapping by scavanging pages that have become
2091 2107 * unmapped here.
2092 2108 */
2093 2109 hat_swapout(as->a_hat);
2094 2110
2095 2111 /*
2096 2112 * Call the swapout routines of all segments in the address
2097 2113 * space to do the actual work, accumulating the amount of
2098 2114 * space reclaimed.
2099 2115 */
2100 2116 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2101 2117 struct seg_ops *ov = seg->s_ops;
2102 2118
2103 2119 /*
2104 2120 * We have to check to see if the seg has
2105 2121 * an ops vector because the seg may have
2106 2122 * been in the middle of being set up when
2107 2123 * the process was picked for swapout.
2108 2124 */
2109 2125 if ((ov != NULL) && (ov->swapout != NULL))
2110 2126 swpcnt += SEGOP_SWAPOUT(seg);
2111 2127 }
2112 2128 AS_LOCK_EXIT(as);
2113 2129 return (swpcnt);
2114 2130 }
2115 2131
2116 2132 /*
2117 2133 * Determine whether data from the mappings in interval [addr, addr + size)
2118 2134 * are in the primary memory (core) cache.
2119 2135 */
2120 2136 int
2121 2137 as_incore(struct as *as, caddr_t addr,
2122 2138 size_t size, char *vec, size_t *sizep)
2123 2139 {
2124 2140 struct seg *seg;
2125 2141 size_t ssize;
2126 2142 caddr_t raddr; /* rounded down addr */
2127 2143 size_t rsize; /* rounded up size */
2128 2144 size_t isize; /* iteration size */
2129 2145 int error = 0; /* result, assume success */
2130 2146
2131 2147 *sizep = 0;
2132 2148 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2133 2149 rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2134 2150 (size_t)raddr;
2135 2151
2136 2152 if (raddr + rsize < raddr) /* check for wraparound */
2137 2153 return (ENOMEM);
2138 2154
2139 2155 AS_LOCK_ENTER(as, RW_READER);
2140 2156 seg = as_segat(as, raddr);
2141 2157 if (seg == NULL) {
2142 2158 AS_LOCK_EXIT(as);
2143 2159 return (-1);
2144 2160 }
2145 2161
2146 2162 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2147 2163 if (raddr >= seg->s_base + seg->s_size) {
2148 2164 seg = AS_SEGNEXT(as, seg);
2149 2165 if (seg == NULL || raddr != seg->s_base) {
2150 2166 error = -1;
2151 2167 break;
2152 2168 }
2153 2169 }
2154 2170 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2155 2171 ssize = seg->s_base + seg->s_size - raddr;
2156 2172 else
2157 2173 ssize = rsize;
2158 2174 *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec);
2159 2175 if (isize != ssize) {
2160 2176 error = -1;
2161 2177 break;
2162 2178 }
2163 2179 vec += btopr(ssize);
2164 2180 }
2165 2181 AS_LOCK_EXIT(as);
2166 2182 return (error);
2167 2183 }
2168 2184
2169 2185 static void
2170 2186 as_segunlock(struct seg *seg, caddr_t addr, int attr,
2171 2187 ulong_t *bitmap, size_t position, size_t npages)
2172 2188 {
2173 2189 caddr_t range_start;
2174 2190 size_t pos1 = position;
2175 2191 size_t pos2;
2176 2192 size_t size;
2177 2193 size_t end_pos = npages + position;
2178 2194
2179 2195 while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2180 2196 size = ptob((pos2 - pos1));
2181 2197 range_start = (caddr_t)((uintptr_t)addr +
2182 2198 ptob(pos1 - position));
2183 2199
2184 2200 (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK,
2185 2201 (ulong_t *)NULL, (size_t)NULL);
2186 2202 pos1 = pos2;
2187 2203 }
2188 2204 }
2189 2205
2190 2206 static void
2191 2207 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
2192 2208 caddr_t raddr, size_t rsize)
2193 2209 {
2194 2210 struct seg *seg = as_segat(as, raddr);
2195 2211 size_t ssize;
2196 2212
2197 2213 while (rsize != 0) {
2198 2214 if (raddr >= seg->s_base + seg->s_size)
2199 2215 seg = AS_SEGNEXT(as, seg);
2200 2216
2201 2217 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2202 2218 ssize = seg->s_base + seg->s_size - raddr;
2203 2219 else
2204 2220 ssize = rsize;
2205 2221
2206 2222 as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
2207 2223
2208 2224 rsize -= ssize;
2209 2225 raddr += ssize;
2210 2226 }
2211 2227 }
2212 2228
2213 2229 /*
2214 2230 * Cache control operations over the interval [addr, addr + size) in
2215 2231 * address space "as".
2216 2232 */
2217 2233 /*ARGSUSED*/
2218 2234 int
2219 2235 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
2220 2236 uintptr_t arg, ulong_t *lock_map, size_t pos)
2221 2237 {
2222 2238 struct seg *seg; /* working segment */
2223 2239 caddr_t raddr; /* rounded down addr */
2224 2240 caddr_t initraddr; /* saved initial rounded down addr */
2225 2241 size_t rsize; /* rounded up size */
2226 2242 size_t initrsize; /* saved initial rounded up size */
2227 2243 size_t ssize; /* size of seg */
2228 2244 int error = 0; /* result */
2229 2245 size_t mlock_size; /* size of bitmap */
2230 2246 ulong_t *mlock_map; /* pointer to bitmap used */
2231 2247 /* to represent the locked */
2232 2248 /* pages. */
2233 2249 retry:
2234 2250 if (error == IE_RETRY)
2235 2251 AS_LOCK_ENTER(as, RW_WRITER);
2236 2252 else
2237 2253 AS_LOCK_ENTER(as, RW_READER);
2238 2254
2239 2255 /*
2240 2256 * If these are address space lock/unlock operations, loop over
2241 2257 * all segments in the address space, as appropriate.
2242 2258 */
2243 2259 if (func == MC_LOCKAS) {
2244 2260 size_t npages, idx;
2245 2261 size_t rlen = 0; /* rounded as length */
2246 2262
2247 2263 idx = pos;
2248 2264
2249 2265 if (arg & MCL_FUTURE) {
2250 2266 mutex_enter(&as->a_contents);
2251 2267 AS_SETPGLCK(as);
2252 2268 mutex_exit(&as->a_contents);
2253 2269 }
2254 2270 if ((arg & MCL_CURRENT) == 0) {
2255 2271 AS_LOCK_EXIT(as);
2256 2272 return (0);
2257 2273 }
2258 2274
2259 2275 seg = AS_SEGFIRST(as);
2260 2276 if (seg == NULL) {
2261 2277 AS_LOCK_EXIT(as);
2262 2278 return (0);
2263 2279 }
2264 2280
2265 2281 do {
2266 2282 raddr = (caddr_t)((uintptr_t)seg->s_base &
2267 2283 (uintptr_t)PAGEMASK);
2268 2284 rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2269 2285 PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2270 2286 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2271 2287
2272 2288 mlock_size = BT_BITOUL(btopr(rlen));
2273 2289 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2274 2290 sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2275 2291 AS_LOCK_EXIT(as);
2276 2292 return (EAGAIN);
2277 2293 }
2278 2294
2279 2295 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2280 2296 error = SEGOP_LOCKOP(seg, seg->s_base,
2281 2297 seg->s_size, attr, MC_LOCK, mlock_map, pos);
2282 2298 if (error != 0)
2283 2299 break;
2284 2300 pos += seg_pages(seg);
2285 2301 }
2286 2302
2287 2303 if (error) {
2288 2304 for (seg = AS_SEGFIRST(as); seg != NULL;
2289 2305 seg = AS_SEGNEXT(as, seg)) {
2290 2306
2291 2307 raddr = (caddr_t)((uintptr_t)seg->s_base &
2292 2308 (uintptr_t)PAGEMASK);
2293 2309 npages = seg_pages(seg);
2294 2310 as_segunlock(seg, raddr, attr, mlock_map,
2295 2311 idx, npages);
2296 2312 idx += npages;
2297 2313 }
2298 2314 }
2299 2315
2300 2316 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2301 2317 AS_LOCK_EXIT(as);
2302 2318 goto lockerr;
2303 2319 } else if (func == MC_UNLOCKAS) {
2304 2320 mutex_enter(&as->a_contents);
2305 2321 AS_CLRPGLCK(as);
2306 2322 mutex_exit(&as->a_contents);
2307 2323
2308 2324 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2309 2325 error = SEGOP_LOCKOP(seg, seg->s_base,
2310 2326 seg->s_size, attr, MC_UNLOCK, NULL, 0);
2311 2327 if (error != 0)
2312 2328 break;
2313 2329 }
2314 2330
2315 2331 AS_LOCK_EXIT(as);
2316 2332 goto lockerr;
2317 2333 }
2318 2334
2319 2335 /*
2320 2336 * Normalize addresses and sizes.
2321 2337 */
2322 2338 initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2323 2339 initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2324 2340 (size_t)raddr;
2325 2341
2326 2342 if (raddr + rsize < raddr) { /* check for wraparound */
2327 2343 AS_LOCK_EXIT(as);
2328 2344 return (ENOMEM);
2329 2345 }
2330 2346
2331 2347 /*
2332 2348 * Get initial segment.
2333 2349 */
2334 2350 if ((seg = as_segat(as, raddr)) == NULL) {
2335 2351 AS_LOCK_EXIT(as);
2336 2352 return (ENOMEM);
2337 2353 }
2338 2354
2339 2355 if (func == MC_LOCK) {
2340 2356 mlock_size = BT_BITOUL(btopr(rsize));
2341 2357 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2342 2358 sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2343 2359 AS_LOCK_EXIT(as);
2344 2360 return (EAGAIN);
2345 2361 }
2346 2362 }
2347 2363
2348 2364 /*
2349 2365 * Loop over all segments. If a hole in the address range is
2350 2366 * discovered, then fail. For each segment, perform the appropriate
2351 2367 * control operation.
2352 2368 */
2353 2369 while (rsize != 0) {
2354 2370
2355 2371 /*
2356 2372 * Make sure there's no hole, calculate the portion
2357 2373 * of the next segment to be operated over.
2358 2374 */
2359 2375 if (raddr >= seg->s_base + seg->s_size) {
2360 2376 seg = AS_SEGNEXT(as, seg);
2361 2377 if (seg == NULL || raddr != seg->s_base) {
2362 2378 if (func == MC_LOCK) {
2363 2379 as_unlockerr(as, attr, mlock_map,
2364 2380 initraddr, initrsize - rsize);
2365 2381 kmem_free(mlock_map,
2366 2382 mlock_size * sizeof (ulong_t));
2367 2383 }
2368 2384 AS_LOCK_EXIT(as);
2369 2385 return (ENOMEM);
2370 2386 }
2371 2387 }
2372 2388 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2373 2389 ssize = seg->s_base + seg->s_size - raddr;
2374 2390 else
2375 2391 ssize = rsize;
2376 2392
2377 2393 /*
2378 2394 * Dispatch on specific function.
2379 2395 */
2380 2396 switch (func) {
2381 2397
2382 2398 /*
2383 2399 * Synchronize cached data from mappings with backing
2384 2400 * objects.
2385 2401 */
2386 2402 case MC_SYNC:
2387 2403 if (error = SEGOP_SYNC(seg, raddr, ssize,
2388 2404 attr, (uint_t)arg)) {
2389 2405 AS_LOCK_EXIT(as);
2390 2406 return (error);
2391 2407 }
2392 2408 break;
2393 2409
2394 2410 /*
2395 2411 * Lock pages in memory.
2396 2412 */
2397 2413 case MC_LOCK:
2398 2414 if (error = SEGOP_LOCKOP(seg, raddr, ssize,
2399 2415 attr, func, mlock_map, pos)) {
2400 2416 as_unlockerr(as, attr, mlock_map, initraddr,
2401 2417 initrsize - rsize + ssize);
2402 2418 kmem_free(mlock_map, mlock_size *
2403 2419 sizeof (ulong_t));
2404 2420 AS_LOCK_EXIT(as);
2405 2421 goto lockerr;
2406 2422 }
2407 2423 break;
2408 2424
2409 2425 /*
2410 2426 * Unlock mapped pages.
2411 2427 */
2412 2428 case MC_UNLOCK:
2413 2429 (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func,
2414 2430 (ulong_t *)NULL, (size_t)NULL);
2415 2431 break;
2416 2432
2417 2433 /*
2418 2434 * Store VM advise for mapped pages in segment layer.
2419 2435 */
2420 2436 case MC_ADVISE:
2421 2437 error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg);
2422 2438
2423 2439 /*
2424 2440 * Check for regular errors and special retry error
2425 2441 */
2426 2442 if (error) {
2427 2443 if (error == IE_RETRY) {
2428 2444 /*
2429 2445 * Need to acquire writers lock, so
2430 2446 * have to drop readers lock and start
2431 2447 * all over again
2432 2448 */
2433 2449 AS_LOCK_EXIT(as);
2434 2450 goto retry;
2435 2451 } else if (error == IE_REATTACH) {
2436 2452 /*
2437 2453 * Find segment for current address
2438 2454 * because current segment just got
2439 2455 * split or concatenated
2440 2456 */
2441 2457 seg = as_segat(as, raddr);
2442 2458 if (seg == NULL) {
2443 2459 AS_LOCK_EXIT(as);
2444 2460 return (ENOMEM);
2445 2461 }
2446 2462 } else {
2447 2463 /*
2448 2464 * Regular error
2449 2465 */
2450 2466 AS_LOCK_EXIT(as);
2451 2467 return (error);
2452 2468 }
2453 2469 }
2454 2470 break;
2455 2471
2456 2472 case MC_INHERIT_ZERO:
2457 2473 if (seg->s_ops->inherit == NULL) {
2458 2474 error = ENOTSUP;
2459 2475 } else {
2460 2476 error = SEGOP_INHERIT(seg, raddr, ssize,
2461 2477 SEGP_INH_ZERO);
2462 2478 }
2463 2479 if (error != 0) {
2464 2480 AS_LOCK_EXIT(as);
2465 2481 return (error);
2466 2482 }
2467 2483 break;
2468 2484
2469 2485 /*
2470 2486 * Can't happen.
2471 2487 */
2472 2488 default:
2473 2489 panic("as_ctl: bad operation %d", func);
2474 2490 /*NOTREACHED*/
2475 2491 }
2476 2492
2477 2493 rsize -= ssize;
2478 2494 raddr += ssize;
2479 2495 }
2480 2496
2481 2497 if (func == MC_LOCK)
2482 2498 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2483 2499 AS_LOCK_EXIT(as);
2484 2500 return (0);
2485 2501 lockerr:
2486 2502
2487 2503 /*
2488 2504 * If the lower levels returned EDEADLK for a segment lockop,
2489 2505 * it means that we should retry the operation. Let's wait
2490 2506 * a bit also to let the deadlock causing condition clear.
2491 2507 * This is part of a gross hack to work around a design flaw
2492 2508 * in the ufs/sds logging code and should go away when the
2493 2509 * logging code is re-designed to fix the problem. See bug
2494 2510 * 4125102 for details of the problem.
2495 2511 */
2496 2512 if (error == EDEADLK) {
2497 2513 delay(deadlk_wait);
2498 2514 error = 0;
2499 2515 goto retry;
2500 2516 }
2501 2517 return (error);
2502 2518 }
2503 2519
2504 2520 int
2505 2521 fc_decode(faultcode_t fault_err)
2506 2522 {
2507 2523 int error = 0;
2508 2524
2509 2525 switch (FC_CODE(fault_err)) {
2510 2526 case FC_OBJERR:
2511 2527 error = FC_ERRNO(fault_err);
2512 2528 break;
2513 2529 case FC_PROT:
2514 2530 error = EACCES;
2515 2531 break;
2516 2532 default:
2517 2533 error = EFAULT;
2518 2534 break;
2519 2535 }
2520 2536 return (error);
2521 2537 }
2522 2538
2523 2539 /*
2524 2540 * Pagelock pages from a range that spans more than 1 segment. Obtain shadow
2525 2541 * lists from each segment and copy them to one contiguous shadow list (plist)
2526 2542 * as expected by the caller. Save pointers to per segment shadow lists at
2527 2543 * the tail of plist so that they can be used during as_pageunlock().
2528 2544 */
2529 2545 static int
2530 2546 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2531 2547 caddr_t addr, size_t size, enum seg_rw rw)
2532 2548 {
2533 2549 caddr_t sv_addr = addr;
2534 2550 size_t sv_size = size;
2535 2551 struct seg *sv_seg = seg;
2536 2552 ulong_t segcnt = 1;
2537 2553 ulong_t cnt;
2538 2554 size_t ssize;
2539 2555 pgcnt_t npages = btop(size);
2540 2556 page_t **plist;
2541 2557 page_t **pl;
2542 2558 int error;
2543 2559 caddr_t eaddr;
2544 2560 faultcode_t fault_err = 0;
2545 2561 pgcnt_t pl_off;
2546 2562 extern struct seg_ops segspt_shmops;
2547 2563
2548 2564 ASSERT(AS_LOCK_HELD(as));
2549 2565 ASSERT(seg != NULL);
2550 2566 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2551 2567 ASSERT(addr + size > seg->s_base + seg->s_size);
2552 2568 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2553 2569 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2554 2570
2555 2571 /*
2556 2572 * Count the number of segments covered by the range we are about to
2557 2573 * lock. The segment count is used to size the shadow list we return
2558 2574 * back to the caller.
2559 2575 */
2560 2576 for (; size != 0; size -= ssize, addr += ssize) {
2561 2577 if (addr >= seg->s_base + seg->s_size) {
2562 2578
2563 2579 seg = AS_SEGNEXT(as, seg);
2564 2580 if (seg == NULL || addr != seg->s_base) {
2565 2581 AS_LOCK_EXIT(as);
2566 2582 return (EFAULT);
2567 2583 }
2568 2584 /*
2569 2585 * Do a quick check if subsequent segments
2570 2586 * will most likely support pagelock.
2571 2587 */
2572 2588 if (seg->s_ops == &segvn_ops) {
2573 2589 vnode_t *vp;
2574 2590
2575 2591 if (SEGOP_GETVP(seg, addr, &vp) != 0 ||
2576 2592 vp != NULL) {
2577 2593 AS_LOCK_EXIT(as);
2578 2594 goto slow;
2579 2595 }
2580 2596 } else if (seg->s_ops != &segspt_shmops) {
2581 2597 AS_LOCK_EXIT(as);
2582 2598 goto slow;
2583 2599 }
2584 2600 segcnt++;
2585 2601 }
2586 2602 if (addr + size > seg->s_base + seg->s_size) {
2587 2603 ssize = seg->s_base + seg->s_size - addr;
2588 2604 } else {
2589 2605 ssize = size;
2590 2606 }
2591 2607 }
2592 2608 ASSERT(segcnt > 1);
2593 2609
2594 2610 plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2595 2611
2596 2612 addr = sv_addr;
2597 2613 size = sv_size;
2598 2614 seg = sv_seg;
2599 2615
2600 2616 for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2601 2617 if (addr >= seg->s_base + seg->s_size) {
2602 2618 seg = AS_SEGNEXT(as, seg);
2603 2619 ASSERT(seg != NULL && addr == seg->s_base);
2604 2620 cnt++;
2605 2621 ASSERT(cnt < segcnt);
2606 2622 }
2607 2623 if (addr + size > seg->s_base + seg->s_size) {
2608 2624 ssize = seg->s_base + seg->s_size - addr;
2609 2625 } else {
2610 2626 ssize = size;
2611 2627 }
2612 2628 pl = &plist[npages + cnt];
2613 2629 error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2614 2630 L_PAGELOCK, rw);
2615 2631 if (error) {
2616 2632 break;
2617 2633 }
2618 2634 ASSERT(plist[npages + cnt] != NULL);
2619 2635 ASSERT(pl_off + btop(ssize) <= npages);
2620 2636 bcopy(plist[npages + cnt], &plist[pl_off],
2621 2637 btop(ssize) * sizeof (page_t *));
2622 2638 pl_off += btop(ssize);
2623 2639 }
2624 2640
2625 2641 if (size == 0) {
2626 2642 AS_LOCK_EXIT(as);
2627 2643 ASSERT(cnt == segcnt - 1);
2628 2644 *ppp = plist;
2629 2645 return (0);
2630 2646 }
2631 2647
2632 2648 /*
2633 2649 * one of pagelock calls failed. The error type is in error variable.
2634 2650 * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2635 2651 * type is either EFAULT or ENOTSUP. Otherwise just return the error
2636 2652 * back to the caller.
2637 2653 */
2638 2654
2639 2655 eaddr = addr;
2640 2656 seg = sv_seg;
2641 2657
2642 2658 for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2643 2659 if (addr >= seg->s_base + seg->s_size) {
2644 2660 seg = AS_SEGNEXT(as, seg);
2645 2661 ASSERT(seg != NULL && addr == seg->s_base);
2646 2662 cnt++;
2647 2663 ASSERT(cnt < segcnt);
2648 2664 }
2649 2665 if (eaddr > seg->s_base + seg->s_size) {
2650 2666 ssize = seg->s_base + seg->s_size - addr;
2651 2667 } else {
2652 2668 ssize = eaddr - addr;
2653 2669 }
2654 2670 pl = &plist[npages + cnt];
2655 2671 ASSERT(*pl != NULL);
2656 2672 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2657 2673 L_PAGEUNLOCK, rw);
2658 2674 }
2659 2675
2660 2676 AS_LOCK_EXIT(as);
2661 2677
2662 2678 kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2663 2679
2664 2680 if (error != ENOTSUP && error != EFAULT) {
2665 2681 return (error);
2666 2682 }
2667 2683
2668 2684 slow:
2669 2685 /*
2670 2686 * If we are here because pagelock failed due to the need to cow fault
2671 2687 * in the pages we want to lock F_SOFTLOCK will do this job and in
2672 2688 * next as_pagelock() call for this address range pagelock will
2673 2689 * hopefully succeed.
2674 2690 */
2675 2691 fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2676 2692 if (fault_err != 0) {
2677 2693 return (fc_decode(fault_err));
2678 2694 }
2679 2695 *ppp = NULL;
2680 2696
2681 2697 return (0);
2682 2698 }
2683 2699
2684 2700 /*
2685 2701 * lock pages in a given address space. Return shadow list. If
2686 2702 * the list is NULL, the MMU mapping is also locked.
2687 2703 */
2688 2704 int
2689 2705 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
2690 2706 size_t size, enum seg_rw rw)
2691 2707 {
2692 2708 size_t rsize;
2693 2709 caddr_t raddr;
2694 2710 faultcode_t fault_err;
2695 2711 struct seg *seg;
2696 2712 int err;
2697 2713
2698 2714 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START,
2699 2715 "as_pagelock_start: addr %p size %ld", addr, size);
2700 2716
2701 2717 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2702 2718 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2703 2719 (size_t)raddr;
2704 2720
2705 2721 /*
2706 2722 * if the request crosses two segments let
2707 2723 * as_fault handle it.
2708 2724 */
2709 2725 AS_LOCK_ENTER(as, RW_READER);
2710 2726
2711 2727 seg = as_segat(as, raddr);
2712 2728 if (seg == NULL) {
2713 2729 AS_LOCK_EXIT(as);
2714 2730 return (EFAULT);
2715 2731 }
2716 2732 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2717 2733 if (raddr + rsize > seg->s_base + seg->s_size) {
2718 2734 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2719 2735 }
2720 2736 if (raddr + rsize <= raddr) {
2721 2737 AS_LOCK_EXIT(as);
2722 2738 return (EFAULT);
2723 2739 }
2724 2740
2725 2741 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
2726 2742 "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize);
2727 2743
2728 2744 /*
2729 2745 * try to lock pages and pass back shadow list
2730 2746 */
2731 2747 err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2732 2748
2733 2749 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
2734 2750
2735 2751 AS_LOCK_EXIT(as);
2736 2752
2737 2753 if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2738 2754 return (err);
2739 2755 }
2740 2756
2741 2757 /*
2742 2758 * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2743 2759 * to no pagelock support for this segment or pages need to be cow
2744 2760 * faulted in. If fault is needed F_SOFTLOCK will do this job for
2745 2761 * this as_pagelock() call and in the next as_pagelock() call for the
2746 2762 * same address range pagelock call will hopefull succeed.
2747 2763 */
2748 2764 fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2749 2765 if (fault_err != 0) {
2750 2766 return (fc_decode(fault_err));
2751 2767 }
2752 2768 *ppp = NULL;
2753 2769
2754 2770 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end");
2755 2771 return (0);
2756 2772 }
2757 2773
2758 2774 /*
2759 2775 * unlock pages locked by as_pagelock_segs(). Retrieve per segment shadow
2760 2776 * lists from the end of plist and call pageunlock interface for each segment.
2761 2777 * Drop as lock and free plist.
2762 2778 */
2763 2779 static void
2764 2780 as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
2765 2781 struct page **plist, enum seg_rw rw)
2766 2782 {
2767 2783 ulong_t cnt;
2768 2784 caddr_t eaddr = addr + size;
2769 2785 pgcnt_t npages = btop(size);
2770 2786 size_t ssize;
2771 2787 page_t **pl;
2772 2788
2773 2789 ASSERT(AS_LOCK_HELD(as));
2774 2790 ASSERT(seg != NULL);
2775 2791 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2776 2792 ASSERT(addr + size > seg->s_base + seg->s_size);
2777 2793 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2778 2794 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2779 2795 ASSERT(plist != NULL);
2780 2796
2781 2797 for (cnt = 0; addr < eaddr; addr += ssize) {
2782 2798 if (addr >= seg->s_base + seg->s_size) {
2783 2799 seg = AS_SEGNEXT(as, seg);
2784 2800 ASSERT(seg != NULL && addr == seg->s_base);
2785 2801 cnt++;
2786 2802 }
2787 2803 if (eaddr > seg->s_base + seg->s_size) {
2788 2804 ssize = seg->s_base + seg->s_size - addr;
2789 2805 } else {
2790 2806 ssize = eaddr - addr;
2791 2807 }
2792 2808 pl = &plist[npages + cnt];
2793 2809 ASSERT(*pl != NULL);
2794 2810 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2795 2811 L_PAGEUNLOCK, rw);
2796 2812 }
2797 2813 ASSERT(cnt > 0);
2798 2814 AS_LOCK_EXIT(as);
2799 2815
2800 2816 cnt++;
2801 2817 kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2802 2818 }
2803 2819
2804 2820 /*
2805 2821 * unlock pages in a given address range
2806 2822 */
2807 2823 void
2808 2824 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2809 2825 enum seg_rw rw)
2810 2826 {
2811 2827 struct seg *seg;
2812 2828 size_t rsize;
2813 2829 caddr_t raddr;
2814 2830
2815 2831 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START,
2816 2832 "as_pageunlock_start: addr %p size %ld", addr, size);
2817 2833
2818 2834 /*
2819 2835 * if the shadow list is NULL, as_pagelock was
2820 2836 * falling back to as_fault
2821 2837 */
2822 2838 if (pp == NULL) {
2823 2839 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2824 2840 return;
2825 2841 }
2826 2842
2827 2843 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2828 2844 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2829 2845 (size_t)raddr;
2830 2846
2831 2847 AS_LOCK_ENTER(as, RW_READER);
2832 2848 seg = as_segat(as, raddr);
2833 2849 ASSERT(seg != NULL);
2834 2850
2835 2851 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
2836 2852 "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
2837 2853
2838 2854 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2839 2855 if (raddr + rsize <= seg->s_base + seg->s_size) {
2840 2856 SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2841 2857 } else {
2842 2858 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2843 2859 return;
2844 2860 }
2845 2861 AS_LOCK_EXIT(as);
2846 2862 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
2847 2863 }
2848 2864
2849 2865 int
2850 2866 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2851 2867 boolean_t wait)
2852 2868 {
2853 2869 struct seg *seg;
2854 2870 size_t ssize;
2855 2871 caddr_t raddr; /* rounded down addr */
2856 2872 size_t rsize; /* rounded up size */
2857 2873 int error = 0;
2858 2874 size_t pgsz = page_get_pagesize(szc);
2859 2875
2860 2876 setpgsz_top:
2861 2877 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
2862 2878 return (EINVAL);
2863 2879 }
2864 2880
2865 2881 raddr = addr;
2866 2882 rsize = size;
2867 2883
2868 2884 if (raddr + rsize < raddr) /* check for wraparound */
2869 2885 return (ENOMEM);
2870 2886
2871 2887 AS_LOCK_ENTER(as, RW_WRITER);
2872 2888 as_clearwatchprot(as, raddr, rsize);
2873 2889 seg = as_segat(as, raddr);
2874 2890 if (seg == NULL) {
2875 2891 as_setwatch(as);
2876 2892 AS_LOCK_EXIT(as);
2877 2893 return (ENOMEM);
2878 2894 }
2879 2895
2880 2896 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2881 2897 if (raddr >= seg->s_base + seg->s_size) {
2882 2898 seg = AS_SEGNEXT(as, seg);
2883 2899 if (seg == NULL || raddr != seg->s_base) {
2884 2900 error = ENOMEM;
2885 2901 break;
2886 2902 }
2887 2903 }
2888 2904 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2889 2905 ssize = seg->s_base + seg->s_size - raddr;
2890 2906 } else {
2891 2907 ssize = rsize;
2892 2908 }
2893 2909
2894 2910 retry:
2895 2911 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
2896 2912
2897 2913 if (error == IE_NOMEM) {
2898 2914 error = EAGAIN;
2899 2915 break;
2900 2916 }
2901 2917
2902 2918 if (error == IE_RETRY) {
2903 2919 AS_LOCK_EXIT(as);
2904 2920 goto setpgsz_top;
2905 2921 }
2906 2922
2907 2923 if (error == ENOTSUP) {
2908 2924 error = EINVAL;
2909 2925 break;
2910 2926 }
2911 2927
2912 2928 if (wait && (error == EAGAIN)) {
2913 2929 /*
2914 2930 * Memory is currently locked. It must be unlocked
2915 2931 * before this operation can succeed through a retry.
2916 2932 * The possible reasons for locked memory and
2917 2933 * corresponding strategies for unlocking are:
2918 2934 * (1) Normal I/O
2919 2935 * wait for a signal that the I/O operation
2920 2936 * has completed and the memory is unlocked.
2921 2937 * (2) Asynchronous I/O
2922 2938 * The aio subsystem does not unlock pages when
2923 2939 * the I/O is completed. Those pages are unlocked
2924 2940 * when the application calls aiowait/aioerror.
2925 2941 * So, to prevent blocking forever, cv_broadcast()
2926 2942 * is done to wake up aio_cleanup_thread.
2927 2943 * Subsequently, segvn_reclaim will be called, and
2928 2944 * that will do AS_CLRUNMAPWAIT() and wake us up.
2929 2945 * (3) Long term page locking:
2930 2946 * This is not relevant for as_setpagesize()
2931 2947 * because we cannot change the page size for
2932 2948 * driver memory. The attempt to do so will
2933 2949 * fail with a different error than EAGAIN so
2934 2950 * there's no need to trigger as callbacks like
2935 2951 * as_unmap, as_setprot or as_free would do.
2936 2952 */
2937 2953 mutex_enter(&as->a_contents);
2938 2954 if (!AS_ISNOUNMAPWAIT(as)) {
2939 2955 if (AS_ISUNMAPWAIT(as) == 0) {
2940 2956 cv_broadcast(&as->a_cv);
2941 2957 }
2942 2958 AS_SETUNMAPWAIT(as);
2943 2959 AS_LOCK_EXIT(as);
2944 2960 while (AS_ISUNMAPWAIT(as)) {
2945 2961 cv_wait(&as->a_cv, &as->a_contents);
2946 2962 }
2947 2963 } else {
2948 2964 /*
2949 2965 * We may have raced with
2950 2966 * segvn_reclaim()/segspt_reclaim(). In this
2951 2967 * case clean nounmapwait flag and retry since
2952 2968 * softlockcnt in this segment may be already
2953 2969 * 0. We don't drop as writer lock so our
2954 2970 * number of retries without sleeping should
2955 2971 * be very small. See segvn_reclaim() for
2956 2972 * more comments.
2957 2973 */
2958 2974 AS_CLRNOUNMAPWAIT(as);
2959 2975 mutex_exit(&as->a_contents);
2960 2976 goto retry;
2961 2977 }
2962 2978 mutex_exit(&as->a_contents);
2963 2979 goto setpgsz_top;
2964 2980 } else if (error != 0) {
2965 2981 break;
2966 2982 }
2967 2983 }
2968 2984 as_setwatch(as);
2969 2985 AS_LOCK_EXIT(as);
2970 2986 return (error);
2971 2987 }
2972 2988
2973 2989 /*
2974 2990 * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments
2975 2991 * in its chunk where s_szc is less than the szc we want to set.
2976 2992 */
2977 2993 static int
2978 2994 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
2979 2995 int *retry)
2980 2996 {
2981 2997 struct seg *seg;
2982 2998 size_t ssize;
2983 2999 int error;
2984 3000
2985 3001 ASSERT(AS_WRITE_HELD(as));
2986 3002
2987 3003 seg = as_segat(as, raddr);
2988 3004 if (seg == NULL) {
2989 3005 panic("as_iset3_default_lpsize: no seg");
2990 3006 }
2991 3007
2992 3008 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2993 3009 if (raddr >= seg->s_base + seg->s_size) {
2994 3010 seg = AS_SEGNEXT(as, seg);
2995 3011 if (seg == NULL || raddr != seg->s_base) {
2996 3012 panic("as_iset3_default_lpsize: as changed");
2997 3013 }
2998 3014 }
2999 3015 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3000 3016 ssize = seg->s_base + seg->s_size - raddr;
3001 3017 } else {
3002 3018 ssize = rsize;
3003 3019 }
3004 3020
3005 3021 if (szc > seg->s_szc) {
3006 3022 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
3007 3023 /* Only retry on EINVAL segments that have no vnode. */
3008 3024 if (error == EINVAL) {
3009 3025 vnode_t *vp = NULL;
3010 3026 if ((SEGOP_GETTYPE(seg, raddr) & MAP_SHARED) &&
3011 3027 (SEGOP_GETVP(seg, raddr, &vp) != 0 ||
3012 3028 vp == NULL)) {
3013 3029 *retry = 1;
3014 3030 } else {
3015 3031 *retry = 0;
3016 3032 }
3017 3033 }
3018 3034 if (error) {
3019 3035 return (error);
3020 3036 }
3021 3037 }
3022 3038 }
3023 3039 return (0);
3024 3040 }
3025 3041
3026 3042 /*
3027 3043 * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
3028 3044 * pagesize on each segment in its range, but if any fails with EINVAL,
3029 3045 * then it reduces the pagesizes to the next size in the bitmap and
3030 3046 * retries as_iset3_default_lpsize(). The reason why the code retries
3031 3047 * smaller allowed sizes on EINVAL is because (a) the anon offset may not
3032 3048 * match the bigger sizes, and (b) it's hard to get this offset (to begin
3033 3049 * with) to pass to map_pgszcvec().
3034 3050 */
3035 3051 static int
3036 3052 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
3037 3053 uint_t szcvec)
3038 3054 {
3039 3055 int error;
3040 3056 int retry;
3041 3057
3042 3058 ASSERT(AS_WRITE_HELD(as));
3043 3059
3044 3060 for (;;) {
3045 3061 error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
3046 3062 if (error == EINVAL && retry) {
3047 3063 szcvec &= ~(1 << szc);
3048 3064 if (szcvec <= 1) {
3049 3065 return (EINVAL);
3050 3066 }
3051 3067 szc = highbit(szcvec) - 1;
3052 3068 } else {
3053 3069 return (error);
3054 3070 }
3055 3071 }
3056 3072 }
3057 3073
3058 3074 /*
3059 3075 * as_iset1_default_lpsize() breaks its chunk into areas where existing
3060 3076 * segments have a smaller szc than we want to set. For each such area,
3061 3077 * it calls as_iset2_default_lpsize()
3062 3078 */
3063 3079 static int
3064 3080 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3065 3081 uint_t szcvec)
3066 3082 {
3067 3083 struct seg *seg;
3068 3084 size_t ssize;
3069 3085 caddr_t setaddr = raddr;
3070 3086 size_t setsize = 0;
3071 3087 int set;
3072 3088 int error;
3073 3089
3074 3090 ASSERT(AS_WRITE_HELD(as));
3075 3091
3076 3092 seg = as_segat(as, raddr);
3077 3093 if (seg == NULL) {
3078 3094 panic("as_iset1_default_lpsize: no seg");
3079 3095 }
3080 3096 if (seg->s_szc < szc) {
3081 3097 set = 1;
3082 3098 } else {
3083 3099 set = 0;
3084 3100 }
3085 3101
3086 3102 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3087 3103 if (raddr >= seg->s_base + seg->s_size) {
3088 3104 seg = AS_SEGNEXT(as, seg);
3089 3105 if (seg == NULL || raddr != seg->s_base) {
3090 3106 panic("as_iset1_default_lpsize: as changed");
3091 3107 }
3092 3108 if (seg->s_szc >= szc && set) {
3093 3109 ASSERT(setsize != 0);
3094 3110 error = as_iset2_default_lpsize(as,
3095 3111 setaddr, setsize, szc, szcvec);
3096 3112 if (error) {
3097 3113 return (error);
3098 3114 }
3099 3115 set = 0;
3100 3116 } else if (seg->s_szc < szc && !set) {
3101 3117 setaddr = raddr;
3102 3118 setsize = 0;
3103 3119 set = 1;
3104 3120 }
3105 3121 }
3106 3122 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3107 3123 ssize = seg->s_base + seg->s_size - raddr;
3108 3124 } else {
3109 3125 ssize = rsize;
3110 3126 }
3111 3127 }
3112 3128 error = 0;
3113 3129 if (set) {
3114 3130 ASSERT(setsize != 0);
3115 3131 error = as_iset2_default_lpsize(as, setaddr, setsize,
3116 3132 szc, szcvec);
3117 3133 }
3118 3134 return (error);
3119 3135 }
3120 3136
3121 3137 /*
3122 3138 * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3123 3139 * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3124 3140 * chunk to as_iset1_default_lpsize().
3125 3141 */
3126 3142 static int
3127 3143 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
3128 3144 int type)
3129 3145 {
3130 3146 int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
3131 3147 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
3132 3148 flags, rtype, 1);
3133 3149 uint_t szc;
3134 3150 uint_t nszc;
3135 3151 int error;
3136 3152 caddr_t a;
3137 3153 caddr_t eaddr;
3138 3154 size_t segsize;
3139 3155 size_t pgsz;
3140 3156 uint_t save_szcvec;
3141 3157
3142 3158 ASSERT(AS_WRITE_HELD(as));
3143 3159 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
3144 3160 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
3145 3161
3146 3162 szcvec &= ~1;
3147 3163 if (szcvec <= 1) { /* skip if base page size */
3148 3164 return (0);
3149 3165 }
3150 3166
3151 3167 /* Get the pagesize of the first larger page size. */
3152 3168 szc = lowbit(szcvec) - 1;
3153 3169 pgsz = page_get_pagesize(szc);
3154 3170 eaddr = addr + size;
3155 3171 addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3156 3172 eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3157 3173
3158 3174 save_szcvec = szcvec;
3159 3175 szcvec >>= (szc + 1);
3160 3176 nszc = szc;
3161 3177 while (szcvec) {
3162 3178 if ((szcvec & 0x1) == 0) {
3163 3179 nszc++;
3164 3180 szcvec >>= 1;
3165 3181 continue;
3166 3182 }
3167 3183 nszc++;
3168 3184 pgsz = page_get_pagesize(nszc);
3169 3185 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3170 3186 if (a != addr) {
3171 3187 ASSERT(szc > 0);
3172 3188 ASSERT(a < eaddr);
3173 3189 segsize = a - addr;
3174 3190 error = as_iset1_default_lpsize(as, addr, segsize, szc,
3175 3191 save_szcvec);
3176 3192 if (error) {
3177 3193 return (error);
3178 3194 }
3179 3195 addr = a;
3180 3196 }
3181 3197 szc = nszc;
3182 3198 szcvec >>= 1;
3183 3199 }
3184 3200
3185 3201 ASSERT(addr < eaddr);
3186 3202 szcvec = save_szcvec;
3187 3203 while (szcvec) {
3188 3204 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3189 3205 ASSERT(a >= addr);
3190 3206 if (a != addr) {
3191 3207 ASSERT(szc > 0);
3192 3208 segsize = a - addr;
3193 3209 error = as_iset1_default_lpsize(as, addr, segsize, szc,
3194 3210 save_szcvec);
3195 3211 if (error) {
3196 3212 return (error);
3197 3213 }
3198 3214 addr = a;
3199 3215 }
3200 3216 szcvec &= ~(1 << szc);
3201 3217 if (szcvec) {
3202 3218 szc = highbit(szcvec) - 1;
3203 3219 pgsz = page_get_pagesize(szc);
3204 3220 }
3205 3221 }
3206 3222 ASSERT(addr == eaddr);
3207 3223
3208 3224 return (0);
3209 3225 }
3210 3226
3211 3227 /*
3212 3228 * Set the default large page size for the range. Called via memcntl with
3213 3229 * page size set to 0. as_set_default_lpsize breaks the range down into
3214 3230 * chunks with the same type/flags, ignores-non segvn segments, and passes
3215 3231 * each chunk to as_iset_default_lpsize().
3216 3232 */
3217 3233 int
3218 3234 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
3219 3235 {
3220 3236 struct seg *seg;
3221 3237 caddr_t raddr;
3222 3238 size_t rsize;
3223 3239 size_t ssize;
3224 3240 int rtype, rflags;
3225 3241 int stype, sflags;
3226 3242 int error;
3227 3243 caddr_t setaddr;
3228 3244 size_t setsize;
3229 3245 int segvn;
3230 3246
3231 3247 if (size == 0)
3232 3248 return (0);
3233 3249
3234 3250 AS_LOCK_ENTER(as, RW_WRITER);
3235 3251 again:
3236 3252 error = 0;
3237 3253
3238 3254 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3239 3255 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3240 3256 (size_t)raddr;
3241 3257
3242 3258 if (raddr + rsize < raddr) { /* check for wraparound */
3243 3259 AS_LOCK_EXIT(as);
3244 3260 return (ENOMEM);
3245 3261 }
3246 3262 as_clearwatchprot(as, raddr, rsize);
3247 3263 seg = as_segat(as, raddr);
3248 3264 if (seg == NULL) {
3249 3265 as_setwatch(as);
3250 3266 AS_LOCK_EXIT(as);
3251 3267 return (ENOMEM);
3252 3268 }
3253 3269 if (seg->s_ops == &segvn_ops) {
3254 3270 rtype = SEGOP_GETTYPE(seg, addr);
3255 3271 rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3256 3272 rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3257 3273 segvn = 1;
3258 3274 } else {
3259 3275 segvn = 0;
3260 3276 }
3261 3277 setaddr = raddr;
3262 3278 setsize = 0;
3263 3279
3264 3280 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3265 3281 if (raddr >= (seg->s_base + seg->s_size)) {
3266 3282 seg = AS_SEGNEXT(as, seg);
3267 3283 if (seg == NULL || raddr != seg->s_base) {
3268 3284 error = ENOMEM;
3269 3285 break;
3270 3286 }
3271 3287 if (seg->s_ops == &segvn_ops) {
3272 3288 stype = SEGOP_GETTYPE(seg, raddr);
3273 3289 sflags = stype & (MAP_TEXT | MAP_INITDATA);
3274 3290 stype &= (MAP_SHARED | MAP_PRIVATE);
3275 3291 if (segvn && (rflags != sflags ||
3276 3292 rtype != stype)) {
3277 3293 /*
3278 3294 * The next segment is also segvn but
3279 3295 * has different flags and/or type.
3280 3296 */
3281 3297 ASSERT(setsize != 0);
3282 3298 error = as_iset_default_lpsize(as,
3283 3299 setaddr, setsize, rflags, rtype);
3284 3300 if (error) {
3285 3301 break;
3286 3302 }
3287 3303 rflags = sflags;
3288 3304 rtype = stype;
3289 3305 setaddr = raddr;
3290 3306 setsize = 0;
3291 3307 } else if (!segvn) {
3292 3308 rflags = sflags;
3293 3309 rtype = stype;
3294 3310 setaddr = raddr;
3295 3311 setsize = 0;
3296 3312 segvn = 1;
3297 3313 }
3298 3314 } else if (segvn) {
3299 3315 /* The next segment is not segvn. */
3300 3316 ASSERT(setsize != 0);
3301 3317 error = as_iset_default_lpsize(as,
3302 3318 setaddr, setsize, rflags, rtype);
3303 3319 if (error) {
3304 3320 break;
3305 3321 }
3306 3322 segvn = 0;
3307 3323 }
3308 3324 }
3309 3325 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3310 3326 ssize = seg->s_base + seg->s_size - raddr;
3311 3327 } else {
3312 3328 ssize = rsize;
3313 3329 }
3314 3330 }
3315 3331 if (error == 0 && segvn) {
3316 3332 /* The last chunk when rsize == 0. */
3317 3333 ASSERT(setsize != 0);
3318 3334 error = as_iset_default_lpsize(as, setaddr, setsize,
3319 3335 rflags, rtype);
3320 3336 }
3321 3337
3322 3338 if (error == IE_RETRY) {
3323 3339 goto again;
3324 3340 } else if (error == IE_NOMEM) {
3325 3341 error = EAGAIN;
3326 3342 } else if (error == ENOTSUP) {
3327 3343 error = EINVAL;
3328 3344 } else if (error == EAGAIN) {
3329 3345 mutex_enter(&as->a_contents);
3330 3346 if (!AS_ISNOUNMAPWAIT(as)) {
3331 3347 if (AS_ISUNMAPWAIT(as) == 0) {
3332 3348 cv_broadcast(&as->a_cv);
3333 3349 }
3334 3350 AS_SETUNMAPWAIT(as);
3335 3351 AS_LOCK_EXIT(as);
3336 3352 while (AS_ISUNMAPWAIT(as)) {
3337 3353 cv_wait(&as->a_cv, &as->a_contents);
3338 3354 }
3339 3355 mutex_exit(&as->a_contents);
3340 3356 AS_LOCK_ENTER(as, RW_WRITER);
3341 3357 } else {
3342 3358 /*
3343 3359 * We may have raced with
3344 3360 * segvn_reclaim()/segspt_reclaim(). In this case
3345 3361 * clean nounmapwait flag and retry since softlockcnt
3346 3362 * in this segment may be already 0. We don't drop as
3347 3363 * writer lock so our number of retries without
3348 3364 * sleeping should be very small. See segvn_reclaim()
3349 3365 * for more comments.
3350 3366 */
3351 3367 AS_CLRNOUNMAPWAIT(as);
3352 3368 mutex_exit(&as->a_contents);
3353 3369 }
3354 3370 goto again;
3355 3371 }
3356 3372
3357 3373 as_setwatch(as);
3358 3374 AS_LOCK_EXIT(as);
3359 3375 return (error);
3360 3376 }
3361 3377
3362 3378 /*
3363 3379 * Setup all of the uninitialized watched pages that we can.
3364 3380 */
3365 3381 void
3366 3382 as_setwatch(struct as *as)
3367 3383 {
3368 3384 struct watched_page *pwp;
3369 3385 struct seg *seg;
3370 3386 caddr_t vaddr;
3371 3387 uint_t prot;
3372 3388 int err, retrycnt;
3373 3389
3374 3390 if (avl_numnodes(&as->a_wpage) == 0)
3375 3391 return;
3376 3392
3377 3393 ASSERT(AS_WRITE_HELD(as));
3378 3394
3379 3395 for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3380 3396 pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3381 3397 retrycnt = 0;
3382 3398 retry:
3383 3399 vaddr = pwp->wp_vaddr;
3384 3400 if (pwp->wp_oprot != 0 || /* already set up */
3385 3401 (seg = as_segat(as, vaddr)) == NULL ||
3386 3402 SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0)
3387 3403 continue;
3388 3404
3389 3405 pwp->wp_oprot = prot;
3390 3406 if (pwp->wp_read)
3391 3407 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3392 3408 if (pwp->wp_write)
3393 3409 prot &= ~PROT_WRITE;
3394 3410 if (pwp->wp_exec)
3395 3411 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3396 3412 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3397 3413 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3398 3414 if (err == IE_RETRY) {
3399 3415 pwp->wp_oprot = 0;
3400 3416 ASSERT(retrycnt == 0);
3401 3417 retrycnt++;
3402 3418 goto retry;
3403 3419 }
3404 3420 }
3405 3421 pwp->wp_prot = prot;
3406 3422 }
3407 3423 }
3408 3424
3409 3425 /*
3410 3426 * Clear all of the watched pages in the address space.
3411 3427 */
3412 3428 void
3413 3429 as_clearwatch(struct as *as)
3414 3430 {
3415 3431 struct watched_page *pwp;
3416 3432 struct seg *seg;
3417 3433 caddr_t vaddr;
3418 3434 uint_t prot;
3419 3435 int err, retrycnt;
3420 3436
3421 3437 if (avl_numnodes(&as->a_wpage) == 0)
3422 3438 return;
3423 3439
3424 3440 ASSERT(AS_WRITE_HELD(as));
3425 3441
3426 3442 for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3427 3443 pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3428 3444 retrycnt = 0;
3429 3445 retry:
3430 3446 vaddr = pwp->wp_vaddr;
3431 3447 if (pwp->wp_oprot == 0 || /* not set up */
3432 3448 (seg = as_segat(as, vaddr)) == NULL)
3433 3449 continue;
3434 3450
3435 3451 if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3436 3452 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3437 3453 if (err == IE_RETRY) {
3438 3454 ASSERT(retrycnt == 0);
3439 3455 retrycnt++;
3440 3456 goto retry;
3441 3457 }
3442 3458 }
3443 3459 pwp->wp_oprot = 0;
3444 3460 pwp->wp_prot = 0;
3445 3461 }
3446 3462 }
3447 3463
3448 3464 /*
3449 3465 * Force a new setup for all the watched pages in the range.
3450 3466 */
3451 3467 static void
3452 3468 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3453 3469 {
3454 3470 struct watched_page *pwp;
3455 3471 struct watched_page tpw;
3456 3472 caddr_t eaddr = addr + size;
3457 3473 caddr_t vaddr;
3458 3474 struct seg *seg;
3459 3475 int err, retrycnt;
3460 3476 uint_t wprot;
3461 3477 avl_index_t where;
3462 3478
3463 3479 if (avl_numnodes(&as->a_wpage) == 0)
3464 3480 return;
3465 3481
3466 3482 ASSERT(AS_WRITE_HELD(as));
3467 3483
3468 3484 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3469 3485 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3470 3486 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3471 3487
3472 3488 while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3473 3489 retrycnt = 0;
3474 3490 vaddr = pwp->wp_vaddr;
3475 3491
3476 3492 wprot = prot;
3477 3493 if (pwp->wp_read)
3478 3494 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3479 3495 if (pwp->wp_write)
3480 3496 wprot &= ~PROT_WRITE;
3481 3497 if (pwp->wp_exec)
3482 3498 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3483 3499 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3484 3500 retry:
3485 3501 seg = as_segat(as, vaddr);
3486 3502 if (seg == NULL) {
3487 3503 panic("as_setwatchprot: no seg");
3488 3504 /*NOTREACHED*/
3489 3505 }
3490 3506 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot);
3491 3507 if (err == IE_RETRY) {
3492 3508 ASSERT(retrycnt == 0);
3493 3509 retrycnt++;
3494 3510 goto retry;
3495 3511 }
3496 3512 }
3497 3513 pwp->wp_oprot = prot;
3498 3514 pwp->wp_prot = wprot;
3499 3515
3500 3516 pwp = AVL_NEXT(&as->a_wpage, pwp);
3501 3517 }
3502 3518 }
3503 3519
3504 3520 /*
3505 3521 * Clear all of the watched pages in the range.
3506 3522 */
3507 3523 static void
3508 3524 as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3509 3525 {
3510 3526 caddr_t eaddr = addr + size;
3511 3527 struct watched_page *pwp;
3512 3528 struct watched_page tpw;
3513 3529 uint_t prot;
3514 3530 struct seg *seg;
3515 3531 int err, retrycnt;
3516 3532 avl_index_t where;
3517 3533
3518 3534 if (avl_numnodes(&as->a_wpage) == 0)
3519 3535 return;
3520 3536
3521 3537 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3522 3538 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3523 3539 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3524 3540
3525 3541 ASSERT(AS_WRITE_HELD(as));
3526 3542
3527 3543 while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3528 3544
3529 3545 if ((prot = pwp->wp_oprot) != 0) {
3530 3546 retrycnt = 0;
3531 3547
3532 3548 if (prot != pwp->wp_prot) {
3533 3549 retry:
3534 3550 seg = as_segat(as, pwp->wp_vaddr);
3535 3551 if (seg == NULL)
3536 3552 continue;
3537 3553 err = SEGOP_SETPROT(seg, pwp->wp_vaddr,
3538 3554 PAGESIZE, prot);
3539 3555 if (err == IE_RETRY) {
3540 3556 ASSERT(retrycnt == 0);
3541 3557 retrycnt++;
3542 3558 goto retry;
3543 3559
3544 3560 }
3545 3561 }
3546 3562 pwp->wp_oprot = 0;
3547 3563 pwp->wp_prot = 0;
3548 3564 }
3549 3565
3550 3566 pwp = AVL_NEXT(&as->a_wpage, pwp);
3551 3567 }
3552 3568 }
3553 3569
3554 3570 void
3555 3571 as_signal_proc(struct as *as, k_siginfo_t *siginfo)
3556 3572 {
3557 3573 struct proc *p;
3558 3574
3559 3575 mutex_enter(&pidlock);
3560 3576 for (p = practive; p; p = p->p_next) {
3561 3577 if (p->p_as == as) {
3562 3578 mutex_enter(&p->p_lock);
3563 3579 if (p->p_as == as)
3564 3580 sigaddq(p, NULL, siginfo, KM_NOSLEEP);
3565 3581 mutex_exit(&p->p_lock);
3566 3582 }
3567 3583 }
3568 3584 mutex_exit(&pidlock);
3569 3585 }
3570 3586
3571 3587 /*
3572 3588 * return memory object ID
3573 3589 */
3574 3590 int
3575 3591 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3576 3592 {
3577 3593 struct seg *seg;
3578 3594 int sts;
3579 3595
3580 3596 AS_LOCK_ENTER(as, RW_READER);
3581 3597 seg = as_segat(as, addr);
3582 3598 if (seg == NULL) {
3583 3599 AS_LOCK_EXIT(as);
3584 3600 return (EFAULT);
3585 3601 }
3586 3602 /*
3587 3603 * catch old drivers which may not support getmemid
3588 3604 */
3589 3605 if (seg->s_ops->getmemid == NULL) {
3590 3606 AS_LOCK_EXIT(as);
3591 3607 return (ENODEV);
3592 3608 }
3593 3609
3594 3610 sts = SEGOP_GETMEMID(seg, addr, memidp);
3595 3611
3596 3612 AS_LOCK_EXIT(as);
3597 3613 return (sts);
3598 3614 }
|
↓ open down ↓ |
2699 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX