1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2015, Joyent, Inc. All rights reserved.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
26 */
27
28 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
29 /* All Rights Reserved */
30
31 /*
32 * University Copyright- Copyright (c) 1982, 1986, 1988
33 * The Regents of the University of California
34 * All Rights Reserved
35 *
36 * University Acknowledgment- Portions of this document are derived from
37 * software developed by the University of California, Berkeley, and its
38 * contributors.
39 */
40
41 /*
42 * VM - address spaces.
43 */
44
45 #include <sys/types.h>
46 #include <sys/t_lock.h>
47 #include <sys/param.h>
48 #include <sys/errno.h>
49 #include <sys/systm.h>
50 #include <sys/mman.h>
51 #include <sys/sysmacros.h>
52 #include <sys/cpuvar.h>
53 #include <sys/sysinfo.h>
54 #include <sys/kmem.h>
55 #include <sys/vnode.h>
56 #include <sys/vmsystm.h>
57 #include <sys/cmn_err.h>
58 #include <sys/debug.h>
59 #include <sys/tnf_probe.h>
60 #include <sys/vtrace.h>
61 #include <sys/ddi.h>
62
63 #include <vm/hat.h>
64 #include <vm/as.h>
65 #include <vm/seg.h>
66 #include <vm/seg_vn.h>
67 #include <vm/seg_dev.h>
68 #include <vm/seg_kmem.h>
69 #include <vm/seg_map.h>
70 #include <vm/seg_spt.h>
71 #include <vm/page.h>
72
73 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
74
75 static struct kmem_cache *as_cache;
76
77 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
78 static void as_clearwatchprot(struct as *, caddr_t, size_t);
79 int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
80
81
82 /*
83 * Verifying the segment lists is very time-consuming; it may not be
84 * desirable always to define VERIFY_SEGLIST when DEBUG is set.
85 */
86 #ifdef DEBUG
87 #define VERIFY_SEGLIST
88 int do_as_verify = 0;
89 #endif
90
91 /*
92 * Allocate a new callback data structure entry and fill in the events of
93 * interest, the address range of interest, and the callback argument.
94 * Link the entry on the as->a_callbacks list. A callback entry for the
95 * entire address space may be specified with vaddr = 0 and size = -1.
96 *
97 * CALLERS RESPONSIBILITY: If not calling from within the process context for
98 * the specified as, the caller must guarantee persistence of the specified as
99 * for the duration of this function (eg. pages being locked within the as
100 * will guarantee persistence).
101 */
102 int
103 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
104 caddr_t vaddr, size_t size, int sleepflag)
105 {
106 struct as_callback *current_head, *cb;
107 caddr_t saddr;
108 size_t rsize;
109
110 /* callback function and an event are mandatory */
111 if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
112 return (EINVAL);
113
114 /* Adding a callback after as_free has been called is not allowed */
115 if (as == &kas)
116 return (ENOMEM);
117
118 /*
119 * vaddr = 0 and size = -1 is used to indicate that the callback range
120 * is the entire address space so no rounding is done in that case.
121 */
122 if (size != -1) {
123 saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
124 rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
125 (size_t)saddr;
126 /* check for wraparound */
127 if (saddr + rsize < saddr)
128 return (ENOMEM);
129 } else {
130 if (vaddr != 0)
131 return (EINVAL);
132 saddr = vaddr;
133 rsize = size;
134 }
135
136 /* Allocate and initialize a callback entry */
137 cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
138 if (cb == NULL)
139 return (EAGAIN);
140
141 cb->ascb_func = cb_func;
142 cb->ascb_arg = arg;
143 cb->ascb_events = events;
144 cb->ascb_saddr = saddr;
145 cb->ascb_len = rsize;
146
147 /* Add the entry to the list */
148 mutex_enter(&as->a_contents);
149 current_head = as->a_callbacks;
150 as->a_callbacks = cb;
151 cb->ascb_next = current_head;
152
153 /*
154 * The call to this function may lose in a race with
155 * a pertinent event - eg. a thread does long term memory locking
156 * but before the callback is added another thread executes as_unmap.
157 * A broadcast here resolves that.
158 */
159 if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
160 AS_CLRUNMAPWAIT(as);
161 cv_broadcast(&as->a_cv);
162 }
163
164 mutex_exit(&as->a_contents);
165 return (0);
166 }
167
168 /*
169 * Search the callback list for an entry which pertains to arg.
170 *
171 * This is called from within the client upon completion of the callback.
172 * RETURN VALUES:
173 * AS_CALLBACK_DELETED (callback entry found and deleted)
174 * AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
175 * AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
176 * entry will be made in as_do_callbacks)
177 *
178 * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
179 * set, it indicates that as_do_callbacks is processing this entry. The
180 * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
181 * to unblock as_do_callbacks, in case it is blocked.
182 *
183 * CALLERS RESPONSIBILITY: If not calling from within the process context for
184 * the specified as, the caller must guarantee persistence of the specified as
185 * for the duration of this function (eg. pages being locked within the as
186 * will guarantee persistence).
187 */
188 uint_t
189 as_delete_callback(struct as *as, void *arg)
190 {
191 struct as_callback **prevcb = &as->a_callbacks;
192 struct as_callback *cb;
193 uint_t rc = AS_CALLBACK_NOTFOUND;
194
195 mutex_enter(&as->a_contents);
196 for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
197 if (cb->ascb_arg != arg)
198 continue;
199
200 /*
201 * If the events indicate AS_CALLBACK_CALLED, just clear
202 * AS_ALL_EVENT in the events field and wakeup the thread
203 * that may be waiting in as_do_callbacks. as_do_callbacks
204 * will take care of removing this entry from the list. In
205 * that case, return AS_CALLBACK_DELETE_DEFERRED. Otherwise
206 * (AS_CALLBACK_CALLED not set), just remove it from the
207 * list, return the memory and return AS_CALLBACK_DELETED.
208 */
209 if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
210 /* leave AS_CALLBACK_CALLED */
211 cb->ascb_events &= ~AS_ALL_EVENT;
212 rc = AS_CALLBACK_DELETE_DEFERRED;
213 cv_broadcast(&as->a_cv);
214 } else {
215 *prevcb = cb->ascb_next;
216 kmem_free(cb, sizeof (struct as_callback));
217 rc = AS_CALLBACK_DELETED;
218 }
219 break;
220 }
221 mutex_exit(&as->a_contents);
222 return (rc);
223 }
224
225 /*
226 * Searches the as callback list for a matching entry.
227 * Returns a pointer to the first matching callback, or NULL if
228 * nothing is found.
229 * This function never sleeps so it is ok to call it with more
230 * locks held but the (required) a_contents mutex.
231 *
232 * See also comment on as_do_callbacks below.
233 */
234 static struct as_callback *
235 as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
236 size_t event_len)
237 {
238 struct as_callback *cb;
239
240 ASSERT(MUTEX_HELD(&as->a_contents));
241 for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
242 /*
243 * If the callback has not already been called, then
244 * check if events or address range pertains. An event_len
245 * of zero means do an unconditional callback.
246 */
247 if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
248 ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
249 (event_addr + event_len < cb->ascb_saddr) ||
250 (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
251 continue;
252 }
253 break;
254 }
255 return (cb);
256 }
257
258 /*
259 * Executes a given callback and removes it from the callback list for
260 * this address space.
261 * This function may sleep so the caller must drop all locks except
262 * a_contents before calling this func.
263 *
264 * See also comments on as_do_callbacks below.
265 */
266 static void
267 as_execute_callback(struct as *as, struct as_callback *cb,
268 uint_t events)
269 {
270 struct as_callback **prevcb;
271 void *cb_arg;
272
273 ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
274 cb->ascb_events |= AS_CALLBACK_CALLED;
275 mutex_exit(&as->a_contents);
276 (*cb->ascb_func)(as, cb->ascb_arg, events);
277 mutex_enter(&as->a_contents);
278 /*
279 * the callback function is required to delete the callback
280 * when the callback function determines it is OK for
281 * this thread to continue. as_delete_callback will clear
282 * the AS_ALL_EVENT in the events field when it is deleted.
283 * If the callback function called as_delete_callback,
284 * events will already be cleared and there will be no blocking.
285 */
286 while ((cb->ascb_events & events) != 0) {
287 cv_wait(&as->a_cv, &as->a_contents);
288 }
289 /*
290 * This entry needs to be taken off the list. Normally, the
291 * callback func itself does that, but unfortunately the list
292 * may have changed while the callback was running because the
293 * a_contents mutex was dropped and someone else other than the
294 * callback func itself could have called as_delete_callback,
295 * so we have to search to find this entry again. The entry
296 * must have AS_CALLBACK_CALLED, and have the same 'arg'.
297 */
298 cb_arg = cb->ascb_arg;
299 prevcb = &as->a_callbacks;
300 for (cb = as->a_callbacks; cb != NULL;
301 prevcb = &cb->ascb_next, cb = *prevcb) {
302 if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
303 (cb_arg != cb->ascb_arg)) {
304 continue;
305 }
306 *prevcb = cb->ascb_next;
307 kmem_free(cb, sizeof (struct as_callback));
308 break;
309 }
310 }
311
312 /*
313 * Check the callback list for a matching event and intersection of
314 * address range. If there is a match invoke the callback. Skip an entry if:
315 * - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
316 * - not event of interest
317 * - not address range of interest
318 *
319 * An event_len of zero indicates a request for an unconditional callback
320 * (regardless of event), only the AS_CALLBACK_CALLED is checked. The
321 * a_contents lock must be dropped before a callback, so only one callback
322 * can be done before returning. Return -1 (true) if a callback was
323 * executed and removed from the list, else return 0 (false).
324 *
325 * The logically separate parts, i.e. finding a matching callback and
326 * executing a given callback have been separated into two functions
327 * so that they can be called with different sets of locks held beyond
328 * the always-required a_contents. as_find_callback does not sleep so
329 * it is ok to call it if more locks than a_contents (i.e. the a_lock
330 * rwlock) are held. as_execute_callback on the other hand may sleep
331 * so all locks beyond a_contents must be dropped by the caller if one
332 * does not want to end comatose.
333 */
334 static int
335 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
336 size_t event_len)
337 {
338 struct as_callback *cb;
339
340 if ((cb = as_find_callback(as, events, event_addr, event_len))) {
341 as_execute_callback(as, cb, events);
342 return (-1);
343 }
344 return (0);
345 }
346
347 /*
348 * Search for the segment containing addr. If a segment containing addr
349 * exists, that segment is returned. If no such segment exists, and
350 * the list spans addresses greater than addr, then the first segment
351 * whose base is greater than addr is returned; otherwise, NULL is
352 * returned unless tail is true, in which case the last element of the
353 * list is returned.
354 *
355 * a_seglast is used to cache the last found segment for repeated
356 * searches to the same addr (which happens frequently).
357 */
358 struct seg *
359 as_findseg(struct as *as, caddr_t addr, int tail)
360 {
361 struct seg *seg = as->a_seglast;
362 avl_index_t where;
363
364 ASSERT(AS_LOCK_HELD(as));
365
366 if (seg != NULL &&
367 seg->s_base <= addr &&
368 addr < seg->s_base + seg->s_size)
369 return (seg);
370
371 seg = avl_find(&as->a_segtree, &addr, &where);
372 if (seg != NULL)
373 return (as->a_seglast = seg);
374
375 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
376 if (seg == NULL && tail)
377 seg = avl_last(&as->a_segtree);
378 return (as->a_seglast = seg);
379 }
380
381 #ifdef VERIFY_SEGLIST
382 /*
383 * verify that the linked list is coherent
384 */
385 static void
386 as_verify(struct as *as)
387 {
388 struct seg *seg, *seglast, *p, *n;
389 uint_t nsegs = 0;
390
391 if (do_as_verify == 0)
392 return;
393
394 seglast = as->a_seglast;
395
396 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
397 ASSERT(seg->s_as == as);
398 p = AS_SEGPREV(as, seg);
399 n = AS_SEGNEXT(as, seg);
400 ASSERT(p == NULL || p->s_as == as);
401 ASSERT(p == NULL || p->s_base < seg->s_base);
402 ASSERT(n == NULL || n->s_base > seg->s_base);
403 ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
404 if (seg == seglast)
405 seglast = NULL;
406 nsegs++;
407 }
408 ASSERT(seglast == NULL);
409 ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
410 }
411 #endif /* VERIFY_SEGLIST */
412
413 /*
414 * Add a new segment to the address space. The avl_find()
415 * may be expensive so we attempt to use last segment accessed
416 * in as_gap() as an insertion point.
417 */
418 int
419 as_addseg(struct as *as, struct seg *newseg)
420 {
421 struct seg *seg;
422 caddr_t addr;
423 caddr_t eaddr;
424 avl_index_t where;
425
426 ASSERT(AS_WRITE_HELD(as));
427
428 as->a_updatedir = 1; /* inform /proc */
429 gethrestime(&as->a_updatetime);
430
431 if (as->a_lastgaphl != NULL) {
432 struct seg *hseg = NULL;
433 struct seg *lseg = NULL;
434
435 if (as->a_lastgaphl->s_base > newseg->s_base) {
436 hseg = as->a_lastgaphl;
437 lseg = AVL_PREV(&as->a_segtree, hseg);
438 } else {
439 lseg = as->a_lastgaphl;
440 hseg = AVL_NEXT(&as->a_segtree, lseg);
441 }
442
443 if (hseg && lseg && lseg->s_base < newseg->s_base &&
444 hseg->s_base > newseg->s_base) {
445 avl_insert_here(&as->a_segtree, newseg, lseg,
446 AVL_AFTER);
447 as->a_lastgaphl = NULL;
448 as->a_seglast = newseg;
449 return (0);
450 }
451 as->a_lastgaphl = NULL;
452 }
453
454 addr = newseg->s_base;
455 eaddr = addr + newseg->s_size;
456 again:
457
458 seg = avl_find(&as->a_segtree, &addr, &where);
459
460 if (seg == NULL)
461 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
462
463 if (seg == NULL)
464 seg = avl_last(&as->a_segtree);
465
466 if (seg != NULL) {
467 caddr_t base = seg->s_base;
468
469 /*
470 * If top of seg is below the requested address, then
471 * the insertion point is at the end of the linked list,
472 * and seg points to the tail of the list. Otherwise,
473 * the insertion point is immediately before seg.
474 */
475 if (base + seg->s_size > addr) {
476 if (addr >= base || eaddr > base) {
477 #ifdef __sparc
478 extern struct seg_ops segnf_ops;
479
480 /*
481 * no-fault segs must disappear if overlaid.
482 * XXX need new segment type so
483 * we don't have to check s_ops
484 */
485 if (seg->s_ops == &segnf_ops) {
486 seg_unmap(seg);
487 goto again;
488 }
489 #endif
490 return (-1); /* overlapping segment */
491 }
492 }
493 }
494 as->a_seglast = newseg;
495 avl_insert(&as->a_segtree, newseg, where);
496
497 #ifdef VERIFY_SEGLIST
498 as_verify(as);
499 #endif
500 return (0);
501 }
502
503 struct seg *
504 as_removeseg(struct as *as, struct seg *seg)
505 {
506 avl_tree_t *t;
507
508 ASSERT(AS_WRITE_HELD(as));
509
510 as->a_updatedir = 1; /* inform /proc */
511 gethrestime(&as->a_updatetime);
512
513 if (seg == NULL)
514 return (NULL);
515
516 t = &as->a_segtree;
517 if (as->a_seglast == seg)
518 as->a_seglast = NULL;
519 as->a_lastgaphl = NULL;
520
521 /*
522 * if this segment is at an address higher than
523 * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
524 */
525 if (as->a_lastgap &&
526 (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
527 as->a_lastgap = AVL_NEXT(t, seg);
528
529 /*
530 * remove the segment from the seg tree
531 */
532 avl_remove(t, seg);
533
534 #ifdef VERIFY_SEGLIST
535 as_verify(as);
536 #endif
537 return (seg);
538 }
539
540 /*
541 * Find a segment containing addr.
542 */
543 struct seg *
544 as_segat(struct as *as, caddr_t addr)
545 {
546 struct seg *seg = as->a_seglast;
547
548 ASSERT(AS_LOCK_HELD(as));
549
550 if (seg != NULL && seg->s_base <= addr &&
551 addr < seg->s_base + seg->s_size)
552 return (seg);
553
554 seg = avl_find(&as->a_segtree, &addr, NULL);
555 return (seg);
556 }
557
558 /*
559 * Serialize all searches for holes in an address space to
560 * prevent two or more threads from allocating the same virtual
561 * address range. The address space must not be "read/write"
562 * locked by the caller since we may block.
563 */
564 void
565 as_rangelock(struct as *as)
566 {
567 mutex_enter(&as->a_contents);
568 while (AS_ISCLAIMGAP(as))
569 cv_wait(&as->a_cv, &as->a_contents);
570 AS_SETCLAIMGAP(as);
571 mutex_exit(&as->a_contents);
572 }
573
574 /*
575 * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
576 */
577 void
578 as_rangeunlock(struct as *as)
579 {
580 mutex_enter(&as->a_contents);
581 AS_CLRCLAIMGAP(as);
582 cv_signal(&as->a_cv);
583 mutex_exit(&as->a_contents);
584 }
585
586 /*
587 * compar segments (or just an address) by segment address range
588 */
589 static int
590 as_segcompar(const void *x, const void *y)
591 {
592 struct seg *a = (struct seg *)x;
593 struct seg *b = (struct seg *)y;
594
595 if (a->s_base < b->s_base)
596 return (-1);
597 if (a->s_base >= b->s_base + b->s_size)
598 return (1);
599 return (0);
600 }
601
602
603 void
604 as_avlinit(struct as *as)
605 {
606 avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
607 offsetof(struct seg, s_tree));
608 avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
609 offsetof(struct watched_page, wp_link));
610 }
611
612 /*ARGSUSED*/
613 static int
614 as_constructor(void *buf, void *cdrarg, int kmflags)
615 {
616 struct as *as = buf;
617
618 mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
619 cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
620 rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
621 as_avlinit(as);
622 return (0);
623 }
624
625 /*ARGSUSED1*/
626 static void
627 as_destructor(void *buf, void *cdrarg)
628 {
629 struct as *as = buf;
630
631 avl_destroy(&as->a_segtree);
632 mutex_destroy(&as->a_contents);
633 cv_destroy(&as->a_cv);
634 rw_destroy(&as->a_lock);
635 }
636
637 void
638 as_init(void)
639 {
640 as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
641 as_constructor, as_destructor, NULL, NULL, NULL, 0);
642 }
643
644 /*
645 * Allocate and initialize an address space data structure.
646 * We call hat_alloc to allow any machine dependent
647 * information in the hat structure to be initialized.
648 */
649 struct as *
650 as_alloc(void)
651 {
652 struct as *as;
653
654 as = kmem_cache_alloc(as_cache, KM_SLEEP);
655
656 as->a_flags = 0;
657 as->a_vbits = 0;
658 as->a_hrm = NULL;
659 as->a_seglast = NULL;
660 as->a_size = 0;
661 as->a_resvsize = 0;
662 as->a_updatedir = 0;
663 gethrestime(&as->a_updatetime);
664 as->a_objectdir = NULL;
665 as->a_sizedir = 0;
666 as->a_userlimit = (caddr_t)USERLIMIT;
667 as->a_lastgap = NULL;
668 as->a_lastgaphl = NULL;
669 as->a_callbacks = NULL;
670 as->a_proc = NULL;
671
672 AS_LOCK_ENTER(as, RW_WRITER);
673 as->a_hat = hat_alloc(as); /* create hat for default system mmu */
674 AS_LOCK_EXIT(as);
675
676 return (as);
677 }
678
679 /*
680 * Free an address space data structure.
681 * Need to free the hat first and then
682 * all the segments on this as and finally
683 * the space for the as struct itself.
684 */
685 void
686 as_free(struct as *as)
687 {
688 struct hat *hat = as->a_hat;
689 struct seg *seg, *next;
690 boolean_t free_started = B_FALSE;
691
692 top:
693 /*
694 * Invoke ALL callbacks. as_do_callbacks will do one callback
695 * per call, and not return (-1) until the callback has completed.
696 * When as_do_callbacks returns zero, all callbacks have completed.
697 */
698 mutex_enter(&as->a_contents);
699 while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
700 ;
701
702 mutex_exit(&as->a_contents);
703 AS_LOCK_ENTER(as, RW_WRITER);
704
705 if (!free_started) {
706 free_started = B_TRUE;
707 hat_free_start(hat);
708 }
709 for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
710 int err;
711
712 next = AS_SEGNEXT(as, seg);
713 retry:
714 err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
715 if (err == EAGAIN) {
716 mutex_enter(&as->a_contents);
717 if (as->a_callbacks) {
718 AS_LOCK_EXIT(as);
719 } else if (!AS_ISNOUNMAPWAIT(as)) {
720 /*
721 * Memory is currently locked. Wait for a
722 * cv_signal that it has been unlocked, then
723 * try the operation again.
724 */
725 if (AS_ISUNMAPWAIT(as) == 0)
726 cv_broadcast(&as->a_cv);
727 AS_SETUNMAPWAIT(as);
728 AS_LOCK_EXIT(as);
729 while (AS_ISUNMAPWAIT(as))
730 cv_wait(&as->a_cv, &as->a_contents);
731 } else {
732 /*
733 * We may have raced with
734 * segvn_reclaim()/segspt_reclaim(). In this
735 * case clean nounmapwait flag and retry since
736 * softlockcnt in this segment may be already
737 * 0. We don't drop as writer lock so our
738 * number of retries without sleeping should
739 * be very small. See segvn_reclaim() for
740 * more comments.
741 */
742 AS_CLRNOUNMAPWAIT(as);
743 mutex_exit(&as->a_contents);
744 goto retry;
745 }
746 mutex_exit(&as->a_contents);
747 goto top;
748 } else {
749 /*
750 * We do not expect any other error return at this
751 * time. This is similar to an ASSERT in seg_unmap()
752 */
753 ASSERT(err == 0);
754 }
755 }
756 hat_free_end(hat);
757 AS_LOCK_EXIT(as);
758
759 /* /proc stuff */
760 ASSERT(avl_numnodes(&as->a_wpage) == 0);
761 if (as->a_objectdir) {
762 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
763 as->a_objectdir = NULL;
764 as->a_sizedir = 0;
765 }
766
767 /*
768 * Free the struct as back to kmem. Assert it has no segments.
769 */
770 ASSERT(avl_numnodes(&as->a_segtree) == 0);
771 kmem_cache_free(as_cache, as);
772 }
773
774 int
775 as_dup(struct as *as, struct proc *forkedproc)
776 {
777 struct as *newas;
778 struct seg *seg, *newseg;
779 size_t purgesize = 0;
780 int error;
781
782 AS_LOCK_ENTER(as, RW_WRITER);
783 as_clearwatch(as);
784 newas = as_alloc();
785 newas->a_userlimit = as->a_userlimit;
786 newas->a_proc = forkedproc;
787
788 AS_LOCK_ENTER(newas, RW_WRITER);
789
790 (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
791
792 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
793
794 if (seg->s_flags & S_PURGE) {
795 purgesize += seg->s_size;
796 continue;
797 }
798
799 newseg = seg_alloc(newas, seg->s_base, seg->s_size);
800 if (newseg == NULL) {
801 AS_LOCK_EXIT(newas);
802 as_setwatch(as);
803 AS_LOCK_EXIT(as);
804 as_free(newas);
805 return (-1);
806 }
807 if ((error = SEGOP_DUP(seg, newseg)) != 0) {
808 /*
809 * We call seg_free() on the new seg
810 * because the segment is not set up
811 * completely; i.e. it has no ops.
812 */
813 as_setwatch(as);
814 AS_LOCK_EXIT(as);
815 seg_free(newseg);
816 AS_LOCK_EXIT(newas);
817 as_free(newas);
818 return (error);
819 }
820 newas->a_size += seg->s_size;
821 }
822 newas->a_resvsize = as->a_resvsize - purgesize;
823
824 error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
825
826 AS_LOCK_EXIT(newas);
827
828 as_setwatch(as);
829 AS_LOCK_EXIT(as);
830 if (error != 0) {
831 as_free(newas);
832 return (error);
833 }
834 forkedproc->p_as = newas;
835 return (0);
836 }
837
838 /*
839 * Handle a ``fault'' at addr for size bytes.
840 */
841 faultcode_t
842 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
843 enum fault_type type, enum seg_rw rw)
844 {
845 struct seg *seg;
846 caddr_t raddr; /* rounded down addr */
847 size_t rsize; /* rounded up size */
848 size_t ssize;
849 faultcode_t res = 0;
850 caddr_t addrsav;
851 struct seg *segsav;
852 int as_lock_held;
853 klwp_t *lwp = ttolwp(curthread);
854 zone_t *zonep = curzone;
855
856 retry:
857 /*
858 * Indicate that the lwp is not to be stopped while waiting for a
859 * pagefault. This is to avoid deadlock while debugging a process
860 * via /proc over NFS (in particular).
861 */
862 if (lwp != NULL)
863 lwp->lwp_nostop++;
864
865 /*
866 * same length must be used when we softlock and softunlock. We
867 * don't support softunlocking lengths less than the original length
868 * when there is largepage support. See seg_dev.c for more
869 * comments.
870 */
871 switch (type) {
872
873 case F_SOFTLOCK:
874 CPU_STATS_ADD_K(vm, softlock, 1);
875 break;
876
877 case F_SOFTUNLOCK:
878 break;
879
880 case F_PROT:
881 CPU_STATS_ADD_K(vm, prot_fault, 1);
882 break;
883
884 case F_INVAL:
885 CPU_STATS_ENTER_K();
886 CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
887 if (as == &kas)
888 CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
889 CPU_STATS_EXIT_K();
890 if (zonep->zone_pg_flt_delay != 0) {
891 /*
892 * The zone in which this process is running
893 * is currently over it's physical memory cap.
894 * Throttle page faults to help the user-land
895 * memory capper catch up. Note that
896 * drv_usectohz() rounds up.
897 */
898 atomic_add_64(&zonep->zone_pf_throttle, 1);
899 atomic_add_64(&zonep->zone_pf_throttle_usec,
900 zonep->zone_pg_flt_delay);
901 if (zonep->zone_pg_flt_delay < TICK_TO_USEC(1))
902 drv_usecwait(zonep->zone_pg_flt_delay);
903 else
904 delay(drv_usectohz(zonep->zone_pg_flt_delay));
905 }
906 break;
907 }
908
909 /* Kernel probe */
910 TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */,
911 tnf_opaque, address, addr,
912 tnf_fault_type, fault_type, type,
913 tnf_seg_access, access, rw);
914
915 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
916 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
917 (size_t)raddr;
918
919 /*
920 * XXX -- Don't grab the as lock for segkmap. We should grab it for
921 * correctness, but then we could be stuck holding this lock for
922 * a LONG time if the fault needs to be resolved on a slow
923 * filesystem, and then no-one will be able to exec new commands,
924 * as exec'ing requires the write lock on the as.
925 */
926 if (as == &kas && segkmap && segkmap->s_base <= raddr &&
927 raddr + size < segkmap->s_base + segkmap->s_size) {
928 seg = segkmap;
929 as_lock_held = 0;
930 } else {
931 AS_LOCK_ENTER(as, RW_READER);
932
933 seg = as_segat(as, raddr);
934 if (seg == NULL) {
935 AS_LOCK_EXIT(as);
936 if (lwp != NULL)
937 lwp->lwp_nostop--;
938 return (FC_NOMAP);
939 }
940
941 as_lock_held = 1;
942 }
943
944 addrsav = raddr;
945 segsav = seg;
946
947 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
948 if (raddr >= seg->s_base + seg->s_size) {
949 seg = AS_SEGNEXT(as, seg);
950 if (seg == NULL || raddr != seg->s_base) {
951 res = FC_NOMAP;
952 break;
953 }
954 }
955 if (raddr + rsize > seg->s_base + seg->s_size)
956 ssize = seg->s_base + seg->s_size - raddr;
957 else
958 ssize = rsize;
959
960 res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw);
961 if (res != 0)
962 break;
963 }
964
965 /*
966 * If we were SOFTLOCKing and encountered a failure,
967 * we must SOFTUNLOCK the range we already did. (Maybe we
968 * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
969 * right here...)
970 */
971 if (res != 0 && type == F_SOFTLOCK) {
972 for (seg = segsav; addrsav < raddr; addrsav += ssize) {
973 if (addrsav >= seg->s_base + seg->s_size)
974 seg = AS_SEGNEXT(as, seg);
975 ASSERT(seg != NULL);
976 /*
977 * Now call the fault routine again to perform the
978 * unlock using S_OTHER instead of the rw variable
979 * since we never got a chance to touch the pages.
980 */
981 if (raddr > seg->s_base + seg->s_size)
982 ssize = seg->s_base + seg->s_size - addrsav;
983 else
984 ssize = raddr - addrsav;
985 (void) SEGOP_FAULT(hat, seg, addrsav, ssize,
986 F_SOFTUNLOCK, S_OTHER);
987 }
988 }
989 if (as_lock_held)
990 AS_LOCK_EXIT(as);
991 if (lwp != NULL)
992 lwp->lwp_nostop--;
993
994 /*
995 * If the lower levels returned EDEADLK for a fault,
996 * It means that we should retry the fault. Let's wait
997 * a bit also to let the deadlock causing condition clear.
998 * This is part of a gross hack to work around a design flaw
999 * in the ufs/sds logging code and should go away when the
1000 * logging code is re-designed to fix the problem. See bug
1001 * 4125102 for details of the problem.
1002 */
1003 if (FC_ERRNO(res) == EDEADLK) {
1004 delay(deadlk_wait);
1005 res = 0;
1006 goto retry;
1007 }
1008 return (res);
1009 }
1010
1011
1012
1013 /*
1014 * Asynchronous ``fault'' at addr for size bytes.
1015 */
1016 faultcode_t
1017 as_faulta(struct as *as, caddr_t addr, size_t size)
1018 {
1019 struct seg *seg;
1020 caddr_t raddr; /* rounded down addr */
1021 size_t rsize; /* rounded up size */
1022 faultcode_t res = 0;
1023 klwp_t *lwp = ttolwp(curthread);
1024
1025 retry:
1026 /*
1027 * Indicate that the lwp is not to be stopped while waiting
1028 * for a pagefault. This is to avoid deadlock while debugging
1029 * a process via /proc over NFS (in particular).
1030 */
1031 if (lwp != NULL)
1032 lwp->lwp_nostop++;
1033
1034 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1035 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1036 (size_t)raddr;
1037
1038 AS_LOCK_ENTER(as, RW_READER);
1039 seg = as_segat(as, raddr);
1040 if (seg == NULL) {
1041 AS_LOCK_EXIT(as);
1042 if (lwp != NULL)
1043 lwp->lwp_nostop--;
1044 return (FC_NOMAP);
1045 }
1046
1047 for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1048 if (raddr >= seg->s_base + seg->s_size) {
1049 seg = AS_SEGNEXT(as, seg);
1050 if (seg == NULL || raddr != seg->s_base) {
1051 res = FC_NOMAP;
1052 break;
1053 }
1054 }
1055 res = SEGOP_FAULTA(seg, raddr);
1056 if (res != 0)
1057 break;
1058 }
1059 AS_LOCK_EXIT(as);
1060 if (lwp != NULL)
1061 lwp->lwp_nostop--;
1062 /*
1063 * If the lower levels returned EDEADLK for a fault,
1064 * It means that we should retry the fault. Let's wait
1065 * a bit also to let the deadlock causing condition clear.
1066 * This is part of a gross hack to work around a design flaw
1067 * in the ufs/sds logging code and should go away when the
1068 * logging code is re-designed to fix the problem. See bug
1069 * 4125102 for details of the problem.
1070 */
1071 if (FC_ERRNO(res) == EDEADLK) {
1072 delay(deadlk_wait);
1073 res = 0;
1074 goto retry;
1075 }
1076 return (res);
1077 }
1078
1079 /*
1080 * Set the virtual mapping for the interval from [addr : addr + size)
1081 * in address space `as' to have the specified protection.
1082 * It is ok for the range to cross over several segments,
1083 * as long as they are contiguous.
1084 */
1085 int
1086 as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1087 {
1088 struct seg *seg;
1089 struct as_callback *cb;
1090 size_t ssize;
1091 caddr_t raddr; /* rounded down addr */
1092 size_t rsize; /* rounded up size */
1093 int error = 0, writer = 0;
1094 caddr_t saveraddr;
1095 size_t saversize;
1096
1097 setprot_top:
1098 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1099 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1100 (size_t)raddr;
1101
1102 if (raddr + rsize < raddr) /* check for wraparound */
1103 return (ENOMEM);
1104
1105 saveraddr = raddr;
1106 saversize = rsize;
1107
1108 /*
1109 * Normally we only lock the as as a reader. But
1110 * if due to setprot the segment driver needs to split
1111 * a segment it will return IE_RETRY. Therefore we re-acquire
1112 * the as lock as a writer so the segment driver can change
1113 * the seg list. Also the segment driver will return IE_RETRY
1114 * after it has changed the segment list so we therefore keep
1115 * locking as a writer. Since these opeartions should be rare
1116 * want to only lock as a writer when necessary.
1117 */
1118 if (writer || avl_numnodes(&as->a_wpage) != 0) {
1119 AS_LOCK_ENTER(as, RW_WRITER);
1120 } else {
1121 AS_LOCK_ENTER(as, RW_READER);
1122 }
1123
1124 as_clearwatchprot(as, raddr, rsize);
1125 seg = as_segat(as, raddr);
1126 if (seg == NULL) {
1127 as_setwatch(as);
1128 AS_LOCK_EXIT(as);
1129 return (ENOMEM);
1130 }
1131
1132 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1133 if (raddr >= seg->s_base + seg->s_size) {
1134 seg = AS_SEGNEXT(as, seg);
1135 if (seg == NULL || raddr != seg->s_base) {
1136 error = ENOMEM;
1137 break;
1138 }
1139 }
1140 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1141 ssize = seg->s_base + seg->s_size - raddr;
1142 else
1143 ssize = rsize;
1144 retry:
1145 error = SEGOP_SETPROT(seg, raddr, ssize, prot);
1146
1147 if (error == IE_NOMEM) {
1148 error = EAGAIN;
1149 break;
1150 }
1151
1152 if (error == IE_RETRY) {
1153 AS_LOCK_EXIT(as);
1154 writer = 1;
1155 goto setprot_top;
1156 }
1157
1158 if (error == EAGAIN) {
1159 /*
1160 * Make sure we have a_lock as writer.
1161 */
1162 if (writer == 0) {
1163 AS_LOCK_EXIT(as);
1164 writer = 1;
1165 goto setprot_top;
1166 }
1167
1168 /*
1169 * Memory is currently locked. It must be unlocked
1170 * before this operation can succeed through a retry.
1171 * The possible reasons for locked memory and
1172 * corresponding strategies for unlocking are:
1173 * (1) Normal I/O
1174 * wait for a signal that the I/O operation
1175 * has completed and the memory is unlocked.
1176 * (2) Asynchronous I/O
1177 * The aio subsystem does not unlock pages when
1178 * the I/O is completed. Those pages are unlocked
1179 * when the application calls aiowait/aioerror.
1180 * So, to prevent blocking forever, cv_broadcast()
1181 * is done to wake up aio_cleanup_thread.
1182 * Subsequently, segvn_reclaim will be called, and
1183 * that will do AS_CLRUNMAPWAIT() and wake us up.
1184 * (3) Long term page locking:
1185 * Drivers intending to have pages locked for a
1186 * period considerably longer than for normal I/O
1187 * (essentially forever) may have registered for a
1188 * callback so they may unlock these pages on
1189 * request. This is needed to allow this operation
1190 * to succeed. Each entry on the callback list is
1191 * examined. If the event or address range pertains
1192 * the callback is invoked (unless it already is in
1193 * progress). The a_contents lock must be dropped
1194 * before the callback, so only one callback can
1195 * be done at a time. Go to the top and do more
1196 * until zero is returned. If zero is returned,
1197 * either there were no callbacks for this event
1198 * or they were already in progress.
1199 */
1200 mutex_enter(&as->a_contents);
1201 if (as->a_callbacks &&
1202 (cb = as_find_callback(as, AS_SETPROT_EVENT,
1203 seg->s_base, seg->s_size))) {
1204 AS_LOCK_EXIT(as);
1205 as_execute_callback(as, cb, AS_SETPROT_EVENT);
1206 } else if (!AS_ISNOUNMAPWAIT(as)) {
1207 if (AS_ISUNMAPWAIT(as) == 0)
1208 cv_broadcast(&as->a_cv);
1209 AS_SETUNMAPWAIT(as);
1210 AS_LOCK_EXIT(as);
1211 while (AS_ISUNMAPWAIT(as))
1212 cv_wait(&as->a_cv, &as->a_contents);
1213 } else {
1214 /*
1215 * We may have raced with
1216 * segvn_reclaim()/segspt_reclaim(). In this
1217 * case clean nounmapwait flag and retry since
1218 * softlockcnt in this segment may be already
1219 * 0. We don't drop as writer lock so our
1220 * number of retries without sleeping should
1221 * be very small. See segvn_reclaim() for
1222 * more comments.
1223 */
1224 AS_CLRNOUNMAPWAIT(as);
1225 mutex_exit(&as->a_contents);
1226 goto retry;
1227 }
1228 mutex_exit(&as->a_contents);
1229 goto setprot_top;
1230 } else if (error != 0)
1231 break;
1232 }
1233 if (error != 0) {
1234 as_setwatch(as);
1235 } else {
1236 as_setwatchprot(as, saveraddr, saversize, prot);
1237 }
1238 AS_LOCK_EXIT(as);
1239 return (error);
1240 }
1241
1242 /*
1243 * Check to make sure that the interval [addr, addr + size)
1244 * in address space `as' has at least the specified protection.
1245 * It is ok for the range to cross over several segments, as long
1246 * as they are contiguous.
1247 */
1248 int
1249 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1250 {
1251 struct seg *seg;
1252 size_t ssize;
1253 caddr_t raddr; /* rounded down addr */
1254 size_t rsize; /* rounded up size */
1255 int error = 0;
1256
1257 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1258 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1259 (size_t)raddr;
1260
1261 if (raddr + rsize < raddr) /* check for wraparound */
1262 return (ENOMEM);
1263
1264 /*
1265 * This is ugly as sin...
1266 * Normally, we only acquire the address space readers lock.
1267 * However, if the address space has watchpoints present,
1268 * we must acquire the writer lock on the address space for
1269 * the benefit of as_clearwatchprot() and as_setwatchprot().
1270 */
1271 if (avl_numnodes(&as->a_wpage) != 0)
1272 AS_LOCK_ENTER(as, RW_WRITER);
1273 else
1274 AS_LOCK_ENTER(as, RW_READER);
1275 as_clearwatchprot(as, raddr, rsize);
1276 seg = as_segat(as, raddr);
1277 if (seg == NULL) {
1278 as_setwatch(as);
1279 AS_LOCK_EXIT(as);
1280 return (ENOMEM);
1281 }
1282
1283 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1284 if (raddr >= seg->s_base + seg->s_size) {
1285 seg = AS_SEGNEXT(as, seg);
1286 if (seg == NULL || raddr != seg->s_base) {
1287 error = ENOMEM;
1288 break;
1289 }
1290 }
1291 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1292 ssize = seg->s_base + seg->s_size - raddr;
1293 else
1294 ssize = rsize;
1295
1296 error = SEGOP_CHECKPROT(seg, raddr, ssize, prot);
1297 if (error != 0)
1298 break;
1299 }
1300 as_setwatch(as);
1301 AS_LOCK_EXIT(as);
1302 return (error);
1303 }
1304
1305 int
1306 as_unmap(struct as *as, caddr_t addr, size_t size)
1307 {
1308 struct seg *seg, *seg_next;
1309 struct as_callback *cb;
1310 caddr_t raddr, eaddr;
1311 size_t ssize, rsize = 0;
1312 int err;
1313
1314 top:
1315 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1316 eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1317 (uintptr_t)PAGEMASK);
1318
1319 AS_LOCK_ENTER(as, RW_WRITER);
1320
1321 as->a_updatedir = 1; /* inform /proc */
1322 gethrestime(&as->a_updatetime);
1323
1324 /*
1325 * Use as_findseg to find the first segment in the range, then
1326 * step through the segments in order, following s_next.
1327 */
1328 as_clearwatchprot(as, raddr, eaddr - raddr);
1329
1330 for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
1331 if (eaddr <= seg->s_base)
1332 break; /* eaddr was in a gap; all done */
1333
1334 /* this is implied by the test above */
1335 ASSERT(raddr < eaddr);
1336
1337 if (raddr < seg->s_base)
1338 raddr = seg->s_base; /* raddr was in a gap */
1339
1340 if (eaddr > (seg->s_base + seg->s_size))
1341 ssize = seg->s_base + seg->s_size - raddr;
1342 else
1343 ssize = eaddr - raddr;
1344
1345 /*
1346 * Save next segment pointer since seg can be
1347 * destroyed during the segment unmap operation.
1348 */
1349 seg_next = AS_SEGNEXT(as, seg);
1350
1351 /*
1352 * We didn't count /dev/null mappings, so ignore them here.
1353 * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1354 * we have to do this check here while we have seg.)
1355 */
1356 rsize = 0;
1357 if (!SEG_IS_DEVNULL_MAPPING(seg) &&
1358 !SEG_IS_PARTIAL_RESV(seg))
1359 rsize = ssize;
1360
1361 retry:
1362 err = SEGOP_UNMAP(seg, raddr, ssize);
1363 if (err == EAGAIN) {
1364 /*
1365 * Memory is currently locked. It must be unlocked
1366 * before this operation can succeed through a retry.
1367 * The possible reasons for locked memory and
1368 * corresponding strategies for unlocking are:
1369 * (1) Normal I/O
1370 * wait for a signal that the I/O operation
1371 * has completed and the memory is unlocked.
1372 * (2) Asynchronous I/O
1373 * The aio subsystem does not unlock pages when
1374 * the I/O is completed. Those pages are unlocked
1375 * when the application calls aiowait/aioerror.
1376 * So, to prevent blocking forever, cv_broadcast()
1377 * is done to wake up aio_cleanup_thread.
1378 * Subsequently, segvn_reclaim will be called, and
1379 * that will do AS_CLRUNMAPWAIT() and wake us up.
1380 * (3) Long term page locking:
1381 * Drivers intending to have pages locked for a
1382 * period considerably longer than for normal I/O
1383 * (essentially forever) may have registered for a
1384 * callback so they may unlock these pages on
1385 * request. This is needed to allow this operation
1386 * to succeed. Each entry on the callback list is
1387 * examined. If the event or address range pertains
1388 * the callback is invoked (unless it already is in
1389 * progress). The a_contents lock must be dropped
1390 * before the callback, so only one callback can
1391 * be done at a time. Go to the top and do more
1392 * until zero is returned. If zero is returned,
1393 * either there were no callbacks for this event
1394 * or they were already in progress.
1395 */
1396 mutex_enter(&as->a_contents);
1397 if (as->a_callbacks &&
1398 (cb = as_find_callback(as, AS_UNMAP_EVENT,
1399 seg->s_base, seg->s_size))) {
1400 AS_LOCK_EXIT(as);
1401 as_execute_callback(as, cb, AS_UNMAP_EVENT);
1402 } else if (!AS_ISNOUNMAPWAIT(as)) {
1403 if (AS_ISUNMAPWAIT(as) == 0)
1404 cv_broadcast(&as->a_cv);
1405 AS_SETUNMAPWAIT(as);
1406 AS_LOCK_EXIT(as);
1407 while (AS_ISUNMAPWAIT(as))
1408 cv_wait(&as->a_cv, &as->a_contents);
1409 } else {
1410 /*
1411 * We may have raced with
1412 * segvn_reclaim()/segspt_reclaim(). In this
1413 * case clean nounmapwait flag and retry since
1414 * softlockcnt in this segment may be already
1415 * 0. We don't drop as writer lock so our
1416 * number of retries without sleeping should
1417 * be very small. See segvn_reclaim() for
1418 * more comments.
1419 */
1420 AS_CLRNOUNMAPWAIT(as);
1421 mutex_exit(&as->a_contents);
1422 goto retry;
1423 }
1424 mutex_exit(&as->a_contents);
1425 goto top;
1426 } else if (err == IE_RETRY) {
1427 AS_LOCK_EXIT(as);
1428 goto top;
1429 } else if (err) {
1430 as_setwatch(as);
1431 AS_LOCK_EXIT(as);
1432 return (-1);
1433 }
1434
1435 as->a_size -= ssize;
1436 if (rsize)
1437 as->a_resvsize -= rsize;
1438 raddr += ssize;
1439 }
1440 AS_LOCK_EXIT(as);
1441 return (0);
1442 }
1443
1444 static int
1445 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
1446 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1447 {
1448 uint_t szc;
1449 uint_t nszc;
1450 int error;
1451 caddr_t a;
1452 caddr_t eaddr;
1453 size_t segsize;
1454 struct seg *seg;
1455 size_t pgsz;
1456 int do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
1457 uint_t save_szcvec;
1458
1459 ASSERT(AS_WRITE_HELD(as));
1460 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1461 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1462 ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
1463 if (!do_off) {
1464 vn_a->offset = 0;
1465 }
1466
1467 if (szcvec <= 1) {
1468 seg = seg_alloc(as, addr, size);
1469 if (seg == NULL) {
1470 return (ENOMEM);
1471 }
1472 vn_a->szc = 0;
1473 error = (*crfp)(seg, vn_a);
1474 if (error != 0) {
1475 seg_free(seg);
1476 } else {
1477 as->a_size += size;
1478 as->a_resvsize += size;
1479 }
1480 return (error);
1481 }
1482
1483 eaddr = addr + size;
1484 save_szcvec = szcvec;
1485 szcvec >>= 1;
1486 szc = 0;
1487 nszc = 0;
1488 while (szcvec) {
1489 if ((szcvec & 0x1) == 0) {
1490 nszc++;
1491 szcvec >>= 1;
1492 continue;
1493 }
1494 nszc++;
1495 pgsz = page_get_pagesize(nszc);
1496 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
1497 if (a != addr) {
1498 ASSERT(a < eaddr);
1499 segsize = a - addr;
1500 seg = seg_alloc(as, addr, segsize);
1501 if (seg == NULL) {
1502 return (ENOMEM);
1503 }
1504 vn_a->szc = szc;
1505 error = (*crfp)(seg, vn_a);
1506 if (error != 0) {
1507 seg_free(seg);
1508 return (error);
1509 }
1510 as->a_size += segsize;
1511 as->a_resvsize += segsize;
1512 *segcreated = 1;
1513 if (do_off) {
1514 vn_a->offset += segsize;
1515 }
1516 addr = a;
1517 }
1518 szc = nszc;
1519 szcvec >>= 1;
1520 }
1521
1522 ASSERT(addr < eaddr);
1523 szcvec = save_szcvec | 1; /* add 8K pages */
1524 while (szcvec) {
1525 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
1526 ASSERT(a >= addr);
1527 if (a != addr) {
1528 segsize = a - addr;
1529 seg = seg_alloc(as, addr, segsize);
1530 if (seg == NULL) {
1531 return (ENOMEM);
1532 }
1533 vn_a->szc = szc;
1534 error = (*crfp)(seg, vn_a);
1535 if (error != 0) {
1536 seg_free(seg);
1537 return (error);
1538 }
1539 as->a_size += segsize;
1540 as->a_resvsize += segsize;
1541 *segcreated = 1;
1542 if (do_off) {
1543 vn_a->offset += segsize;
1544 }
1545 addr = a;
1546 }
1547 szcvec &= ~(1 << szc);
1548 if (szcvec) {
1549 szc = highbit(szcvec) - 1;
1550 pgsz = page_get_pagesize(szc);
1551 }
1552 }
1553 ASSERT(addr == eaddr);
1554
1555 return (0);
1556 }
1557
1558 static int
1559 as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
1560 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1561 {
1562 uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
1563 int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
1564 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1565 type, 0);
1566 int error;
1567 struct seg *seg;
1568 struct vattr va;
1569 u_offset_t eoff;
1570 size_t save_size = 0;
1571 extern size_t textrepl_size_thresh;
1572
1573 ASSERT(AS_WRITE_HELD(as));
1574 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1575 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1576 ASSERT(vn_a->vp != NULL);
1577 ASSERT(vn_a->amp == NULL);
1578
1579 again:
1580 if (szcvec <= 1) {
1581 seg = seg_alloc(as, addr, size);
1582 if (seg == NULL) {
1583 return (ENOMEM);
1584 }
1585 vn_a->szc = 0;
1586 error = (*crfp)(seg, vn_a);
1587 if (error != 0) {
1588 seg_free(seg);
1589 } else {
1590 as->a_size += size;
1591 as->a_resvsize += size;
1592 }
1593 return (error);
1594 }
1595
1596 va.va_mask = AT_SIZE;
1597 if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) {
1598 szcvec = 0;
1599 goto again;
1600 }
1601 eoff = vn_a->offset & PAGEMASK;
1602 if (eoff >= va.va_size) {
1603 szcvec = 0;
1604 goto again;
1605 }
1606 eoff += size;
1607 if (btopr(va.va_size) < btopr(eoff)) {
1608 save_size = size;
1609 size = va.va_size - (vn_a->offset & PAGEMASK);
1610 size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
1611 szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1612 type, 0);
1613 if (szcvec <= 1) {
1614 size = save_size;
1615 goto again;
1616 }
1617 }
1618
1619 if (size > textrepl_size_thresh) {
1620 vn_a->flags |= _MAP_TEXTREPL;
1621 }
1622 error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a,
1623 segcreated);
1624 if (error != 0) {
1625 return (error);
1626 }
1627 if (save_size) {
1628 addr += size;
1629 size = save_size - size;
1630 szcvec = 0;
1631 goto again;
1632 }
1633 return (0);
1634 }
1635
1636 /*
1637 * as_map_ansegs: shared or private anonymous memory. Note that the flags
1638 * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1639 */
1640 static int
1641 as_map_ansegs(struct as *as, caddr_t addr, size_t size,
1642 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1643 {
1644 uint_t szcvec;
1645 uchar_t type;
1646
1647 ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
1648 if (vn_a->type == MAP_SHARED) {
1649 type = MAPPGSZC_SHM;
1650 } else if (vn_a->type == MAP_PRIVATE) {
1651 if (vn_a->szc == AS_MAP_HEAP) {
1652 type = MAPPGSZC_HEAP;
1653 } else if (vn_a->szc == AS_MAP_STACK) {
1654 type = MAPPGSZC_STACK;
1655 } else {
1656 type = MAPPGSZC_PRIVM;
1657 }
1658 }
1659 szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
1660 (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
1661 (vn_a->flags & MAP_TEXT), type, 0);
1662 ASSERT(AS_WRITE_HELD(as));
1663 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1664 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1665 ASSERT(vn_a->vp == NULL);
1666
1667 return (as_map_segvn_segs(as, addr, size, szcvec,
1668 crfp, vn_a, segcreated));
1669 }
1670
1671 int
1672 as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp)
1673 {
1674 AS_LOCK_ENTER(as, RW_WRITER);
1675 return (as_map_locked(as, addr, size, crfp, argsp));
1676 }
1677
1678 int
1679 as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(),
1680 void *argsp)
1681 {
1682 struct seg *seg = NULL;
1683 caddr_t raddr; /* rounded down addr */
1684 size_t rsize; /* rounded up size */
1685 int error;
1686 int unmap = 0;
1687 /*
1688 * The use of a_proc is preferred to handle the case where curproc is
1689 * a door_call server and is allocating memory in the client's (a_proc)
1690 * address space.
1691 * When creating a shared memory segment a_proc will be NULL so we
1692 * fallback to curproc in that case.
1693 */
1694 struct proc *p = (as->a_proc == NULL) ? curproc : as->a_proc;
1695 struct segvn_crargs crargs;
1696
1697 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1698 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1699 (size_t)raddr;
1700
1701 /*
1702 * check for wrap around
1703 */
1704 if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
1705 AS_LOCK_EXIT(as);
1706 return (ENOMEM);
1707 }
1708
1709 as->a_updatedir = 1; /* inform /proc */
1710 gethrestime(&as->a_updatetime);
1711
1712 if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) {
1713 AS_LOCK_EXIT(as);
1714
1715 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
1716 RCA_UNSAFE_ALL);
1717
1718 return (ENOMEM);
1719 }
1720
1721 if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
1722 crargs = *(struct segvn_crargs *)argsp;
1723 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap);
1724 if (error != 0) {
1725 AS_LOCK_EXIT(as);
1726 if (unmap) {
1727 (void) as_unmap(as, addr, size);
1728 }
1729 return (error);
1730 }
1731 } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
1732 crargs = *(struct segvn_crargs *)argsp;
1733 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap);
1734 if (error != 0) {
1735 AS_LOCK_EXIT(as);
1736 if (unmap) {
1737 (void) as_unmap(as, addr, size);
1738 }
1739 return (error);
1740 }
1741 } else {
1742 seg = seg_alloc(as, addr, size);
1743 if (seg == NULL) {
1744 AS_LOCK_EXIT(as);
1745 return (ENOMEM);
1746 }
1747
1748 error = (*crfp)(seg, argsp);
1749 if (error != 0) {
1750 seg_free(seg);
1751 AS_LOCK_EXIT(as);
1752 return (error);
1753 }
1754 /*
1755 * Add size now so as_unmap will work if as_ctl fails.
1756 */
1757 as->a_size += rsize;
1758 as->a_resvsize += rsize;
1759 }
1760
1761 as_setwatch(as);
1762
1763 /*
1764 * If the address space is locked,
1765 * establish memory locks for the new segment.
1766 */
1767 mutex_enter(&as->a_contents);
1768 if (AS_ISPGLCK(as)) {
1769 mutex_exit(&as->a_contents);
1770 AS_LOCK_EXIT(as);
1771 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
1772 if (error != 0)
1773 (void) as_unmap(as, addr, size);
1774 } else {
1775 mutex_exit(&as->a_contents);
1776 AS_LOCK_EXIT(as);
1777 }
1778 return (error);
1779 }
1780
1781
1782 /*
1783 * Delete all segments in the address space marked with S_PURGE.
1784 * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1785 * These segments are deleted as a first step before calls to as_gap(), so
1786 * that they don't affect mmap() or shmat().
1787 */
1788 void
1789 as_purge(struct as *as)
1790 {
1791 struct seg *seg;
1792 struct seg *next_seg;
1793
1794 /*
1795 * the setting of NEEDSPURGE is protect by as_rangelock(), so
1796 * no need to grab a_contents mutex for this check
1797 */
1798 if ((as->a_flags & AS_NEEDSPURGE) == 0)
1799 return;
1800
1801 AS_LOCK_ENTER(as, RW_WRITER);
1802 next_seg = NULL;
1803 seg = AS_SEGFIRST(as);
1804 while (seg != NULL) {
1805 next_seg = AS_SEGNEXT(as, seg);
1806 if (seg->s_flags & S_PURGE)
1807 SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1808 seg = next_seg;
1809 }
1810 AS_LOCK_EXIT(as);
1811
1812 mutex_enter(&as->a_contents);
1813 as->a_flags &= ~AS_NEEDSPURGE;
1814 mutex_exit(&as->a_contents);
1815 }
1816
1817 /*
1818 * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1819 * range of addresses at least "minlen" long, where the base of the range is
1820 * at "off" phase from an "align" boundary and there is space for a
1821 * "redzone"-sized redzone on eithe rside of the range. Thus,
1822 * if align was 4M and off was 16k, the user wants a hole which will start
1823 * 16k into a 4M page.
1824 *
1825 * If flags specifies AH_HI, the hole will have the highest possible address
1826 * in the range. We use the as->a_lastgap field to figure out where to
1827 * start looking for a gap.
1828 *
1829 * Otherwise, the gap will have the lowest possible address.
1830 *
1831 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1832 *
1833 * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1834 * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1835 *
1836 * NOTE: This routine is not correct when base+len overflows caddr_t.
1837 */
1838 int
1839 as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
1840 uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off)
1841 {
1842 caddr_t lobound = *basep;
1843 caddr_t hibound = lobound + *lenp;
1844 struct seg *lseg, *hseg;
1845 caddr_t lo, hi;
1846 int forward;
1847 caddr_t save_base;
1848 size_t save_len;
1849 size_t save_minlen;
1850 size_t save_redzone;
1851 int fast_path = 1;
1852
1853 save_base = *basep;
1854 save_len = *lenp;
1855 save_minlen = minlen;
1856 save_redzone = redzone;
1857
1858 /*
1859 * For the first pass/fast_path, just add align and redzone into
1860 * minlen since if we get an allocation, we can guarantee that it
1861 * will fit the alignment and redzone requested.
1862 * This increases the chance that hibound will be adjusted to
1863 * a_lastgap->s_base which will likely allow us to find an
1864 * acceptable hole in the address space quicker.
1865 * If we can't find a hole with this fast_path, then we look for
1866 * smaller holes in which the alignment and offset may allow
1867 * the allocation to fit.
1868 */
1869 minlen += align;
1870 minlen += 2 * redzone;
1871 redzone = 0;
1872
1873 AS_LOCK_ENTER(as, RW_READER);
1874 if (AS_SEGFIRST(as) == NULL) {
1875 if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
1876 align, redzone, off)) {
1877 AS_LOCK_EXIT(as);
1878 return (0);
1879 } else {
1880 AS_LOCK_EXIT(as);
1881 *basep = save_base;
1882 *lenp = save_len;
1883 return (-1);
1884 }
1885 }
1886
1887 retry:
1888 /*
1889 * Set up to iterate over all the inter-segment holes in the given
1890 * direction. lseg is NULL for the lowest-addressed hole and hseg is
1891 * NULL for the highest-addressed hole. If moving backwards, we reset
1892 * sseg to denote the highest-addressed segment.
1893 */
1894 forward = (flags & AH_DIR) == AH_LO;
1895 if (forward) {
1896 hseg = as_findseg(as, lobound, 1);
1897 lseg = AS_SEGPREV(as, hseg);
1898 } else {
1899
1900 /*
1901 * If allocating at least as much as the last allocation,
1902 * use a_lastgap's base as a better estimate of hibound.
1903 */
1904 if (as->a_lastgap &&
1905 minlen >= as->a_lastgap->s_size &&
1906 hibound >= as->a_lastgap->s_base)
1907 hibound = as->a_lastgap->s_base;
1908
1909 hseg = as_findseg(as, hibound, 1);
1910 if (hseg->s_base + hseg->s_size < hibound) {
1911 lseg = hseg;
1912 hseg = NULL;
1913 } else {
1914 lseg = AS_SEGPREV(as, hseg);
1915 }
1916 }
1917
1918 for (;;) {
1919 /*
1920 * Set lo and hi to the hole's boundaries. (We should really
1921 * use MAXADDR in place of hibound in the expression below,
1922 * but can't express it easily; using hibound in its place is
1923 * harmless.)
1924 */
1925 lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
1926 hi = (hseg == NULL) ? hibound : hseg->s_base;
1927 /*
1928 * If the iteration has moved past the interval from lobound
1929 * to hibound it's pointless to continue.
1930 */
1931 if ((forward && lo > hibound) || (!forward && hi < lobound))
1932 break;
1933 else if (lo > hibound || hi < lobound)
1934 goto cont;
1935 /*
1936 * Candidate hole lies at least partially within the allowable
1937 * range. Restrict it to fall completely within that range,
1938 * i.e., to [max(lo, lobound), min(hi, hibound)].
1939 */
1940 if (lo < lobound)
1941 lo = lobound;
1942 if (hi > hibound)
1943 hi = hibound;
1944 /*
1945 * Verify that the candidate hole is big enough and meets
1946 * hardware constraints. If the hole is too small, no need
1947 * to do the further checks since they will fail.
1948 */
1949 *basep = lo;
1950 *lenp = hi - lo;
1951 if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
1952 minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
1953 ((flags & AH_CONTAIN) == 0 ||
1954 (*basep <= addr && *basep + *lenp > addr))) {
1955 if (!forward)
1956 as->a_lastgap = hseg;
1957 if (hseg != NULL)
1958 as->a_lastgaphl = hseg;
1959 else
1960 as->a_lastgaphl = lseg;
1961 AS_LOCK_EXIT(as);
1962 return (0);
1963 }
1964 cont:
1965 /*
1966 * Move to the next hole.
1967 */
1968 if (forward) {
1969 lseg = hseg;
1970 if (lseg == NULL)
1971 break;
1972 hseg = AS_SEGNEXT(as, hseg);
1973 } else {
1974 hseg = lseg;
1975 if (hseg == NULL)
1976 break;
1977 lseg = AS_SEGPREV(as, lseg);
1978 }
1979 }
1980 if (fast_path && (align != 0 || save_redzone != 0)) {
1981 fast_path = 0;
1982 minlen = save_minlen;
1983 redzone = save_redzone;
1984 goto retry;
1985 }
1986 *basep = save_base;
1987 *lenp = save_len;
1988 AS_LOCK_EXIT(as);
1989 return (-1);
1990 }
1991
1992 /*
1993 * Find a hole of at least size minlen within [*basep, *basep + *lenp).
1994 *
1995 * If flags specifies AH_HI, the hole will have the highest possible address
1996 * in the range. We use the as->a_lastgap field to figure out where to
1997 * start looking for a gap.
1998 *
1999 * Otherwise, the gap will have the lowest possible address.
2000 *
2001 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
2002 *
2003 * If an adequate hole is found, base and len are set to reflect the part of
2004 * the hole that is within range, and 0 is returned, otherwise,
2005 * -1 is returned.
2006 *
2007 * NOTE: This routine is not correct when base+len overflows caddr_t.
2008 */
2009 int
2010 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
2011 caddr_t addr)
2012 {
2013
2014 return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
2015 }
2016
2017 /*
2018 * Return the next range within [base, base + len) that is backed
2019 * with "real memory". Skip holes and non-seg_vn segments.
2020 * We're lazy and only return one segment at a time.
2021 */
2022 int
2023 as_memory(struct as *as, caddr_t *basep, size_t *lenp)
2024 {
2025 extern struct seg_ops segspt_shmops; /* needs a header file */
2026 struct seg *seg;
2027 caddr_t addr, eaddr;
2028 caddr_t segend;
2029
2030 AS_LOCK_ENTER(as, RW_READER);
2031
2032 addr = *basep;
2033 eaddr = addr + *lenp;
2034
2035 seg = as_findseg(as, addr, 0);
2036 if (seg != NULL)
2037 addr = MAX(seg->s_base, addr);
2038
2039 for (;;) {
2040 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2041 AS_LOCK_EXIT(as);
2042 return (EINVAL);
2043 }
2044
2045 if (seg->s_ops == &segvn_ops) {
2046 segend = seg->s_base + seg->s_size;
2047 break;
2048 }
2049
2050 /*
2051 * We do ISM by looking into the private data
2052 * to determine the real size of the segment.
2053 */
2054 if (seg->s_ops == &segspt_shmops) {
2055 segend = seg->s_base + spt_realsize(seg);
2056 if (addr < segend)
2057 break;
2058 }
2059
2060 seg = AS_SEGNEXT(as, seg);
2061
2062 if (seg != NULL)
2063 addr = seg->s_base;
2064 }
2065
2066 *basep = addr;
2067
2068 if (segend > eaddr)
2069 *lenp = eaddr - addr;
2070 else
2071 *lenp = segend - addr;
2072
2073 AS_LOCK_EXIT(as);
2074 return (0);
2075 }
2076
2077 /*
2078 * Swap the pages associated with the address space as out to
2079 * secondary storage, returning the number of bytes actually
2080 * swapped.
2081 *
2082 * The value returned is intended to correlate well with the process's
2083 * memory requirements. Its usefulness for this purpose depends on
2084 * how well the segment-level routines do at returning accurate
2085 * information.
2086 */
2087 size_t
2088 as_swapout(struct as *as)
2089 {
2090 struct seg *seg;
2091 size_t swpcnt = 0;
2092
2093 /*
2094 * Kernel-only processes have given up their address
2095 * spaces. Of course, we shouldn't be attempting to
2096 * swap out such processes in the first place...
2097 */
2098 if (as == NULL)
2099 return (0);
2100
2101 AS_LOCK_ENTER(as, RW_READER);
2102
2103 /*
2104 * Free all mapping resources associated with the address
2105 * space. The segment-level swapout routines capitalize
2106 * on this unmapping by scavanging pages that have become
2107 * unmapped here.
2108 */
2109 hat_swapout(as->a_hat);
2110
2111 /*
2112 * Call the swapout routines of all segments in the address
2113 * space to do the actual work, accumulating the amount of
2114 * space reclaimed.
2115 */
2116 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2117 struct seg_ops *ov = seg->s_ops;
2118
2119 /*
2120 * We have to check to see if the seg has
2121 * an ops vector because the seg may have
2122 * been in the middle of being set up when
2123 * the process was picked for swapout.
2124 */
2125 if ((ov != NULL) && (ov->swapout != NULL))
2126 swpcnt += SEGOP_SWAPOUT(seg);
2127 }
2128 AS_LOCK_EXIT(as);
2129 return (swpcnt);
2130 }
2131
2132 /*
2133 * Determine whether data from the mappings in interval [addr, addr + size)
2134 * are in the primary memory (core) cache.
2135 */
2136 int
2137 as_incore(struct as *as, caddr_t addr,
2138 size_t size, char *vec, size_t *sizep)
2139 {
2140 struct seg *seg;
2141 size_t ssize;
2142 caddr_t raddr; /* rounded down addr */
2143 size_t rsize; /* rounded up size */
2144 size_t isize; /* iteration size */
2145 int error = 0; /* result, assume success */
2146
2147 *sizep = 0;
2148 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2149 rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2150 (size_t)raddr;
2151
2152 if (raddr + rsize < raddr) /* check for wraparound */
2153 return (ENOMEM);
2154
2155 AS_LOCK_ENTER(as, RW_READER);
2156 seg = as_segat(as, raddr);
2157 if (seg == NULL) {
2158 AS_LOCK_EXIT(as);
2159 return (-1);
2160 }
2161
2162 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2163 if (raddr >= seg->s_base + seg->s_size) {
2164 seg = AS_SEGNEXT(as, seg);
2165 if (seg == NULL || raddr != seg->s_base) {
2166 error = -1;
2167 break;
2168 }
2169 }
2170 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2171 ssize = seg->s_base + seg->s_size - raddr;
2172 else
2173 ssize = rsize;
2174 *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec);
2175 if (isize != ssize) {
2176 error = -1;
2177 break;
2178 }
2179 vec += btopr(ssize);
2180 }
2181 AS_LOCK_EXIT(as);
2182 return (error);
2183 }
2184
2185 static void
2186 as_segunlock(struct seg *seg, caddr_t addr, int attr,
2187 ulong_t *bitmap, size_t position, size_t npages)
2188 {
2189 caddr_t range_start;
2190 size_t pos1 = position;
2191 size_t pos2;
2192 size_t size;
2193 size_t end_pos = npages + position;
2194
2195 while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2196 size = ptob((pos2 - pos1));
2197 range_start = (caddr_t)((uintptr_t)addr +
2198 ptob(pos1 - position));
2199
2200 (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK,
2201 (ulong_t *)NULL, (size_t)NULL);
2202 pos1 = pos2;
2203 }
2204 }
2205
2206 static void
2207 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
2208 caddr_t raddr, size_t rsize)
2209 {
2210 struct seg *seg = as_segat(as, raddr);
2211 size_t ssize;
2212
2213 while (rsize != 0) {
2214 if (raddr >= seg->s_base + seg->s_size)
2215 seg = AS_SEGNEXT(as, seg);
2216
2217 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2218 ssize = seg->s_base + seg->s_size - raddr;
2219 else
2220 ssize = rsize;
2221
2222 as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
2223
2224 rsize -= ssize;
2225 raddr += ssize;
2226 }
2227 }
2228
2229 /*
2230 * Cache control operations over the interval [addr, addr + size) in
2231 * address space "as".
2232 */
2233 /*ARGSUSED*/
2234 int
2235 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
2236 uintptr_t arg, ulong_t *lock_map, size_t pos)
2237 {
2238 struct seg *seg; /* working segment */
2239 caddr_t raddr; /* rounded down addr */
2240 caddr_t initraddr; /* saved initial rounded down addr */
2241 size_t rsize; /* rounded up size */
2242 size_t initrsize; /* saved initial rounded up size */
2243 size_t ssize; /* size of seg */
2244 int error = 0; /* result */
2245 size_t mlock_size; /* size of bitmap */
2246 ulong_t *mlock_map; /* pointer to bitmap used */
2247 /* to represent the locked */
2248 /* pages. */
2249 retry:
2250 if (error == IE_RETRY)
2251 AS_LOCK_ENTER(as, RW_WRITER);
2252 else
2253 AS_LOCK_ENTER(as, RW_READER);
2254
2255 /*
2256 * If these are address space lock/unlock operations, loop over
2257 * all segments in the address space, as appropriate.
2258 */
2259 if (func == MC_LOCKAS) {
2260 size_t npages, idx;
2261 size_t rlen = 0; /* rounded as length */
2262
2263 idx = pos;
2264
2265 if (arg & MCL_FUTURE) {
2266 mutex_enter(&as->a_contents);
2267 AS_SETPGLCK(as);
2268 mutex_exit(&as->a_contents);
2269 }
2270 if ((arg & MCL_CURRENT) == 0) {
2271 AS_LOCK_EXIT(as);
2272 return (0);
2273 }
2274
2275 seg = AS_SEGFIRST(as);
2276 if (seg == NULL) {
2277 AS_LOCK_EXIT(as);
2278 return (0);
2279 }
2280
2281 do {
2282 raddr = (caddr_t)((uintptr_t)seg->s_base &
2283 (uintptr_t)PAGEMASK);
2284 rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2285 PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2286 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2287
2288 mlock_size = BT_BITOUL(btopr(rlen));
2289 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2290 sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2291 AS_LOCK_EXIT(as);
2292 return (EAGAIN);
2293 }
2294
2295 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2296 error = SEGOP_LOCKOP(seg, seg->s_base,
2297 seg->s_size, attr, MC_LOCK, mlock_map, pos);
2298 if (error != 0)
2299 break;
2300 pos += seg_pages(seg);
2301 }
2302
2303 if (error) {
2304 for (seg = AS_SEGFIRST(as); seg != NULL;
2305 seg = AS_SEGNEXT(as, seg)) {
2306
2307 raddr = (caddr_t)((uintptr_t)seg->s_base &
2308 (uintptr_t)PAGEMASK);
2309 npages = seg_pages(seg);
2310 as_segunlock(seg, raddr, attr, mlock_map,
2311 idx, npages);
2312 idx += npages;
2313 }
2314 }
2315
2316 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2317 AS_LOCK_EXIT(as);
2318 goto lockerr;
2319 } else if (func == MC_UNLOCKAS) {
2320 mutex_enter(&as->a_contents);
2321 AS_CLRPGLCK(as);
2322 mutex_exit(&as->a_contents);
2323
2324 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2325 error = SEGOP_LOCKOP(seg, seg->s_base,
2326 seg->s_size, attr, MC_UNLOCK, NULL, 0);
2327 if (error != 0)
2328 break;
2329 }
2330
2331 AS_LOCK_EXIT(as);
2332 goto lockerr;
2333 }
2334
2335 /*
2336 * Normalize addresses and sizes.
2337 */
2338 initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2339 initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2340 (size_t)raddr;
2341
2342 if (raddr + rsize < raddr) { /* check for wraparound */
2343 AS_LOCK_EXIT(as);
2344 return (ENOMEM);
2345 }
2346
2347 /*
2348 * Get initial segment.
2349 */
2350 if ((seg = as_segat(as, raddr)) == NULL) {
2351 AS_LOCK_EXIT(as);
2352 return (ENOMEM);
2353 }
2354
2355 if (func == MC_LOCK) {
2356 mlock_size = BT_BITOUL(btopr(rsize));
2357 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2358 sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2359 AS_LOCK_EXIT(as);
2360 return (EAGAIN);
2361 }
2362 }
2363
2364 /*
2365 * Loop over all segments. If a hole in the address range is
2366 * discovered, then fail. For each segment, perform the appropriate
2367 * control operation.
2368 */
2369 while (rsize != 0) {
2370
2371 /*
2372 * Make sure there's no hole, calculate the portion
2373 * of the next segment to be operated over.
2374 */
2375 if (raddr >= seg->s_base + seg->s_size) {
2376 seg = AS_SEGNEXT(as, seg);
2377 if (seg == NULL || raddr != seg->s_base) {
2378 if (func == MC_LOCK) {
2379 as_unlockerr(as, attr, mlock_map,
2380 initraddr, initrsize - rsize);
2381 kmem_free(mlock_map,
2382 mlock_size * sizeof (ulong_t));
2383 }
2384 AS_LOCK_EXIT(as);
2385 return (ENOMEM);
2386 }
2387 }
2388 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2389 ssize = seg->s_base + seg->s_size - raddr;
2390 else
2391 ssize = rsize;
2392
2393 /*
2394 * Dispatch on specific function.
2395 */
2396 switch (func) {
2397
2398 /*
2399 * Synchronize cached data from mappings with backing
2400 * objects.
2401 */
2402 case MC_SYNC:
2403 if (error = SEGOP_SYNC(seg, raddr, ssize,
2404 attr, (uint_t)arg)) {
2405 AS_LOCK_EXIT(as);
2406 return (error);
2407 }
2408 break;
2409
2410 /*
2411 * Lock pages in memory.
2412 */
2413 case MC_LOCK:
2414 if (error = SEGOP_LOCKOP(seg, raddr, ssize,
2415 attr, func, mlock_map, pos)) {
2416 as_unlockerr(as, attr, mlock_map, initraddr,
2417 initrsize - rsize + ssize);
2418 kmem_free(mlock_map, mlock_size *
2419 sizeof (ulong_t));
2420 AS_LOCK_EXIT(as);
2421 goto lockerr;
2422 }
2423 break;
2424
2425 /*
2426 * Unlock mapped pages.
2427 */
2428 case MC_UNLOCK:
2429 (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func,
2430 (ulong_t *)NULL, (size_t)NULL);
2431 break;
2432
2433 /*
2434 * Store VM advise for mapped pages in segment layer.
2435 */
2436 case MC_ADVISE:
2437 error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg);
2438
2439 /*
2440 * Check for regular errors and special retry error
2441 */
2442 if (error) {
2443 if (error == IE_RETRY) {
2444 /*
2445 * Need to acquire writers lock, so
2446 * have to drop readers lock and start
2447 * all over again
2448 */
2449 AS_LOCK_EXIT(as);
2450 goto retry;
2451 } else if (error == IE_REATTACH) {
2452 /*
2453 * Find segment for current address
2454 * because current segment just got
2455 * split or concatenated
2456 */
2457 seg = as_segat(as, raddr);
2458 if (seg == NULL) {
2459 AS_LOCK_EXIT(as);
2460 return (ENOMEM);
2461 }
2462 } else {
2463 /*
2464 * Regular error
2465 */
2466 AS_LOCK_EXIT(as);
2467 return (error);
2468 }
2469 }
2470 break;
2471
2472 case MC_INHERIT_ZERO:
2473 if (seg->s_ops->inherit == NULL) {
2474 error = ENOTSUP;
2475 } else {
2476 error = SEGOP_INHERIT(seg, raddr, ssize,
2477 SEGP_INH_ZERO);
2478 }
2479 if (error != 0) {
2480 AS_LOCK_EXIT(as);
2481 return (error);
2482 }
2483 break;
2484
2485 /*
2486 * Can't happen.
2487 */
2488 default:
2489 panic("as_ctl: bad operation %d", func);
2490 /*NOTREACHED*/
2491 }
2492
2493 rsize -= ssize;
2494 raddr += ssize;
2495 }
2496
2497 if (func == MC_LOCK)
2498 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2499 AS_LOCK_EXIT(as);
2500 return (0);
2501 lockerr:
2502
2503 /*
2504 * If the lower levels returned EDEADLK for a segment lockop,
2505 * it means that we should retry the operation. Let's wait
2506 * a bit also to let the deadlock causing condition clear.
2507 * This is part of a gross hack to work around a design flaw
2508 * in the ufs/sds logging code and should go away when the
2509 * logging code is re-designed to fix the problem. See bug
2510 * 4125102 for details of the problem.
2511 */
2512 if (error == EDEADLK) {
2513 delay(deadlk_wait);
2514 error = 0;
2515 goto retry;
2516 }
2517 return (error);
2518 }
2519
2520 int
2521 fc_decode(faultcode_t fault_err)
2522 {
2523 int error = 0;
2524
2525 switch (FC_CODE(fault_err)) {
2526 case FC_OBJERR:
2527 error = FC_ERRNO(fault_err);
2528 break;
2529 case FC_PROT:
2530 error = EACCES;
2531 break;
2532 default:
2533 error = EFAULT;
2534 break;
2535 }
2536 return (error);
2537 }
2538
2539 /*
2540 * Pagelock pages from a range that spans more than 1 segment. Obtain shadow
2541 * lists from each segment and copy them to one contiguous shadow list (plist)
2542 * as expected by the caller. Save pointers to per segment shadow lists at
2543 * the tail of plist so that they can be used during as_pageunlock().
2544 */
2545 static int
2546 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2547 caddr_t addr, size_t size, enum seg_rw rw)
2548 {
2549 caddr_t sv_addr = addr;
2550 size_t sv_size = size;
2551 struct seg *sv_seg = seg;
2552 ulong_t segcnt = 1;
2553 ulong_t cnt;
2554 size_t ssize;
2555 pgcnt_t npages = btop(size);
2556 page_t **plist;
2557 page_t **pl;
2558 int error;
2559 caddr_t eaddr;
2560 faultcode_t fault_err = 0;
2561 pgcnt_t pl_off;
2562 extern struct seg_ops segspt_shmops;
2563
2564 ASSERT(AS_LOCK_HELD(as));
2565 ASSERT(seg != NULL);
2566 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2567 ASSERT(addr + size > seg->s_base + seg->s_size);
2568 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2569 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2570
2571 /*
2572 * Count the number of segments covered by the range we are about to
2573 * lock. The segment count is used to size the shadow list we return
2574 * back to the caller.
2575 */
2576 for (; size != 0; size -= ssize, addr += ssize) {
2577 if (addr >= seg->s_base + seg->s_size) {
2578
2579 seg = AS_SEGNEXT(as, seg);
2580 if (seg == NULL || addr != seg->s_base) {
2581 AS_LOCK_EXIT(as);
2582 return (EFAULT);
2583 }
2584 /*
2585 * Do a quick check if subsequent segments
2586 * will most likely support pagelock.
2587 */
2588 if (seg->s_ops == &segvn_ops) {
2589 vnode_t *vp;
2590
2591 if (SEGOP_GETVP(seg, addr, &vp) != 0 ||
2592 vp != NULL) {
2593 AS_LOCK_EXIT(as);
2594 goto slow;
2595 }
2596 } else if (seg->s_ops != &segspt_shmops) {
2597 AS_LOCK_EXIT(as);
2598 goto slow;
2599 }
2600 segcnt++;
2601 }
2602 if (addr + size > seg->s_base + seg->s_size) {
2603 ssize = seg->s_base + seg->s_size - addr;
2604 } else {
2605 ssize = size;
2606 }
2607 }
2608 ASSERT(segcnt > 1);
2609
2610 plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2611
2612 addr = sv_addr;
2613 size = sv_size;
2614 seg = sv_seg;
2615
2616 for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2617 if (addr >= seg->s_base + seg->s_size) {
2618 seg = AS_SEGNEXT(as, seg);
2619 ASSERT(seg != NULL && addr == seg->s_base);
2620 cnt++;
2621 ASSERT(cnt < segcnt);
2622 }
2623 if (addr + size > seg->s_base + seg->s_size) {
2624 ssize = seg->s_base + seg->s_size - addr;
2625 } else {
2626 ssize = size;
2627 }
2628 pl = &plist[npages + cnt];
2629 error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2630 L_PAGELOCK, rw);
2631 if (error) {
2632 break;
2633 }
2634 ASSERT(plist[npages + cnt] != NULL);
2635 ASSERT(pl_off + btop(ssize) <= npages);
2636 bcopy(plist[npages + cnt], &plist[pl_off],
2637 btop(ssize) * sizeof (page_t *));
2638 pl_off += btop(ssize);
2639 }
2640
2641 if (size == 0) {
2642 AS_LOCK_EXIT(as);
2643 ASSERT(cnt == segcnt - 1);
2644 *ppp = plist;
2645 return (0);
2646 }
2647
2648 /*
2649 * one of pagelock calls failed. The error type is in error variable.
2650 * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2651 * type is either EFAULT or ENOTSUP. Otherwise just return the error
2652 * back to the caller.
2653 */
2654
2655 eaddr = addr;
2656 seg = sv_seg;
2657
2658 for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2659 if (addr >= seg->s_base + seg->s_size) {
2660 seg = AS_SEGNEXT(as, seg);
2661 ASSERT(seg != NULL && addr == seg->s_base);
2662 cnt++;
2663 ASSERT(cnt < segcnt);
2664 }
2665 if (eaddr > seg->s_base + seg->s_size) {
2666 ssize = seg->s_base + seg->s_size - addr;
2667 } else {
2668 ssize = eaddr - addr;
2669 }
2670 pl = &plist[npages + cnt];
2671 ASSERT(*pl != NULL);
2672 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2673 L_PAGEUNLOCK, rw);
2674 }
2675
2676 AS_LOCK_EXIT(as);
2677
2678 kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2679
2680 if (error != ENOTSUP && error != EFAULT) {
2681 return (error);
2682 }
2683
2684 slow:
2685 /*
2686 * If we are here because pagelock failed due to the need to cow fault
2687 * in the pages we want to lock F_SOFTLOCK will do this job and in
2688 * next as_pagelock() call for this address range pagelock will
2689 * hopefully succeed.
2690 */
2691 fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2692 if (fault_err != 0) {
2693 return (fc_decode(fault_err));
2694 }
2695 *ppp = NULL;
2696
2697 return (0);
2698 }
2699
2700 /*
2701 * lock pages in a given address space. Return shadow list. If
2702 * the list is NULL, the MMU mapping is also locked.
2703 */
2704 int
2705 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
2706 size_t size, enum seg_rw rw)
2707 {
2708 size_t rsize;
2709 caddr_t raddr;
2710 faultcode_t fault_err;
2711 struct seg *seg;
2712 int err;
2713
2714 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START,
2715 "as_pagelock_start: addr %p size %ld", addr, size);
2716
2717 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2718 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2719 (size_t)raddr;
2720
2721 /*
2722 * if the request crosses two segments let
2723 * as_fault handle it.
2724 */
2725 AS_LOCK_ENTER(as, RW_READER);
2726
2727 seg = as_segat(as, raddr);
2728 if (seg == NULL) {
2729 AS_LOCK_EXIT(as);
2730 return (EFAULT);
2731 }
2732 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2733 if (raddr + rsize > seg->s_base + seg->s_size) {
2734 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2735 }
2736 if (raddr + rsize <= raddr) {
2737 AS_LOCK_EXIT(as);
2738 return (EFAULT);
2739 }
2740
2741 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
2742 "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize);
2743
2744 /*
2745 * try to lock pages and pass back shadow list
2746 */
2747 err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2748
2749 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
2750
2751 AS_LOCK_EXIT(as);
2752
2753 if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2754 return (err);
2755 }
2756
2757 /*
2758 * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2759 * to no pagelock support for this segment or pages need to be cow
2760 * faulted in. If fault is needed F_SOFTLOCK will do this job for
2761 * this as_pagelock() call and in the next as_pagelock() call for the
2762 * same address range pagelock call will hopefull succeed.
2763 */
2764 fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2765 if (fault_err != 0) {
2766 return (fc_decode(fault_err));
2767 }
2768 *ppp = NULL;
2769
2770 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end");
2771 return (0);
2772 }
2773
2774 /*
2775 * unlock pages locked by as_pagelock_segs(). Retrieve per segment shadow
2776 * lists from the end of plist and call pageunlock interface for each segment.
2777 * Drop as lock and free plist.
2778 */
2779 static void
2780 as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
2781 struct page **plist, enum seg_rw rw)
2782 {
2783 ulong_t cnt;
2784 caddr_t eaddr = addr + size;
2785 pgcnt_t npages = btop(size);
2786 size_t ssize;
2787 page_t **pl;
2788
2789 ASSERT(AS_LOCK_HELD(as));
2790 ASSERT(seg != NULL);
2791 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2792 ASSERT(addr + size > seg->s_base + seg->s_size);
2793 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2794 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2795 ASSERT(plist != NULL);
2796
2797 for (cnt = 0; addr < eaddr; addr += ssize) {
2798 if (addr >= seg->s_base + seg->s_size) {
2799 seg = AS_SEGNEXT(as, seg);
2800 ASSERT(seg != NULL && addr == seg->s_base);
2801 cnt++;
2802 }
2803 if (eaddr > seg->s_base + seg->s_size) {
2804 ssize = seg->s_base + seg->s_size - addr;
2805 } else {
2806 ssize = eaddr - addr;
2807 }
2808 pl = &plist[npages + cnt];
2809 ASSERT(*pl != NULL);
2810 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2811 L_PAGEUNLOCK, rw);
2812 }
2813 ASSERT(cnt > 0);
2814 AS_LOCK_EXIT(as);
2815
2816 cnt++;
2817 kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2818 }
2819
2820 /*
2821 * unlock pages in a given address range
2822 */
2823 void
2824 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2825 enum seg_rw rw)
2826 {
2827 struct seg *seg;
2828 size_t rsize;
2829 caddr_t raddr;
2830
2831 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START,
2832 "as_pageunlock_start: addr %p size %ld", addr, size);
2833
2834 /*
2835 * if the shadow list is NULL, as_pagelock was
2836 * falling back to as_fault
2837 */
2838 if (pp == NULL) {
2839 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2840 return;
2841 }
2842
2843 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2844 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2845 (size_t)raddr;
2846
2847 AS_LOCK_ENTER(as, RW_READER);
2848 seg = as_segat(as, raddr);
2849 ASSERT(seg != NULL);
2850
2851 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
2852 "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
2853
2854 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2855 if (raddr + rsize <= seg->s_base + seg->s_size) {
2856 SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2857 } else {
2858 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2859 return;
2860 }
2861 AS_LOCK_EXIT(as);
2862 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
2863 }
2864
2865 int
2866 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2867 boolean_t wait)
2868 {
2869 struct seg *seg;
2870 size_t ssize;
2871 caddr_t raddr; /* rounded down addr */
2872 size_t rsize; /* rounded up size */
2873 int error = 0;
2874 size_t pgsz = page_get_pagesize(szc);
2875
2876 setpgsz_top:
2877 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
2878 return (EINVAL);
2879 }
2880
2881 raddr = addr;
2882 rsize = size;
2883
2884 if (raddr + rsize < raddr) /* check for wraparound */
2885 return (ENOMEM);
2886
2887 AS_LOCK_ENTER(as, RW_WRITER);
2888 as_clearwatchprot(as, raddr, rsize);
2889 seg = as_segat(as, raddr);
2890 if (seg == NULL) {
2891 as_setwatch(as);
2892 AS_LOCK_EXIT(as);
2893 return (ENOMEM);
2894 }
2895
2896 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2897 if (raddr >= seg->s_base + seg->s_size) {
2898 seg = AS_SEGNEXT(as, seg);
2899 if (seg == NULL || raddr != seg->s_base) {
2900 error = ENOMEM;
2901 break;
2902 }
2903 }
2904 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2905 ssize = seg->s_base + seg->s_size - raddr;
2906 } else {
2907 ssize = rsize;
2908 }
2909
2910 retry:
2911 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
2912
2913 if (error == IE_NOMEM) {
2914 error = EAGAIN;
2915 break;
2916 }
2917
2918 if (error == IE_RETRY) {
2919 AS_LOCK_EXIT(as);
2920 goto setpgsz_top;
2921 }
2922
2923 if (error == ENOTSUP) {
2924 error = EINVAL;
2925 break;
2926 }
2927
2928 if (wait && (error == EAGAIN)) {
2929 /*
2930 * Memory is currently locked. It must be unlocked
2931 * before this operation can succeed through a retry.
2932 * The possible reasons for locked memory and
2933 * corresponding strategies for unlocking are:
2934 * (1) Normal I/O
2935 * wait for a signal that the I/O operation
2936 * has completed and the memory is unlocked.
2937 * (2) Asynchronous I/O
2938 * The aio subsystem does not unlock pages when
2939 * the I/O is completed. Those pages are unlocked
2940 * when the application calls aiowait/aioerror.
2941 * So, to prevent blocking forever, cv_broadcast()
2942 * is done to wake up aio_cleanup_thread.
2943 * Subsequently, segvn_reclaim will be called, and
2944 * that will do AS_CLRUNMAPWAIT() and wake us up.
2945 * (3) Long term page locking:
2946 * This is not relevant for as_setpagesize()
2947 * because we cannot change the page size for
2948 * driver memory. The attempt to do so will
2949 * fail with a different error than EAGAIN so
2950 * there's no need to trigger as callbacks like
2951 * as_unmap, as_setprot or as_free would do.
2952 */
2953 mutex_enter(&as->a_contents);
2954 if (!AS_ISNOUNMAPWAIT(as)) {
2955 if (AS_ISUNMAPWAIT(as) == 0) {
2956 cv_broadcast(&as->a_cv);
2957 }
2958 AS_SETUNMAPWAIT(as);
2959 AS_LOCK_EXIT(as);
2960 while (AS_ISUNMAPWAIT(as)) {
2961 cv_wait(&as->a_cv, &as->a_contents);
2962 }
2963 } else {
2964 /*
2965 * We may have raced with
2966 * segvn_reclaim()/segspt_reclaim(). In this
2967 * case clean nounmapwait flag and retry since
2968 * softlockcnt in this segment may be already
2969 * 0. We don't drop as writer lock so our
2970 * number of retries without sleeping should
2971 * be very small. See segvn_reclaim() for
2972 * more comments.
2973 */
2974 AS_CLRNOUNMAPWAIT(as);
2975 mutex_exit(&as->a_contents);
2976 goto retry;
2977 }
2978 mutex_exit(&as->a_contents);
2979 goto setpgsz_top;
2980 } else if (error != 0) {
2981 break;
2982 }
2983 }
2984 as_setwatch(as);
2985 AS_LOCK_EXIT(as);
2986 return (error);
2987 }
2988
2989 /*
2990 * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments
2991 * in its chunk where s_szc is less than the szc we want to set.
2992 */
2993 static int
2994 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
2995 int *retry)
2996 {
2997 struct seg *seg;
2998 size_t ssize;
2999 int error;
3000
3001 ASSERT(AS_WRITE_HELD(as));
3002
3003 seg = as_segat(as, raddr);
3004 if (seg == NULL) {
3005 panic("as_iset3_default_lpsize: no seg");
3006 }
3007
3008 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
3009 if (raddr >= seg->s_base + seg->s_size) {
3010 seg = AS_SEGNEXT(as, seg);
3011 if (seg == NULL || raddr != seg->s_base) {
3012 panic("as_iset3_default_lpsize: as changed");
3013 }
3014 }
3015 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3016 ssize = seg->s_base + seg->s_size - raddr;
3017 } else {
3018 ssize = rsize;
3019 }
3020
3021 if (szc > seg->s_szc) {
3022 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
3023 /* Only retry on EINVAL segments that have no vnode. */
3024 if (error == EINVAL) {
3025 vnode_t *vp = NULL;
3026 if ((SEGOP_GETTYPE(seg, raddr) & MAP_SHARED) &&
3027 (SEGOP_GETVP(seg, raddr, &vp) != 0 ||
3028 vp == NULL)) {
3029 *retry = 1;
3030 } else {
3031 *retry = 0;
3032 }
3033 }
3034 if (error) {
3035 return (error);
3036 }
3037 }
3038 }
3039 return (0);
3040 }
3041
3042 /*
3043 * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
3044 * pagesize on each segment in its range, but if any fails with EINVAL,
3045 * then it reduces the pagesizes to the next size in the bitmap and
3046 * retries as_iset3_default_lpsize(). The reason why the code retries
3047 * smaller allowed sizes on EINVAL is because (a) the anon offset may not
3048 * match the bigger sizes, and (b) it's hard to get this offset (to begin
3049 * with) to pass to map_pgszcvec().
3050 */
3051 static int
3052 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
3053 uint_t szcvec)
3054 {
3055 int error;
3056 int retry;
3057
3058 ASSERT(AS_WRITE_HELD(as));
3059
3060 for (;;) {
3061 error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
3062 if (error == EINVAL && retry) {
3063 szcvec &= ~(1 << szc);
3064 if (szcvec <= 1) {
3065 return (EINVAL);
3066 }
3067 szc = highbit(szcvec) - 1;
3068 } else {
3069 return (error);
3070 }
3071 }
3072 }
3073
3074 /*
3075 * as_iset1_default_lpsize() breaks its chunk into areas where existing
3076 * segments have a smaller szc than we want to set. For each such area,
3077 * it calls as_iset2_default_lpsize()
3078 */
3079 static int
3080 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3081 uint_t szcvec)
3082 {
3083 struct seg *seg;
3084 size_t ssize;
3085 caddr_t setaddr = raddr;
3086 size_t setsize = 0;
3087 int set;
3088 int error;
3089
3090 ASSERT(AS_WRITE_HELD(as));
3091
3092 seg = as_segat(as, raddr);
3093 if (seg == NULL) {
3094 panic("as_iset1_default_lpsize: no seg");
3095 }
3096 if (seg->s_szc < szc) {
3097 set = 1;
3098 } else {
3099 set = 0;
3100 }
3101
3102 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3103 if (raddr >= seg->s_base + seg->s_size) {
3104 seg = AS_SEGNEXT(as, seg);
3105 if (seg == NULL || raddr != seg->s_base) {
3106 panic("as_iset1_default_lpsize: as changed");
3107 }
3108 if (seg->s_szc >= szc && set) {
3109 ASSERT(setsize != 0);
3110 error = as_iset2_default_lpsize(as,
3111 setaddr, setsize, szc, szcvec);
3112 if (error) {
3113 return (error);
3114 }
3115 set = 0;
3116 } else if (seg->s_szc < szc && !set) {
3117 setaddr = raddr;
3118 setsize = 0;
3119 set = 1;
3120 }
3121 }
3122 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3123 ssize = seg->s_base + seg->s_size - raddr;
3124 } else {
3125 ssize = rsize;
3126 }
3127 }
3128 error = 0;
3129 if (set) {
3130 ASSERT(setsize != 0);
3131 error = as_iset2_default_lpsize(as, setaddr, setsize,
3132 szc, szcvec);
3133 }
3134 return (error);
3135 }
3136
3137 /*
3138 * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3139 * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3140 * chunk to as_iset1_default_lpsize().
3141 */
3142 static int
3143 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
3144 int type)
3145 {
3146 int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
3147 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
3148 flags, rtype, 1);
3149 uint_t szc;
3150 uint_t nszc;
3151 int error;
3152 caddr_t a;
3153 caddr_t eaddr;
3154 size_t segsize;
3155 size_t pgsz;
3156 uint_t save_szcvec;
3157
3158 ASSERT(AS_WRITE_HELD(as));
3159 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
3160 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
3161
3162 szcvec &= ~1;
3163 if (szcvec <= 1) { /* skip if base page size */
3164 return (0);
3165 }
3166
3167 /* Get the pagesize of the first larger page size. */
3168 szc = lowbit(szcvec) - 1;
3169 pgsz = page_get_pagesize(szc);
3170 eaddr = addr + size;
3171 addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3172 eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3173
3174 save_szcvec = szcvec;
3175 szcvec >>= (szc + 1);
3176 nszc = szc;
3177 while (szcvec) {
3178 if ((szcvec & 0x1) == 0) {
3179 nszc++;
3180 szcvec >>= 1;
3181 continue;
3182 }
3183 nszc++;
3184 pgsz = page_get_pagesize(nszc);
3185 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3186 if (a != addr) {
3187 ASSERT(szc > 0);
3188 ASSERT(a < eaddr);
3189 segsize = a - addr;
3190 error = as_iset1_default_lpsize(as, addr, segsize, szc,
3191 save_szcvec);
3192 if (error) {
3193 return (error);
3194 }
3195 addr = a;
3196 }
3197 szc = nszc;
3198 szcvec >>= 1;
3199 }
3200
3201 ASSERT(addr < eaddr);
3202 szcvec = save_szcvec;
3203 while (szcvec) {
3204 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3205 ASSERT(a >= addr);
3206 if (a != addr) {
3207 ASSERT(szc > 0);
3208 segsize = a - addr;
3209 error = as_iset1_default_lpsize(as, addr, segsize, szc,
3210 save_szcvec);
3211 if (error) {
3212 return (error);
3213 }
3214 addr = a;
3215 }
3216 szcvec &= ~(1 << szc);
3217 if (szcvec) {
3218 szc = highbit(szcvec) - 1;
3219 pgsz = page_get_pagesize(szc);
3220 }
3221 }
3222 ASSERT(addr == eaddr);
3223
3224 return (0);
3225 }
3226
3227 /*
3228 * Set the default large page size for the range. Called via memcntl with
3229 * page size set to 0. as_set_default_lpsize breaks the range down into
3230 * chunks with the same type/flags, ignores-non segvn segments, and passes
3231 * each chunk to as_iset_default_lpsize().
3232 */
3233 int
3234 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
3235 {
3236 struct seg *seg;
3237 caddr_t raddr;
3238 size_t rsize;
3239 size_t ssize;
3240 int rtype, rflags;
3241 int stype, sflags;
3242 int error;
3243 caddr_t setaddr;
3244 size_t setsize;
3245 int segvn;
3246
3247 if (size == 0)
3248 return (0);
3249
3250 AS_LOCK_ENTER(as, RW_WRITER);
3251 again:
3252 error = 0;
3253
3254 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3255 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3256 (size_t)raddr;
3257
3258 if (raddr + rsize < raddr) { /* check for wraparound */
3259 AS_LOCK_EXIT(as);
3260 return (ENOMEM);
3261 }
3262 as_clearwatchprot(as, raddr, rsize);
3263 seg = as_segat(as, raddr);
3264 if (seg == NULL) {
3265 as_setwatch(as);
3266 AS_LOCK_EXIT(as);
3267 return (ENOMEM);
3268 }
3269 if (seg->s_ops == &segvn_ops) {
3270 rtype = SEGOP_GETTYPE(seg, addr);
3271 rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3272 rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3273 segvn = 1;
3274 } else {
3275 segvn = 0;
3276 }
3277 setaddr = raddr;
3278 setsize = 0;
3279
3280 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3281 if (raddr >= (seg->s_base + seg->s_size)) {
3282 seg = AS_SEGNEXT(as, seg);
3283 if (seg == NULL || raddr != seg->s_base) {
3284 error = ENOMEM;
3285 break;
3286 }
3287 if (seg->s_ops == &segvn_ops) {
3288 stype = SEGOP_GETTYPE(seg, raddr);
3289 sflags = stype & (MAP_TEXT | MAP_INITDATA);
3290 stype &= (MAP_SHARED | MAP_PRIVATE);
3291 if (segvn && (rflags != sflags ||
3292 rtype != stype)) {
3293 /*
3294 * The next segment is also segvn but
3295 * has different flags and/or type.
3296 */
3297 ASSERT(setsize != 0);
3298 error = as_iset_default_lpsize(as,
3299 setaddr, setsize, rflags, rtype);
3300 if (error) {
3301 break;
3302 }
3303 rflags = sflags;
3304 rtype = stype;
3305 setaddr = raddr;
3306 setsize = 0;
3307 } else if (!segvn) {
3308 rflags = sflags;
3309 rtype = stype;
3310 setaddr = raddr;
3311 setsize = 0;
3312 segvn = 1;
3313 }
3314 } else if (segvn) {
3315 /* The next segment is not segvn. */
3316 ASSERT(setsize != 0);
3317 error = as_iset_default_lpsize(as,
3318 setaddr, setsize, rflags, rtype);
3319 if (error) {
3320 break;
3321 }
3322 segvn = 0;
3323 }
3324 }
3325 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3326 ssize = seg->s_base + seg->s_size - raddr;
3327 } else {
3328 ssize = rsize;
3329 }
3330 }
3331 if (error == 0 && segvn) {
3332 /* The last chunk when rsize == 0. */
3333 ASSERT(setsize != 0);
3334 error = as_iset_default_lpsize(as, setaddr, setsize,
3335 rflags, rtype);
3336 }
3337
3338 if (error == IE_RETRY) {
3339 goto again;
3340 } else if (error == IE_NOMEM) {
3341 error = EAGAIN;
3342 } else if (error == ENOTSUP) {
3343 error = EINVAL;
3344 } else if (error == EAGAIN) {
3345 mutex_enter(&as->a_contents);
3346 if (!AS_ISNOUNMAPWAIT(as)) {
3347 if (AS_ISUNMAPWAIT(as) == 0) {
3348 cv_broadcast(&as->a_cv);
3349 }
3350 AS_SETUNMAPWAIT(as);
3351 AS_LOCK_EXIT(as);
3352 while (AS_ISUNMAPWAIT(as)) {
3353 cv_wait(&as->a_cv, &as->a_contents);
3354 }
3355 mutex_exit(&as->a_contents);
3356 AS_LOCK_ENTER(as, RW_WRITER);
3357 } else {
3358 /*
3359 * We may have raced with
3360 * segvn_reclaim()/segspt_reclaim(). In this case
3361 * clean nounmapwait flag and retry since softlockcnt
3362 * in this segment may be already 0. We don't drop as
3363 * writer lock so our number of retries without
3364 * sleeping should be very small. See segvn_reclaim()
3365 * for more comments.
3366 */
3367 AS_CLRNOUNMAPWAIT(as);
3368 mutex_exit(&as->a_contents);
3369 }
3370 goto again;
3371 }
3372
3373 as_setwatch(as);
3374 AS_LOCK_EXIT(as);
3375 return (error);
3376 }
3377
3378 /*
3379 * Setup all of the uninitialized watched pages that we can.
3380 */
3381 void
3382 as_setwatch(struct as *as)
3383 {
3384 struct watched_page *pwp;
3385 struct seg *seg;
3386 caddr_t vaddr;
3387 uint_t prot;
3388 int err, retrycnt;
3389
3390 if (avl_numnodes(&as->a_wpage) == 0)
3391 return;
3392
3393 ASSERT(AS_WRITE_HELD(as));
3394
3395 for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3396 pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3397 retrycnt = 0;
3398 retry:
3399 vaddr = pwp->wp_vaddr;
3400 if (pwp->wp_oprot != 0 || /* already set up */
3401 (seg = as_segat(as, vaddr)) == NULL ||
3402 SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0)
3403 continue;
3404
3405 pwp->wp_oprot = prot;
3406 if (pwp->wp_read)
3407 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3408 if (pwp->wp_write)
3409 prot &= ~PROT_WRITE;
3410 if (pwp->wp_exec)
3411 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3412 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3413 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3414 if (err == IE_RETRY) {
3415 pwp->wp_oprot = 0;
3416 ASSERT(retrycnt == 0);
3417 retrycnt++;
3418 goto retry;
3419 }
3420 }
3421 pwp->wp_prot = prot;
3422 }
3423 }
3424
3425 /*
3426 * Clear all of the watched pages in the address space.
3427 */
3428 void
3429 as_clearwatch(struct as *as)
3430 {
3431 struct watched_page *pwp;
3432 struct seg *seg;
3433 caddr_t vaddr;
3434 uint_t prot;
3435 int err, retrycnt;
3436
3437 if (avl_numnodes(&as->a_wpage) == 0)
3438 return;
3439
3440 ASSERT(AS_WRITE_HELD(as));
3441
3442 for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3443 pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3444 retrycnt = 0;
3445 retry:
3446 vaddr = pwp->wp_vaddr;
3447 if (pwp->wp_oprot == 0 || /* not set up */
3448 (seg = as_segat(as, vaddr)) == NULL)
3449 continue;
3450
3451 if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3452 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3453 if (err == IE_RETRY) {
3454 ASSERT(retrycnt == 0);
3455 retrycnt++;
3456 goto retry;
3457 }
3458 }
3459 pwp->wp_oprot = 0;
3460 pwp->wp_prot = 0;
3461 }
3462 }
3463
3464 /*
3465 * Force a new setup for all the watched pages in the range.
3466 */
3467 static void
3468 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3469 {
3470 struct watched_page *pwp;
3471 struct watched_page tpw;
3472 caddr_t eaddr = addr + size;
3473 caddr_t vaddr;
3474 struct seg *seg;
3475 int err, retrycnt;
3476 uint_t wprot;
3477 avl_index_t where;
3478
3479 if (avl_numnodes(&as->a_wpage) == 0)
3480 return;
3481
3482 ASSERT(AS_WRITE_HELD(as));
3483
3484 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3485 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3486 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3487
3488 while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3489 retrycnt = 0;
3490 vaddr = pwp->wp_vaddr;
3491
3492 wprot = prot;
3493 if (pwp->wp_read)
3494 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3495 if (pwp->wp_write)
3496 wprot &= ~PROT_WRITE;
3497 if (pwp->wp_exec)
3498 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3499 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3500 retry:
3501 seg = as_segat(as, vaddr);
3502 if (seg == NULL) {
3503 panic("as_setwatchprot: no seg");
3504 /*NOTREACHED*/
3505 }
3506 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot);
3507 if (err == IE_RETRY) {
3508 ASSERT(retrycnt == 0);
3509 retrycnt++;
3510 goto retry;
3511 }
3512 }
3513 pwp->wp_oprot = prot;
3514 pwp->wp_prot = wprot;
3515
3516 pwp = AVL_NEXT(&as->a_wpage, pwp);
3517 }
3518 }
3519
3520 /*
3521 * Clear all of the watched pages in the range.
3522 */
3523 static void
3524 as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3525 {
3526 caddr_t eaddr = addr + size;
3527 struct watched_page *pwp;
3528 struct watched_page tpw;
3529 uint_t prot;
3530 struct seg *seg;
3531 int err, retrycnt;
3532 avl_index_t where;
3533
3534 if (avl_numnodes(&as->a_wpage) == 0)
3535 return;
3536
3537 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3538 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3539 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3540
3541 ASSERT(AS_WRITE_HELD(as));
3542
3543 while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3544
3545 if ((prot = pwp->wp_oprot) != 0) {
3546 retrycnt = 0;
3547
3548 if (prot != pwp->wp_prot) {
3549 retry:
3550 seg = as_segat(as, pwp->wp_vaddr);
3551 if (seg == NULL)
3552 continue;
3553 err = SEGOP_SETPROT(seg, pwp->wp_vaddr,
3554 PAGESIZE, prot);
3555 if (err == IE_RETRY) {
3556 ASSERT(retrycnt == 0);
3557 retrycnt++;
3558 goto retry;
3559
3560 }
3561 }
3562 pwp->wp_oprot = 0;
3563 pwp->wp_prot = 0;
3564 }
3565
3566 pwp = AVL_NEXT(&as->a_wpage, pwp);
3567 }
3568 }
3569
3570 void
3571 as_signal_proc(struct as *as, k_siginfo_t *siginfo)
3572 {
3573 struct proc *p;
3574
3575 mutex_enter(&pidlock);
3576 for (p = practive; p; p = p->p_next) {
3577 if (p->p_as == as) {
3578 mutex_enter(&p->p_lock);
3579 if (p->p_as == as)
3580 sigaddq(p, NULL, siginfo, KM_NOSLEEP);
3581 mutex_exit(&p->p_lock);
3582 }
3583 }
3584 mutex_exit(&pidlock);
3585 }
3586
3587 /*
3588 * return memory object ID
3589 */
3590 int
3591 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3592 {
3593 struct seg *seg;
3594 int sts;
3595
3596 AS_LOCK_ENTER(as, RW_READER);
3597 seg = as_segat(as, addr);
3598 if (seg == NULL) {
3599 AS_LOCK_EXIT(as);
3600 return (EFAULT);
3601 }
3602 /*
3603 * catch old drivers which may not support getmemid
3604 */
3605 if (seg->s_ops->getmemid == NULL) {
3606 AS_LOCK_EXIT(as);
3607 return (ENODEV);
3608 }
3609
3610 sts = SEGOP_GETMEMID(seg, addr, memidp);
3611
3612 AS_LOCK_EXIT(as);
3613 return (sts);
3614 }