1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2015, Joyent, Inc. All rights reserved.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
26 */
27
28 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
29 /* All Rights Reserved */
30
31 /*
32 * University Copyright- Copyright (c) 1982, 1986, 1988
33 * The Regents of the University of California
34 * All Rights Reserved
35 *
36 * University Acknowledgment- Portions of this document are derived from
37 * software developed by the University of California, Berkeley, and its
38 * contributors.
39 */
40
41 /*
42 * VM - address spaces.
43 */
44
45 #include <sys/types.h>
46 #include <sys/t_lock.h>
47 #include <sys/param.h>
48 #include <sys/errno.h>
49 #include <sys/systm.h>
50 #include <sys/mman.h>
51 #include <sys/sysmacros.h>
52 #include <sys/cpuvar.h>
53 #include <sys/sysinfo.h>
54 #include <sys/kmem.h>
55 #include <sys/vnode.h>
56 #include <sys/vmsystm.h>
57 #include <sys/cmn_err.h>
58 #include <sys/debug.h>
59 #include <sys/tnf_probe.h>
60 #include <sys/vtrace.h>
61
62 #include <vm/hat.h>
63 #include <vm/as.h>
64 #include <vm/seg.h>
65 #include <vm/seg_vn.h>
66 #include <vm/seg_dev.h>
67 #include <vm/seg_kmem.h>
68 #include <vm/seg_map.h>
69 #include <vm/seg_spt.h>
70 #include <vm/page.h>
71
72 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
73
74 static struct kmem_cache *as_cache;
75
76 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
77 static void as_clearwatchprot(struct as *, caddr_t, size_t);
78 int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
79
80
81 /*
82 * Verifying the segment lists is very time-consuming; it may not be
83 * desirable always to define VERIFY_SEGLIST when DEBUG is set.
84 */
85 #ifdef DEBUG
86 #define VERIFY_SEGLIST
87 int do_as_verify = 0;
88 #endif
89
90 /*
91 * Allocate a new callback data structure entry and fill in the events of
92 * interest, the address range of interest, and the callback argument.
93 * Link the entry on the as->a_callbacks list. A callback entry for the
94 * entire address space may be specified with vaddr = 0 and size = -1.
95 *
96 * CALLERS RESPONSIBILITY: If not calling from within the process context for
97 * the specified as, the caller must guarantee persistence of the specified as
98 * for the duration of this function (eg. pages being locked within the as
99 * will guarantee persistence).
100 */
101 int
102 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
103 caddr_t vaddr, size_t size, int sleepflag)
104 {
105 struct as_callback *current_head, *cb;
106 caddr_t saddr;
107 size_t rsize;
108
109 /* callback function and an event are mandatory */
110 if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
111 return (EINVAL);
112
113 /* Adding a callback after as_free has been called is not allowed */
114 if (as == &kas)
115 return (ENOMEM);
116
117 /*
118 * vaddr = 0 and size = -1 is used to indicate that the callback range
119 * is the entire address space so no rounding is done in that case.
120 */
121 if (size != -1) {
122 saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
123 rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
124 (size_t)saddr;
125 /* check for wraparound */
126 if (saddr + rsize < saddr)
127 return (ENOMEM);
128 } else {
129 if (vaddr != 0)
130 return (EINVAL);
131 saddr = vaddr;
132 rsize = size;
133 }
134
135 /* Allocate and initialize a callback entry */
136 cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
137 if (cb == NULL)
138 return (EAGAIN);
139
140 cb->ascb_func = cb_func;
141 cb->ascb_arg = arg;
142 cb->ascb_events = events;
143 cb->ascb_saddr = saddr;
144 cb->ascb_len = rsize;
145
146 /* Add the entry to the list */
147 mutex_enter(&as->a_contents);
148 current_head = as->a_callbacks;
149 as->a_callbacks = cb;
150 cb->ascb_next = current_head;
151
152 /*
153 * The call to this function may lose in a race with
154 * a pertinent event - eg. a thread does long term memory locking
155 * but before the callback is added another thread executes as_unmap.
156 * A broadcast here resolves that.
157 */
158 if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
159 AS_CLRUNMAPWAIT(as);
160 cv_broadcast(&as->a_cv);
161 }
162
163 mutex_exit(&as->a_contents);
164 return (0);
165 }
166
167 /*
168 * Search the callback list for an entry which pertains to arg.
169 *
170 * This is called from within the client upon completion of the callback.
171 * RETURN VALUES:
172 * AS_CALLBACK_DELETED (callback entry found and deleted)
173 * AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
174 * AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
175 * entry will be made in as_do_callbacks)
176 *
177 * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
178 * set, it indicates that as_do_callbacks is processing this entry. The
179 * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
180 * to unblock as_do_callbacks, in case it is blocked.
181 *
182 * CALLERS RESPONSIBILITY: If not calling from within the process context for
183 * the specified as, the caller must guarantee persistence of the specified as
184 * for the duration of this function (eg. pages being locked within the as
185 * will guarantee persistence).
186 */
187 uint_t
188 as_delete_callback(struct as *as, void *arg)
189 {
190 struct as_callback **prevcb = &as->a_callbacks;
191 struct as_callback *cb;
192 uint_t rc = AS_CALLBACK_NOTFOUND;
193
194 mutex_enter(&as->a_contents);
195 for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
196 if (cb->ascb_arg != arg)
197 continue;
198
199 /*
200 * If the events indicate AS_CALLBACK_CALLED, just clear
201 * AS_ALL_EVENT in the events field and wakeup the thread
202 * that may be waiting in as_do_callbacks. as_do_callbacks
203 * will take care of removing this entry from the list. In
204 * that case, return AS_CALLBACK_DELETE_DEFERRED. Otherwise
205 * (AS_CALLBACK_CALLED not set), just remove it from the
206 * list, return the memory and return AS_CALLBACK_DELETED.
207 */
208 if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
209 /* leave AS_CALLBACK_CALLED */
210 cb->ascb_events &= ~AS_ALL_EVENT;
211 rc = AS_CALLBACK_DELETE_DEFERRED;
212 cv_broadcast(&as->a_cv);
213 } else {
214 *prevcb = cb->ascb_next;
215 kmem_free(cb, sizeof (struct as_callback));
216 rc = AS_CALLBACK_DELETED;
217 }
218 break;
219 }
220 mutex_exit(&as->a_contents);
221 return (rc);
222 }
223
224 /*
225 * Searches the as callback list for a matching entry.
226 * Returns a pointer to the first matching callback, or NULL if
227 * nothing is found.
228 * This function never sleeps so it is ok to call it with more
229 * locks held but the (required) a_contents mutex.
230 *
231 * See also comment on as_do_callbacks below.
232 */
233 static struct as_callback *
234 as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
235 size_t event_len)
236 {
237 struct as_callback *cb;
238
239 ASSERT(MUTEX_HELD(&as->a_contents));
240 for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
241 /*
242 * If the callback has not already been called, then
243 * check if events or address range pertains. An event_len
244 * of zero means do an unconditional callback.
245 */
246 if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
247 ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
248 (event_addr + event_len < cb->ascb_saddr) ||
249 (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
250 continue;
251 }
252 break;
253 }
254 return (cb);
255 }
256
257 /*
258 * Executes a given callback and removes it from the callback list for
259 * this address space.
260 * This function may sleep so the caller must drop all locks except
261 * a_contents before calling this func.
262 *
263 * See also comments on as_do_callbacks below.
264 */
265 static void
266 as_execute_callback(struct as *as, struct as_callback *cb,
267 uint_t events)
268 {
269 struct as_callback **prevcb;
270 void *cb_arg;
271
272 ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
273 cb->ascb_events |= AS_CALLBACK_CALLED;
274 mutex_exit(&as->a_contents);
275 (*cb->ascb_func)(as, cb->ascb_arg, events);
276 mutex_enter(&as->a_contents);
277 /*
278 * the callback function is required to delete the callback
279 * when the callback function determines it is OK for
280 * this thread to continue. as_delete_callback will clear
281 * the AS_ALL_EVENT in the events field when it is deleted.
282 * If the callback function called as_delete_callback,
283 * events will already be cleared and there will be no blocking.
284 */
285 while ((cb->ascb_events & events) != 0) {
286 cv_wait(&as->a_cv, &as->a_contents);
287 }
288 /*
289 * This entry needs to be taken off the list. Normally, the
290 * callback func itself does that, but unfortunately the list
291 * may have changed while the callback was running because the
292 * a_contents mutex was dropped and someone else other than the
293 * callback func itself could have called as_delete_callback,
294 * so we have to search to find this entry again. The entry
295 * must have AS_CALLBACK_CALLED, and have the same 'arg'.
296 */
297 cb_arg = cb->ascb_arg;
298 prevcb = &as->a_callbacks;
299 for (cb = as->a_callbacks; cb != NULL;
300 prevcb = &cb->ascb_next, cb = *prevcb) {
301 if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
302 (cb_arg != cb->ascb_arg)) {
303 continue;
304 }
305 *prevcb = cb->ascb_next;
306 kmem_free(cb, sizeof (struct as_callback));
307 break;
308 }
309 }
310
311 /*
312 * Check the callback list for a matching event and intersection of
313 * address range. If there is a match invoke the callback. Skip an entry if:
314 * - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
315 * - not event of interest
316 * - not address range of interest
317 *
318 * An event_len of zero indicates a request for an unconditional callback
319 * (regardless of event), only the AS_CALLBACK_CALLED is checked. The
320 * a_contents lock must be dropped before a callback, so only one callback
321 * can be done before returning. Return -1 (true) if a callback was
322 * executed and removed from the list, else return 0 (false).
323 *
324 * The logically separate parts, i.e. finding a matching callback and
325 * executing a given callback have been separated into two functions
326 * so that they can be called with different sets of locks held beyond
327 * the always-required a_contents. as_find_callback does not sleep so
328 * it is ok to call it if more locks than a_contents (i.e. the a_lock
329 * rwlock) are held. as_execute_callback on the other hand may sleep
330 * so all locks beyond a_contents must be dropped by the caller if one
331 * does not want to end comatose.
332 */
333 static int
334 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
335 size_t event_len)
336 {
337 struct as_callback *cb;
338
339 if ((cb = as_find_callback(as, events, event_addr, event_len))) {
340 as_execute_callback(as, cb, events);
341 return (-1);
342 }
343 return (0);
344 }
345
346 /*
347 * Search for the segment containing addr. If a segment containing addr
348 * exists, that segment is returned. If no such segment exists, and
349 * the list spans addresses greater than addr, then the first segment
350 * whose base is greater than addr is returned; otherwise, NULL is
351 * returned unless tail is true, in which case the last element of the
352 * list is returned.
353 *
354 * a_seglast is used to cache the last found segment for repeated
355 * searches to the same addr (which happens frequently).
356 */
357 struct seg *
358 as_findseg(struct as *as, caddr_t addr, int tail)
359 {
360 struct seg *seg = as->a_seglast;
361 avl_index_t where;
362
363 ASSERT(AS_LOCK_HELD(as));
364
365 if (seg != NULL &&
366 seg->s_base <= addr &&
367 addr < seg->s_base + seg->s_size)
368 return (seg);
369
370 seg = avl_find(&as->a_segtree, &addr, &where);
371 if (seg != NULL)
372 return (as->a_seglast = seg);
373
374 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
375 if (seg == NULL && tail)
376 seg = avl_last(&as->a_segtree);
377 return (as->a_seglast = seg);
378 }
379
380 #ifdef VERIFY_SEGLIST
381 /*
382 * verify that the linked list is coherent
383 */
384 static void
385 as_verify(struct as *as)
386 {
387 struct seg *seg, *seglast, *p, *n;
388 uint_t nsegs = 0;
389
390 if (do_as_verify == 0)
391 return;
392
393 seglast = as->a_seglast;
394
395 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
396 ASSERT(seg->s_as == as);
397 p = AS_SEGPREV(as, seg);
398 n = AS_SEGNEXT(as, seg);
399 ASSERT(p == NULL || p->s_as == as);
400 ASSERT(p == NULL || p->s_base < seg->s_base);
401 ASSERT(n == NULL || n->s_base > seg->s_base);
402 ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
403 if (seg == seglast)
404 seglast = NULL;
405 nsegs++;
406 }
407 ASSERT(seglast == NULL);
408 ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
409 }
410 #endif /* VERIFY_SEGLIST */
411
412 /*
413 * Add a new segment to the address space. The avl_find()
414 * may be expensive so we attempt to use last segment accessed
415 * in as_gap() as an insertion point.
416 */
417 int
418 as_addseg(struct as *as, struct seg *newseg)
419 {
420 struct seg *seg;
421 caddr_t addr;
422 caddr_t eaddr;
423 avl_index_t where;
424
425 ASSERT(AS_WRITE_HELD(as));
426
427 as->a_updatedir = 1; /* inform /proc */
428 gethrestime(&as->a_updatetime);
429
430 if (as->a_lastgaphl != NULL) {
431 struct seg *hseg = NULL;
432 struct seg *lseg = NULL;
433
434 if (as->a_lastgaphl->s_base > newseg->s_base) {
435 hseg = as->a_lastgaphl;
436 lseg = AVL_PREV(&as->a_segtree, hseg);
437 } else {
438 lseg = as->a_lastgaphl;
439 hseg = AVL_NEXT(&as->a_segtree, lseg);
440 }
441
442 if (hseg && lseg && lseg->s_base < newseg->s_base &&
443 hseg->s_base > newseg->s_base) {
444 avl_insert_here(&as->a_segtree, newseg, lseg,
445 AVL_AFTER);
446 as->a_lastgaphl = NULL;
447 as->a_seglast = newseg;
448 return (0);
449 }
450 as->a_lastgaphl = NULL;
451 }
452
453 addr = newseg->s_base;
454 eaddr = addr + newseg->s_size;
455 again:
456
457 seg = avl_find(&as->a_segtree, &addr, &where);
458
459 if (seg == NULL)
460 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
461
462 if (seg == NULL)
463 seg = avl_last(&as->a_segtree);
464
465 if (seg != NULL) {
466 caddr_t base = seg->s_base;
467
468 /*
469 * If top of seg is below the requested address, then
470 * the insertion point is at the end of the linked list,
471 * and seg points to the tail of the list. Otherwise,
472 * the insertion point is immediately before seg.
473 */
474 if (base + seg->s_size > addr) {
475 if (addr >= base || eaddr > base) {
476 #ifdef __sparc
477 extern struct seg_ops segnf_ops;
478
479 /*
480 * no-fault segs must disappear if overlaid.
481 * XXX need new segment type so
482 * we don't have to check s_ops
483 */
484 if (seg->s_ops == &segnf_ops) {
485 seg_unmap(seg);
486 goto again;
487 }
488 #endif
489 return (-1); /* overlapping segment */
490 }
491 }
492 }
493 as->a_seglast = newseg;
494 avl_insert(&as->a_segtree, newseg, where);
495
496 #ifdef VERIFY_SEGLIST
497 as_verify(as);
498 #endif
499 return (0);
500 }
501
502 struct seg *
503 as_removeseg(struct as *as, struct seg *seg)
504 {
505 avl_tree_t *t;
506
507 ASSERT(AS_WRITE_HELD(as));
508
509 as->a_updatedir = 1; /* inform /proc */
510 gethrestime(&as->a_updatetime);
511
512 if (seg == NULL)
513 return (NULL);
514
515 t = &as->a_segtree;
516 if (as->a_seglast == seg)
517 as->a_seglast = NULL;
518 as->a_lastgaphl = NULL;
519
520 /*
521 * if this segment is at an address higher than
522 * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
523 */
524 if (as->a_lastgap &&
525 (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
526 as->a_lastgap = AVL_NEXT(t, seg);
527
528 /*
529 * remove the segment from the seg tree
530 */
531 avl_remove(t, seg);
532
533 #ifdef VERIFY_SEGLIST
534 as_verify(as);
535 #endif
536 return (seg);
537 }
538
539 /*
540 * Find a segment containing addr.
541 */
542 struct seg *
543 as_segat(struct as *as, caddr_t addr)
544 {
545 struct seg *seg = as->a_seglast;
546
547 ASSERT(AS_LOCK_HELD(as));
548
549 if (seg != NULL && seg->s_base <= addr &&
550 addr < seg->s_base + seg->s_size)
551 return (seg);
552
553 seg = avl_find(&as->a_segtree, &addr, NULL);
554 return (seg);
555 }
556
557 /*
558 * Serialize all searches for holes in an address space to
559 * prevent two or more threads from allocating the same virtual
560 * address range. The address space must not be "read/write"
561 * locked by the caller since we may block.
562 */
563 void
564 as_rangelock(struct as *as)
565 {
566 mutex_enter(&as->a_contents);
567 while (AS_ISCLAIMGAP(as))
568 cv_wait(&as->a_cv, &as->a_contents);
569 AS_SETCLAIMGAP(as);
570 mutex_exit(&as->a_contents);
571 }
572
573 /*
574 * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
575 */
576 void
577 as_rangeunlock(struct as *as)
578 {
579 mutex_enter(&as->a_contents);
580 AS_CLRCLAIMGAP(as);
581 cv_signal(&as->a_cv);
582 mutex_exit(&as->a_contents);
583 }
584
585 /*
586 * compar segments (or just an address) by segment address range
587 */
588 static int
589 as_segcompar(const void *x, const void *y)
590 {
591 struct seg *a = (struct seg *)x;
592 struct seg *b = (struct seg *)y;
593
594 if (a->s_base < b->s_base)
595 return (-1);
596 if (a->s_base >= b->s_base + b->s_size)
597 return (1);
598 return (0);
599 }
600
601
602 void
603 as_avlinit(struct as *as)
604 {
605 avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
606 offsetof(struct seg, s_tree));
607 avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
608 offsetof(struct watched_page, wp_link));
609 }
610
611 /*ARGSUSED*/
612 static int
613 as_constructor(void *buf, void *cdrarg, int kmflags)
614 {
615 struct as *as = buf;
616
617 mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
618 cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
619 rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
620 as_avlinit(as);
621 return (0);
622 }
623
624 /*ARGSUSED1*/
625 static void
626 as_destructor(void *buf, void *cdrarg)
627 {
628 struct as *as = buf;
629
630 avl_destroy(&as->a_segtree);
631 mutex_destroy(&as->a_contents);
632 cv_destroy(&as->a_cv);
633 rw_destroy(&as->a_lock);
634 }
635
636 void
637 as_init(void)
638 {
639 as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
640 as_constructor, as_destructor, NULL, NULL, NULL, 0);
641 }
642
643 /*
644 * Allocate and initialize an address space data structure.
645 * We call hat_alloc to allow any machine dependent
646 * information in the hat structure to be initialized.
647 */
648 struct as *
649 as_alloc(void)
650 {
651 struct as *as;
652
653 as = kmem_cache_alloc(as_cache, KM_SLEEP);
654
655 as->a_flags = 0;
656 as->a_vbits = 0;
657 as->a_hrm = NULL;
658 as->a_seglast = NULL;
659 as->a_size = 0;
660 as->a_resvsize = 0;
661 as->a_updatedir = 0;
662 gethrestime(&as->a_updatetime);
663 as->a_objectdir = NULL;
664 as->a_sizedir = 0;
665 as->a_userlimit = (caddr_t)USERLIMIT;
666 as->a_lastgap = NULL;
667 as->a_lastgaphl = NULL;
668 as->a_callbacks = NULL;
669 as->a_proc = NULL;
670
671 AS_LOCK_ENTER(as, RW_WRITER);
672 as->a_hat = hat_alloc(as); /* create hat for default system mmu */
673 AS_LOCK_EXIT(as);
674
675 return (as);
676 }
677
678 /*
679 * Free an address space data structure.
680 * Need to free the hat first and then
681 * all the segments on this as and finally
682 * the space for the as struct itself.
683 */
684 void
685 as_free(struct as *as)
686 {
687 struct hat *hat = as->a_hat;
688 struct seg *seg, *next;
689 boolean_t free_started = B_FALSE;
690
691 top:
692 /*
693 * Invoke ALL callbacks. as_do_callbacks will do one callback
694 * per call, and not return (-1) until the callback has completed.
695 * When as_do_callbacks returns zero, all callbacks have completed.
696 */
697 mutex_enter(&as->a_contents);
698 while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
699 ;
700
701 mutex_exit(&as->a_contents);
702 AS_LOCK_ENTER(as, RW_WRITER);
703
704 if (!free_started) {
705 free_started = B_TRUE;
706 hat_free_start(hat);
707 }
708 for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
709 int err;
710
711 next = AS_SEGNEXT(as, seg);
712 retry:
713 err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
714 if (err == EAGAIN) {
715 mutex_enter(&as->a_contents);
716 if (as->a_callbacks) {
717 AS_LOCK_EXIT(as);
718 } else if (!AS_ISNOUNMAPWAIT(as)) {
719 /*
720 * Memory is currently locked. Wait for a
721 * cv_signal that it has been unlocked, then
722 * try the operation again.
723 */
724 if (AS_ISUNMAPWAIT(as) == 0)
725 cv_broadcast(&as->a_cv);
726 AS_SETUNMAPWAIT(as);
727 AS_LOCK_EXIT(as);
728 while (AS_ISUNMAPWAIT(as))
729 cv_wait(&as->a_cv, &as->a_contents);
730 } else {
731 /*
732 * We may have raced with
733 * segvn_reclaim()/segspt_reclaim(). In this
734 * case clean nounmapwait flag and retry since
735 * softlockcnt in this segment may be already
736 * 0. We don't drop as writer lock so our
737 * number of retries without sleeping should
738 * be very small. See segvn_reclaim() for
739 * more comments.
740 */
741 AS_CLRNOUNMAPWAIT(as);
742 mutex_exit(&as->a_contents);
743 goto retry;
744 }
745 mutex_exit(&as->a_contents);
746 goto top;
747 } else {
748 /*
749 * We do not expect any other error return at this
750 * time. This is similar to an ASSERT in seg_unmap()
751 */
752 ASSERT(err == 0);
753 }
754 }
755 hat_free_end(hat);
756 AS_LOCK_EXIT(as);
757
758 /* /proc stuff */
759 ASSERT(avl_numnodes(&as->a_wpage) == 0);
760 if (as->a_objectdir) {
761 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
762 as->a_objectdir = NULL;
763 as->a_sizedir = 0;
764 }
765
766 /*
767 * Free the struct as back to kmem. Assert it has no segments.
768 */
769 ASSERT(avl_numnodes(&as->a_segtree) == 0);
770 kmem_cache_free(as_cache, as);
771 }
772
773 int
774 as_dup(struct as *as, struct proc *forkedproc)
775 {
776 struct as *newas;
777 struct seg *seg, *newseg;
778 size_t purgesize = 0;
779 int error;
780
781 AS_LOCK_ENTER(as, RW_WRITER);
782 as_clearwatch(as);
783 newas = as_alloc();
784 newas->a_userlimit = as->a_userlimit;
785 newas->a_proc = forkedproc;
786
787 AS_LOCK_ENTER(newas, RW_WRITER);
788
789 (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
790
791 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
792
793 if (seg->s_flags & S_PURGE) {
794 purgesize += seg->s_size;
795 continue;
796 }
797
798 newseg = seg_alloc(newas, seg->s_base, seg->s_size);
799 if (newseg == NULL) {
800 AS_LOCK_EXIT(newas);
801 as_setwatch(as);
802 AS_LOCK_EXIT(as);
803 as_free(newas);
804 return (-1);
805 }
806 if ((error = SEGOP_DUP(seg, newseg)) != 0) {
807 /*
808 * We call seg_free() on the new seg
809 * because the segment is not set up
810 * completely; i.e. it has no ops.
811 */
812 as_setwatch(as);
813 AS_LOCK_EXIT(as);
814 seg_free(newseg);
815 AS_LOCK_EXIT(newas);
816 as_free(newas);
817 return (error);
818 }
819 newas->a_size += seg->s_size;
820 }
821 newas->a_resvsize = as->a_resvsize - purgesize;
822
823 error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
824
825 AS_LOCK_EXIT(newas);
826
827 as_setwatch(as);
828 AS_LOCK_EXIT(as);
829 if (error != 0) {
830 as_free(newas);
831 return (error);
832 }
833 forkedproc->p_as = newas;
834 return (0);
835 }
836
837 /*
838 * Handle a ``fault'' at addr for size bytes.
839 */
840 faultcode_t
841 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
842 enum fault_type type, enum seg_rw rw)
843 {
844 struct seg *seg;
845 caddr_t raddr; /* rounded down addr */
846 size_t rsize; /* rounded up size */
847 size_t ssize;
848 faultcode_t res = 0;
849 caddr_t addrsav;
850 struct seg *segsav;
851 int as_lock_held;
852 klwp_t *lwp = ttolwp(curthread);
853
854
855
856 retry:
857 /*
858 * Indicate that the lwp is not to be stopped while waiting for a
859 * pagefault. This is to avoid deadlock while debugging a process
860 * via /proc over NFS (in particular).
861 */
862 if (lwp != NULL)
863 lwp->lwp_nostop++;
864
865 /*
866 * same length must be used when we softlock and softunlock. We
867 * don't support softunlocking lengths less than the original length
868 * when there is largepage support. See seg_dev.c for more
869 * comments.
870 */
871 switch (type) {
872
873 case F_SOFTLOCK:
874 CPU_STATS_ADD_K(vm, softlock, 1);
875 break;
876
877 case F_SOFTUNLOCK:
878 break;
879
880 case F_PROT:
881 CPU_STATS_ADD_K(vm, prot_fault, 1);
882 break;
883
884 case F_INVAL:
885 CPU_STATS_ENTER_K();
886 CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
887 if (as == &kas)
888 CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
889 CPU_STATS_EXIT_K();
890 break;
891 }
892
893 /* Kernel probe */
894 TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */,
895 tnf_opaque, address, addr,
896 tnf_fault_type, fault_type, type,
897 tnf_seg_access, access, rw);
898
899 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
900 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
901 (size_t)raddr;
902
903 /*
904 * XXX -- Don't grab the as lock for segkmap. We should grab it for
905 * correctness, but then we could be stuck holding this lock for
906 * a LONG time if the fault needs to be resolved on a slow
907 * filesystem, and then no-one will be able to exec new commands,
908 * as exec'ing requires the write lock on the as.
909 */
910 if (as == &kas && segkmap && segkmap->s_base <= raddr &&
911 raddr + size < segkmap->s_base + segkmap->s_size) {
912 seg = segkmap;
913 as_lock_held = 0;
914 } else {
915 AS_LOCK_ENTER(as, RW_READER);
916
917 seg = as_segat(as, raddr);
918 if (seg == NULL) {
919 AS_LOCK_EXIT(as);
920 if (lwp != NULL)
921 lwp->lwp_nostop--;
922 return (FC_NOMAP);
923 }
924
925 as_lock_held = 1;
926 }
927
928 addrsav = raddr;
929 segsav = seg;
930
931 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
932 if (raddr >= seg->s_base + seg->s_size) {
933 seg = AS_SEGNEXT(as, seg);
934 if (seg == NULL || raddr != seg->s_base) {
935 res = FC_NOMAP;
936 break;
937 }
938 }
939 if (raddr + rsize > seg->s_base + seg->s_size)
940 ssize = seg->s_base + seg->s_size - raddr;
941 else
942 ssize = rsize;
943
944 res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw);
945 if (res != 0)
946 break;
947 }
948
949 /*
950 * If we were SOFTLOCKing and encountered a failure,
951 * we must SOFTUNLOCK the range we already did. (Maybe we
952 * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
953 * right here...)
954 */
955 if (res != 0 && type == F_SOFTLOCK) {
956 for (seg = segsav; addrsav < raddr; addrsav += ssize) {
957 if (addrsav >= seg->s_base + seg->s_size)
958 seg = AS_SEGNEXT(as, seg);
959 ASSERT(seg != NULL);
960 /*
961 * Now call the fault routine again to perform the
962 * unlock using S_OTHER instead of the rw variable
963 * since we never got a chance to touch the pages.
964 */
965 if (raddr > seg->s_base + seg->s_size)
966 ssize = seg->s_base + seg->s_size - addrsav;
967 else
968 ssize = raddr - addrsav;
969 (void) SEGOP_FAULT(hat, seg, addrsav, ssize,
970 F_SOFTUNLOCK, S_OTHER);
971 }
972 }
973 if (as_lock_held)
974 AS_LOCK_EXIT(as);
975 if (lwp != NULL)
976 lwp->lwp_nostop--;
977
978 /*
979 * If the lower levels returned EDEADLK for a fault,
980 * It means that we should retry the fault. Let's wait
981 * a bit also to let the deadlock causing condition clear.
982 * This is part of a gross hack to work around a design flaw
983 * in the ufs/sds logging code and should go away when the
984 * logging code is re-designed to fix the problem. See bug
985 * 4125102 for details of the problem.
986 */
987 if (FC_ERRNO(res) == EDEADLK) {
988 delay(deadlk_wait);
989 res = 0;
990 goto retry;
991 }
992 return (res);
993 }
994
995
996
997 /*
998 * Asynchronous ``fault'' at addr for size bytes.
999 */
1000 faultcode_t
1001 as_faulta(struct as *as, caddr_t addr, size_t size)
1002 {
1003 struct seg *seg;
1004 caddr_t raddr; /* rounded down addr */
1005 size_t rsize; /* rounded up size */
1006 faultcode_t res = 0;
1007 klwp_t *lwp = ttolwp(curthread);
1008
1009 retry:
1010 /*
1011 * Indicate that the lwp is not to be stopped while waiting
1012 * for a pagefault. This is to avoid deadlock while debugging
1013 * a process via /proc over NFS (in particular).
1014 */
1015 if (lwp != NULL)
1016 lwp->lwp_nostop++;
1017
1018 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1019 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1020 (size_t)raddr;
1021
1022 AS_LOCK_ENTER(as, RW_READER);
1023 seg = as_segat(as, raddr);
1024 if (seg == NULL) {
1025 AS_LOCK_EXIT(as);
1026 if (lwp != NULL)
1027 lwp->lwp_nostop--;
1028 return (FC_NOMAP);
1029 }
1030
1031 for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1032 if (raddr >= seg->s_base + seg->s_size) {
1033 seg = AS_SEGNEXT(as, seg);
1034 if (seg == NULL || raddr != seg->s_base) {
1035 res = FC_NOMAP;
1036 break;
1037 }
1038 }
1039 res = SEGOP_FAULTA(seg, raddr);
1040 if (res != 0)
1041 break;
1042 }
1043 AS_LOCK_EXIT(as);
1044 if (lwp != NULL)
1045 lwp->lwp_nostop--;
1046 /*
1047 * If the lower levels returned EDEADLK for a fault,
1048 * It means that we should retry the fault. Let's wait
1049 * a bit also to let the deadlock causing condition clear.
1050 * This is part of a gross hack to work around a design flaw
1051 * in the ufs/sds logging code and should go away when the
1052 * logging code is re-designed to fix the problem. See bug
1053 * 4125102 for details of the problem.
1054 */
1055 if (FC_ERRNO(res) == EDEADLK) {
1056 delay(deadlk_wait);
1057 res = 0;
1058 goto retry;
1059 }
1060 return (res);
1061 }
1062
1063 /*
1064 * Set the virtual mapping for the interval from [addr : addr + size)
1065 * in address space `as' to have the specified protection.
1066 * It is ok for the range to cross over several segments,
1067 * as long as they are contiguous.
1068 */
1069 int
1070 as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1071 {
1072 struct seg *seg;
1073 struct as_callback *cb;
1074 size_t ssize;
1075 caddr_t raddr; /* rounded down addr */
1076 size_t rsize; /* rounded up size */
1077 int error = 0, writer = 0;
1078 caddr_t saveraddr;
1079 size_t saversize;
1080
1081 setprot_top:
1082 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1083 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1084 (size_t)raddr;
1085
1086 if (raddr + rsize < raddr) /* check for wraparound */
1087 return (ENOMEM);
1088
1089 saveraddr = raddr;
1090 saversize = rsize;
1091
1092 /*
1093 * Normally we only lock the as as a reader. But
1094 * if due to setprot the segment driver needs to split
1095 * a segment it will return IE_RETRY. Therefore we re-acquire
1096 * the as lock as a writer so the segment driver can change
1097 * the seg list. Also the segment driver will return IE_RETRY
1098 * after it has changed the segment list so we therefore keep
1099 * locking as a writer. Since these opeartions should be rare
1100 * want to only lock as a writer when necessary.
1101 */
1102 if (writer || avl_numnodes(&as->a_wpage) != 0) {
1103 AS_LOCK_ENTER(as, RW_WRITER);
1104 } else {
1105 AS_LOCK_ENTER(as, RW_READER);
1106 }
1107
1108 as_clearwatchprot(as, raddr, rsize);
1109 seg = as_segat(as, raddr);
1110 if (seg == NULL) {
1111 as_setwatch(as);
1112 AS_LOCK_EXIT(as);
1113 return (ENOMEM);
1114 }
1115
1116 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1117 if (raddr >= seg->s_base + seg->s_size) {
1118 seg = AS_SEGNEXT(as, seg);
1119 if (seg == NULL || raddr != seg->s_base) {
1120 error = ENOMEM;
1121 break;
1122 }
1123 }
1124 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1125 ssize = seg->s_base + seg->s_size - raddr;
1126 else
1127 ssize = rsize;
1128 retry:
1129 error = SEGOP_SETPROT(seg, raddr, ssize, prot);
1130
1131 if (error == IE_NOMEM) {
1132 error = EAGAIN;
1133 break;
1134 }
1135
1136 if (error == IE_RETRY) {
1137 AS_LOCK_EXIT(as);
1138 writer = 1;
1139 goto setprot_top;
1140 }
1141
1142 if (error == EAGAIN) {
1143 /*
1144 * Make sure we have a_lock as writer.
1145 */
1146 if (writer == 0) {
1147 AS_LOCK_EXIT(as);
1148 writer = 1;
1149 goto setprot_top;
1150 }
1151
1152 /*
1153 * Memory is currently locked. It must be unlocked
1154 * before this operation can succeed through a retry.
1155 * The possible reasons for locked memory and
1156 * corresponding strategies for unlocking are:
1157 * (1) Normal I/O
1158 * wait for a signal that the I/O operation
1159 * has completed and the memory is unlocked.
1160 * (2) Asynchronous I/O
1161 * The aio subsystem does not unlock pages when
1162 * the I/O is completed. Those pages are unlocked
1163 * when the application calls aiowait/aioerror.
1164 * So, to prevent blocking forever, cv_broadcast()
1165 * is done to wake up aio_cleanup_thread.
1166 * Subsequently, segvn_reclaim will be called, and
1167 * that will do AS_CLRUNMAPWAIT() and wake us up.
1168 * (3) Long term page locking:
1169 * Drivers intending to have pages locked for a
1170 * period considerably longer than for normal I/O
1171 * (essentially forever) may have registered for a
1172 * callback so they may unlock these pages on
1173 * request. This is needed to allow this operation
1174 * to succeed. Each entry on the callback list is
1175 * examined. If the event or address range pertains
1176 * the callback is invoked (unless it already is in
1177 * progress). The a_contents lock must be dropped
1178 * before the callback, so only one callback can
1179 * be done at a time. Go to the top and do more
1180 * until zero is returned. If zero is returned,
1181 * either there were no callbacks for this event
1182 * or they were already in progress.
1183 */
1184 mutex_enter(&as->a_contents);
1185 if (as->a_callbacks &&
1186 (cb = as_find_callback(as, AS_SETPROT_EVENT,
1187 seg->s_base, seg->s_size))) {
1188 AS_LOCK_EXIT(as);
1189 as_execute_callback(as, cb, AS_SETPROT_EVENT);
1190 } else if (!AS_ISNOUNMAPWAIT(as)) {
1191 if (AS_ISUNMAPWAIT(as) == 0)
1192 cv_broadcast(&as->a_cv);
1193 AS_SETUNMAPWAIT(as);
1194 AS_LOCK_EXIT(as);
1195 while (AS_ISUNMAPWAIT(as))
1196 cv_wait(&as->a_cv, &as->a_contents);
1197 } else {
1198 /*
1199 * We may have raced with
1200 * segvn_reclaim()/segspt_reclaim(). In this
1201 * case clean nounmapwait flag and retry since
1202 * softlockcnt in this segment may be already
1203 * 0. We don't drop as writer lock so our
1204 * number of retries without sleeping should
1205 * be very small. See segvn_reclaim() for
1206 * more comments.
1207 */
1208 AS_CLRNOUNMAPWAIT(as);
1209 mutex_exit(&as->a_contents);
1210 goto retry;
1211 }
1212 mutex_exit(&as->a_contents);
1213 goto setprot_top;
1214 } else if (error != 0)
1215 break;
1216 }
1217 if (error != 0) {
1218 as_setwatch(as);
1219 } else {
1220 as_setwatchprot(as, saveraddr, saversize, prot);
1221 }
1222 AS_LOCK_EXIT(as);
1223 return (error);
1224 }
1225
1226 /*
1227 * Check to make sure that the interval [addr, addr + size)
1228 * in address space `as' has at least the specified protection.
1229 * It is ok for the range to cross over several segments, as long
1230 * as they are contiguous.
1231 */
1232 int
1233 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1234 {
1235 struct seg *seg;
1236 size_t ssize;
1237 caddr_t raddr; /* rounded down addr */
1238 size_t rsize; /* rounded up size */
1239 int error = 0;
1240
1241 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1242 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1243 (size_t)raddr;
1244
1245 if (raddr + rsize < raddr) /* check for wraparound */
1246 return (ENOMEM);
1247
1248 /*
1249 * This is ugly as sin...
1250 * Normally, we only acquire the address space readers lock.
1251 * However, if the address space has watchpoints present,
1252 * we must acquire the writer lock on the address space for
1253 * the benefit of as_clearwatchprot() and as_setwatchprot().
1254 */
1255 if (avl_numnodes(&as->a_wpage) != 0)
1256 AS_LOCK_ENTER(as, RW_WRITER);
1257 else
1258 AS_LOCK_ENTER(as, RW_READER);
1259 as_clearwatchprot(as, raddr, rsize);
1260 seg = as_segat(as, raddr);
1261 if (seg == NULL) {
1262 as_setwatch(as);
1263 AS_LOCK_EXIT(as);
1264 return (ENOMEM);
1265 }
1266
1267 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1268 if (raddr >= seg->s_base + seg->s_size) {
1269 seg = AS_SEGNEXT(as, seg);
1270 if (seg == NULL || raddr != seg->s_base) {
1271 error = ENOMEM;
1272 break;
1273 }
1274 }
1275 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1276 ssize = seg->s_base + seg->s_size - raddr;
1277 else
1278 ssize = rsize;
1279
1280 error = SEGOP_CHECKPROT(seg, raddr, ssize, prot);
1281 if (error != 0)
1282 break;
1283 }
1284 as_setwatch(as);
1285 AS_LOCK_EXIT(as);
1286 return (error);
1287 }
1288
1289 int
1290 as_unmap(struct as *as, caddr_t addr, size_t size)
1291 {
1292 struct seg *seg, *seg_next;
1293 struct as_callback *cb;
1294 caddr_t raddr, eaddr;
1295 size_t ssize, rsize = 0;
1296 int err;
1297
1298 top:
1299 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1300 eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1301 (uintptr_t)PAGEMASK);
1302
1303 AS_LOCK_ENTER(as, RW_WRITER);
1304
1305 as->a_updatedir = 1; /* inform /proc */
1306 gethrestime(&as->a_updatetime);
1307
1308 /*
1309 * Use as_findseg to find the first segment in the range, then
1310 * step through the segments in order, following s_next.
1311 */
1312 as_clearwatchprot(as, raddr, eaddr - raddr);
1313
1314 for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
1315 if (eaddr <= seg->s_base)
1316 break; /* eaddr was in a gap; all done */
1317
1318 /* this is implied by the test above */
1319 ASSERT(raddr < eaddr);
1320
1321 if (raddr < seg->s_base)
1322 raddr = seg->s_base; /* raddr was in a gap */
1323
1324 if (eaddr > (seg->s_base + seg->s_size))
1325 ssize = seg->s_base + seg->s_size - raddr;
1326 else
1327 ssize = eaddr - raddr;
1328
1329 /*
1330 * Save next segment pointer since seg can be
1331 * destroyed during the segment unmap operation.
1332 */
1333 seg_next = AS_SEGNEXT(as, seg);
1334
1335 /*
1336 * We didn't count /dev/null mappings, so ignore them here.
1337 * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1338 * we have to do this check here while we have seg.)
1339 */
1340 rsize = 0;
1341 if (!SEG_IS_DEVNULL_MAPPING(seg) &&
1342 !SEG_IS_PARTIAL_RESV(seg))
1343 rsize = ssize;
1344
1345 retry:
1346 err = SEGOP_UNMAP(seg, raddr, ssize);
1347 if (err == EAGAIN) {
1348 /*
1349 * Memory is currently locked. It must be unlocked
1350 * before this operation can succeed through a retry.
1351 * The possible reasons for locked memory and
1352 * corresponding strategies for unlocking are:
1353 * (1) Normal I/O
1354 * wait for a signal that the I/O operation
1355 * has completed and the memory is unlocked.
1356 * (2) Asynchronous I/O
1357 * The aio subsystem does not unlock pages when
1358 * the I/O is completed. Those pages are unlocked
1359 * when the application calls aiowait/aioerror.
1360 * So, to prevent blocking forever, cv_broadcast()
1361 * is done to wake up aio_cleanup_thread.
1362 * Subsequently, segvn_reclaim will be called, and
1363 * that will do AS_CLRUNMAPWAIT() and wake us up.
1364 * (3) Long term page locking:
1365 * Drivers intending to have pages locked for a
1366 * period considerably longer than for normal I/O
1367 * (essentially forever) may have registered for a
1368 * callback so they may unlock these pages on
1369 * request. This is needed to allow this operation
1370 * to succeed. Each entry on the callback list is
1371 * examined. If the event or address range pertains
1372 * the callback is invoked (unless it already is in
1373 * progress). The a_contents lock must be dropped
1374 * before the callback, so only one callback can
1375 * be done at a time. Go to the top and do more
1376 * until zero is returned. If zero is returned,
1377 * either there were no callbacks for this event
1378 * or they were already in progress.
1379 */
1380 mutex_enter(&as->a_contents);
1381 if (as->a_callbacks &&
1382 (cb = as_find_callback(as, AS_UNMAP_EVENT,
1383 seg->s_base, seg->s_size))) {
1384 AS_LOCK_EXIT(as);
1385 as_execute_callback(as, cb, AS_UNMAP_EVENT);
1386 } else if (!AS_ISNOUNMAPWAIT(as)) {
1387 if (AS_ISUNMAPWAIT(as) == 0)
1388 cv_broadcast(&as->a_cv);
1389 AS_SETUNMAPWAIT(as);
1390 AS_LOCK_EXIT(as);
1391 while (AS_ISUNMAPWAIT(as))
1392 cv_wait(&as->a_cv, &as->a_contents);
1393 } else {
1394 /*
1395 * We may have raced with
1396 * segvn_reclaim()/segspt_reclaim(). In this
1397 * case clean nounmapwait flag and retry since
1398 * softlockcnt in this segment may be already
1399 * 0. We don't drop as writer lock so our
1400 * number of retries without sleeping should
1401 * be very small. See segvn_reclaim() for
1402 * more comments.
1403 */
1404 AS_CLRNOUNMAPWAIT(as);
1405 mutex_exit(&as->a_contents);
1406 goto retry;
1407 }
1408 mutex_exit(&as->a_contents);
1409 goto top;
1410 } else if (err == IE_RETRY) {
1411 AS_LOCK_EXIT(as);
1412 goto top;
1413 } else if (err) {
1414 as_setwatch(as);
1415 AS_LOCK_EXIT(as);
1416 return (-1);
1417 }
1418
1419 as->a_size -= ssize;
1420 if (rsize)
1421 as->a_resvsize -= rsize;
1422 raddr += ssize;
1423 }
1424 AS_LOCK_EXIT(as);
1425 return (0);
1426 }
1427
1428 static int
1429 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
1430 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1431 {
1432 uint_t szc;
1433 uint_t nszc;
1434 int error;
1435 caddr_t a;
1436 caddr_t eaddr;
1437 size_t segsize;
1438 struct seg *seg;
1439 size_t pgsz;
1440 int do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
1441 uint_t save_szcvec;
1442
1443 ASSERT(AS_WRITE_HELD(as));
1444 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1445 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1446 ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
1447 if (!do_off) {
1448 vn_a->offset = 0;
1449 }
1450
1451 if (szcvec <= 1) {
1452 seg = seg_alloc(as, addr, size);
1453 if (seg == NULL) {
1454 return (ENOMEM);
1455 }
1456 vn_a->szc = 0;
1457 error = (*crfp)(seg, vn_a);
1458 if (error != 0) {
1459 seg_free(seg);
1460 } else {
1461 as->a_size += size;
1462 as->a_resvsize += size;
1463 }
1464 return (error);
1465 }
1466
1467 eaddr = addr + size;
1468 save_szcvec = szcvec;
1469 szcvec >>= 1;
1470 szc = 0;
1471 nszc = 0;
1472 while (szcvec) {
1473 if ((szcvec & 0x1) == 0) {
1474 nszc++;
1475 szcvec >>= 1;
1476 continue;
1477 }
1478 nszc++;
1479 pgsz = page_get_pagesize(nszc);
1480 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
1481 if (a != addr) {
1482 ASSERT(a < eaddr);
1483 segsize = a - addr;
1484 seg = seg_alloc(as, addr, segsize);
1485 if (seg == NULL) {
1486 return (ENOMEM);
1487 }
1488 vn_a->szc = szc;
1489 error = (*crfp)(seg, vn_a);
1490 if (error != 0) {
1491 seg_free(seg);
1492 return (error);
1493 }
1494 as->a_size += segsize;
1495 as->a_resvsize += segsize;
1496 *segcreated = 1;
1497 if (do_off) {
1498 vn_a->offset += segsize;
1499 }
1500 addr = a;
1501 }
1502 szc = nszc;
1503 szcvec >>= 1;
1504 }
1505
1506 ASSERT(addr < eaddr);
1507 szcvec = save_szcvec | 1; /* add 8K pages */
1508 while (szcvec) {
1509 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
1510 ASSERT(a >= addr);
1511 if (a != addr) {
1512 segsize = a - addr;
1513 seg = seg_alloc(as, addr, segsize);
1514 if (seg == NULL) {
1515 return (ENOMEM);
1516 }
1517 vn_a->szc = szc;
1518 error = (*crfp)(seg, vn_a);
1519 if (error != 0) {
1520 seg_free(seg);
1521 return (error);
1522 }
1523 as->a_size += segsize;
1524 as->a_resvsize += segsize;
1525 *segcreated = 1;
1526 if (do_off) {
1527 vn_a->offset += segsize;
1528 }
1529 addr = a;
1530 }
1531 szcvec &= ~(1 << szc);
1532 if (szcvec) {
1533 szc = highbit(szcvec) - 1;
1534 pgsz = page_get_pagesize(szc);
1535 }
1536 }
1537 ASSERT(addr == eaddr);
1538
1539 return (0);
1540 }
1541
1542 static int
1543 as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
1544 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1545 {
1546 uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
1547 int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
1548 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1549 type, 0);
1550 int error;
1551 struct seg *seg;
1552 struct vattr va;
1553 u_offset_t eoff;
1554 size_t save_size = 0;
1555 extern size_t textrepl_size_thresh;
1556
1557 ASSERT(AS_WRITE_HELD(as));
1558 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1559 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1560 ASSERT(vn_a->vp != NULL);
1561 ASSERT(vn_a->amp == NULL);
1562
1563 again:
1564 if (szcvec <= 1) {
1565 seg = seg_alloc(as, addr, size);
1566 if (seg == NULL) {
1567 return (ENOMEM);
1568 }
1569 vn_a->szc = 0;
1570 error = (*crfp)(seg, vn_a);
1571 if (error != 0) {
1572 seg_free(seg);
1573 } else {
1574 as->a_size += size;
1575 as->a_resvsize += size;
1576 }
1577 return (error);
1578 }
1579
1580 va.va_mask = AT_SIZE;
1581 if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) {
1582 szcvec = 0;
1583 goto again;
1584 }
1585 eoff = vn_a->offset & PAGEMASK;
1586 if (eoff >= va.va_size) {
1587 szcvec = 0;
1588 goto again;
1589 }
1590 eoff += size;
1591 if (btopr(va.va_size) < btopr(eoff)) {
1592 save_size = size;
1593 size = va.va_size - (vn_a->offset & PAGEMASK);
1594 size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
1595 szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1596 type, 0);
1597 if (szcvec <= 1) {
1598 size = save_size;
1599 goto again;
1600 }
1601 }
1602
1603 if (size > textrepl_size_thresh) {
1604 vn_a->flags |= _MAP_TEXTREPL;
1605 }
1606 error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a,
1607 segcreated);
1608 if (error != 0) {
1609 return (error);
1610 }
1611 if (save_size) {
1612 addr += size;
1613 size = save_size - size;
1614 szcvec = 0;
1615 goto again;
1616 }
1617 return (0);
1618 }
1619
1620 /*
1621 * as_map_ansegs: shared or private anonymous memory. Note that the flags
1622 * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1623 */
1624 static int
1625 as_map_ansegs(struct as *as, caddr_t addr, size_t size,
1626 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1627 {
1628 uint_t szcvec;
1629 uchar_t type;
1630
1631 ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
1632 if (vn_a->type == MAP_SHARED) {
1633 type = MAPPGSZC_SHM;
1634 } else if (vn_a->type == MAP_PRIVATE) {
1635 if (vn_a->szc == AS_MAP_HEAP) {
1636 type = MAPPGSZC_HEAP;
1637 } else if (vn_a->szc == AS_MAP_STACK) {
1638 type = MAPPGSZC_STACK;
1639 } else {
1640 type = MAPPGSZC_PRIVM;
1641 }
1642 }
1643 szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
1644 (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
1645 (vn_a->flags & MAP_TEXT), type, 0);
1646 ASSERT(AS_WRITE_HELD(as));
1647 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1648 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1649 ASSERT(vn_a->vp == NULL);
1650
1651 return (as_map_segvn_segs(as, addr, size, szcvec,
1652 crfp, vn_a, segcreated));
1653 }
1654
1655 int
1656 as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp)
1657 {
1658 AS_LOCK_ENTER(as, RW_WRITER);
1659 return (as_map_locked(as, addr, size, crfp, argsp));
1660 }
1661
1662 int
1663 as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(),
1664 void *argsp)
1665 {
1666 struct seg *seg = NULL;
1667 caddr_t raddr; /* rounded down addr */
1668 size_t rsize; /* rounded up size */
1669 int error;
1670 int unmap = 0;
1671 /*
1672 * The use of a_proc is preferred to handle the case where curproc is
1673 * a door_call server and is allocating memory in the client's (a_proc)
1674 * address space.
1675 * When creating a shared memory segment a_proc will be NULL so we
1676 * fallback to curproc in that case.
1677 */
1678 struct proc *p = (as->a_proc == NULL) ? curproc : as->a_proc;
1679 struct segvn_crargs crargs;
1680
1681 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1682 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1683 (size_t)raddr;
1684
1685 /*
1686 * check for wrap around
1687 */
1688 if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
1689 AS_LOCK_EXIT(as);
1690 return (ENOMEM);
1691 }
1692
1693 as->a_updatedir = 1; /* inform /proc */
1694 gethrestime(&as->a_updatetime);
1695
1696 if (as != &kas && as->a_size + rsize > (size_t)p->p_vmem_ctl) {
1697 AS_LOCK_EXIT(as);
1698
1699 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM], p->p_rctls, p,
1700 RCA_UNSAFE_ALL);
1701
1702 return (ENOMEM);
1703 }
1704
1705 if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
1706 crargs = *(struct segvn_crargs *)argsp;
1707 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap);
1708 if (error != 0) {
1709 AS_LOCK_EXIT(as);
1710 if (unmap) {
1711 (void) as_unmap(as, addr, size);
1712 }
1713 return (error);
1714 }
1715 } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
1716 crargs = *(struct segvn_crargs *)argsp;
1717 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap);
1718 if (error != 0) {
1719 AS_LOCK_EXIT(as);
1720 if (unmap) {
1721 (void) as_unmap(as, addr, size);
1722 }
1723 return (error);
1724 }
1725 } else {
1726 seg = seg_alloc(as, addr, size);
1727 if (seg == NULL) {
1728 AS_LOCK_EXIT(as);
1729 return (ENOMEM);
1730 }
1731
1732 error = (*crfp)(seg, argsp);
1733 if (error != 0) {
1734 seg_free(seg);
1735 AS_LOCK_EXIT(as);
1736 return (error);
1737 }
1738 /*
1739 * Add size now so as_unmap will work if as_ctl fails.
1740 */
1741 as->a_size += rsize;
1742 as->a_resvsize += rsize;
1743 }
1744
1745 as_setwatch(as);
1746
1747 /*
1748 * If the address space is locked,
1749 * establish memory locks for the new segment.
1750 */
1751 mutex_enter(&as->a_contents);
1752 if (AS_ISPGLCK(as)) {
1753 mutex_exit(&as->a_contents);
1754 AS_LOCK_EXIT(as);
1755 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
1756 if (error != 0)
1757 (void) as_unmap(as, addr, size);
1758 } else {
1759 mutex_exit(&as->a_contents);
1760 AS_LOCK_EXIT(as);
1761 }
1762 return (error);
1763 }
1764
1765
1766 /*
1767 * Delete all segments in the address space marked with S_PURGE.
1768 * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1769 * These segments are deleted as a first step before calls to as_gap(), so
1770 * that they don't affect mmap() or shmat().
1771 */
1772 void
1773 as_purge(struct as *as)
1774 {
1775 struct seg *seg;
1776 struct seg *next_seg;
1777
1778 /*
1779 * the setting of NEEDSPURGE is protect by as_rangelock(), so
1780 * no need to grab a_contents mutex for this check
1781 */
1782 if ((as->a_flags & AS_NEEDSPURGE) == 0)
1783 return;
1784
1785 AS_LOCK_ENTER(as, RW_WRITER);
1786 next_seg = NULL;
1787 seg = AS_SEGFIRST(as);
1788 while (seg != NULL) {
1789 next_seg = AS_SEGNEXT(as, seg);
1790 if (seg->s_flags & S_PURGE)
1791 SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1792 seg = next_seg;
1793 }
1794 AS_LOCK_EXIT(as);
1795
1796 mutex_enter(&as->a_contents);
1797 as->a_flags &= ~AS_NEEDSPURGE;
1798 mutex_exit(&as->a_contents);
1799 }
1800
1801 /*
1802 * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1803 * range of addresses at least "minlen" long, where the base of the range is
1804 * at "off" phase from an "align" boundary and there is space for a
1805 * "redzone"-sized redzone on eithe rside of the range. Thus,
1806 * if align was 4M and off was 16k, the user wants a hole which will start
1807 * 16k into a 4M page.
1808 *
1809 * If flags specifies AH_HI, the hole will have the highest possible address
1810 * in the range. We use the as->a_lastgap field to figure out where to
1811 * start looking for a gap.
1812 *
1813 * Otherwise, the gap will have the lowest possible address.
1814 *
1815 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1816 *
1817 * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1818 * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1819 *
1820 * NOTE: This routine is not correct when base+len overflows caddr_t.
1821 */
1822 int
1823 as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
1824 uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off)
1825 {
1826 caddr_t lobound = *basep;
1827 caddr_t hibound = lobound + *lenp;
1828 struct seg *lseg, *hseg;
1829 caddr_t lo, hi;
1830 int forward;
1831 caddr_t save_base;
1832 size_t save_len;
1833 size_t save_minlen;
1834 size_t save_redzone;
1835 int fast_path = 1;
1836
1837 save_base = *basep;
1838 save_len = *lenp;
1839 save_minlen = minlen;
1840 save_redzone = redzone;
1841
1842 /*
1843 * For the first pass/fast_path, just add align and redzone into
1844 * minlen since if we get an allocation, we can guarantee that it
1845 * will fit the alignment and redzone requested.
1846 * This increases the chance that hibound will be adjusted to
1847 * a_lastgap->s_base which will likely allow us to find an
1848 * acceptable hole in the address space quicker.
1849 * If we can't find a hole with this fast_path, then we look for
1850 * smaller holes in which the alignment and offset may allow
1851 * the allocation to fit.
1852 */
1853 minlen += align;
1854 minlen += 2 * redzone;
1855 redzone = 0;
1856
1857 AS_LOCK_ENTER(as, RW_READER);
1858 if (AS_SEGFIRST(as) == NULL) {
1859 if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
1860 align, redzone, off)) {
1861 AS_LOCK_EXIT(as);
1862 return (0);
1863 } else {
1864 AS_LOCK_EXIT(as);
1865 *basep = save_base;
1866 *lenp = save_len;
1867 return (-1);
1868 }
1869 }
1870
1871 retry:
1872 /*
1873 * Set up to iterate over all the inter-segment holes in the given
1874 * direction. lseg is NULL for the lowest-addressed hole and hseg is
1875 * NULL for the highest-addressed hole. If moving backwards, we reset
1876 * sseg to denote the highest-addressed segment.
1877 */
1878 forward = (flags & AH_DIR) == AH_LO;
1879 if (forward) {
1880 hseg = as_findseg(as, lobound, 1);
1881 lseg = AS_SEGPREV(as, hseg);
1882 } else {
1883
1884 /*
1885 * If allocating at least as much as the last allocation,
1886 * use a_lastgap's base as a better estimate of hibound.
1887 */
1888 if (as->a_lastgap &&
1889 minlen >= as->a_lastgap->s_size &&
1890 hibound >= as->a_lastgap->s_base)
1891 hibound = as->a_lastgap->s_base;
1892
1893 hseg = as_findseg(as, hibound, 1);
1894 if (hseg->s_base + hseg->s_size < hibound) {
1895 lseg = hseg;
1896 hseg = NULL;
1897 } else {
1898 lseg = AS_SEGPREV(as, hseg);
1899 }
1900 }
1901
1902 for (;;) {
1903 /*
1904 * Set lo and hi to the hole's boundaries. (We should really
1905 * use MAXADDR in place of hibound in the expression below,
1906 * but can't express it easily; using hibound in its place is
1907 * harmless.)
1908 */
1909 lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
1910 hi = (hseg == NULL) ? hibound : hseg->s_base;
1911 /*
1912 * If the iteration has moved past the interval from lobound
1913 * to hibound it's pointless to continue.
1914 */
1915 if ((forward && lo > hibound) || (!forward && hi < lobound))
1916 break;
1917 else if (lo > hibound || hi < lobound)
1918 goto cont;
1919 /*
1920 * Candidate hole lies at least partially within the allowable
1921 * range. Restrict it to fall completely within that range,
1922 * i.e., to [max(lo, lobound), min(hi, hibound)].
1923 */
1924 if (lo < lobound)
1925 lo = lobound;
1926 if (hi > hibound)
1927 hi = hibound;
1928 /*
1929 * Verify that the candidate hole is big enough and meets
1930 * hardware constraints. If the hole is too small, no need
1931 * to do the further checks since they will fail.
1932 */
1933 *basep = lo;
1934 *lenp = hi - lo;
1935 if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
1936 minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
1937 ((flags & AH_CONTAIN) == 0 ||
1938 (*basep <= addr && *basep + *lenp > addr))) {
1939 if (!forward)
1940 as->a_lastgap = hseg;
1941 if (hseg != NULL)
1942 as->a_lastgaphl = hseg;
1943 else
1944 as->a_lastgaphl = lseg;
1945 AS_LOCK_EXIT(as);
1946 return (0);
1947 }
1948 cont:
1949 /*
1950 * Move to the next hole.
1951 */
1952 if (forward) {
1953 lseg = hseg;
1954 if (lseg == NULL)
1955 break;
1956 hseg = AS_SEGNEXT(as, hseg);
1957 } else {
1958 hseg = lseg;
1959 if (hseg == NULL)
1960 break;
1961 lseg = AS_SEGPREV(as, lseg);
1962 }
1963 }
1964 if (fast_path && (align != 0 || save_redzone != 0)) {
1965 fast_path = 0;
1966 minlen = save_minlen;
1967 redzone = save_redzone;
1968 goto retry;
1969 }
1970 *basep = save_base;
1971 *lenp = save_len;
1972 AS_LOCK_EXIT(as);
1973 return (-1);
1974 }
1975
1976 /*
1977 * Find a hole of at least size minlen within [*basep, *basep + *lenp).
1978 *
1979 * If flags specifies AH_HI, the hole will have the highest possible address
1980 * in the range. We use the as->a_lastgap field to figure out where to
1981 * start looking for a gap.
1982 *
1983 * Otherwise, the gap will have the lowest possible address.
1984 *
1985 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1986 *
1987 * If an adequate hole is found, base and len are set to reflect the part of
1988 * the hole that is within range, and 0 is returned, otherwise,
1989 * -1 is returned.
1990 *
1991 * NOTE: This routine is not correct when base+len overflows caddr_t.
1992 */
1993 int
1994 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
1995 caddr_t addr)
1996 {
1997
1998 return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
1999 }
2000
2001 /*
2002 * Return the next range within [base, base + len) that is backed
2003 * with "real memory". Skip holes and non-seg_vn segments.
2004 * We're lazy and only return one segment at a time.
2005 */
2006 int
2007 as_memory(struct as *as, caddr_t *basep, size_t *lenp)
2008 {
2009 extern struct seg_ops segspt_shmops; /* needs a header file */
2010 struct seg *seg;
2011 caddr_t addr, eaddr;
2012 caddr_t segend;
2013
2014 AS_LOCK_ENTER(as, RW_READER);
2015
2016 addr = *basep;
2017 eaddr = addr + *lenp;
2018
2019 seg = as_findseg(as, addr, 0);
2020 if (seg != NULL)
2021 addr = MAX(seg->s_base, addr);
2022
2023 for (;;) {
2024 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2025 AS_LOCK_EXIT(as);
2026 return (EINVAL);
2027 }
2028
2029 if (seg->s_ops == &segvn_ops) {
2030 segend = seg->s_base + seg->s_size;
2031 break;
2032 }
2033
2034 /*
2035 * We do ISM by looking into the private data
2036 * to determine the real size of the segment.
2037 */
2038 if (seg->s_ops == &segspt_shmops) {
2039 segend = seg->s_base + spt_realsize(seg);
2040 if (addr < segend)
2041 break;
2042 }
2043
2044 seg = AS_SEGNEXT(as, seg);
2045
2046 if (seg != NULL)
2047 addr = seg->s_base;
2048 }
2049
2050 *basep = addr;
2051
2052 if (segend > eaddr)
2053 *lenp = eaddr - addr;
2054 else
2055 *lenp = segend - addr;
2056
2057 AS_LOCK_EXIT(as);
2058 return (0);
2059 }
2060
2061 /*
2062 * Swap the pages associated with the address space as out to
2063 * secondary storage, returning the number of bytes actually
2064 * swapped.
2065 *
2066 * The value returned is intended to correlate well with the process's
2067 * memory requirements. Its usefulness for this purpose depends on
2068 * how well the segment-level routines do at returning accurate
2069 * information.
2070 */
2071 size_t
2072 as_swapout(struct as *as)
2073 {
2074 struct seg *seg;
2075 size_t swpcnt = 0;
2076
2077 /*
2078 * Kernel-only processes have given up their address
2079 * spaces. Of course, we shouldn't be attempting to
2080 * swap out such processes in the first place...
2081 */
2082 if (as == NULL)
2083 return (0);
2084
2085 AS_LOCK_ENTER(as, RW_READER);
2086
2087 /*
2088 * Free all mapping resources associated with the address
2089 * space. The segment-level swapout routines capitalize
2090 * on this unmapping by scavanging pages that have become
2091 * unmapped here.
2092 */
2093 hat_swapout(as->a_hat);
2094
2095 /*
2096 * Call the swapout routines of all segments in the address
2097 * space to do the actual work, accumulating the amount of
2098 * space reclaimed.
2099 */
2100 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2101 struct seg_ops *ov = seg->s_ops;
2102
2103 /*
2104 * We have to check to see if the seg has
2105 * an ops vector because the seg may have
2106 * been in the middle of being set up when
2107 * the process was picked for swapout.
2108 */
2109 if ((ov != NULL) && (ov->swapout != NULL))
2110 swpcnt += SEGOP_SWAPOUT(seg);
2111 }
2112 AS_LOCK_EXIT(as);
2113 return (swpcnt);
2114 }
2115
2116 /*
2117 * Determine whether data from the mappings in interval [addr, addr + size)
2118 * are in the primary memory (core) cache.
2119 */
2120 int
2121 as_incore(struct as *as, caddr_t addr,
2122 size_t size, char *vec, size_t *sizep)
2123 {
2124 struct seg *seg;
2125 size_t ssize;
2126 caddr_t raddr; /* rounded down addr */
2127 size_t rsize; /* rounded up size */
2128 size_t isize; /* iteration size */
2129 int error = 0; /* result, assume success */
2130
2131 *sizep = 0;
2132 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2133 rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2134 (size_t)raddr;
2135
2136 if (raddr + rsize < raddr) /* check for wraparound */
2137 return (ENOMEM);
2138
2139 AS_LOCK_ENTER(as, RW_READER);
2140 seg = as_segat(as, raddr);
2141 if (seg == NULL) {
2142 AS_LOCK_EXIT(as);
2143 return (-1);
2144 }
2145
2146 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2147 if (raddr >= seg->s_base + seg->s_size) {
2148 seg = AS_SEGNEXT(as, seg);
2149 if (seg == NULL || raddr != seg->s_base) {
2150 error = -1;
2151 break;
2152 }
2153 }
2154 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2155 ssize = seg->s_base + seg->s_size - raddr;
2156 else
2157 ssize = rsize;
2158 *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec);
2159 if (isize != ssize) {
2160 error = -1;
2161 break;
2162 }
2163 vec += btopr(ssize);
2164 }
2165 AS_LOCK_EXIT(as);
2166 return (error);
2167 }
2168
2169 static void
2170 as_segunlock(struct seg *seg, caddr_t addr, int attr,
2171 ulong_t *bitmap, size_t position, size_t npages)
2172 {
2173 caddr_t range_start;
2174 size_t pos1 = position;
2175 size_t pos2;
2176 size_t size;
2177 size_t end_pos = npages + position;
2178
2179 while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2180 size = ptob((pos2 - pos1));
2181 range_start = (caddr_t)((uintptr_t)addr +
2182 ptob(pos1 - position));
2183
2184 (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK,
2185 (ulong_t *)NULL, (size_t)NULL);
2186 pos1 = pos2;
2187 }
2188 }
2189
2190 static void
2191 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
2192 caddr_t raddr, size_t rsize)
2193 {
2194 struct seg *seg = as_segat(as, raddr);
2195 size_t ssize;
2196
2197 while (rsize != 0) {
2198 if (raddr >= seg->s_base + seg->s_size)
2199 seg = AS_SEGNEXT(as, seg);
2200
2201 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2202 ssize = seg->s_base + seg->s_size - raddr;
2203 else
2204 ssize = rsize;
2205
2206 as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
2207
2208 rsize -= ssize;
2209 raddr += ssize;
2210 }
2211 }
2212
2213 /*
2214 * Cache control operations over the interval [addr, addr + size) in
2215 * address space "as".
2216 */
2217 /*ARGSUSED*/
2218 int
2219 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
2220 uintptr_t arg, ulong_t *lock_map, size_t pos)
2221 {
2222 struct seg *seg; /* working segment */
2223 caddr_t raddr; /* rounded down addr */
2224 caddr_t initraddr; /* saved initial rounded down addr */
2225 size_t rsize; /* rounded up size */
2226 size_t initrsize; /* saved initial rounded up size */
2227 size_t ssize; /* size of seg */
2228 int error = 0; /* result */
2229 size_t mlock_size; /* size of bitmap */
2230 ulong_t *mlock_map; /* pointer to bitmap used */
2231 /* to represent the locked */
2232 /* pages. */
2233 retry:
2234 if (error == IE_RETRY)
2235 AS_LOCK_ENTER(as, RW_WRITER);
2236 else
2237 AS_LOCK_ENTER(as, RW_READER);
2238
2239 /*
2240 * If these are address space lock/unlock operations, loop over
2241 * all segments in the address space, as appropriate.
2242 */
2243 if (func == MC_LOCKAS) {
2244 size_t npages, idx;
2245 size_t rlen = 0; /* rounded as length */
2246
2247 idx = pos;
2248
2249 if (arg & MCL_FUTURE) {
2250 mutex_enter(&as->a_contents);
2251 AS_SETPGLCK(as);
2252 mutex_exit(&as->a_contents);
2253 }
2254 if ((arg & MCL_CURRENT) == 0) {
2255 AS_LOCK_EXIT(as);
2256 return (0);
2257 }
2258
2259 seg = AS_SEGFIRST(as);
2260 if (seg == NULL) {
2261 AS_LOCK_EXIT(as);
2262 return (0);
2263 }
2264
2265 do {
2266 raddr = (caddr_t)((uintptr_t)seg->s_base &
2267 (uintptr_t)PAGEMASK);
2268 rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2269 PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2270 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2271
2272 mlock_size = BT_BITOUL(btopr(rlen));
2273 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2274 sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2275 AS_LOCK_EXIT(as);
2276 return (EAGAIN);
2277 }
2278
2279 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2280 error = SEGOP_LOCKOP(seg, seg->s_base,
2281 seg->s_size, attr, MC_LOCK, mlock_map, pos);
2282 if (error != 0)
2283 break;
2284 pos += seg_pages(seg);
2285 }
2286
2287 if (error) {
2288 for (seg = AS_SEGFIRST(as); seg != NULL;
2289 seg = AS_SEGNEXT(as, seg)) {
2290
2291 raddr = (caddr_t)((uintptr_t)seg->s_base &
2292 (uintptr_t)PAGEMASK);
2293 npages = seg_pages(seg);
2294 as_segunlock(seg, raddr, attr, mlock_map,
2295 idx, npages);
2296 idx += npages;
2297 }
2298 }
2299
2300 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2301 AS_LOCK_EXIT(as);
2302 goto lockerr;
2303 } else if (func == MC_UNLOCKAS) {
2304 mutex_enter(&as->a_contents);
2305 AS_CLRPGLCK(as);
2306 mutex_exit(&as->a_contents);
2307
2308 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2309 error = SEGOP_LOCKOP(seg, seg->s_base,
2310 seg->s_size, attr, MC_UNLOCK, NULL, 0);
2311 if (error != 0)
2312 break;
2313 }
2314
2315 AS_LOCK_EXIT(as);
2316 goto lockerr;
2317 }
2318
2319 /*
2320 * Normalize addresses and sizes.
2321 */
2322 initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2323 initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2324 (size_t)raddr;
2325
2326 if (raddr + rsize < raddr) { /* check for wraparound */
2327 AS_LOCK_EXIT(as);
2328 return (ENOMEM);
2329 }
2330
2331 /*
2332 * Get initial segment.
2333 */
2334 if ((seg = as_segat(as, raddr)) == NULL) {
2335 AS_LOCK_EXIT(as);
2336 return (ENOMEM);
2337 }
2338
2339 if (func == MC_LOCK) {
2340 mlock_size = BT_BITOUL(btopr(rsize));
2341 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2342 sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2343 AS_LOCK_EXIT(as);
2344 return (EAGAIN);
2345 }
2346 }
2347
2348 /*
2349 * Loop over all segments. If a hole in the address range is
2350 * discovered, then fail. For each segment, perform the appropriate
2351 * control operation.
2352 */
2353 while (rsize != 0) {
2354
2355 /*
2356 * Make sure there's no hole, calculate the portion
2357 * of the next segment to be operated over.
2358 */
2359 if (raddr >= seg->s_base + seg->s_size) {
2360 seg = AS_SEGNEXT(as, seg);
2361 if (seg == NULL || raddr != seg->s_base) {
2362 if (func == MC_LOCK) {
2363 as_unlockerr(as, attr, mlock_map,
2364 initraddr, initrsize - rsize);
2365 kmem_free(mlock_map,
2366 mlock_size * sizeof (ulong_t));
2367 }
2368 AS_LOCK_EXIT(as);
2369 return (ENOMEM);
2370 }
2371 }
2372 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2373 ssize = seg->s_base + seg->s_size - raddr;
2374 else
2375 ssize = rsize;
2376
2377 /*
2378 * Dispatch on specific function.
2379 */
2380 switch (func) {
2381
2382 /*
2383 * Synchronize cached data from mappings with backing
2384 * objects.
2385 */
2386 case MC_SYNC:
2387 if (error = SEGOP_SYNC(seg, raddr, ssize,
2388 attr, (uint_t)arg)) {
2389 AS_LOCK_EXIT(as);
2390 return (error);
2391 }
2392 break;
2393
2394 /*
2395 * Lock pages in memory.
2396 */
2397 case MC_LOCK:
2398 if (error = SEGOP_LOCKOP(seg, raddr, ssize,
2399 attr, func, mlock_map, pos)) {
2400 as_unlockerr(as, attr, mlock_map, initraddr,
2401 initrsize - rsize + ssize);
2402 kmem_free(mlock_map, mlock_size *
2403 sizeof (ulong_t));
2404 AS_LOCK_EXIT(as);
2405 goto lockerr;
2406 }
2407 break;
2408
2409 /*
2410 * Unlock mapped pages.
2411 */
2412 case MC_UNLOCK:
2413 (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func,
2414 (ulong_t *)NULL, (size_t)NULL);
2415 break;
2416
2417 /*
2418 * Store VM advise for mapped pages in segment layer.
2419 */
2420 case MC_ADVISE:
2421 error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg);
2422
2423 /*
2424 * Check for regular errors and special retry error
2425 */
2426 if (error) {
2427 if (error == IE_RETRY) {
2428 /*
2429 * Need to acquire writers lock, so
2430 * have to drop readers lock and start
2431 * all over again
2432 */
2433 AS_LOCK_EXIT(as);
2434 goto retry;
2435 } else if (error == IE_REATTACH) {
2436 /*
2437 * Find segment for current address
2438 * because current segment just got
2439 * split or concatenated
2440 */
2441 seg = as_segat(as, raddr);
2442 if (seg == NULL) {
2443 AS_LOCK_EXIT(as);
2444 return (ENOMEM);
2445 }
2446 } else {
2447 /*
2448 * Regular error
2449 */
2450 AS_LOCK_EXIT(as);
2451 return (error);
2452 }
2453 }
2454 break;
2455
2456 case MC_INHERIT_ZERO:
2457 if (seg->s_ops->inherit == NULL) {
2458 error = ENOTSUP;
2459 } else {
2460 error = SEGOP_INHERIT(seg, raddr, ssize,
2461 SEGP_INH_ZERO);
2462 }
2463 if (error != 0) {
2464 AS_LOCK_EXIT(as);
2465 return (error);
2466 }
2467 break;
2468
2469 /*
2470 * Can't happen.
2471 */
2472 default:
2473 panic("as_ctl: bad operation %d", func);
2474 /*NOTREACHED*/
2475 }
2476
2477 rsize -= ssize;
2478 raddr += ssize;
2479 }
2480
2481 if (func == MC_LOCK)
2482 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2483 AS_LOCK_EXIT(as);
2484 return (0);
2485 lockerr:
2486
2487 /*
2488 * If the lower levels returned EDEADLK for a segment lockop,
2489 * it means that we should retry the operation. Let's wait
2490 * a bit also to let the deadlock causing condition clear.
2491 * This is part of a gross hack to work around a design flaw
2492 * in the ufs/sds logging code and should go away when the
2493 * logging code is re-designed to fix the problem. See bug
2494 * 4125102 for details of the problem.
2495 */
2496 if (error == EDEADLK) {
2497 delay(deadlk_wait);
2498 error = 0;
2499 goto retry;
2500 }
2501 return (error);
2502 }
2503
2504 int
2505 fc_decode(faultcode_t fault_err)
2506 {
2507 int error = 0;
2508
2509 switch (FC_CODE(fault_err)) {
2510 case FC_OBJERR:
2511 error = FC_ERRNO(fault_err);
2512 break;
2513 case FC_PROT:
2514 error = EACCES;
2515 break;
2516 default:
2517 error = EFAULT;
2518 break;
2519 }
2520 return (error);
2521 }
2522
2523 /*
2524 * Pagelock pages from a range that spans more than 1 segment. Obtain shadow
2525 * lists from each segment and copy them to one contiguous shadow list (plist)
2526 * as expected by the caller. Save pointers to per segment shadow lists at
2527 * the tail of plist so that they can be used during as_pageunlock().
2528 */
2529 static int
2530 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2531 caddr_t addr, size_t size, enum seg_rw rw)
2532 {
2533 caddr_t sv_addr = addr;
2534 size_t sv_size = size;
2535 struct seg *sv_seg = seg;
2536 ulong_t segcnt = 1;
2537 ulong_t cnt;
2538 size_t ssize;
2539 pgcnt_t npages = btop(size);
2540 page_t **plist;
2541 page_t **pl;
2542 int error;
2543 caddr_t eaddr;
2544 faultcode_t fault_err = 0;
2545 pgcnt_t pl_off;
2546 extern struct seg_ops segspt_shmops;
2547
2548 ASSERT(AS_LOCK_HELD(as));
2549 ASSERT(seg != NULL);
2550 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2551 ASSERT(addr + size > seg->s_base + seg->s_size);
2552 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2553 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2554
2555 /*
2556 * Count the number of segments covered by the range we are about to
2557 * lock. The segment count is used to size the shadow list we return
2558 * back to the caller.
2559 */
2560 for (; size != 0; size -= ssize, addr += ssize) {
2561 if (addr >= seg->s_base + seg->s_size) {
2562
2563 seg = AS_SEGNEXT(as, seg);
2564 if (seg == NULL || addr != seg->s_base) {
2565 AS_LOCK_EXIT(as);
2566 return (EFAULT);
2567 }
2568 /*
2569 * Do a quick check if subsequent segments
2570 * will most likely support pagelock.
2571 */
2572 if (seg->s_ops == &segvn_ops) {
2573 vnode_t *vp;
2574
2575 if (SEGOP_GETVP(seg, addr, &vp) != 0 ||
2576 vp != NULL) {
2577 AS_LOCK_EXIT(as);
2578 goto slow;
2579 }
2580 } else if (seg->s_ops != &segspt_shmops) {
2581 AS_LOCK_EXIT(as);
2582 goto slow;
2583 }
2584 segcnt++;
2585 }
2586 if (addr + size > seg->s_base + seg->s_size) {
2587 ssize = seg->s_base + seg->s_size - addr;
2588 } else {
2589 ssize = size;
2590 }
2591 }
2592 ASSERT(segcnt > 1);
2593
2594 plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2595
2596 addr = sv_addr;
2597 size = sv_size;
2598 seg = sv_seg;
2599
2600 for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2601 if (addr >= seg->s_base + seg->s_size) {
2602 seg = AS_SEGNEXT(as, seg);
2603 ASSERT(seg != NULL && addr == seg->s_base);
2604 cnt++;
2605 ASSERT(cnt < segcnt);
2606 }
2607 if (addr + size > seg->s_base + seg->s_size) {
2608 ssize = seg->s_base + seg->s_size - addr;
2609 } else {
2610 ssize = size;
2611 }
2612 pl = &plist[npages + cnt];
2613 error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2614 L_PAGELOCK, rw);
2615 if (error) {
2616 break;
2617 }
2618 ASSERT(plist[npages + cnt] != NULL);
2619 ASSERT(pl_off + btop(ssize) <= npages);
2620 bcopy(plist[npages + cnt], &plist[pl_off],
2621 btop(ssize) * sizeof (page_t *));
2622 pl_off += btop(ssize);
2623 }
2624
2625 if (size == 0) {
2626 AS_LOCK_EXIT(as);
2627 ASSERT(cnt == segcnt - 1);
2628 *ppp = plist;
2629 return (0);
2630 }
2631
2632 /*
2633 * one of pagelock calls failed. The error type is in error variable.
2634 * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2635 * type is either EFAULT or ENOTSUP. Otherwise just return the error
2636 * back to the caller.
2637 */
2638
2639 eaddr = addr;
2640 seg = sv_seg;
2641
2642 for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2643 if (addr >= seg->s_base + seg->s_size) {
2644 seg = AS_SEGNEXT(as, seg);
2645 ASSERT(seg != NULL && addr == seg->s_base);
2646 cnt++;
2647 ASSERT(cnt < segcnt);
2648 }
2649 if (eaddr > seg->s_base + seg->s_size) {
2650 ssize = seg->s_base + seg->s_size - addr;
2651 } else {
2652 ssize = eaddr - addr;
2653 }
2654 pl = &plist[npages + cnt];
2655 ASSERT(*pl != NULL);
2656 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2657 L_PAGEUNLOCK, rw);
2658 }
2659
2660 AS_LOCK_EXIT(as);
2661
2662 kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2663
2664 if (error != ENOTSUP && error != EFAULT) {
2665 return (error);
2666 }
2667
2668 slow:
2669 /*
2670 * If we are here because pagelock failed due to the need to cow fault
2671 * in the pages we want to lock F_SOFTLOCK will do this job and in
2672 * next as_pagelock() call for this address range pagelock will
2673 * hopefully succeed.
2674 */
2675 fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2676 if (fault_err != 0) {
2677 return (fc_decode(fault_err));
2678 }
2679 *ppp = NULL;
2680
2681 return (0);
2682 }
2683
2684 /*
2685 * lock pages in a given address space. Return shadow list. If
2686 * the list is NULL, the MMU mapping is also locked.
2687 */
2688 int
2689 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
2690 size_t size, enum seg_rw rw)
2691 {
2692 size_t rsize;
2693 caddr_t raddr;
2694 faultcode_t fault_err;
2695 struct seg *seg;
2696 int err;
2697
2698 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START,
2699 "as_pagelock_start: addr %p size %ld", addr, size);
2700
2701 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2702 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2703 (size_t)raddr;
2704
2705 /*
2706 * if the request crosses two segments let
2707 * as_fault handle it.
2708 */
2709 AS_LOCK_ENTER(as, RW_READER);
2710
2711 seg = as_segat(as, raddr);
2712 if (seg == NULL) {
2713 AS_LOCK_EXIT(as);
2714 return (EFAULT);
2715 }
2716 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2717 if (raddr + rsize > seg->s_base + seg->s_size) {
2718 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2719 }
2720 if (raddr + rsize <= raddr) {
2721 AS_LOCK_EXIT(as);
2722 return (EFAULT);
2723 }
2724
2725 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
2726 "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize);
2727
2728 /*
2729 * try to lock pages and pass back shadow list
2730 */
2731 err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2732
2733 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
2734
2735 AS_LOCK_EXIT(as);
2736
2737 if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2738 return (err);
2739 }
2740
2741 /*
2742 * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2743 * to no pagelock support for this segment or pages need to be cow
2744 * faulted in. If fault is needed F_SOFTLOCK will do this job for
2745 * this as_pagelock() call and in the next as_pagelock() call for the
2746 * same address range pagelock call will hopefull succeed.
2747 */
2748 fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2749 if (fault_err != 0) {
2750 return (fc_decode(fault_err));
2751 }
2752 *ppp = NULL;
2753
2754 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end");
2755 return (0);
2756 }
2757
2758 /*
2759 * unlock pages locked by as_pagelock_segs(). Retrieve per segment shadow
2760 * lists from the end of plist and call pageunlock interface for each segment.
2761 * Drop as lock and free plist.
2762 */
2763 static void
2764 as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
2765 struct page **plist, enum seg_rw rw)
2766 {
2767 ulong_t cnt;
2768 caddr_t eaddr = addr + size;
2769 pgcnt_t npages = btop(size);
2770 size_t ssize;
2771 page_t **pl;
2772
2773 ASSERT(AS_LOCK_HELD(as));
2774 ASSERT(seg != NULL);
2775 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2776 ASSERT(addr + size > seg->s_base + seg->s_size);
2777 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2778 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2779 ASSERT(plist != NULL);
2780
2781 for (cnt = 0; addr < eaddr; addr += ssize) {
2782 if (addr >= seg->s_base + seg->s_size) {
2783 seg = AS_SEGNEXT(as, seg);
2784 ASSERT(seg != NULL && addr == seg->s_base);
2785 cnt++;
2786 }
2787 if (eaddr > seg->s_base + seg->s_size) {
2788 ssize = seg->s_base + seg->s_size - addr;
2789 } else {
2790 ssize = eaddr - addr;
2791 }
2792 pl = &plist[npages + cnt];
2793 ASSERT(*pl != NULL);
2794 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2795 L_PAGEUNLOCK, rw);
2796 }
2797 ASSERT(cnt > 0);
2798 AS_LOCK_EXIT(as);
2799
2800 cnt++;
2801 kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2802 }
2803
2804 /*
2805 * unlock pages in a given address range
2806 */
2807 void
2808 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2809 enum seg_rw rw)
2810 {
2811 struct seg *seg;
2812 size_t rsize;
2813 caddr_t raddr;
2814
2815 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START,
2816 "as_pageunlock_start: addr %p size %ld", addr, size);
2817
2818 /*
2819 * if the shadow list is NULL, as_pagelock was
2820 * falling back to as_fault
2821 */
2822 if (pp == NULL) {
2823 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2824 return;
2825 }
2826
2827 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2828 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2829 (size_t)raddr;
2830
2831 AS_LOCK_ENTER(as, RW_READER);
2832 seg = as_segat(as, raddr);
2833 ASSERT(seg != NULL);
2834
2835 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
2836 "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
2837
2838 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2839 if (raddr + rsize <= seg->s_base + seg->s_size) {
2840 SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2841 } else {
2842 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2843 return;
2844 }
2845 AS_LOCK_EXIT(as);
2846 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
2847 }
2848
2849 int
2850 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2851 boolean_t wait)
2852 {
2853 struct seg *seg;
2854 size_t ssize;
2855 caddr_t raddr; /* rounded down addr */
2856 size_t rsize; /* rounded up size */
2857 int error = 0;
2858 size_t pgsz = page_get_pagesize(szc);
2859
2860 setpgsz_top:
2861 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
2862 return (EINVAL);
2863 }
2864
2865 raddr = addr;
2866 rsize = size;
2867
2868 if (raddr + rsize < raddr) /* check for wraparound */
2869 return (ENOMEM);
2870
2871 AS_LOCK_ENTER(as, RW_WRITER);
2872 as_clearwatchprot(as, raddr, rsize);
2873 seg = as_segat(as, raddr);
2874 if (seg == NULL) {
2875 as_setwatch(as);
2876 AS_LOCK_EXIT(as);
2877 return (ENOMEM);
2878 }
2879
2880 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2881 if (raddr >= seg->s_base + seg->s_size) {
2882 seg = AS_SEGNEXT(as, seg);
2883 if (seg == NULL || raddr != seg->s_base) {
2884 error = ENOMEM;
2885 break;
2886 }
2887 }
2888 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2889 ssize = seg->s_base + seg->s_size - raddr;
2890 } else {
2891 ssize = rsize;
2892 }
2893
2894 retry:
2895 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
2896
2897 if (error == IE_NOMEM) {
2898 error = EAGAIN;
2899 break;
2900 }
2901
2902 if (error == IE_RETRY) {
2903 AS_LOCK_EXIT(as);
2904 goto setpgsz_top;
2905 }
2906
2907 if (error == ENOTSUP) {
2908 error = EINVAL;
2909 break;
2910 }
2911
2912 if (wait && (error == EAGAIN)) {
2913 /*
2914 * Memory is currently locked. It must be unlocked
2915 * before this operation can succeed through a retry.
2916 * The possible reasons for locked memory and
2917 * corresponding strategies for unlocking are:
2918 * (1) Normal I/O
2919 * wait for a signal that the I/O operation
2920 * has completed and the memory is unlocked.
2921 * (2) Asynchronous I/O
2922 * The aio subsystem does not unlock pages when
2923 * the I/O is completed. Those pages are unlocked
2924 * when the application calls aiowait/aioerror.
2925 * So, to prevent blocking forever, cv_broadcast()
2926 * is done to wake up aio_cleanup_thread.
2927 * Subsequently, segvn_reclaim will be called, and
2928 * that will do AS_CLRUNMAPWAIT() and wake us up.
2929 * (3) Long term page locking:
2930 * This is not relevant for as_setpagesize()
2931 * because we cannot change the page size for
2932 * driver memory. The attempt to do so will
2933 * fail with a different error than EAGAIN so
2934 * there's no need to trigger as callbacks like
2935 * as_unmap, as_setprot or as_free would do.
2936 */
2937 mutex_enter(&as->a_contents);
2938 if (!AS_ISNOUNMAPWAIT(as)) {
2939 if (AS_ISUNMAPWAIT(as) == 0) {
2940 cv_broadcast(&as->a_cv);
2941 }
2942 AS_SETUNMAPWAIT(as);
2943 AS_LOCK_EXIT(as);
2944 while (AS_ISUNMAPWAIT(as)) {
2945 cv_wait(&as->a_cv, &as->a_contents);
2946 }
2947 } else {
2948 /*
2949 * We may have raced with
2950 * segvn_reclaim()/segspt_reclaim(). In this
2951 * case clean nounmapwait flag and retry since
2952 * softlockcnt in this segment may be already
2953 * 0. We don't drop as writer lock so our
2954 * number of retries without sleeping should
2955 * be very small. See segvn_reclaim() for
2956 * more comments.
2957 */
2958 AS_CLRNOUNMAPWAIT(as);
2959 mutex_exit(&as->a_contents);
2960 goto retry;
2961 }
2962 mutex_exit(&as->a_contents);
2963 goto setpgsz_top;
2964 } else if (error != 0) {
2965 break;
2966 }
2967 }
2968 as_setwatch(as);
2969 AS_LOCK_EXIT(as);
2970 return (error);
2971 }
2972
2973 /*
2974 * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments
2975 * in its chunk where s_szc is less than the szc we want to set.
2976 */
2977 static int
2978 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
2979 int *retry)
2980 {
2981 struct seg *seg;
2982 size_t ssize;
2983 int error;
2984
2985 ASSERT(AS_WRITE_HELD(as));
2986
2987 seg = as_segat(as, raddr);
2988 if (seg == NULL) {
2989 panic("as_iset3_default_lpsize: no seg");
2990 }
2991
2992 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2993 if (raddr >= seg->s_base + seg->s_size) {
2994 seg = AS_SEGNEXT(as, seg);
2995 if (seg == NULL || raddr != seg->s_base) {
2996 panic("as_iset3_default_lpsize: as changed");
2997 }
2998 }
2999 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3000 ssize = seg->s_base + seg->s_size - raddr;
3001 } else {
3002 ssize = rsize;
3003 }
3004
3005 if (szc > seg->s_szc) {
3006 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
3007 /* Only retry on EINVAL segments that have no vnode. */
3008 if (error == EINVAL) {
3009 vnode_t *vp = NULL;
3010 if ((SEGOP_GETTYPE(seg, raddr) & MAP_SHARED) &&
3011 (SEGOP_GETVP(seg, raddr, &vp) != 0 ||
3012 vp == NULL)) {
3013 *retry = 1;
3014 } else {
3015 *retry = 0;
3016 }
3017 }
3018 if (error) {
3019 return (error);
3020 }
3021 }
3022 }
3023 return (0);
3024 }
3025
3026 /*
3027 * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
3028 * pagesize on each segment in its range, but if any fails with EINVAL,
3029 * then it reduces the pagesizes to the next size in the bitmap and
3030 * retries as_iset3_default_lpsize(). The reason why the code retries
3031 * smaller allowed sizes on EINVAL is because (a) the anon offset may not
3032 * match the bigger sizes, and (b) it's hard to get this offset (to begin
3033 * with) to pass to map_pgszcvec().
3034 */
3035 static int
3036 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
3037 uint_t szcvec)
3038 {
3039 int error;
3040 int retry;
3041
3042 ASSERT(AS_WRITE_HELD(as));
3043
3044 for (;;) {
3045 error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
3046 if (error == EINVAL && retry) {
3047 szcvec &= ~(1 << szc);
3048 if (szcvec <= 1) {
3049 return (EINVAL);
3050 }
3051 szc = highbit(szcvec) - 1;
3052 } else {
3053 return (error);
3054 }
3055 }
3056 }
3057
3058 /*
3059 * as_iset1_default_lpsize() breaks its chunk into areas where existing
3060 * segments have a smaller szc than we want to set. For each such area,
3061 * it calls as_iset2_default_lpsize()
3062 */
3063 static int
3064 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3065 uint_t szcvec)
3066 {
3067 struct seg *seg;
3068 size_t ssize;
3069 caddr_t setaddr = raddr;
3070 size_t setsize = 0;
3071 int set;
3072 int error;
3073
3074 ASSERT(AS_WRITE_HELD(as));
3075
3076 seg = as_segat(as, raddr);
3077 if (seg == NULL) {
3078 panic("as_iset1_default_lpsize: no seg");
3079 }
3080 if (seg->s_szc < szc) {
3081 set = 1;
3082 } else {
3083 set = 0;
3084 }
3085
3086 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3087 if (raddr >= seg->s_base + seg->s_size) {
3088 seg = AS_SEGNEXT(as, seg);
3089 if (seg == NULL || raddr != seg->s_base) {
3090 panic("as_iset1_default_lpsize: as changed");
3091 }
3092 if (seg->s_szc >= szc && set) {
3093 ASSERT(setsize != 0);
3094 error = as_iset2_default_lpsize(as,
3095 setaddr, setsize, szc, szcvec);
3096 if (error) {
3097 return (error);
3098 }
3099 set = 0;
3100 } else if (seg->s_szc < szc && !set) {
3101 setaddr = raddr;
3102 setsize = 0;
3103 set = 1;
3104 }
3105 }
3106 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3107 ssize = seg->s_base + seg->s_size - raddr;
3108 } else {
3109 ssize = rsize;
3110 }
3111 }
3112 error = 0;
3113 if (set) {
3114 ASSERT(setsize != 0);
3115 error = as_iset2_default_lpsize(as, setaddr, setsize,
3116 szc, szcvec);
3117 }
3118 return (error);
3119 }
3120
3121 /*
3122 * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3123 * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3124 * chunk to as_iset1_default_lpsize().
3125 */
3126 static int
3127 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
3128 int type)
3129 {
3130 int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
3131 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
3132 flags, rtype, 1);
3133 uint_t szc;
3134 uint_t nszc;
3135 int error;
3136 caddr_t a;
3137 caddr_t eaddr;
3138 size_t segsize;
3139 size_t pgsz;
3140 uint_t save_szcvec;
3141
3142 ASSERT(AS_WRITE_HELD(as));
3143 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
3144 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
3145
3146 szcvec &= ~1;
3147 if (szcvec <= 1) { /* skip if base page size */
3148 return (0);
3149 }
3150
3151 /* Get the pagesize of the first larger page size. */
3152 szc = lowbit(szcvec) - 1;
3153 pgsz = page_get_pagesize(szc);
3154 eaddr = addr + size;
3155 addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3156 eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3157
3158 save_szcvec = szcvec;
3159 szcvec >>= (szc + 1);
3160 nszc = szc;
3161 while (szcvec) {
3162 if ((szcvec & 0x1) == 0) {
3163 nszc++;
3164 szcvec >>= 1;
3165 continue;
3166 }
3167 nszc++;
3168 pgsz = page_get_pagesize(nszc);
3169 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3170 if (a != addr) {
3171 ASSERT(szc > 0);
3172 ASSERT(a < eaddr);
3173 segsize = a - addr;
3174 error = as_iset1_default_lpsize(as, addr, segsize, szc,
3175 save_szcvec);
3176 if (error) {
3177 return (error);
3178 }
3179 addr = a;
3180 }
3181 szc = nszc;
3182 szcvec >>= 1;
3183 }
3184
3185 ASSERT(addr < eaddr);
3186 szcvec = save_szcvec;
3187 while (szcvec) {
3188 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3189 ASSERT(a >= addr);
3190 if (a != addr) {
3191 ASSERT(szc > 0);
3192 segsize = a - addr;
3193 error = as_iset1_default_lpsize(as, addr, segsize, szc,
3194 save_szcvec);
3195 if (error) {
3196 return (error);
3197 }
3198 addr = a;
3199 }
3200 szcvec &= ~(1 << szc);
3201 if (szcvec) {
3202 szc = highbit(szcvec) - 1;
3203 pgsz = page_get_pagesize(szc);
3204 }
3205 }
3206 ASSERT(addr == eaddr);
3207
3208 return (0);
3209 }
3210
3211 /*
3212 * Set the default large page size for the range. Called via memcntl with
3213 * page size set to 0. as_set_default_lpsize breaks the range down into
3214 * chunks with the same type/flags, ignores-non segvn segments, and passes
3215 * each chunk to as_iset_default_lpsize().
3216 */
3217 int
3218 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
3219 {
3220 struct seg *seg;
3221 caddr_t raddr;
3222 size_t rsize;
3223 size_t ssize;
3224 int rtype, rflags;
3225 int stype, sflags;
3226 int error;
3227 caddr_t setaddr;
3228 size_t setsize;
3229 int segvn;
3230
3231 if (size == 0)
3232 return (0);
3233
3234 AS_LOCK_ENTER(as, RW_WRITER);
3235 again:
3236 error = 0;
3237
3238 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3239 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3240 (size_t)raddr;
3241
3242 if (raddr + rsize < raddr) { /* check for wraparound */
3243 AS_LOCK_EXIT(as);
3244 return (ENOMEM);
3245 }
3246 as_clearwatchprot(as, raddr, rsize);
3247 seg = as_segat(as, raddr);
3248 if (seg == NULL) {
3249 as_setwatch(as);
3250 AS_LOCK_EXIT(as);
3251 return (ENOMEM);
3252 }
3253 if (seg->s_ops == &segvn_ops) {
3254 rtype = SEGOP_GETTYPE(seg, addr);
3255 rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3256 rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3257 segvn = 1;
3258 } else {
3259 segvn = 0;
3260 }
3261 setaddr = raddr;
3262 setsize = 0;
3263
3264 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3265 if (raddr >= (seg->s_base + seg->s_size)) {
3266 seg = AS_SEGNEXT(as, seg);
3267 if (seg == NULL || raddr != seg->s_base) {
3268 error = ENOMEM;
3269 break;
3270 }
3271 if (seg->s_ops == &segvn_ops) {
3272 stype = SEGOP_GETTYPE(seg, raddr);
3273 sflags = stype & (MAP_TEXT | MAP_INITDATA);
3274 stype &= (MAP_SHARED | MAP_PRIVATE);
3275 if (segvn && (rflags != sflags ||
3276 rtype != stype)) {
3277 /*
3278 * The next segment is also segvn but
3279 * has different flags and/or type.
3280 */
3281 ASSERT(setsize != 0);
3282 error = as_iset_default_lpsize(as,
3283 setaddr, setsize, rflags, rtype);
3284 if (error) {
3285 break;
3286 }
3287 rflags = sflags;
3288 rtype = stype;
3289 setaddr = raddr;
3290 setsize = 0;
3291 } else if (!segvn) {
3292 rflags = sflags;
3293 rtype = stype;
3294 setaddr = raddr;
3295 setsize = 0;
3296 segvn = 1;
3297 }
3298 } else if (segvn) {
3299 /* The next segment is not segvn. */
3300 ASSERT(setsize != 0);
3301 error = as_iset_default_lpsize(as,
3302 setaddr, setsize, rflags, rtype);
3303 if (error) {
3304 break;
3305 }
3306 segvn = 0;
3307 }
3308 }
3309 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3310 ssize = seg->s_base + seg->s_size - raddr;
3311 } else {
3312 ssize = rsize;
3313 }
3314 }
3315 if (error == 0 && segvn) {
3316 /* The last chunk when rsize == 0. */
3317 ASSERT(setsize != 0);
3318 error = as_iset_default_lpsize(as, setaddr, setsize,
3319 rflags, rtype);
3320 }
3321
3322 if (error == IE_RETRY) {
3323 goto again;
3324 } else if (error == IE_NOMEM) {
3325 error = EAGAIN;
3326 } else if (error == ENOTSUP) {
3327 error = EINVAL;
3328 } else if (error == EAGAIN) {
3329 mutex_enter(&as->a_contents);
3330 if (!AS_ISNOUNMAPWAIT(as)) {
3331 if (AS_ISUNMAPWAIT(as) == 0) {
3332 cv_broadcast(&as->a_cv);
3333 }
3334 AS_SETUNMAPWAIT(as);
3335 AS_LOCK_EXIT(as);
3336 while (AS_ISUNMAPWAIT(as)) {
3337 cv_wait(&as->a_cv, &as->a_contents);
3338 }
3339 mutex_exit(&as->a_contents);
3340 AS_LOCK_ENTER(as, RW_WRITER);
3341 } else {
3342 /*
3343 * We may have raced with
3344 * segvn_reclaim()/segspt_reclaim(). In this case
3345 * clean nounmapwait flag and retry since softlockcnt
3346 * in this segment may be already 0. We don't drop as
3347 * writer lock so our number of retries without
3348 * sleeping should be very small. See segvn_reclaim()
3349 * for more comments.
3350 */
3351 AS_CLRNOUNMAPWAIT(as);
3352 mutex_exit(&as->a_contents);
3353 }
3354 goto again;
3355 }
3356
3357 as_setwatch(as);
3358 AS_LOCK_EXIT(as);
3359 return (error);
3360 }
3361
3362 /*
3363 * Setup all of the uninitialized watched pages that we can.
3364 */
3365 void
3366 as_setwatch(struct as *as)
3367 {
3368 struct watched_page *pwp;
3369 struct seg *seg;
3370 caddr_t vaddr;
3371 uint_t prot;
3372 int err, retrycnt;
3373
3374 if (avl_numnodes(&as->a_wpage) == 0)
3375 return;
3376
3377 ASSERT(AS_WRITE_HELD(as));
3378
3379 for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3380 pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3381 retrycnt = 0;
3382 retry:
3383 vaddr = pwp->wp_vaddr;
3384 if (pwp->wp_oprot != 0 || /* already set up */
3385 (seg = as_segat(as, vaddr)) == NULL ||
3386 SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0)
3387 continue;
3388
3389 pwp->wp_oprot = prot;
3390 if (pwp->wp_read)
3391 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3392 if (pwp->wp_write)
3393 prot &= ~PROT_WRITE;
3394 if (pwp->wp_exec)
3395 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3396 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3397 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3398 if (err == IE_RETRY) {
3399 pwp->wp_oprot = 0;
3400 ASSERT(retrycnt == 0);
3401 retrycnt++;
3402 goto retry;
3403 }
3404 }
3405 pwp->wp_prot = prot;
3406 }
3407 }
3408
3409 /*
3410 * Clear all of the watched pages in the address space.
3411 */
3412 void
3413 as_clearwatch(struct as *as)
3414 {
3415 struct watched_page *pwp;
3416 struct seg *seg;
3417 caddr_t vaddr;
3418 uint_t prot;
3419 int err, retrycnt;
3420
3421 if (avl_numnodes(&as->a_wpage) == 0)
3422 return;
3423
3424 ASSERT(AS_WRITE_HELD(as));
3425
3426 for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3427 pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3428 retrycnt = 0;
3429 retry:
3430 vaddr = pwp->wp_vaddr;
3431 if (pwp->wp_oprot == 0 || /* not set up */
3432 (seg = as_segat(as, vaddr)) == NULL)
3433 continue;
3434
3435 if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3436 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3437 if (err == IE_RETRY) {
3438 ASSERT(retrycnt == 0);
3439 retrycnt++;
3440 goto retry;
3441 }
3442 }
3443 pwp->wp_oprot = 0;
3444 pwp->wp_prot = 0;
3445 }
3446 }
3447
3448 /*
3449 * Force a new setup for all the watched pages in the range.
3450 */
3451 static void
3452 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3453 {
3454 struct watched_page *pwp;
3455 struct watched_page tpw;
3456 caddr_t eaddr = addr + size;
3457 caddr_t vaddr;
3458 struct seg *seg;
3459 int err, retrycnt;
3460 uint_t wprot;
3461 avl_index_t where;
3462
3463 if (avl_numnodes(&as->a_wpage) == 0)
3464 return;
3465
3466 ASSERT(AS_WRITE_HELD(as));
3467
3468 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3469 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3470 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3471
3472 while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3473 retrycnt = 0;
3474 vaddr = pwp->wp_vaddr;
3475
3476 wprot = prot;
3477 if (pwp->wp_read)
3478 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3479 if (pwp->wp_write)
3480 wprot &= ~PROT_WRITE;
3481 if (pwp->wp_exec)
3482 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3483 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3484 retry:
3485 seg = as_segat(as, vaddr);
3486 if (seg == NULL) {
3487 panic("as_setwatchprot: no seg");
3488 /*NOTREACHED*/
3489 }
3490 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot);
3491 if (err == IE_RETRY) {
3492 ASSERT(retrycnt == 0);
3493 retrycnt++;
3494 goto retry;
3495 }
3496 }
3497 pwp->wp_oprot = prot;
3498 pwp->wp_prot = wprot;
3499
3500 pwp = AVL_NEXT(&as->a_wpage, pwp);
3501 }
3502 }
3503
3504 /*
3505 * Clear all of the watched pages in the range.
3506 */
3507 static void
3508 as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3509 {
3510 caddr_t eaddr = addr + size;
3511 struct watched_page *pwp;
3512 struct watched_page tpw;
3513 uint_t prot;
3514 struct seg *seg;
3515 int err, retrycnt;
3516 avl_index_t where;
3517
3518 if (avl_numnodes(&as->a_wpage) == 0)
3519 return;
3520
3521 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3522 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3523 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3524
3525 ASSERT(AS_WRITE_HELD(as));
3526
3527 while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3528
3529 if ((prot = pwp->wp_oprot) != 0) {
3530 retrycnt = 0;
3531
3532 if (prot != pwp->wp_prot) {
3533 retry:
3534 seg = as_segat(as, pwp->wp_vaddr);
3535 if (seg == NULL)
3536 continue;
3537 err = SEGOP_SETPROT(seg, pwp->wp_vaddr,
3538 PAGESIZE, prot);
3539 if (err == IE_RETRY) {
3540 ASSERT(retrycnt == 0);
3541 retrycnt++;
3542 goto retry;
3543
3544 }
3545 }
3546 pwp->wp_oprot = 0;
3547 pwp->wp_prot = 0;
3548 }
3549
3550 pwp = AVL_NEXT(&as->a_wpage, pwp);
3551 }
3552 }
3553
3554 void
3555 as_signal_proc(struct as *as, k_siginfo_t *siginfo)
3556 {
3557 struct proc *p;
3558
3559 mutex_enter(&pidlock);
3560 for (p = practive; p; p = p->p_next) {
3561 if (p->p_as == as) {
3562 mutex_enter(&p->p_lock);
3563 if (p->p_as == as)
3564 sigaddq(p, NULL, siginfo, KM_NOSLEEP);
3565 mutex_exit(&p->p_lock);
3566 }
3567 }
3568 mutex_exit(&pidlock);
3569 }
3570
3571 /*
3572 * return memory object ID
3573 */
3574 int
3575 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3576 {
3577 struct seg *seg;
3578 int sts;
3579
3580 AS_LOCK_ENTER(as, RW_READER);
3581 seg = as_segat(as, addr);
3582 if (seg == NULL) {
3583 AS_LOCK_EXIT(as);
3584 return (EFAULT);
3585 }
3586 /*
3587 * catch old drivers which may not support getmemid
3588 */
3589 if (seg->s_ops->getmemid == NULL) {
3590 AS_LOCK_EXIT(as);
3591 return (ENODEV);
3592 }
3593
3594 sts = SEGOP_GETMEMID(seg, addr, memidp);
3595
3596 AS_LOCK_EXIT(as);
3597 return (sts);
3598 }