1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2016 Joyent, Inc.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
26 */
27
28 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
29 /* All Rights Reserved */
30
31 /*
32 * University Copyright- Copyright (c) 1982, 1986, 1988
33 * The Regents of the University of California
34 * All Rights Reserved
35 *
36 * University Acknowledgment- Portions of this document are derived from
37 * software developed by the University of California, Berkeley, and its
38 * contributors.
39 */
40
41 /*
42 * VM - address spaces.
43 */
44
45 #include <sys/types.h>
46 #include <sys/t_lock.h>
47 #include <sys/param.h>
48 #include <sys/errno.h>
49 #include <sys/systm.h>
50 #include <sys/mman.h>
51 #include <sys/sysmacros.h>
52 #include <sys/cpuvar.h>
53 #include <sys/sysinfo.h>
54 #include <sys/kmem.h>
55 #include <sys/vnode.h>
56 #include <sys/vmsystm.h>
57 #include <sys/cmn_err.h>
58 #include <sys/debug.h>
59 #include <sys/tnf_probe.h>
60 #include <sys/vtrace.h>
61 #include <sys/ddi.h>
62
63 #include <vm/hat.h>
64 #include <vm/as.h>
65 #include <vm/seg.h>
66 #include <vm/seg_vn.h>
67 #include <vm/seg_dev.h>
68 #include <vm/seg_kmem.h>
69 #include <vm/seg_map.h>
70 #include <vm/seg_spt.h>
71 #include <vm/page.h>
72
73 clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */
74
75 ulong_t as_user_seg_limit = 0xffff; /* max segments in an (non-kas) AS */
76
77 static struct kmem_cache *as_cache;
78
79 static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t);
80 static void as_clearwatchprot(struct as *, caddr_t, size_t);
81 int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
82
83
84 /*
85 * Verifying the segment lists is very time-consuming; it may not be
86 * desirable always to define VERIFY_SEGLIST when DEBUG is set.
87 */
88 #ifdef DEBUG
89 #define VERIFY_SEGLIST
90 int do_as_verify = 0;
91 #endif
92
93 /*
94 * Allocate a new callback data structure entry and fill in the events of
95 * interest, the address range of interest, and the callback argument.
96 * Link the entry on the as->a_callbacks list. A callback entry for the
97 * entire address space may be specified with vaddr = 0 and size = -1.
98 *
99 * CALLERS RESPONSIBILITY: If not calling from within the process context for
100 * the specified as, the caller must guarantee persistence of the specified as
101 * for the duration of this function (eg. pages being locked within the as
102 * will guarantee persistence).
103 */
104 int
105 as_add_callback(struct as *as, void (*cb_func)(), void *arg, uint_t events,
106 caddr_t vaddr, size_t size, int sleepflag)
107 {
108 struct as_callback *current_head, *cb;
109 caddr_t saddr;
110 size_t rsize;
111
112 /* callback function and an event are mandatory */
113 if ((cb_func == NULL) || ((events & AS_ALL_EVENT) == 0))
114 return (EINVAL);
115
116 /* Adding a callback after as_free has been called is not allowed */
117 if (as == &kas)
118 return (ENOMEM);
119
120 /*
121 * vaddr = 0 and size = -1 is used to indicate that the callback range
122 * is the entire address space so no rounding is done in that case.
123 */
124 if (size != -1) {
125 saddr = (caddr_t)((uintptr_t)vaddr & (uintptr_t)PAGEMASK);
126 rsize = (((size_t)(vaddr + size) + PAGEOFFSET) & PAGEMASK) -
127 (size_t)saddr;
128 /* check for wraparound */
129 if (saddr + rsize < saddr)
130 return (ENOMEM);
131 } else {
132 if (vaddr != 0)
133 return (EINVAL);
134 saddr = vaddr;
135 rsize = size;
136 }
137
138 /* Allocate and initialize a callback entry */
139 cb = kmem_zalloc(sizeof (struct as_callback), sleepflag);
140 if (cb == NULL)
141 return (EAGAIN);
142
143 cb->ascb_func = cb_func;
144 cb->ascb_arg = arg;
145 cb->ascb_events = events;
146 cb->ascb_saddr = saddr;
147 cb->ascb_len = rsize;
148
149 /* Add the entry to the list */
150 mutex_enter(&as->a_contents);
151 current_head = as->a_callbacks;
152 as->a_callbacks = cb;
153 cb->ascb_next = current_head;
154
155 /*
156 * The call to this function may lose in a race with
157 * a pertinent event - eg. a thread does long term memory locking
158 * but before the callback is added another thread executes as_unmap.
159 * A broadcast here resolves that.
160 */
161 if ((cb->ascb_events & AS_UNMAPWAIT_EVENT) && AS_ISUNMAPWAIT(as)) {
162 AS_CLRUNMAPWAIT(as);
163 cv_broadcast(&as->a_cv);
164 }
165
166 mutex_exit(&as->a_contents);
167 return (0);
168 }
169
170 /*
171 * Search the callback list for an entry which pertains to arg.
172 *
173 * This is called from within the client upon completion of the callback.
174 * RETURN VALUES:
175 * AS_CALLBACK_DELETED (callback entry found and deleted)
176 * AS_CALLBACK_NOTFOUND (no callback entry found - this is ok)
177 * AS_CALLBACK_DELETE_DEFERRED (callback is in process, delete of this
178 * entry will be made in as_do_callbacks)
179 *
180 * If as_delete_callback encounters a matching entry with AS_CALLBACK_CALLED
181 * set, it indicates that as_do_callbacks is processing this entry. The
182 * AS_ALL_EVENT events are cleared in the entry, and a broadcast is made
183 * to unblock as_do_callbacks, in case it is blocked.
184 *
185 * CALLERS RESPONSIBILITY: If not calling from within the process context for
186 * the specified as, the caller must guarantee persistence of the specified as
187 * for the duration of this function (eg. pages being locked within the as
188 * will guarantee persistence).
189 */
190 uint_t
191 as_delete_callback(struct as *as, void *arg)
192 {
193 struct as_callback **prevcb = &as->a_callbacks;
194 struct as_callback *cb;
195 uint_t rc = AS_CALLBACK_NOTFOUND;
196
197 mutex_enter(&as->a_contents);
198 for (cb = as->a_callbacks; cb; prevcb = &cb->ascb_next, cb = *prevcb) {
199 if (cb->ascb_arg != arg)
200 continue;
201
202 /*
203 * If the events indicate AS_CALLBACK_CALLED, just clear
204 * AS_ALL_EVENT in the events field and wakeup the thread
205 * that may be waiting in as_do_callbacks. as_do_callbacks
206 * will take care of removing this entry from the list. In
207 * that case, return AS_CALLBACK_DELETE_DEFERRED. Otherwise
208 * (AS_CALLBACK_CALLED not set), just remove it from the
209 * list, return the memory and return AS_CALLBACK_DELETED.
210 */
211 if ((cb->ascb_events & AS_CALLBACK_CALLED) != 0) {
212 /* leave AS_CALLBACK_CALLED */
213 cb->ascb_events &= ~AS_ALL_EVENT;
214 rc = AS_CALLBACK_DELETE_DEFERRED;
215 cv_broadcast(&as->a_cv);
216 } else {
217 *prevcb = cb->ascb_next;
218 kmem_free(cb, sizeof (struct as_callback));
219 rc = AS_CALLBACK_DELETED;
220 }
221 break;
222 }
223 mutex_exit(&as->a_contents);
224 return (rc);
225 }
226
227 /*
228 * Searches the as callback list for a matching entry.
229 * Returns a pointer to the first matching callback, or NULL if
230 * nothing is found.
231 * This function never sleeps so it is ok to call it with more
232 * locks held but the (required) a_contents mutex.
233 *
234 * See also comment on as_do_callbacks below.
235 */
236 static struct as_callback *
237 as_find_callback(struct as *as, uint_t events, caddr_t event_addr,
238 size_t event_len)
239 {
240 struct as_callback *cb;
241
242 ASSERT(MUTEX_HELD(&as->a_contents));
243 for (cb = as->a_callbacks; cb != NULL; cb = cb->ascb_next) {
244 /*
245 * If the callback has not already been called, then
246 * check if events or address range pertains. An event_len
247 * of zero means do an unconditional callback.
248 */
249 if (((cb->ascb_events & AS_CALLBACK_CALLED) != 0) ||
250 ((event_len != 0) && (((cb->ascb_events & events) == 0) ||
251 (event_addr + event_len < cb->ascb_saddr) ||
252 (event_addr > (cb->ascb_saddr + cb->ascb_len))))) {
253 continue;
254 }
255 break;
256 }
257 return (cb);
258 }
259
260 /*
261 * Executes a given callback and removes it from the callback list for
262 * this address space.
263 * This function may sleep so the caller must drop all locks except
264 * a_contents before calling this func.
265 *
266 * See also comments on as_do_callbacks below.
267 */
268 static void
269 as_execute_callback(struct as *as, struct as_callback *cb,
270 uint_t events)
271 {
272 struct as_callback **prevcb;
273 void *cb_arg;
274
275 ASSERT(MUTEX_HELD(&as->a_contents) && (cb->ascb_events & events));
276 cb->ascb_events |= AS_CALLBACK_CALLED;
277 mutex_exit(&as->a_contents);
278 (*cb->ascb_func)(as, cb->ascb_arg, events);
279 mutex_enter(&as->a_contents);
280 /*
281 * the callback function is required to delete the callback
282 * when the callback function determines it is OK for
283 * this thread to continue. as_delete_callback will clear
284 * the AS_ALL_EVENT in the events field when it is deleted.
285 * If the callback function called as_delete_callback,
286 * events will already be cleared and there will be no blocking.
287 */
288 while ((cb->ascb_events & events) != 0) {
289 cv_wait(&as->a_cv, &as->a_contents);
290 }
291 /*
292 * This entry needs to be taken off the list. Normally, the
293 * callback func itself does that, but unfortunately the list
294 * may have changed while the callback was running because the
295 * a_contents mutex was dropped and someone else other than the
296 * callback func itself could have called as_delete_callback,
297 * so we have to search to find this entry again. The entry
298 * must have AS_CALLBACK_CALLED, and have the same 'arg'.
299 */
300 cb_arg = cb->ascb_arg;
301 prevcb = &as->a_callbacks;
302 for (cb = as->a_callbacks; cb != NULL;
303 prevcb = &cb->ascb_next, cb = *prevcb) {
304 if (((cb->ascb_events & AS_CALLBACK_CALLED) == 0) ||
305 (cb_arg != cb->ascb_arg)) {
306 continue;
307 }
308 *prevcb = cb->ascb_next;
309 kmem_free(cb, sizeof (struct as_callback));
310 break;
311 }
312 }
313
314 /*
315 * Check the callback list for a matching event and intersection of
316 * address range. If there is a match invoke the callback. Skip an entry if:
317 * - a callback is already in progress for this entry (AS_CALLBACK_CALLED)
318 * - not event of interest
319 * - not address range of interest
320 *
321 * An event_len of zero indicates a request for an unconditional callback
322 * (regardless of event), only the AS_CALLBACK_CALLED is checked. The
323 * a_contents lock must be dropped before a callback, so only one callback
324 * can be done before returning. Return -1 (true) if a callback was
325 * executed and removed from the list, else return 0 (false).
326 *
327 * The logically separate parts, i.e. finding a matching callback and
328 * executing a given callback have been separated into two functions
329 * so that they can be called with different sets of locks held beyond
330 * the always-required a_contents. as_find_callback does not sleep so
331 * it is ok to call it if more locks than a_contents (i.e. the a_lock
332 * rwlock) are held. as_execute_callback on the other hand may sleep
333 * so all locks beyond a_contents must be dropped by the caller if one
334 * does not want to end comatose.
335 */
336 static int
337 as_do_callbacks(struct as *as, uint_t events, caddr_t event_addr,
338 size_t event_len)
339 {
340 struct as_callback *cb;
341
342 if ((cb = as_find_callback(as, events, event_addr, event_len))) {
343 as_execute_callback(as, cb, events);
344 return (-1);
345 }
346 return (0);
347 }
348
349 /*
350 * Search for the segment containing addr. If a segment containing addr
351 * exists, that segment is returned. If no such segment exists, and
352 * the list spans addresses greater than addr, then the first segment
353 * whose base is greater than addr is returned; otherwise, NULL is
354 * returned unless tail is true, in which case the last element of the
355 * list is returned.
356 *
357 * a_seglast is used to cache the last found segment for repeated
358 * searches to the same addr (which happens frequently).
359 */
360 struct seg *
361 as_findseg(struct as *as, caddr_t addr, int tail)
362 {
363 struct seg *seg = as->a_seglast;
364 avl_index_t where;
365
366 ASSERT(AS_LOCK_HELD(as));
367
368 if (seg != NULL &&
369 seg->s_base <= addr &&
370 addr < seg->s_base + seg->s_size)
371 return (seg);
372
373 seg = avl_find(&as->a_segtree, &addr, &where);
374 if (seg != NULL)
375 return (as->a_seglast = seg);
376
377 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
378 if (seg == NULL && tail)
379 seg = avl_last(&as->a_segtree);
380 return (as->a_seglast = seg);
381 }
382
383 #ifdef VERIFY_SEGLIST
384 /*
385 * verify that the linked list is coherent
386 */
387 static void
388 as_verify(struct as *as)
389 {
390 struct seg *seg, *seglast, *p, *n;
391 uint_t nsegs = 0;
392
393 if (do_as_verify == 0)
394 return;
395
396 seglast = as->a_seglast;
397
398 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
399 ASSERT(seg->s_as == as);
400 p = AS_SEGPREV(as, seg);
401 n = AS_SEGNEXT(as, seg);
402 ASSERT(p == NULL || p->s_as == as);
403 ASSERT(p == NULL || p->s_base < seg->s_base);
404 ASSERT(n == NULL || n->s_base > seg->s_base);
405 ASSERT(n != NULL || seg == avl_last(&as->a_segtree));
406 if (seg == seglast)
407 seglast = NULL;
408 nsegs++;
409 }
410 ASSERT(seglast == NULL);
411 ASSERT(avl_numnodes(&as->a_segtree) == nsegs);
412 }
413 #endif /* VERIFY_SEGLIST */
414
415 /*
416 * Add a new segment to the address space. The avl_find()
417 * may be expensive so we attempt to use last segment accessed
418 * in as_gap() as an insertion point.
419 */
420 int
421 as_addseg(struct as *as, struct seg *newseg)
422 {
423 struct seg *seg;
424 caddr_t addr;
425 caddr_t eaddr;
426 avl_index_t where;
427
428 ASSERT(AS_WRITE_HELD(as));
429
430 as->a_updatedir = 1; /* inform /proc */
431 gethrestime(&as->a_updatetime);
432
433 if (as->a_lastgaphl != NULL) {
434 struct seg *hseg = NULL;
435 struct seg *lseg = NULL;
436
437 if (as->a_lastgaphl->s_base > newseg->s_base) {
438 hseg = as->a_lastgaphl;
439 lseg = AVL_PREV(&as->a_segtree, hseg);
440 } else {
441 lseg = as->a_lastgaphl;
442 hseg = AVL_NEXT(&as->a_segtree, lseg);
443 }
444
445 if (hseg && lseg && lseg->s_base < newseg->s_base &&
446 hseg->s_base > newseg->s_base) {
447 avl_insert_here(&as->a_segtree, newseg, lseg,
448 AVL_AFTER);
449 as->a_lastgaphl = NULL;
450 as->a_seglast = newseg;
451 return (0);
452 }
453 as->a_lastgaphl = NULL;
454 }
455
456 addr = newseg->s_base;
457 eaddr = addr + newseg->s_size;
458 again:
459
460 seg = avl_find(&as->a_segtree, &addr, &where);
461
462 if (seg == NULL)
463 seg = avl_nearest(&as->a_segtree, where, AVL_AFTER);
464
465 if (seg == NULL)
466 seg = avl_last(&as->a_segtree);
467
468 if (seg != NULL) {
469 caddr_t base = seg->s_base;
470
471 /*
472 * If top of seg is below the requested address, then
473 * the insertion point is at the end of the linked list,
474 * and seg points to the tail of the list. Otherwise,
475 * the insertion point is immediately before seg.
476 */
477 if (base + seg->s_size > addr) {
478 if (addr >= base || eaddr > base) {
479 #ifdef __sparc
480 extern struct seg_ops segnf_ops;
481
482 /*
483 * no-fault segs must disappear if overlaid.
484 * XXX need new segment type so
485 * we don't have to check s_ops
486 */
487 if (seg->s_ops == &segnf_ops) {
488 seg_unmap(seg);
489 goto again;
490 }
491 #endif
492 return (-1); /* overlapping segment */
493 }
494 }
495 }
496 as->a_seglast = newseg;
497 avl_insert(&as->a_segtree, newseg, where);
498
499 #ifdef VERIFY_SEGLIST
500 as_verify(as);
501 #endif
502 return (0);
503 }
504
505 struct seg *
506 as_removeseg(struct as *as, struct seg *seg)
507 {
508 avl_tree_t *t;
509
510 ASSERT(AS_WRITE_HELD(as));
511
512 as->a_updatedir = 1; /* inform /proc */
513 gethrestime(&as->a_updatetime);
514
515 if (seg == NULL)
516 return (NULL);
517
518 t = &as->a_segtree;
519 if (as->a_seglast == seg)
520 as->a_seglast = NULL;
521 as->a_lastgaphl = NULL;
522
523 /*
524 * if this segment is at an address higher than
525 * a_lastgap, set a_lastgap to the next segment (NULL if last segment)
526 */
527 if (as->a_lastgap &&
528 (seg == as->a_lastgap || seg->s_base > as->a_lastgap->s_base))
529 as->a_lastgap = AVL_NEXT(t, seg);
530
531 /*
532 * remove the segment from the seg tree
533 */
534 avl_remove(t, seg);
535
536 #ifdef VERIFY_SEGLIST
537 as_verify(as);
538 #endif
539 return (seg);
540 }
541
542 /*
543 * Find a segment containing addr.
544 */
545 struct seg *
546 as_segat(struct as *as, caddr_t addr)
547 {
548 struct seg *seg = as->a_seglast;
549
550 ASSERT(AS_LOCK_HELD(as));
551
552 if (seg != NULL && seg->s_base <= addr &&
553 addr < seg->s_base + seg->s_size)
554 return (seg);
555
556 seg = avl_find(&as->a_segtree, &addr, NULL);
557 return (seg);
558 }
559
560 /*
561 * Serialize all searches for holes in an address space to
562 * prevent two or more threads from allocating the same virtual
563 * address range. The address space must not be "read/write"
564 * locked by the caller since we may block.
565 */
566 void
567 as_rangelock(struct as *as)
568 {
569 mutex_enter(&as->a_contents);
570 while (AS_ISCLAIMGAP(as))
571 cv_wait(&as->a_cv, &as->a_contents);
572 AS_SETCLAIMGAP(as);
573 mutex_exit(&as->a_contents);
574 }
575
576 /*
577 * Release hold on a_state & AS_CLAIMGAP and signal any other blocked threads.
578 */
579 void
580 as_rangeunlock(struct as *as)
581 {
582 mutex_enter(&as->a_contents);
583 AS_CLRCLAIMGAP(as);
584 cv_signal(&as->a_cv);
585 mutex_exit(&as->a_contents);
586 }
587
588 /*
589 * compar segments (or just an address) by segment address range
590 */
591 static int
592 as_segcompar(const void *x, const void *y)
593 {
594 struct seg *a = (struct seg *)x;
595 struct seg *b = (struct seg *)y;
596
597 if (a->s_base < b->s_base)
598 return (-1);
599 if (a->s_base >= b->s_base + b->s_size)
600 return (1);
601 return (0);
602 }
603
604
605 void
606 as_avlinit(struct as *as)
607 {
608 avl_create(&as->a_segtree, as_segcompar, sizeof (struct seg),
609 offsetof(struct seg, s_tree));
610 avl_create(&as->a_wpage, wp_compare, sizeof (struct watched_page),
611 offsetof(struct watched_page, wp_link));
612 }
613
614 /*ARGSUSED*/
615 static int
616 as_constructor(void *buf, void *cdrarg, int kmflags)
617 {
618 struct as *as = buf;
619
620 mutex_init(&as->a_contents, NULL, MUTEX_DEFAULT, NULL);
621 cv_init(&as->a_cv, NULL, CV_DEFAULT, NULL);
622 rw_init(&as->a_lock, NULL, RW_DEFAULT, NULL);
623 as_avlinit(as);
624 return (0);
625 }
626
627 /*ARGSUSED1*/
628 static void
629 as_destructor(void *buf, void *cdrarg)
630 {
631 struct as *as = buf;
632
633 avl_destroy(&as->a_segtree);
634 mutex_destroy(&as->a_contents);
635 cv_destroy(&as->a_cv);
636 rw_destroy(&as->a_lock);
637 }
638
639 void
640 as_init(void)
641 {
642 as_cache = kmem_cache_create("as_cache", sizeof (struct as), 0,
643 as_constructor, as_destructor, NULL, NULL, NULL, 0);
644 }
645
646 /*
647 * Allocate and initialize an address space data structure.
648 * We call hat_alloc to allow any machine dependent
649 * information in the hat structure to be initialized.
650 */
651 struct as *
652 as_alloc(void)
653 {
654 struct as *as;
655
656 as = kmem_cache_alloc(as_cache, KM_SLEEP);
657
658 as->a_flags = 0;
659 as->a_vbits = 0;
660 as->a_hrm = NULL;
661 as->a_seglast = NULL;
662 as->a_size = 0;
663 as->a_resvsize = 0;
664 as->a_updatedir = 0;
665 gethrestime(&as->a_updatetime);
666 as->a_objectdir = NULL;
667 as->a_sizedir = 0;
668 as->a_userlimit = (caddr_t)USERLIMIT;
669 as->a_lastgap = NULL;
670 as->a_lastgaphl = NULL;
671 as->a_callbacks = NULL;
672 as->a_proc = NULL;
673
674 AS_LOCK_ENTER(as, RW_WRITER);
675 as->a_hat = hat_alloc(as); /* create hat for default system mmu */
676 AS_LOCK_EXIT(as);
677
678 return (as);
679 }
680
681 /*
682 * Free an address space data structure.
683 * Need to free the hat first and then
684 * all the segments on this as and finally
685 * the space for the as struct itself.
686 */
687 void
688 as_free(struct as *as)
689 {
690 struct hat *hat = as->a_hat;
691 struct seg *seg, *next;
692 boolean_t free_started = B_FALSE;
693
694 top:
695 /*
696 * Invoke ALL callbacks. as_do_callbacks will do one callback
697 * per call, and not return (-1) until the callback has completed.
698 * When as_do_callbacks returns zero, all callbacks have completed.
699 */
700 mutex_enter(&as->a_contents);
701 while (as->a_callbacks && as_do_callbacks(as, AS_ALL_EVENT, 0, 0))
702 ;
703
704 mutex_exit(&as->a_contents);
705 AS_LOCK_ENTER(as, RW_WRITER);
706
707 if (!free_started) {
708 free_started = B_TRUE;
709 hat_free_start(hat);
710 }
711 for (seg = AS_SEGFIRST(as); seg != NULL; seg = next) {
712 int err;
713
714 next = AS_SEGNEXT(as, seg);
715 retry:
716 err = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
717 if (err == EAGAIN) {
718 mutex_enter(&as->a_contents);
719 if (as->a_callbacks) {
720 AS_LOCK_EXIT(as);
721 } else if (!AS_ISNOUNMAPWAIT(as)) {
722 /*
723 * Memory is currently locked. Wait for a
724 * cv_signal that it has been unlocked, then
725 * try the operation again.
726 */
727 if (AS_ISUNMAPWAIT(as) == 0)
728 cv_broadcast(&as->a_cv);
729 AS_SETUNMAPWAIT(as);
730 AS_LOCK_EXIT(as);
731 while (AS_ISUNMAPWAIT(as))
732 cv_wait(&as->a_cv, &as->a_contents);
733 } else {
734 /*
735 * We may have raced with
736 * segvn_reclaim()/segspt_reclaim(). In this
737 * case clean nounmapwait flag and retry since
738 * softlockcnt in this segment may be already
739 * 0. We don't drop as writer lock so our
740 * number of retries without sleeping should
741 * be very small. See segvn_reclaim() for
742 * more comments.
743 */
744 AS_CLRNOUNMAPWAIT(as);
745 mutex_exit(&as->a_contents);
746 goto retry;
747 }
748 mutex_exit(&as->a_contents);
749 goto top;
750 } else {
751 /*
752 * We do not expect any other error return at this
753 * time. This is similar to an ASSERT in seg_unmap()
754 */
755 ASSERT(err == 0);
756 }
757 }
758 hat_free_end(hat);
759 AS_LOCK_EXIT(as);
760
761 /* /proc stuff */
762 ASSERT(avl_numnodes(&as->a_wpage) == 0);
763 if (as->a_objectdir) {
764 kmem_free(as->a_objectdir, as->a_sizedir * sizeof (vnode_t *));
765 as->a_objectdir = NULL;
766 as->a_sizedir = 0;
767 }
768
769 /*
770 * Free the struct as back to kmem. Assert it has no segments.
771 */
772 ASSERT(avl_numnodes(&as->a_segtree) == 0);
773 kmem_cache_free(as_cache, as);
774 }
775
776 int
777 as_dup(struct as *as, struct proc *forkedproc)
778 {
779 struct as *newas;
780 struct seg *seg, *newseg;
781 size_t purgesize = 0;
782 int error;
783
784 AS_LOCK_ENTER(as, RW_WRITER);
785 as_clearwatch(as);
786 newas = as_alloc();
787 newas->a_userlimit = as->a_userlimit;
788 newas->a_proc = forkedproc;
789
790 AS_LOCK_ENTER(newas, RW_WRITER);
791
792 (void) hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_SRD);
793
794 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
795
796 if (seg->s_flags & S_PURGE) {
797 purgesize += seg->s_size;
798 continue;
799 }
800
801 newseg = seg_alloc(newas, seg->s_base, seg->s_size);
802 if (newseg == NULL) {
803 AS_LOCK_EXIT(newas);
804 as_setwatch(as);
805 AS_LOCK_EXIT(as);
806 as_free(newas);
807 return (-1);
808 }
809 if ((error = SEGOP_DUP(seg, newseg)) != 0) {
810 /*
811 * We call seg_free() on the new seg
812 * because the segment is not set up
813 * completely; i.e. it has no ops.
814 */
815 as_setwatch(as);
816 AS_LOCK_EXIT(as);
817 seg_free(newseg);
818 AS_LOCK_EXIT(newas);
819 as_free(newas);
820 return (error);
821 }
822 newas->a_size += seg->s_size;
823 }
824 newas->a_resvsize = as->a_resvsize - purgesize;
825
826 error = hat_dup(as->a_hat, newas->a_hat, NULL, 0, HAT_DUP_ALL);
827
828 AS_LOCK_EXIT(newas);
829
830 as_setwatch(as);
831 AS_LOCK_EXIT(as);
832 if (error != 0) {
833 as_free(newas);
834 return (error);
835 }
836 forkedproc->p_as = newas;
837 return (0);
838 }
839
840 /*
841 * Handle a ``fault'' at addr for size bytes.
842 */
843 faultcode_t
844 as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size,
845 enum fault_type type, enum seg_rw rw)
846 {
847 struct seg *seg;
848 caddr_t raddr; /* rounded down addr */
849 size_t rsize; /* rounded up size */
850 size_t ssize;
851 faultcode_t res = 0;
852 caddr_t addrsav;
853 struct seg *segsav;
854 int as_lock_held;
855 klwp_t *lwp = ttolwp(curthread);
856 zone_t *zonep = curzone;
857
858 retry:
859 /*
860 * Indicate that the lwp is not to be stopped while waiting for a
861 * pagefault. This is to avoid deadlock while debugging a process
862 * via /proc over NFS (in particular).
863 */
864 if (lwp != NULL)
865 lwp->lwp_nostop++;
866
867 /*
868 * same length must be used when we softlock and softunlock. We
869 * don't support softunlocking lengths less than the original length
870 * when there is largepage support. See seg_dev.c for more
871 * comments.
872 */
873 switch (type) {
874
875 case F_SOFTLOCK:
876 CPU_STATS_ADD_K(vm, softlock, 1);
877 break;
878
879 case F_SOFTUNLOCK:
880 break;
881
882 case F_PROT:
883 CPU_STATS_ADD_K(vm, prot_fault, 1);
884 break;
885
886 case F_INVAL:
887 CPU_STATS_ENTER_K();
888 CPU_STATS_ADDQ(CPU, vm, as_fault, 1);
889 if (as == &kas)
890 CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1);
891 CPU_STATS_EXIT_K();
892 if (zonep->zone_pg_flt_delay != 0) {
893 /*
894 * The zone in which this process is running is
895 * currently over it's physical memory cap. Throttle
896 * page faults to help the user-land memory capper
897 * catch up. Note that drv_usectohz() rounds up.
898 */
899 atomic_add_64(&zonep->zone_pf_throttle, 1);
900 atomic_add_64(&zonep->zone_pf_throttle_usec,
901 zonep->zone_pg_flt_delay);
902 if (zonep->zone_pg_flt_delay < TICK_TO_USEC(1)) {
903 drv_usecwait(zonep->zone_pg_flt_delay);
904 } else {
905 delay(drv_usectohz(zonep->zone_pg_flt_delay));
906 }
907 }
908 break;
909 }
910
911 /* Kernel probe */
912 TNF_PROBE_3(address_fault, "vm pagefault", /* CSTYLED */,
913 tnf_opaque, address, addr,
914 tnf_fault_type, fault_type, type,
915 tnf_seg_access, access, rw);
916
917 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
918 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
919 (size_t)raddr;
920
921 /*
922 * XXX -- Don't grab the as lock for segkmap. We should grab it for
923 * correctness, but then we could be stuck holding this lock for
924 * a LONG time if the fault needs to be resolved on a slow
925 * filesystem, and then no-one will be able to exec new commands,
926 * as exec'ing requires the write lock on the as.
927 */
928 if (as == &kas && segkmap && segkmap->s_base <= raddr &&
929 raddr + size < segkmap->s_base + segkmap->s_size) {
930 seg = segkmap;
931 as_lock_held = 0;
932 } else {
933 AS_LOCK_ENTER(as, RW_READER);
934
935 seg = as_segat(as, raddr);
936 if (seg == NULL) {
937 AS_LOCK_EXIT(as);
938 if (lwp != NULL)
939 lwp->lwp_nostop--;
940 return (FC_NOMAP);
941 }
942
943 as_lock_held = 1;
944 }
945
946 addrsav = raddr;
947 segsav = seg;
948
949 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
950 if (raddr >= seg->s_base + seg->s_size) {
951 seg = AS_SEGNEXT(as, seg);
952 if (seg == NULL || raddr != seg->s_base) {
953 res = FC_NOMAP;
954 break;
955 }
956 }
957 if (raddr + rsize > seg->s_base + seg->s_size)
958 ssize = seg->s_base + seg->s_size - raddr;
959 else
960 ssize = rsize;
961
962 res = SEGOP_FAULT(hat, seg, raddr, ssize, type, rw);
963 if (res != 0)
964 break;
965 }
966
967 /*
968 * If we were SOFTLOCKing and encountered a failure,
969 * we must SOFTUNLOCK the range we already did. (Maybe we
970 * should just panic if we are SOFTLOCKing or even SOFTUNLOCKing
971 * right here...)
972 */
973 if (res != 0 && type == F_SOFTLOCK) {
974 for (seg = segsav; addrsav < raddr; addrsav += ssize) {
975 if (addrsav >= seg->s_base + seg->s_size)
976 seg = AS_SEGNEXT(as, seg);
977 ASSERT(seg != NULL);
978 /*
979 * Now call the fault routine again to perform the
980 * unlock using S_OTHER instead of the rw variable
981 * since we never got a chance to touch the pages.
982 */
983 if (raddr > seg->s_base + seg->s_size)
984 ssize = seg->s_base + seg->s_size - addrsav;
985 else
986 ssize = raddr - addrsav;
987 (void) SEGOP_FAULT(hat, seg, addrsav, ssize,
988 F_SOFTUNLOCK, S_OTHER);
989 }
990 }
991 if (as_lock_held)
992 AS_LOCK_EXIT(as);
993 if (lwp != NULL)
994 lwp->lwp_nostop--;
995
996 /*
997 * If the lower levels returned EDEADLK for a fault,
998 * It means that we should retry the fault. Let's wait
999 * a bit also to let the deadlock causing condition clear.
1000 * This is part of a gross hack to work around a design flaw
1001 * in the ufs/sds logging code and should go away when the
1002 * logging code is re-designed to fix the problem. See bug
1003 * 4125102 for details of the problem.
1004 */
1005 if (FC_ERRNO(res) == EDEADLK) {
1006 delay(deadlk_wait);
1007 res = 0;
1008 goto retry;
1009 }
1010 return (res);
1011 }
1012
1013
1014
1015 /*
1016 * Asynchronous ``fault'' at addr for size bytes.
1017 */
1018 faultcode_t
1019 as_faulta(struct as *as, caddr_t addr, size_t size)
1020 {
1021 struct seg *seg;
1022 caddr_t raddr; /* rounded down addr */
1023 size_t rsize; /* rounded up size */
1024 faultcode_t res = 0;
1025 klwp_t *lwp = ttolwp(curthread);
1026
1027 retry:
1028 /*
1029 * Indicate that the lwp is not to be stopped while waiting
1030 * for a pagefault. This is to avoid deadlock while debugging
1031 * a process via /proc over NFS (in particular).
1032 */
1033 if (lwp != NULL)
1034 lwp->lwp_nostop++;
1035
1036 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1037 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1038 (size_t)raddr;
1039
1040 AS_LOCK_ENTER(as, RW_READER);
1041 seg = as_segat(as, raddr);
1042 if (seg == NULL) {
1043 AS_LOCK_EXIT(as);
1044 if (lwp != NULL)
1045 lwp->lwp_nostop--;
1046 return (FC_NOMAP);
1047 }
1048
1049 for (; rsize != 0; rsize -= PAGESIZE, raddr += PAGESIZE) {
1050 if (raddr >= seg->s_base + seg->s_size) {
1051 seg = AS_SEGNEXT(as, seg);
1052 if (seg == NULL || raddr != seg->s_base) {
1053 res = FC_NOMAP;
1054 break;
1055 }
1056 }
1057 res = SEGOP_FAULTA(seg, raddr);
1058 if (res != 0)
1059 break;
1060 }
1061 AS_LOCK_EXIT(as);
1062 if (lwp != NULL)
1063 lwp->lwp_nostop--;
1064 /*
1065 * If the lower levels returned EDEADLK for a fault,
1066 * It means that we should retry the fault. Let's wait
1067 * a bit also to let the deadlock causing condition clear.
1068 * This is part of a gross hack to work around a design flaw
1069 * in the ufs/sds logging code and should go away when the
1070 * logging code is re-designed to fix the problem. See bug
1071 * 4125102 for details of the problem.
1072 */
1073 if (FC_ERRNO(res) == EDEADLK) {
1074 delay(deadlk_wait);
1075 res = 0;
1076 goto retry;
1077 }
1078 return (res);
1079 }
1080
1081 /*
1082 * Set the virtual mapping for the interval from [addr : addr + size)
1083 * in address space `as' to have the specified protection.
1084 * It is ok for the range to cross over several segments,
1085 * as long as they are contiguous.
1086 */
1087 int
1088 as_setprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1089 {
1090 struct seg *seg;
1091 struct as_callback *cb;
1092 size_t ssize;
1093 caddr_t raddr; /* rounded down addr */
1094 size_t rsize; /* rounded up size */
1095 int error = 0, writer = 0;
1096 caddr_t saveraddr;
1097 size_t saversize;
1098
1099 setprot_top:
1100 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1101 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1102 (size_t)raddr;
1103
1104 if (raddr + rsize < raddr) /* check for wraparound */
1105 return (ENOMEM);
1106
1107 saveraddr = raddr;
1108 saversize = rsize;
1109
1110 /*
1111 * Normally we only lock the as as a reader. But
1112 * if due to setprot the segment driver needs to split
1113 * a segment it will return IE_RETRY. Therefore we re-acquire
1114 * the as lock as a writer so the segment driver can change
1115 * the seg list. Also the segment driver will return IE_RETRY
1116 * after it has changed the segment list so we therefore keep
1117 * locking as a writer. Since these opeartions should be rare
1118 * want to only lock as a writer when necessary.
1119 */
1120 if (writer || avl_numnodes(&as->a_wpage) != 0) {
1121 AS_LOCK_ENTER(as, RW_WRITER);
1122 } else {
1123 AS_LOCK_ENTER(as, RW_READER);
1124 }
1125
1126 as_clearwatchprot(as, raddr, rsize);
1127 seg = as_segat(as, raddr);
1128 if (seg == NULL) {
1129 as_setwatch(as);
1130 AS_LOCK_EXIT(as);
1131 return (ENOMEM);
1132 }
1133
1134 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1135 if (raddr >= seg->s_base + seg->s_size) {
1136 seg = AS_SEGNEXT(as, seg);
1137 if (seg == NULL || raddr != seg->s_base) {
1138 error = ENOMEM;
1139 break;
1140 }
1141 }
1142 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1143 ssize = seg->s_base + seg->s_size - raddr;
1144 else
1145 ssize = rsize;
1146 retry:
1147 error = SEGOP_SETPROT(seg, raddr, ssize, prot);
1148
1149 if (error == IE_NOMEM) {
1150 error = EAGAIN;
1151 break;
1152 }
1153
1154 if (error == IE_RETRY) {
1155 AS_LOCK_EXIT(as);
1156 writer = 1;
1157 goto setprot_top;
1158 }
1159
1160 if (error == EAGAIN) {
1161 /*
1162 * Make sure we have a_lock as writer.
1163 */
1164 if (writer == 0) {
1165 AS_LOCK_EXIT(as);
1166 writer = 1;
1167 goto setprot_top;
1168 }
1169
1170 /*
1171 * Memory is currently locked. It must be unlocked
1172 * before this operation can succeed through a retry.
1173 * The possible reasons for locked memory and
1174 * corresponding strategies for unlocking are:
1175 * (1) Normal I/O
1176 * wait for a signal that the I/O operation
1177 * has completed and the memory is unlocked.
1178 * (2) Asynchronous I/O
1179 * The aio subsystem does not unlock pages when
1180 * the I/O is completed. Those pages are unlocked
1181 * when the application calls aiowait/aioerror.
1182 * So, to prevent blocking forever, cv_broadcast()
1183 * is done to wake up aio_cleanup_thread.
1184 * Subsequently, segvn_reclaim will be called, and
1185 * that will do AS_CLRUNMAPWAIT() and wake us up.
1186 * (3) Long term page locking:
1187 * Drivers intending to have pages locked for a
1188 * period considerably longer than for normal I/O
1189 * (essentially forever) may have registered for a
1190 * callback so they may unlock these pages on
1191 * request. This is needed to allow this operation
1192 * to succeed. Each entry on the callback list is
1193 * examined. If the event or address range pertains
1194 * the callback is invoked (unless it already is in
1195 * progress). The a_contents lock must be dropped
1196 * before the callback, so only one callback can
1197 * be done at a time. Go to the top and do more
1198 * until zero is returned. If zero is returned,
1199 * either there were no callbacks for this event
1200 * or they were already in progress.
1201 */
1202 mutex_enter(&as->a_contents);
1203 if (as->a_callbacks &&
1204 (cb = as_find_callback(as, AS_SETPROT_EVENT,
1205 seg->s_base, seg->s_size))) {
1206 AS_LOCK_EXIT(as);
1207 as_execute_callback(as, cb, AS_SETPROT_EVENT);
1208 } else if (!AS_ISNOUNMAPWAIT(as)) {
1209 if (AS_ISUNMAPWAIT(as) == 0)
1210 cv_broadcast(&as->a_cv);
1211 AS_SETUNMAPWAIT(as);
1212 AS_LOCK_EXIT(as);
1213 while (AS_ISUNMAPWAIT(as))
1214 cv_wait(&as->a_cv, &as->a_contents);
1215 } else {
1216 /*
1217 * We may have raced with
1218 * segvn_reclaim()/segspt_reclaim(). In this
1219 * case clean nounmapwait flag and retry since
1220 * softlockcnt in this segment may be already
1221 * 0. We don't drop as writer lock so our
1222 * number of retries without sleeping should
1223 * be very small. See segvn_reclaim() for
1224 * more comments.
1225 */
1226 AS_CLRNOUNMAPWAIT(as);
1227 mutex_exit(&as->a_contents);
1228 goto retry;
1229 }
1230 mutex_exit(&as->a_contents);
1231 goto setprot_top;
1232 } else if (error != 0)
1233 break;
1234 }
1235 if (error != 0) {
1236 as_setwatch(as);
1237 } else {
1238 as_setwatchprot(as, saveraddr, saversize, prot);
1239 }
1240 AS_LOCK_EXIT(as);
1241 return (error);
1242 }
1243
1244 /*
1245 * Check to make sure that the interval [addr, addr + size)
1246 * in address space `as' has at least the specified protection.
1247 * It is ok for the range to cross over several segments, as long
1248 * as they are contiguous.
1249 */
1250 int
1251 as_checkprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
1252 {
1253 struct seg *seg;
1254 size_t ssize;
1255 caddr_t raddr; /* rounded down addr */
1256 size_t rsize; /* rounded up size */
1257 int error = 0;
1258
1259 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1260 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1261 (size_t)raddr;
1262
1263 if (raddr + rsize < raddr) /* check for wraparound */
1264 return (ENOMEM);
1265
1266 /*
1267 * This is ugly as sin...
1268 * Normally, we only acquire the address space readers lock.
1269 * However, if the address space has watchpoints present,
1270 * we must acquire the writer lock on the address space for
1271 * the benefit of as_clearwatchprot() and as_setwatchprot().
1272 */
1273 if (avl_numnodes(&as->a_wpage) != 0)
1274 AS_LOCK_ENTER(as, RW_WRITER);
1275 else
1276 AS_LOCK_ENTER(as, RW_READER);
1277 as_clearwatchprot(as, raddr, rsize);
1278 seg = as_segat(as, raddr);
1279 if (seg == NULL) {
1280 as_setwatch(as);
1281 AS_LOCK_EXIT(as);
1282 return (ENOMEM);
1283 }
1284
1285 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
1286 if (raddr >= seg->s_base + seg->s_size) {
1287 seg = AS_SEGNEXT(as, seg);
1288 if (seg == NULL || raddr != seg->s_base) {
1289 error = ENOMEM;
1290 break;
1291 }
1292 }
1293 if ((raddr + rsize) > (seg->s_base + seg->s_size))
1294 ssize = seg->s_base + seg->s_size - raddr;
1295 else
1296 ssize = rsize;
1297
1298 error = SEGOP_CHECKPROT(seg, raddr, ssize, prot);
1299 if (error != 0)
1300 break;
1301 }
1302 as_setwatch(as);
1303 AS_LOCK_EXIT(as);
1304 return (error);
1305 }
1306
1307 int
1308 as_unmap(struct as *as, caddr_t addr, size_t size)
1309 {
1310 struct seg *seg, *seg_next;
1311 struct as_callback *cb;
1312 caddr_t raddr, eaddr;
1313 size_t ssize, rsize = 0;
1314 int err;
1315
1316 top:
1317 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1318 eaddr = (caddr_t)(((uintptr_t)(addr + size) + PAGEOFFSET) &
1319 (uintptr_t)PAGEMASK);
1320
1321 AS_LOCK_ENTER(as, RW_WRITER);
1322
1323 as->a_updatedir = 1; /* inform /proc */
1324 gethrestime(&as->a_updatetime);
1325
1326 /*
1327 * Use as_findseg to find the first segment in the range, then
1328 * step through the segments in order, following s_next.
1329 */
1330 as_clearwatchprot(as, raddr, eaddr - raddr);
1331
1332 for (seg = as_findseg(as, raddr, 0); seg != NULL; seg = seg_next) {
1333 if (eaddr <= seg->s_base)
1334 break; /* eaddr was in a gap; all done */
1335
1336 /* this is implied by the test above */
1337 ASSERT(raddr < eaddr);
1338
1339 if (raddr < seg->s_base)
1340 raddr = seg->s_base; /* raddr was in a gap */
1341
1342 if (eaddr > (seg->s_base + seg->s_size))
1343 ssize = seg->s_base + seg->s_size - raddr;
1344 else
1345 ssize = eaddr - raddr;
1346
1347 /*
1348 * Save next segment pointer since seg can be
1349 * destroyed during the segment unmap operation.
1350 */
1351 seg_next = AS_SEGNEXT(as, seg);
1352
1353 /*
1354 * We didn't count /dev/null mappings, so ignore them here.
1355 * We'll handle MAP_NORESERVE cases in segvn_unmap(). (Again,
1356 * we have to do this check here while we have seg.)
1357 */
1358 rsize = 0;
1359 if (!SEG_IS_DEVNULL_MAPPING(seg) &&
1360 !SEG_IS_PARTIAL_RESV(seg))
1361 rsize = ssize;
1362
1363 retry:
1364 err = SEGOP_UNMAP(seg, raddr, ssize);
1365 if (err == EAGAIN) {
1366 /*
1367 * Memory is currently locked. It must be unlocked
1368 * before this operation can succeed through a retry.
1369 * The possible reasons for locked memory and
1370 * corresponding strategies for unlocking are:
1371 * (1) Normal I/O
1372 * wait for a signal that the I/O operation
1373 * has completed and the memory is unlocked.
1374 * (2) Asynchronous I/O
1375 * The aio subsystem does not unlock pages when
1376 * the I/O is completed. Those pages are unlocked
1377 * when the application calls aiowait/aioerror.
1378 * So, to prevent blocking forever, cv_broadcast()
1379 * is done to wake up aio_cleanup_thread.
1380 * Subsequently, segvn_reclaim will be called, and
1381 * that will do AS_CLRUNMAPWAIT() and wake us up.
1382 * (3) Long term page locking:
1383 * Drivers intending to have pages locked for a
1384 * period considerably longer than for normal I/O
1385 * (essentially forever) may have registered for a
1386 * callback so they may unlock these pages on
1387 * request. This is needed to allow this operation
1388 * to succeed. Each entry on the callback list is
1389 * examined. If the event or address range pertains
1390 * the callback is invoked (unless it already is in
1391 * progress). The a_contents lock must be dropped
1392 * before the callback, so only one callback can
1393 * be done at a time. Go to the top and do more
1394 * until zero is returned. If zero is returned,
1395 * either there were no callbacks for this event
1396 * or they were already in progress.
1397 */
1398 mutex_enter(&as->a_contents);
1399 if (as->a_callbacks &&
1400 (cb = as_find_callback(as, AS_UNMAP_EVENT,
1401 seg->s_base, seg->s_size))) {
1402 AS_LOCK_EXIT(as);
1403 as_execute_callback(as, cb, AS_UNMAP_EVENT);
1404 } else if (!AS_ISNOUNMAPWAIT(as)) {
1405 if (AS_ISUNMAPWAIT(as) == 0)
1406 cv_broadcast(&as->a_cv);
1407 AS_SETUNMAPWAIT(as);
1408 AS_LOCK_EXIT(as);
1409 while (AS_ISUNMAPWAIT(as))
1410 cv_wait(&as->a_cv, &as->a_contents);
1411 } else {
1412 /*
1413 * We may have raced with
1414 * segvn_reclaim()/segspt_reclaim(). In this
1415 * case clean nounmapwait flag and retry since
1416 * softlockcnt in this segment may be already
1417 * 0. We don't drop as writer lock so our
1418 * number of retries without sleeping should
1419 * be very small. See segvn_reclaim() for
1420 * more comments.
1421 */
1422 AS_CLRNOUNMAPWAIT(as);
1423 mutex_exit(&as->a_contents);
1424 goto retry;
1425 }
1426 mutex_exit(&as->a_contents);
1427 goto top;
1428 } else if (err == IE_RETRY) {
1429 AS_LOCK_EXIT(as);
1430 goto top;
1431 } else if (err) {
1432 as_setwatch(as);
1433 AS_LOCK_EXIT(as);
1434 return (-1);
1435 }
1436
1437 as->a_size -= ssize;
1438 if (rsize)
1439 as->a_resvsize -= rsize;
1440 raddr += ssize;
1441 }
1442 AS_LOCK_EXIT(as);
1443 return (0);
1444 }
1445
1446 static int
1447 as_map_segvn_segs(struct as *as, caddr_t addr, size_t size, uint_t szcvec,
1448 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1449 {
1450 uint_t szc;
1451 uint_t nszc;
1452 int error;
1453 caddr_t a;
1454 caddr_t eaddr;
1455 size_t segsize;
1456 struct seg *seg;
1457 size_t pgsz;
1458 int do_off = (vn_a->vp != NULL || vn_a->amp != NULL);
1459 uint_t save_szcvec;
1460
1461 ASSERT(AS_WRITE_HELD(as));
1462 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1463 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1464 ASSERT(vn_a->vp == NULL || vn_a->amp == NULL);
1465 if (!do_off) {
1466 vn_a->offset = 0;
1467 }
1468
1469 if (szcvec <= 1) {
1470 seg = seg_alloc(as, addr, size);
1471 if (seg == NULL) {
1472 return (ENOMEM);
1473 }
1474 vn_a->szc = 0;
1475 error = (*crfp)(seg, vn_a);
1476 if (error != 0) {
1477 seg_free(seg);
1478 } else {
1479 as->a_size += size;
1480 as->a_resvsize += size;
1481 }
1482 return (error);
1483 }
1484
1485 eaddr = addr + size;
1486 save_szcvec = szcvec;
1487 szcvec >>= 1;
1488 szc = 0;
1489 nszc = 0;
1490 while (szcvec) {
1491 if ((szcvec & 0x1) == 0) {
1492 nszc++;
1493 szcvec >>= 1;
1494 continue;
1495 }
1496 nszc++;
1497 pgsz = page_get_pagesize(nszc);
1498 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
1499 if (a != addr) {
1500 ASSERT(a < eaddr);
1501 segsize = a - addr;
1502 seg = seg_alloc(as, addr, segsize);
1503 if (seg == NULL) {
1504 return (ENOMEM);
1505 }
1506 vn_a->szc = szc;
1507 error = (*crfp)(seg, vn_a);
1508 if (error != 0) {
1509 seg_free(seg);
1510 return (error);
1511 }
1512 as->a_size += segsize;
1513 as->a_resvsize += segsize;
1514 *segcreated = 1;
1515 if (do_off) {
1516 vn_a->offset += segsize;
1517 }
1518 addr = a;
1519 }
1520 szc = nszc;
1521 szcvec >>= 1;
1522 }
1523
1524 ASSERT(addr < eaddr);
1525 szcvec = save_szcvec | 1; /* add 8K pages */
1526 while (szcvec) {
1527 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
1528 ASSERT(a >= addr);
1529 if (a != addr) {
1530 segsize = a - addr;
1531 seg = seg_alloc(as, addr, segsize);
1532 if (seg == NULL) {
1533 return (ENOMEM);
1534 }
1535 vn_a->szc = szc;
1536 error = (*crfp)(seg, vn_a);
1537 if (error != 0) {
1538 seg_free(seg);
1539 return (error);
1540 }
1541 as->a_size += segsize;
1542 as->a_resvsize += segsize;
1543 *segcreated = 1;
1544 if (do_off) {
1545 vn_a->offset += segsize;
1546 }
1547 addr = a;
1548 }
1549 szcvec &= ~(1 << szc);
1550 if (szcvec) {
1551 szc = highbit(szcvec) - 1;
1552 pgsz = page_get_pagesize(szc);
1553 }
1554 }
1555 ASSERT(addr == eaddr);
1556
1557 return (0);
1558 }
1559
1560 static int
1561 as_map_vnsegs(struct as *as, caddr_t addr, size_t size,
1562 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1563 {
1564 uint_t mapflags = vn_a->flags & (MAP_TEXT | MAP_INITDATA);
1565 int type = (vn_a->type == MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
1566 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1567 type, 0);
1568 int error;
1569 struct seg *seg;
1570 struct vattr va;
1571 u_offset_t eoff;
1572 size_t save_size = 0;
1573 extern size_t textrepl_size_thresh;
1574
1575 ASSERT(AS_WRITE_HELD(as));
1576 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1577 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1578 ASSERT(vn_a->vp != NULL);
1579 ASSERT(vn_a->amp == NULL);
1580
1581 again:
1582 if (szcvec <= 1) {
1583 seg = seg_alloc(as, addr, size);
1584 if (seg == NULL) {
1585 return (ENOMEM);
1586 }
1587 vn_a->szc = 0;
1588 error = (*crfp)(seg, vn_a);
1589 if (error != 0) {
1590 seg_free(seg);
1591 } else {
1592 as->a_size += size;
1593 as->a_resvsize += size;
1594 }
1595 return (error);
1596 }
1597
1598 va.va_mask = AT_SIZE;
1599 if (VOP_GETATTR(vn_a->vp, &va, ATTR_HINT, vn_a->cred, NULL) != 0) {
1600 szcvec = 0;
1601 goto again;
1602 }
1603 eoff = vn_a->offset & PAGEMASK;
1604 if (eoff >= va.va_size) {
1605 szcvec = 0;
1606 goto again;
1607 }
1608 eoff += size;
1609 if (btopr(va.va_size) < btopr(eoff)) {
1610 save_size = size;
1611 size = va.va_size - (vn_a->offset & PAGEMASK);
1612 size = P2ROUNDUP_TYPED(size, PAGESIZE, size_t);
1613 szcvec = map_pgszcvec(addr, size, (uintptr_t)addr, mapflags,
1614 type, 0);
1615 if (szcvec <= 1) {
1616 size = save_size;
1617 goto again;
1618 }
1619 }
1620
1621 if (size > textrepl_size_thresh) {
1622 vn_a->flags |= _MAP_TEXTREPL;
1623 }
1624 error = as_map_segvn_segs(as, addr, size, szcvec, crfp, vn_a,
1625 segcreated);
1626 if (error != 0) {
1627 return (error);
1628 }
1629 if (save_size) {
1630 addr += size;
1631 size = save_size - size;
1632 szcvec = 0;
1633 goto again;
1634 }
1635 return (0);
1636 }
1637
1638 /*
1639 * as_map_ansegs: shared or private anonymous memory. Note that the flags
1640 * passed to map_pgszvec cannot be MAP_INITDATA, for anon.
1641 */
1642 static int
1643 as_map_ansegs(struct as *as, caddr_t addr, size_t size,
1644 int (*crfp)(), struct segvn_crargs *vn_a, int *segcreated)
1645 {
1646 uint_t szcvec;
1647 uchar_t type;
1648
1649 ASSERT(vn_a->type == MAP_SHARED || vn_a->type == MAP_PRIVATE);
1650 if (vn_a->type == MAP_SHARED) {
1651 type = MAPPGSZC_SHM;
1652 } else if (vn_a->type == MAP_PRIVATE) {
1653 if (vn_a->szc == AS_MAP_HEAP) {
1654 type = MAPPGSZC_HEAP;
1655 } else if (vn_a->szc == AS_MAP_STACK) {
1656 type = MAPPGSZC_STACK;
1657 } else {
1658 type = MAPPGSZC_PRIVM;
1659 }
1660 }
1661 szcvec = map_pgszcvec(addr, size, vn_a->amp == NULL ?
1662 (uintptr_t)addr : (uintptr_t)P2ROUNDUP(vn_a->offset, PAGESIZE),
1663 (vn_a->flags & MAP_TEXT), type, 0);
1664 ASSERT(AS_WRITE_HELD(as));
1665 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
1666 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
1667 ASSERT(vn_a->vp == NULL);
1668
1669 return (as_map_segvn_segs(as, addr, size, szcvec,
1670 crfp, vn_a, segcreated));
1671 }
1672
1673 int
1674 as_map(struct as *as, caddr_t addr, size_t size, int (*crfp)(), void *argsp)
1675 {
1676 AS_LOCK_ENTER(as, RW_WRITER);
1677 return (as_map_locked(as, addr, size, crfp, argsp));
1678 }
1679
1680 int
1681 as_map_locked(struct as *as, caddr_t addr, size_t size, int (*crfp)(),
1682 void *argsp)
1683 {
1684 struct seg *seg = NULL;
1685 caddr_t raddr; /* rounded down addr */
1686 size_t rsize; /* rounded up size */
1687 int error;
1688 int unmap = 0;
1689 /*
1690 * The use of a_proc is preferred to handle the case where curproc is
1691 * a door_call server and is allocating memory in the client's (a_proc)
1692 * address space.
1693 * When creating a shared memory segment a_proc will be NULL so we
1694 * fallback to curproc in that case.
1695 */
1696 struct proc *p = (as->a_proc == NULL) ? curproc : as->a_proc;
1697 struct segvn_crargs crargs;
1698
1699 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1700 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
1701 (size_t)raddr;
1702
1703 /*
1704 * check for wrap around
1705 */
1706 if ((raddr + rsize < raddr) || (as->a_size > (ULONG_MAX - size))) {
1707 AS_LOCK_EXIT(as);
1708 return (ENOMEM);
1709 }
1710
1711 as->a_updatedir = 1; /* inform /proc */
1712 gethrestime(&as->a_updatetime);
1713
1714 if (as != &kas) {
1715 if (as->a_size + rsize > (size_t)p->p_vmem_ctl) {
1716 AS_LOCK_EXIT(as);
1717
1718 (void) rctl_action(rctlproc_legacy[RLIMIT_VMEM],
1719 p->p_rctls, p, RCA_UNSAFE_ALL);
1720 return (ENOMEM);
1721 }
1722
1723 /*
1724 * Keep the number of segments in a userspace AS constrained to
1725 * a reasonable limit. Linux enforces a value slightly less
1726 * than 64k in order to avoid ELF limits if/when a process
1727 * dumps core. While SunOS avoids that specific problem with
1728 * other tricks, the limit is still valuable to keep kernel
1729 * memory consumption in check.
1730 */
1731 if (avl_numnodes(&as->a_segtree) >= as_user_seg_limit) {
1732 AS_LOCK_EXIT(as);
1733 atomic_inc_32(&p->p_zone->zone_mfseglim);
1734 return (ENOMEM);
1735 }
1736 }
1737
1738 if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) {
1739 crargs = *(struct segvn_crargs *)argsp;
1740 error = as_map_vnsegs(as, raddr, rsize, crfp, &crargs, &unmap);
1741 if (error != 0) {
1742 AS_LOCK_EXIT(as);
1743 if (unmap) {
1744 (void) as_unmap(as, addr, size);
1745 }
1746 return (error);
1747 }
1748 } else if (AS_MAP_CHECK_ANON_LPOOB(crfp, argsp)) {
1749 crargs = *(struct segvn_crargs *)argsp;
1750 error = as_map_ansegs(as, raddr, rsize, crfp, &crargs, &unmap);
1751 if (error != 0) {
1752 AS_LOCK_EXIT(as);
1753 if (unmap) {
1754 (void) as_unmap(as, addr, size);
1755 }
1756 return (error);
1757 }
1758 } else {
1759 seg = seg_alloc(as, addr, size);
1760 if (seg == NULL) {
1761 AS_LOCK_EXIT(as);
1762 return (ENOMEM);
1763 }
1764
1765 error = (*crfp)(seg, argsp);
1766 if (error != 0) {
1767 seg_free(seg);
1768 AS_LOCK_EXIT(as);
1769 return (error);
1770 }
1771 /*
1772 * Add size now so as_unmap will work if as_ctl fails.
1773 */
1774 as->a_size += rsize;
1775 as->a_resvsize += rsize;
1776 }
1777
1778 as_setwatch(as);
1779
1780 /*
1781 * If the address space is locked,
1782 * establish memory locks for the new segment.
1783 */
1784 mutex_enter(&as->a_contents);
1785 if (AS_ISPGLCK(as)) {
1786 mutex_exit(&as->a_contents);
1787 AS_LOCK_EXIT(as);
1788 error = as_ctl(as, addr, size, MC_LOCK, 0, 0, NULL, 0);
1789 if (error != 0)
1790 (void) as_unmap(as, addr, size);
1791 } else {
1792 mutex_exit(&as->a_contents);
1793 AS_LOCK_EXIT(as);
1794 }
1795 return (error);
1796 }
1797
1798
1799 /*
1800 * Delete all segments in the address space marked with S_PURGE.
1801 * This is currently used for Sparc V9 nofault ASI segments (seg_nf.c).
1802 * These segments are deleted as a first step before calls to as_gap(), so
1803 * that they don't affect mmap() or shmat().
1804 */
1805 void
1806 as_purge(struct as *as)
1807 {
1808 struct seg *seg;
1809 struct seg *next_seg;
1810
1811 /*
1812 * the setting of NEEDSPURGE is protect by as_rangelock(), so
1813 * no need to grab a_contents mutex for this check
1814 */
1815 if ((as->a_flags & AS_NEEDSPURGE) == 0)
1816 return;
1817
1818 AS_LOCK_ENTER(as, RW_WRITER);
1819 next_seg = NULL;
1820 seg = AS_SEGFIRST(as);
1821 while (seg != NULL) {
1822 next_seg = AS_SEGNEXT(as, seg);
1823 if (seg->s_flags & S_PURGE)
1824 SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1825 seg = next_seg;
1826 }
1827 AS_LOCK_EXIT(as);
1828
1829 mutex_enter(&as->a_contents);
1830 as->a_flags &= ~AS_NEEDSPURGE;
1831 mutex_exit(&as->a_contents);
1832 }
1833
1834 /*
1835 * Find a hole within [*basep, *basep + *lenp), which contains a mappable
1836 * range of addresses at least "minlen" long, where the base of the range is
1837 * at "off" phase from an "align" boundary and there is space for a
1838 * "redzone"-sized redzone on eithe rside of the range. Thus,
1839 * if align was 4M and off was 16k, the user wants a hole which will start
1840 * 16k into a 4M page.
1841 *
1842 * If flags specifies AH_HI, the hole will have the highest possible address
1843 * in the range. We use the as->a_lastgap field to figure out where to
1844 * start looking for a gap.
1845 *
1846 * Otherwise, the gap will have the lowest possible address.
1847 *
1848 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
1849 *
1850 * If an adequate hole is found, *basep and *lenp are set to reflect the part of
1851 * the hole that is within range, and 0 is returned. On failure, -1 is returned.
1852 *
1853 * NOTE: This routine is not correct when base+len overflows caddr_t.
1854 */
1855 int
1856 as_gap_aligned(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp,
1857 uint_t flags, caddr_t addr, size_t align, size_t redzone, size_t off)
1858 {
1859 caddr_t lobound = *basep;
1860 caddr_t hibound = lobound + *lenp;
1861 struct seg *lseg, *hseg;
1862 caddr_t lo, hi;
1863 int forward;
1864 caddr_t save_base;
1865 size_t save_len;
1866 size_t save_minlen;
1867 size_t save_redzone;
1868 int fast_path = 1;
1869
1870 save_base = *basep;
1871 save_len = *lenp;
1872 save_minlen = minlen;
1873 save_redzone = redzone;
1874
1875 /*
1876 * For the first pass/fast_path, just add align and redzone into
1877 * minlen since if we get an allocation, we can guarantee that it
1878 * will fit the alignment and redzone requested.
1879 * This increases the chance that hibound will be adjusted to
1880 * a_lastgap->s_base which will likely allow us to find an
1881 * acceptable hole in the address space quicker.
1882 * If we can't find a hole with this fast_path, then we look for
1883 * smaller holes in which the alignment and offset may allow
1884 * the allocation to fit.
1885 */
1886 minlen += align;
1887 minlen += 2 * redzone;
1888 redzone = 0;
1889
1890 AS_LOCK_ENTER(as, RW_READER);
1891 if (AS_SEGFIRST(as) == NULL) {
1892 if (valid_va_range_aligned(basep, lenp, minlen, flags & AH_DIR,
1893 align, redzone, off)) {
1894 AS_LOCK_EXIT(as);
1895 return (0);
1896 } else {
1897 AS_LOCK_EXIT(as);
1898 *basep = save_base;
1899 *lenp = save_len;
1900 return (-1);
1901 }
1902 }
1903
1904 retry:
1905 /*
1906 * Set up to iterate over all the inter-segment holes in the given
1907 * direction. lseg is NULL for the lowest-addressed hole and hseg is
1908 * NULL for the highest-addressed hole. If moving backwards, we reset
1909 * sseg to denote the highest-addressed segment.
1910 */
1911 forward = (flags & AH_DIR) == AH_LO;
1912 if (forward) {
1913 hseg = as_findseg(as, lobound, 1);
1914 lseg = AS_SEGPREV(as, hseg);
1915 } else {
1916
1917 /*
1918 * If allocating at least as much as the last allocation,
1919 * use a_lastgap's base as a better estimate of hibound.
1920 */
1921 if (as->a_lastgap &&
1922 minlen >= as->a_lastgap->s_size &&
1923 hibound >= as->a_lastgap->s_base)
1924 hibound = as->a_lastgap->s_base;
1925
1926 hseg = as_findseg(as, hibound, 1);
1927 if (hseg->s_base + hseg->s_size < hibound) {
1928 lseg = hseg;
1929 hseg = NULL;
1930 } else {
1931 lseg = AS_SEGPREV(as, hseg);
1932 }
1933 }
1934
1935 for (;;) {
1936 /*
1937 * Set lo and hi to the hole's boundaries. (We should really
1938 * use MAXADDR in place of hibound in the expression below,
1939 * but can't express it easily; using hibound in its place is
1940 * harmless.)
1941 */
1942 lo = (lseg == NULL) ? 0 : lseg->s_base + lseg->s_size;
1943 hi = (hseg == NULL) ? hibound : hseg->s_base;
1944 /*
1945 * If the iteration has moved past the interval from lobound
1946 * to hibound it's pointless to continue.
1947 */
1948 if ((forward && lo > hibound) || (!forward && hi < lobound))
1949 break;
1950 else if (lo > hibound || hi < lobound)
1951 goto cont;
1952 /*
1953 * Candidate hole lies at least partially within the allowable
1954 * range. Restrict it to fall completely within that range,
1955 * i.e., to [max(lo, lobound), min(hi, hibound)].
1956 */
1957 if (lo < lobound)
1958 lo = lobound;
1959 if (hi > hibound)
1960 hi = hibound;
1961 /*
1962 * Verify that the candidate hole is big enough and meets
1963 * hardware constraints. If the hole is too small, no need
1964 * to do the further checks since they will fail.
1965 */
1966 *basep = lo;
1967 *lenp = hi - lo;
1968 if (*lenp >= minlen && valid_va_range_aligned(basep, lenp,
1969 minlen, forward ? AH_LO : AH_HI, align, redzone, off) &&
1970 ((flags & AH_CONTAIN) == 0 ||
1971 (*basep <= addr && *basep + *lenp > addr))) {
1972 if (!forward)
1973 as->a_lastgap = hseg;
1974 if (hseg != NULL)
1975 as->a_lastgaphl = hseg;
1976 else
1977 as->a_lastgaphl = lseg;
1978 AS_LOCK_EXIT(as);
1979 return (0);
1980 }
1981 cont:
1982 /*
1983 * Move to the next hole.
1984 */
1985 if (forward) {
1986 lseg = hseg;
1987 if (lseg == NULL)
1988 break;
1989 hseg = AS_SEGNEXT(as, hseg);
1990 } else {
1991 hseg = lseg;
1992 if (hseg == NULL)
1993 break;
1994 lseg = AS_SEGPREV(as, lseg);
1995 }
1996 }
1997 if (fast_path && (align != 0 || save_redzone != 0)) {
1998 fast_path = 0;
1999 minlen = save_minlen;
2000 redzone = save_redzone;
2001 goto retry;
2002 }
2003 *basep = save_base;
2004 *lenp = save_len;
2005 AS_LOCK_EXIT(as);
2006 return (-1);
2007 }
2008
2009 /*
2010 * Find a hole of at least size minlen within [*basep, *basep + *lenp).
2011 *
2012 * If flags specifies AH_HI, the hole will have the highest possible address
2013 * in the range. We use the as->a_lastgap field to figure out where to
2014 * start looking for a gap.
2015 *
2016 * Otherwise, the gap will have the lowest possible address.
2017 *
2018 * If flags specifies AH_CONTAIN, the hole will contain the address addr.
2019 *
2020 * If an adequate hole is found, base and len are set to reflect the part of
2021 * the hole that is within range, and 0 is returned, otherwise,
2022 * -1 is returned.
2023 *
2024 * NOTE: This routine is not correct when base+len overflows caddr_t.
2025 */
2026 int
2027 as_gap(struct as *as, size_t minlen, caddr_t *basep, size_t *lenp, uint_t flags,
2028 caddr_t addr)
2029 {
2030
2031 return (as_gap_aligned(as, minlen, basep, lenp, flags, addr, 0, 0, 0));
2032 }
2033
2034 /*
2035 * Return the next range within [base, base + len) that is backed
2036 * with "real memory". Skip holes and non-seg_vn segments.
2037 * We're lazy and only return one segment at a time.
2038 */
2039 int
2040 as_memory(struct as *as, caddr_t *basep, size_t *lenp)
2041 {
2042 extern struct seg_ops segspt_shmops; /* needs a header file */
2043 struct seg *seg;
2044 caddr_t addr, eaddr;
2045 caddr_t segend;
2046
2047 AS_LOCK_ENTER(as, RW_READER);
2048
2049 addr = *basep;
2050 eaddr = addr + *lenp;
2051
2052 seg = as_findseg(as, addr, 0);
2053 if (seg != NULL)
2054 addr = MAX(seg->s_base, addr);
2055
2056 for (;;) {
2057 if (seg == NULL || addr >= eaddr || eaddr <= seg->s_base) {
2058 AS_LOCK_EXIT(as);
2059 return (EINVAL);
2060 }
2061
2062 if (seg->s_ops == &segvn_ops) {
2063 segend = seg->s_base + seg->s_size;
2064 break;
2065 }
2066
2067 /*
2068 * We do ISM by looking into the private data
2069 * to determine the real size of the segment.
2070 */
2071 if (seg->s_ops == &segspt_shmops) {
2072 segend = seg->s_base + spt_realsize(seg);
2073 if (addr < segend)
2074 break;
2075 }
2076
2077 seg = AS_SEGNEXT(as, seg);
2078
2079 if (seg != NULL)
2080 addr = seg->s_base;
2081 }
2082
2083 *basep = addr;
2084
2085 if (segend > eaddr)
2086 *lenp = eaddr - addr;
2087 else
2088 *lenp = segend - addr;
2089
2090 AS_LOCK_EXIT(as);
2091 return (0);
2092 }
2093
2094 /*
2095 * Swap the pages associated with the address space as out to
2096 * secondary storage, returning the number of bytes actually
2097 * swapped.
2098 *
2099 * The value returned is intended to correlate well with the process's
2100 * memory requirements. Its usefulness for this purpose depends on
2101 * how well the segment-level routines do at returning accurate
2102 * information.
2103 */
2104 size_t
2105 as_swapout(struct as *as)
2106 {
2107 struct seg *seg;
2108 size_t swpcnt = 0;
2109
2110 /*
2111 * Kernel-only processes have given up their address
2112 * spaces. Of course, we shouldn't be attempting to
2113 * swap out such processes in the first place...
2114 */
2115 if (as == NULL)
2116 return (0);
2117
2118 AS_LOCK_ENTER(as, RW_READER);
2119
2120 /*
2121 * Free all mapping resources associated with the address
2122 * space. The segment-level swapout routines capitalize
2123 * on this unmapping by scavanging pages that have become
2124 * unmapped here.
2125 */
2126 hat_swapout(as->a_hat);
2127
2128 /*
2129 * Call the swapout routines of all segments in the address
2130 * space to do the actual work, accumulating the amount of
2131 * space reclaimed.
2132 */
2133 for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) {
2134 struct seg_ops *ov = seg->s_ops;
2135
2136 /*
2137 * We have to check to see if the seg has
2138 * an ops vector because the seg may have
2139 * been in the middle of being set up when
2140 * the process was picked for swapout.
2141 */
2142 if ((ov != NULL) && (ov->swapout != NULL))
2143 swpcnt += SEGOP_SWAPOUT(seg);
2144 }
2145 AS_LOCK_EXIT(as);
2146 return (swpcnt);
2147 }
2148
2149 /*
2150 * Determine whether data from the mappings in interval [addr, addr + size)
2151 * are in the primary memory (core) cache.
2152 */
2153 int
2154 as_incore(struct as *as, caddr_t addr,
2155 size_t size, char *vec, size_t *sizep)
2156 {
2157 struct seg *seg;
2158 size_t ssize;
2159 caddr_t raddr; /* rounded down addr */
2160 size_t rsize; /* rounded up size */
2161 size_t isize; /* iteration size */
2162 int error = 0; /* result, assume success */
2163
2164 *sizep = 0;
2165 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2166 rsize = ((((size_t)addr + size) + PAGEOFFSET) & PAGEMASK) -
2167 (size_t)raddr;
2168
2169 if (raddr + rsize < raddr) /* check for wraparound */
2170 return (ENOMEM);
2171
2172 AS_LOCK_ENTER(as, RW_READER);
2173 seg = as_segat(as, raddr);
2174 if (seg == NULL) {
2175 AS_LOCK_EXIT(as);
2176 return (-1);
2177 }
2178
2179 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2180 if (raddr >= seg->s_base + seg->s_size) {
2181 seg = AS_SEGNEXT(as, seg);
2182 if (seg == NULL || raddr != seg->s_base) {
2183 error = -1;
2184 break;
2185 }
2186 }
2187 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2188 ssize = seg->s_base + seg->s_size - raddr;
2189 else
2190 ssize = rsize;
2191 *sizep += isize = SEGOP_INCORE(seg, raddr, ssize, vec);
2192 if (isize != ssize) {
2193 error = -1;
2194 break;
2195 }
2196 vec += btopr(ssize);
2197 }
2198 AS_LOCK_EXIT(as);
2199 return (error);
2200 }
2201
2202 static void
2203 as_segunlock(struct seg *seg, caddr_t addr, int attr,
2204 ulong_t *bitmap, size_t position, size_t npages)
2205 {
2206 caddr_t range_start;
2207 size_t pos1 = position;
2208 size_t pos2;
2209 size_t size;
2210 size_t end_pos = npages + position;
2211
2212 while (bt_range(bitmap, &pos1, &pos2, end_pos)) {
2213 size = ptob((pos2 - pos1));
2214 range_start = (caddr_t)((uintptr_t)addr +
2215 ptob(pos1 - position));
2216
2217 (void) SEGOP_LOCKOP(seg, range_start, size, attr, MC_UNLOCK,
2218 (ulong_t *)NULL, (size_t)NULL);
2219 pos1 = pos2;
2220 }
2221 }
2222
2223 static void
2224 as_unlockerr(struct as *as, int attr, ulong_t *mlock_map,
2225 caddr_t raddr, size_t rsize)
2226 {
2227 struct seg *seg = as_segat(as, raddr);
2228 size_t ssize;
2229
2230 while (rsize != 0) {
2231 if (raddr >= seg->s_base + seg->s_size)
2232 seg = AS_SEGNEXT(as, seg);
2233
2234 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2235 ssize = seg->s_base + seg->s_size - raddr;
2236 else
2237 ssize = rsize;
2238
2239 as_segunlock(seg, raddr, attr, mlock_map, 0, btopr(ssize));
2240
2241 rsize -= ssize;
2242 raddr += ssize;
2243 }
2244 }
2245
2246 /*
2247 * Cache control operations over the interval [addr, addr + size) in
2248 * address space "as".
2249 */
2250 /*ARGSUSED*/
2251 int
2252 as_ctl(struct as *as, caddr_t addr, size_t size, int func, int attr,
2253 uintptr_t arg, ulong_t *lock_map, size_t pos)
2254 {
2255 struct seg *seg; /* working segment */
2256 caddr_t raddr; /* rounded down addr */
2257 caddr_t initraddr; /* saved initial rounded down addr */
2258 size_t rsize; /* rounded up size */
2259 size_t initrsize; /* saved initial rounded up size */
2260 size_t ssize; /* size of seg */
2261 int error = 0; /* result */
2262 size_t mlock_size; /* size of bitmap */
2263 ulong_t *mlock_map; /* pointer to bitmap used */
2264 /* to represent the locked */
2265 /* pages. */
2266 retry:
2267 if (error == IE_RETRY)
2268 AS_LOCK_ENTER(as, RW_WRITER);
2269 else
2270 AS_LOCK_ENTER(as, RW_READER);
2271
2272 /*
2273 * If these are address space lock/unlock operations, loop over
2274 * all segments in the address space, as appropriate.
2275 */
2276 if (func == MC_LOCKAS) {
2277 size_t npages, idx;
2278 size_t rlen = 0; /* rounded as length */
2279
2280 idx = pos;
2281
2282 if (arg & MCL_FUTURE) {
2283 mutex_enter(&as->a_contents);
2284 AS_SETPGLCK(as);
2285 mutex_exit(&as->a_contents);
2286 }
2287 if ((arg & MCL_CURRENT) == 0) {
2288 AS_LOCK_EXIT(as);
2289 return (0);
2290 }
2291
2292 seg = AS_SEGFIRST(as);
2293 if (seg == NULL) {
2294 AS_LOCK_EXIT(as);
2295 return (0);
2296 }
2297
2298 do {
2299 raddr = (caddr_t)((uintptr_t)seg->s_base &
2300 (uintptr_t)PAGEMASK);
2301 rlen += (((uintptr_t)(seg->s_base + seg->s_size) +
2302 PAGEOFFSET) & PAGEMASK) - (uintptr_t)raddr;
2303 } while ((seg = AS_SEGNEXT(as, seg)) != NULL);
2304
2305 mlock_size = BT_BITOUL(btopr(rlen));
2306 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2307 sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2308 AS_LOCK_EXIT(as);
2309 return (EAGAIN);
2310 }
2311
2312 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2313 error = SEGOP_LOCKOP(seg, seg->s_base,
2314 seg->s_size, attr, MC_LOCK, mlock_map, pos);
2315 if (error != 0)
2316 break;
2317 pos += seg_pages(seg);
2318 }
2319
2320 if (error) {
2321 for (seg = AS_SEGFIRST(as); seg != NULL;
2322 seg = AS_SEGNEXT(as, seg)) {
2323
2324 raddr = (caddr_t)((uintptr_t)seg->s_base &
2325 (uintptr_t)PAGEMASK);
2326 npages = seg_pages(seg);
2327 as_segunlock(seg, raddr, attr, mlock_map,
2328 idx, npages);
2329 idx += npages;
2330 }
2331 }
2332
2333 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2334 AS_LOCK_EXIT(as);
2335 goto lockerr;
2336 } else if (func == MC_UNLOCKAS) {
2337 mutex_enter(&as->a_contents);
2338 AS_CLRPGLCK(as);
2339 mutex_exit(&as->a_contents);
2340
2341 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
2342 error = SEGOP_LOCKOP(seg, seg->s_base,
2343 seg->s_size, attr, MC_UNLOCK, NULL, 0);
2344 if (error != 0)
2345 break;
2346 }
2347
2348 AS_LOCK_EXIT(as);
2349 goto lockerr;
2350 }
2351
2352 /*
2353 * Normalize addresses and sizes.
2354 */
2355 initraddr = raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2356 initrsize = rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2357 (size_t)raddr;
2358
2359 if (raddr + rsize < raddr) { /* check for wraparound */
2360 AS_LOCK_EXIT(as);
2361 return (ENOMEM);
2362 }
2363
2364 /*
2365 * Get initial segment.
2366 */
2367 if ((seg = as_segat(as, raddr)) == NULL) {
2368 AS_LOCK_EXIT(as);
2369 return (ENOMEM);
2370 }
2371
2372 if (func == MC_LOCK) {
2373 mlock_size = BT_BITOUL(btopr(rsize));
2374 if ((mlock_map = (ulong_t *)kmem_zalloc(mlock_size *
2375 sizeof (ulong_t), KM_NOSLEEP)) == NULL) {
2376 AS_LOCK_EXIT(as);
2377 return (EAGAIN);
2378 }
2379 }
2380
2381 /*
2382 * Loop over all segments. If a hole in the address range is
2383 * discovered, then fail. For each segment, perform the appropriate
2384 * control operation.
2385 */
2386 while (rsize != 0) {
2387
2388 /*
2389 * Make sure there's no hole, calculate the portion
2390 * of the next segment to be operated over.
2391 */
2392 if (raddr >= seg->s_base + seg->s_size) {
2393 seg = AS_SEGNEXT(as, seg);
2394 if (seg == NULL || raddr != seg->s_base) {
2395 if (func == MC_LOCK) {
2396 as_unlockerr(as, attr, mlock_map,
2397 initraddr, initrsize - rsize);
2398 kmem_free(mlock_map,
2399 mlock_size * sizeof (ulong_t));
2400 }
2401 AS_LOCK_EXIT(as);
2402 return (ENOMEM);
2403 }
2404 }
2405 if ((raddr + rsize) > (seg->s_base + seg->s_size))
2406 ssize = seg->s_base + seg->s_size - raddr;
2407 else
2408 ssize = rsize;
2409
2410 /*
2411 * Dispatch on specific function.
2412 */
2413 switch (func) {
2414
2415 /*
2416 * Synchronize cached data from mappings with backing
2417 * objects.
2418 */
2419 case MC_SYNC:
2420 if (error = SEGOP_SYNC(seg, raddr, ssize,
2421 attr, (uint_t)arg)) {
2422 AS_LOCK_EXIT(as);
2423 return (error);
2424 }
2425 break;
2426
2427 /*
2428 * Lock pages in memory.
2429 */
2430 case MC_LOCK:
2431 if (error = SEGOP_LOCKOP(seg, raddr, ssize,
2432 attr, func, mlock_map, pos)) {
2433 as_unlockerr(as, attr, mlock_map, initraddr,
2434 initrsize - rsize + ssize);
2435 kmem_free(mlock_map, mlock_size *
2436 sizeof (ulong_t));
2437 AS_LOCK_EXIT(as);
2438 goto lockerr;
2439 }
2440 break;
2441
2442 /*
2443 * Unlock mapped pages.
2444 */
2445 case MC_UNLOCK:
2446 (void) SEGOP_LOCKOP(seg, raddr, ssize, attr, func,
2447 (ulong_t *)NULL, (size_t)NULL);
2448 break;
2449
2450 /*
2451 * Store VM advise for mapped pages in segment layer.
2452 */
2453 case MC_ADVISE:
2454 error = SEGOP_ADVISE(seg, raddr, ssize, (uint_t)arg);
2455
2456 /*
2457 * Check for regular errors and special retry error
2458 */
2459 if (error) {
2460 if (error == IE_RETRY) {
2461 /*
2462 * Need to acquire writers lock, so
2463 * have to drop readers lock and start
2464 * all over again
2465 */
2466 AS_LOCK_EXIT(as);
2467 goto retry;
2468 } else if (error == IE_REATTACH) {
2469 /*
2470 * Find segment for current address
2471 * because current segment just got
2472 * split or concatenated
2473 */
2474 seg = as_segat(as, raddr);
2475 if (seg == NULL) {
2476 AS_LOCK_EXIT(as);
2477 return (ENOMEM);
2478 }
2479 } else {
2480 /*
2481 * Regular error
2482 */
2483 AS_LOCK_EXIT(as);
2484 return (error);
2485 }
2486 }
2487 break;
2488
2489 case MC_INHERIT_ZERO:
2490 if (seg->s_ops->inherit == NULL) {
2491 error = ENOTSUP;
2492 } else {
2493 error = SEGOP_INHERIT(seg, raddr, ssize,
2494 SEGP_INH_ZERO);
2495 }
2496 if (error != 0) {
2497 AS_LOCK_EXIT(as);
2498 return (error);
2499 }
2500 break;
2501
2502 /*
2503 * Can't happen.
2504 */
2505 default:
2506 panic("as_ctl: bad operation %d", func);
2507 /*NOTREACHED*/
2508 }
2509
2510 rsize -= ssize;
2511 raddr += ssize;
2512 }
2513
2514 if (func == MC_LOCK)
2515 kmem_free(mlock_map, mlock_size * sizeof (ulong_t));
2516 AS_LOCK_EXIT(as);
2517 return (0);
2518 lockerr:
2519
2520 /*
2521 * If the lower levels returned EDEADLK for a segment lockop,
2522 * it means that we should retry the operation. Let's wait
2523 * a bit also to let the deadlock causing condition clear.
2524 * This is part of a gross hack to work around a design flaw
2525 * in the ufs/sds logging code and should go away when the
2526 * logging code is re-designed to fix the problem. See bug
2527 * 4125102 for details of the problem.
2528 */
2529 if (error == EDEADLK) {
2530 delay(deadlk_wait);
2531 error = 0;
2532 goto retry;
2533 }
2534 return (error);
2535 }
2536
2537 int
2538 fc_decode(faultcode_t fault_err)
2539 {
2540 int error = 0;
2541
2542 switch (FC_CODE(fault_err)) {
2543 case FC_OBJERR:
2544 error = FC_ERRNO(fault_err);
2545 break;
2546 case FC_PROT:
2547 error = EACCES;
2548 break;
2549 default:
2550 error = EFAULT;
2551 break;
2552 }
2553 return (error);
2554 }
2555
2556 /*
2557 * Pagelock pages from a range that spans more than 1 segment. Obtain shadow
2558 * lists from each segment and copy them to one contiguous shadow list (plist)
2559 * as expected by the caller. Save pointers to per segment shadow lists at
2560 * the tail of plist so that they can be used during as_pageunlock().
2561 */
2562 static int
2563 as_pagelock_segs(struct as *as, struct seg *seg, struct page ***ppp,
2564 caddr_t addr, size_t size, enum seg_rw rw)
2565 {
2566 caddr_t sv_addr = addr;
2567 size_t sv_size = size;
2568 struct seg *sv_seg = seg;
2569 ulong_t segcnt = 1;
2570 ulong_t cnt;
2571 size_t ssize;
2572 pgcnt_t npages = btop(size);
2573 page_t **plist;
2574 page_t **pl;
2575 int error;
2576 caddr_t eaddr;
2577 faultcode_t fault_err = 0;
2578 pgcnt_t pl_off;
2579 extern struct seg_ops segspt_shmops;
2580
2581 ASSERT(AS_LOCK_HELD(as));
2582 ASSERT(seg != NULL);
2583 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2584 ASSERT(addr + size > seg->s_base + seg->s_size);
2585 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2586 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2587
2588 /*
2589 * Count the number of segments covered by the range we are about to
2590 * lock. The segment count is used to size the shadow list we return
2591 * back to the caller.
2592 */
2593 for (; size != 0; size -= ssize, addr += ssize) {
2594 if (addr >= seg->s_base + seg->s_size) {
2595
2596 seg = AS_SEGNEXT(as, seg);
2597 if (seg == NULL || addr != seg->s_base) {
2598 AS_LOCK_EXIT(as);
2599 return (EFAULT);
2600 }
2601 /*
2602 * Do a quick check if subsequent segments
2603 * will most likely support pagelock.
2604 */
2605 if (seg->s_ops == &segvn_ops) {
2606 vnode_t *vp;
2607
2608 if (SEGOP_GETVP(seg, addr, &vp) != 0 ||
2609 vp != NULL) {
2610 AS_LOCK_EXIT(as);
2611 goto slow;
2612 }
2613 } else if (seg->s_ops != &segspt_shmops) {
2614 AS_LOCK_EXIT(as);
2615 goto slow;
2616 }
2617 segcnt++;
2618 }
2619 if (addr + size > seg->s_base + seg->s_size) {
2620 ssize = seg->s_base + seg->s_size - addr;
2621 } else {
2622 ssize = size;
2623 }
2624 }
2625 ASSERT(segcnt > 1);
2626
2627 plist = kmem_zalloc((npages + segcnt) * sizeof (page_t *), KM_SLEEP);
2628
2629 addr = sv_addr;
2630 size = sv_size;
2631 seg = sv_seg;
2632
2633 for (cnt = 0, pl_off = 0; size != 0; size -= ssize, addr += ssize) {
2634 if (addr >= seg->s_base + seg->s_size) {
2635 seg = AS_SEGNEXT(as, seg);
2636 ASSERT(seg != NULL && addr == seg->s_base);
2637 cnt++;
2638 ASSERT(cnt < segcnt);
2639 }
2640 if (addr + size > seg->s_base + seg->s_size) {
2641 ssize = seg->s_base + seg->s_size - addr;
2642 } else {
2643 ssize = size;
2644 }
2645 pl = &plist[npages + cnt];
2646 error = SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2647 L_PAGELOCK, rw);
2648 if (error) {
2649 break;
2650 }
2651 ASSERT(plist[npages + cnt] != NULL);
2652 ASSERT(pl_off + btop(ssize) <= npages);
2653 bcopy(plist[npages + cnt], &plist[pl_off],
2654 btop(ssize) * sizeof (page_t *));
2655 pl_off += btop(ssize);
2656 }
2657
2658 if (size == 0) {
2659 AS_LOCK_EXIT(as);
2660 ASSERT(cnt == segcnt - 1);
2661 *ppp = plist;
2662 return (0);
2663 }
2664
2665 /*
2666 * one of pagelock calls failed. The error type is in error variable.
2667 * Unlock what we've locked so far and retry with F_SOFTLOCK if error
2668 * type is either EFAULT or ENOTSUP. Otherwise just return the error
2669 * back to the caller.
2670 */
2671
2672 eaddr = addr;
2673 seg = sv_seg;
2674
2675 for (cnt = 0, addr = sv_addr; addr < eaddr; addr += ssize) {
2676 if (addr >= seg->s_base + seg->s_size) {
2677 seg = AS_SEGNEXT(as, seg);
2678 ASSERT(seg != NULL && addr == seg->s_base);
2679 cnt++;
2680 ASSERT(cnt < segcnt);
2681 }
2682 if (eaddr > seg->s_base + seg->s_size) {
2683 ssize = seg->s_base + seg->s_size - addr;
2684 } else {
2685 ssize = eaddr - addr;
2686 }
2687 pl = &plist[npages + cnt];
2688 ASSERT(*pl != NULL);
2689 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2690 L_PAGEUNLOCK, rw);
2691 }
2692
2693 AS_LOCK_EXIT(as);
2694
2695 kmem_free(plist, (npages + segcnt) * sizeof (page_t *));
2696
2697 if (error != ENOTSUP && error != EFAULT) {
2698 return (error);
2699 }
2700
2701 slow:
2702 /*
2703 * If we are here because pagelock failed due to the need to cow fault
2704 * in the pages we want to lock F_SOFTLOCK will do this job and in
2705 * next as_pagelock() call for this address range pagelock will
2706 * hopefully succeed.
2707 */
2708 fault_err = as_fault(as->a_hat, as, sv_addr, sv_size, F_SOFTLOCK, rw);
2709 if (fault_err != 0) {
2710 return (fc_decode(fault_err));
2711 }
2712 *ppp = NULL;
2713
2714 return (0);
2715 }
2716
2717 /*
2718 * lock pages in a given address space. Return shadow list. If
2719 * the list is NULL, the MMU mapping is also locked.
2720 */
2721 int
2722 as_pagelock(struct as *as, struct page ***ppp, caddr_t addr,
2723 size_t size, enum seg_rw rw)
2724 {
2725 size_t rsize;
2726 caddr_t raddr;
2727 faultcode_t fault_err;
2728 struct seg *seg;
2729 int err;
2730
2731 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_START,
2732 "as_pagelock_start: addr %p size %ld", addr, size);
2733
2734 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2735 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2736 (size_t)raddr;
2737
2738 /*
2739 * if the request crosses two segments let
2740 * as_fault handle it.
2741 */
2742 AS_LOCK_ENTER(as, RW_READER);
2743
2744 seg = as_segat(as, raddr);
2745 if (seg == NULL) {
2746 AS_LOCK_EXIT(as);
2747 return (EFAULT);
2748 }
2749 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2750 if (raddr + rsize > seg->s_base + seg->s_size) {
2751 return (as_pagelock_segs(as, seg, ppp, raddr, rsize, rw));
2752 }
2753 if (raddr + rsize <= raddr) {
2754 AS_LOCK_EXIT(as);
2755 return (EFAULT);
2756 }
2757
2758 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_START,
2759 "seg_lock_1_start: raddr %p rsize %ld", raddr, rsize);
2760
2761 /*
2762 * try to lock pages and pass back shadow list
2763 */
2764 err = SEGOP_PAGELOCK(seg, raddr, rsize, ppp, L_PAGELOCK, rw);
2765
2766 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_SEG_LOCK_END, "seg_lock_1_end");
2767
2768 AS_LOCK_EXIT(as);
2769
2770 if (err == 0 || (err != ENOTSUP && err != EFAULT)) {
2771 return (err);
2772 }
2773
2774 /*
2775 * Use F_SOFTLOCK to lock the pages because pagelock failed either due
2776 * to no pagelock support for this segment or pages need to be cow
2777 * faulted in. If fault is needed F_SOFTLOCK will do this job for
2778 * this as_pagelock() call and in the next as_pagelock() call for the
2779 * same address range pagelock call will hopefull succeed.
2780 */
2781 fault_err = as_fault(as->a_hat, as, addr, size, F_SOFTLOCK, rw);
2782 if (fault_err != 0) {
2783 return (fc_decode(fault_err));
2784 }
2785 *ppp = NULL;
2786
2787 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_LOCK_END, "as_pagelock_end");
2788 return (0);
2789 }
2790
2791 /*
2792 * unlock pages locked by as_pagelock_segs(). Retrieve per segment shadow
2793 * lists from the end of plist and call pageunlock interface for each segment.
2794 * Drop as lock and free plist.
2795 */
2796 static void
2797 as_pageunlock_segs(struct as *as, struct seg *seg, caddr_t addr, size_t size,
2798 struct page **plist, enum seg_rw rw)
2799 {
2800 ulong_t cnt;
2801 caddr_t eaddr = addr + size;
2802 pgcnt_t npages = btop(size);
2803 size_t ssize;
2804 page_t **pl;
2805
2806 ASSERT(AS_LOCK_HELD(as));
2807 ASSERT(seg != NULL);
2808 ASSERT(addr >= seg->s_base && addr < seg->s_base + seg->s_size);
2809 ASSERT(addr + size > seg->s_base + seg->s_size);
2810 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
2811 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
2812 ASSERT(plist != NULL);
2813
2814 for (cnt = 0; addr < eaddr; addr += ssize) {
2815 if (addr >= seg->s_base + seg->s_size) {
2816 seg = AS_SEGNEXT(as, seg);
2817 ASSERT(seg != NULL && addr == seg->s_base);
2818 cnt++;
2819 }
2820 if (eaddr > seg->s_base + seg->s_size) {
2821 ssize = seg->s_base + seg->s_size - addr;
2822 } else {
2823 ssize = eaddr - addr;
2824 }
2825 pl = &plist[npages + cnt];
2826 ASSERT(*pl != NULL);
2827 (void) SEGOP_PAGELOCK(seg, addr, ssize, (page_t ***)pl,
2828 L_PAGEUNLOCK, rw);
2829 }
2830 ASSERT(cnt > 0);
2831 AS_LOCK_EXIT(as);
2832
2833 cnt++;
2834 kmem_free(plist, (npages + cnt) * sizeof (page_t *));
2835 }
2836
2837 /*
2838 * unlock pages in a given address range
2839 */
2840 void
2841 as_pageunlock(struct as *as, struct page **pp, caddr_t addr, size_t size,
2842 enum seg_rw rw)
2843 {
2844 struct seg *seg;
2845 size_t rsize;
2846 caddr_t raddr;
2847
2848 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_START,
2849 "as_pageunlock_start: addr %p size %ld", addr, size);
2850
2851 /*
2852 * if the shadow list is NULL, as_pagelock was
2853 * falling back to as_fault
2854 */
2855 if (pp == NULL) {
2856 (void) as_fault(as->a_hat, as, addr, size, F_SOFTUNLOCK, rw);
2857 return;
2858 }
2859
2860 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2861 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
2862 (size_t)raddr;
2863
2864 AS_LOCK_ENTER(as, RW_READER);
2865 seg = as_segat(as, raddr);
2866 ASSERT(seg != NULL);
2867
2868 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEG_UNLOCK_START,
2869 "seg_unlock_start: raddr %p rsize %ld", raddr, rsize);
2870
2871 ASSERT(raddr >= seg->s_base && raddr < seg->s_base + seg->s_size);
2872 if (raddr + rsize <= seg->s_base + seg->s_size) {
2873 SEGOP_PAGELOCK(seg, raddr, rsize, &pp, L_PAGEUNLOCK, rw);
2874 } else {
2875 as_pageunlock_segs(as, seg, raddr, rsize, pp, rw);
2876 return;
2877 }
2878 AS_LOCK_EXIT(as);
2879 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_AS_UNLOCK_END, "as_pageunlock_end");
2880 }
2881
2882 int
2883 as_setpagesize(struct as *as, caddr_t addr, size_t size, uint_t szc,
2884 boolean_t wait)
2885 {
2886 struct seg *seg;
2887 size_t ssize;
2888 caddr_t raddr; /* rounded down addr */
2889 size_t rsize; /* rounded up size */
2890 int error = 0;
2891 size_t pgsz = page_get_pagesize(szc);
2892
2893 setpgsz_top:
2894 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(size, pgsz)) {
2895 return (EINVAL);
2896 }
2897
2898 raddr = addr;
2899 rsize = size;
2900
2901 if (raddr + rsize < raddr) /* check for wraparound */
2902 return (ENOMEM);
2903
2904 AS_LOCK_ENTER(as, RW_WRITER);
2905 as_clearwatchprot(as, raddr, rsize);
2906 seg = as_segat(as, raddr);
2907 if (seg == NULL) {
2908 as_setwatch(as);
2909 AS_LOCK_EXIT(as);
2910 return (ENOMEM);
2911 }
2912
2913 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
2914 if (raddr >= seg->s_base + seg->s_size) {
2915 seg = AS_SEGNEXT(as, seg);
2916 if (seg == NULL || raddr != seg->s_base) {
2917 error = ENOMEM;
2918 break;
2919 }
2920 }
2921 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
2922 ssize = seg->s_base + seg->s_size - raddr;
2923 } else {
2924 ssize = rsize;
2925 }
2926
2927 retry:
2928 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
2929
2930 if (error == IE_NOMEM) {
2931 error = EAGAIN;
2932 break;
2933 }
2934
2935 if (error == IE_RETRY) {
2936 AS_LOCK_EXIT(as);
2937 goto setpgsz_top;
2938 }
2939
2940 if (error == ENOTSUP) {
2941 error = EINVAL;
2942 break;
2943 }
2944
2945 if (wait && (error == EAGAIN)) {
2946 /*
2947 * Memory is currently locked. It must be unlocked
2948 * before this operation can succeed through a retry.
2949 * The possible reasons for locked memory and
2950 * corresponding strategies for unlocking are:
2951 * (1) Normal I/O
2952 * wait for a signal that the I/O operation
2953 * has completed and the memory is unlocked.
2954 * (2) Asynchronous I/O
2955 * The aio subsystem does not unlock pages when
2956 * the I/O is completed. Those pages are unlocked
2957 * when the application calls aiowait/aioerror.
2958 * So, to prevent blocking forever, cv_broadcast()
2959 * is done to wake up aio_cleanup_thread.
2960 * Subsequently, segvn_reclaim will be called, and
2961 * that will do AS_CLRUNMAPWAIT() and wake us up.
2962 * (3) Long term page locking:
2963 * This is not relevant for as_setpagesize()
2964 * because we cannot change the page size for
2965 * driver memory. The attempt to do so will
2966 * fail with a different error than EAGAIN so
2967 * there's no need to trigger as callbacks like
2968 * as_unmap, as_setprot or as_free would do.
2969 */
2970 mutex_enter(&as->a_contents);
2971 if (!AS_ISNOUNMAPWAIT(as)) {
2972 if (AS_ISUNMAPWAIT(as) == 0) {
2973 cv_broadcast(&as->a_cv);
2974 }
2975 AS_SETUNMAPWAIT(as);
2976 AS_LOCK_EXIT(as);
2977 while (AS_ISUNMAPWAIT(as)) {
2978 cv_wait(&as->a_cv, &as->a_contents);
2979 }
2980 } else {
2981 /*
2982 * We may have raced with
2983 * segvn_reclaim()/segspt_reclaim(). In this
2984 * case clean nounmapwait flag and retry since
2985 * softlockcnt in this segment may be already
2986 * 0. We don't drop as writer lock so our
2987 * number of retries without sleeping should
2988 * be very small. See segvn_reclaim() for
2989 * more comments.
2990 */
2991 AS_CLRNOUNMAPWAIT(as);
2992 mutex_exit(&as->a_contents);
2993 goto retry;
2994 }
2995 mutex_exit(&as->a_contents);
2996 goto setpgsz_top;
2997 } else if (error != 0) {
2998 break;
2999 }
3000 }
3001 as_setwatch(as);
3002 AS_LOCK_EXIT(as);
3003 return (error);
3004 }
3005
3006 /*
3007 * as_iset3_default_lpsize() just calls SEGOP_SETPAGESIZE() on all segments
3008 * in its chunk where s_szc is less than the szc we want to set.
3009 */
3010 static int
3011 as_iset3_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3012 int *retry)
3013 {
3014 struct seg *seg;
3015 size_t ssize;
3016 int error;
3017
3018 ASSERT(AS_WRITE_HELD(as));
3019
3020 seg = as_segat(as, raddr);
3021 if (seg == NULL) {
3022 panic("as_iset3_default_lpsize: no seg");
3023 }
3024
3025 for (; rsize != 0; rsize -= ssize, raddr += ssize) {
3026 if (raddr >= seg->s_base + seg->s_size) {
3027 seg = AS_SEGNEXT(as, seg);
3028 if (seg == NULL || raddr != seg->s_base) {
3029 panic("as_iset3_default_lpsize: as changed");
3030 }
3031 }
3032 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3033 ssize = seg->s_base + seg->s_size - raddr;
3034 } else {
3035 ssize = rsize;
3036 }
3037
3038 if (szc > seg->s_szc) {
3039 error = SEGOP_SETPAGESIZE(seg, raddr, ssize, szc);
3040 /* Only retry on EINVAL segments that have no vnode. */
3041 if (error == EINVAL) {
3042 vnode_t *vp = NULL;
3043 if ((SEGOP_GETTYPE(seg, raddr) & MAP_SHARED) &&
3044 (SEGOP_GETVP(seg, raddr, &vp) != 0 ||
3045 vp == NULL)) {
3046 *retry = 1;
3047 } else {
3048 *retry = 0;
3049 }
3050 }
3051 if (error) {
3052 return (error);
3053 }
3054 }
3055 }
3056 return (0);
3057 }
3058
3059 /*
3060 * as_iset2_default_lpsize() calls as_iset3_default_lpsize() to set the
3061 * pagesize on each segment in its range, but if any fails with EINVAL,
3062 * then it reduces the pagesizes to the next size in the bitmap and
3063 * retries as_iset3_default_lpsize(). The reason why the code retries
3064 * smaller allowed sizes on EINVAL is because (a) the anon offset may not
3065 * match the bigger sizes, and (b) it's hard to get this offset (to begin
3066 * with) to pass to map_pgszcvec().
3067 */
3068 static int
3069 as_iset2_default_lpsize(struct as *as, caddr_t addr, size_t size, uint_t szc,
3070 uint_t szcvec)
3071 {
3072 int error;
3073 int retry;
3074
3075 ASSERT(AS_WRITE_HELD(as));
3076
3077 for (;;) {
3078 error = as_iset3_default_lpsize(as, addr, size, szc, &retry);
3079 if (error == EINVAL && retry) {
3080 szcvec &= ~(1 << szc);
3081 if (szcvec <= 1) {
3082 return (EINVAL);
3083 }
3084 szc = highbit(szcvec) - 1;
3085 } else {
3086 return (error);
3087 }
3088 }
3089 }
3090
3091 /*
3092 * as_iset1_default_lpsize() breaks its chunk into areas where existing
3093 * segments have a smaller szc than we want to set. For each such area,
3094 * it calls as_iset2_default_lpsize()
3095 */
3096 static int
3097 as_iset1_default_lpsize(struct as *as, caddr_t raddr, size_t rsize, uint_t szc,
3098 uint_t szcvec)
3099 {
3100 struct seg *seg;
3101 size_t ssize;
3102 caddr_t setaddr = raddr;
3103 size_t setsize = 0;
3104 int set;
3105 int error;
3106
3107 ASSERT(AS_WRITE_HELD(as));
3108
3109 seg = as_segat(as, raddr);
3110 if (seg == NULL) {
3111 panic("as_iset1_default_lpsize: no seg");
3112 }
3113 if (seg->s_szc < szc) {
3114 set = 1;
3115 } else {
3116 set = 0;
3117 }
3118
3119 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3120 if (raddr >= seg->s_base + seg->s_size) {
3121 seg = AS_SEGNEXT(as, seg);
3122 if (seg == NULL || raddr != seg->s_base) {
3123 panic("as_iset1_default_lpsize: as changed");
3124 }
3125 if (seg->s_szc >= szc && set) {
3126 ASSERT(setsize != 0);
3127 error = as_iset2_default_lpsize(as,
3128 setaddr, setsize, szc, szcvec);
3129 if (error) {
3130 return (error);
3131 }
3132 set = 0;
3133 } else if (seg->s_szc < szc && !set) {
3134 setaddr = raddr;
3135 setsize = 0;
3136 set = 1;
3137 }
3138 }
3139 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3140 ssize = seg->s_base + seg->s_size - raddr;
3141 } else {
3142 ssize = rsize;
3143 }
3144 }
3145 error = 0;
3146 if (set) {
3147 ASSERT(setsize != 0);
3148 error = as_iset2_default_lpsize(as, setaddr, setsize,
3149 szc, szcvec);
3150 }
3151 return (error);
3152 }
3153
3154 /*
3155 * as_iset_default_lpsize() breaks its chunk according to the size code bitmap
3156 * returned by map_pgszcvec() (similar to as_map_segvn_segs()), and passes each
3157 * chunk to as_iset1_default_lpsize().
3158 */
3159 static int
3160 as_iset_default_lpsize(struct as *as, caddr_t addr, size_t size, int flags,
3161 int type)
3162 {
3163 int rtype = (type & MAP_SHARED) ? MAPPGSZC_SHM : MAPPGSZC_PRIVM;
3164 uint_t szcvec = map_pgszcvec(addr, size, (uintptr_t)addr,
3165 flags, rtype, 1);
3166 uint_t szc;
3167 uint_t nszc;
3168 int error;
3169 caddr_t a;
3170 caddr_t eaddr;
3171 size_t segsize;
3172 size_t pgsz;
3173 uint_t save_szcvec;
3174
3175 ASSERT(AS_WRITE_HELD(as));
3176 ASSERT(IS_P2ALIGNED(addr, PAGESIZE));
3177 ASSERT(IS_P2ALIGNED(size, PAGESIZE));
3178
3179 szcvec &= ~1;
3180 if (szcvec <= 1) { /* skip if base page size */
3181 return (0);
3182 }
3183
3184 /* Get the pagesize of the first larger page size. */
3185 szc = lowbit(szcvec) - 1;
3186 pgsz = page_get_pagesize(szc);
3187 eaddr = addr + size;
3188 addr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3189 eaddr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3190
3191 save_szcvec = szcvec;
3192 szcvec >>= (szc + 1);
3193 nszc = szc;
3194 while (szcvec) {
3195 if ((szcvec & 0x1) == 0) {
3196 nszc++;
3197 szcvec >>= 1;
3198 continue;
3199 }
3200 nszc++;
3201 pgsz = page_get_pagesize(nszc);
3202 a = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz);
3203 if (a != addr) {
3204 ASSERT(szc > 0);
3205 ASSERT(a < eaddr);
3206 segsize = a - addr;
3207 error = as_iset1_default_lpsize(as, addr, segsize, szc,
3208 save_szcvec);
3209 if (error) {
3210 return (error);
3211 }
3212 addr = a;
3213 }
3214 szc = nszc;
3215 szcvec >>= 1;
3216 }
3217
3218 ASSERT(addr < eaddr);
3219 szcvec = save_szcvec;
3220 while (szcvec) {
3221 a = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz);
3222 ASSERT(a >= addr);
3223 if (a != addr) {
3224 ASSERT(szc > 0);
3225 segsize = a - addr;
3226 error = as_iset1_default_lpsize(as, addr, segsize, szc,
3227 save_szcvec);
3228 if (error) {
3229 return (error);
3230 }
3231 addr = a;
3232 }
3233 szcvec &= ~(1 << szc);
3234 if (szcvec) {
3235 szc = highbit(szcvec) - 1;
3236 pgsz = page_get_pagesize(szc);
3237 }
3238 }
3239 ASSERT(addr == eaddr);
3240
3241 return (0);
3242 }
3243
3244 /*
3245 * Set the default large page size for the range. Called via memcntl with
3246 * page size set to 0. as_set_default_lpsize breaks the range down into
3247 * chunks with the same type/flags, ignores-non segvn segments, and passes
3248 * each chunk to as_iset_default_lpsize().
3249 */
3250 int
3251 as_set_default_lpsize(struct as *as, caddr_t addr, size_t size)
3252 {
3253 struct seg *seg;
3254 caddr_t raddr;
3255 size_t rsize;
3256 size_t ssize;
3257 int rtype, rflags;
3258 int stype, sflags;
3259 int error;
3260 caddr_t setaddr;
3261 size_t setsize;
3262 int segvn;
3263
3264 if (size == 0)
3265 return (0);
3266
3267 AS_LOCK_ENTER(as, RW_WRITER);
3268 again:
3269 error = 0;
3270
3271 raddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3272 rsize = (((size_t)(addr + size) + PAGEOFFSET) & PAGEMASK) -
3273 (size_t)raddr;
3274
3275 if (raddr + rsize < raddr) { /* check for wraparound */
3276 AS_LOCK_EXIT(as);
3277 return (ENOMEM);
3278 }
3279 as_clearwatchprot(as, raddr, rsize);
3280 seg = as_segat(as, raddr);
3281 if (seg == NULL) {
3282 as_setwatch(as);
3283 AS_LOCK_EXIT(as);
3284 return (ENOMEM);
3285 }
3286 if (seg->s_ops == &segvn_ops) {
3287 rtype = SEGOP_GETTYPE(seg, addr);
3288 rflags = rtype & (MAP_TEXT | MAP_INITDATA);
3289 rtype = rtype & (MAP_SHARED | MAP_PRIVATE);
3290 segvn = 1;
3291 } else {
3292 segvn = 0;
3293 }
3294 setaddr = raddr;
3295 setsize = 0;
3296
3297 for (; rsize != 0; rsize -= ssize, raddr += ssize, setsize += ssize) {
3298 if (raddr >= (seg->s_base + seg->s_size)) {
3299 seg = AS_SEGNEXT(as, seg);
3300 if (seg == NULL || raddr != seg->s_base) {
3301 error = ENOMEM;
3302 break;
3303 }
3304 if (seg->s_ops == &segvn_ops) {
3305 stype = SEGOP_GETTYPE(seg, raddr);
3306 sflags = stype & (MAP_TEXT | MAP_INITDATA);
3307 stype &= (MAP_SHARED | MAP_PRIVATE);
3308 if (segvn && (rflags != sflags ||
3309 rtype != stype)) {
3310 /*
3311 * The next segment is also segvn but
3312 * has different flags and/or type.
3313 */
3314 ASSERT(setsize != 0);
3315 error = as_iset_default_lpsize(as,
3316 setaddr, setsize, rflags, rtype);
3317 if (error) {
3318 break;
3319 }
3320 rflags = sflags;
3321 rtype = stype;
3322 setaddr = raddr;
3323 setsize = 0;
3324 } else if (!segvn) {
3325 rflags = sflags;
3326 rtype = stype;
3327 setaddr = raddr;
3328 setsize = 0;
3329 segvn = 1;
3330 }
3331 } else if (segvn) {
3332 /* The next segment is not segvn. */
3333 ASSERT(setsize != 0);
3334 error = as_iset_default_lpsize(as,
3335 setaddr, setsize, rflags, rtype);
3336 if (error) {
3337 break;
3338 }
3339 segvn = 0;
3340 }
3341 }
3342 if ((raddr + rsize) > (seg->s_base + seg->s_size)) {
3343 ssize = seg->s_base + seg->s_size - raddr;
3344 } else {
3345 ssize = rsize;
3346 }
3347 }
3348 if (error == 0 && segvn) {
3349 /* The last chunk when rsize == 0. */
3350 ASSERT(setsize != 0);
3351 error = as_iset_default_lpsize(as, setaddr, setsize,
3352 rflags, rtype);
3353 }
3354
3355 if (error == IE_RETRY) {
3356 goto again;
3357 } else if (error == IE_NOMEM) {
3358 error = EAGAIN;
3359 } else if (error == ENOTSUP) {
3360 error = EINVAL;
3361 } else if (error == EAGAIN) {
3362 mutex_enter(&as->a_contents);
3363 if (!AS_ISNOUNMAPWAIT(as)) {
3364 if (AS_ISUNMAPWAIT(as) == 0) {
3365 cv_broadcast(&as->a_cv);
3366 }
3367 AS_SETUNMAPWAIT(as);
3368 AS_LOCK_EXIT(as);
3369 while (AS_ISUNMAPWAIT(as)) {
3370 cv_wait(&as->a_cv, &as->a_contents);
3371 }
3372 mutex_exit(&as->a_contents);
3373 AS_LOCK_ENTER(as, RW_WRITER);
3374 } else {
3375 /*
3376 * We may have raced with
3377 * segvn_reclaim()/segspt_reclaim(). In this case
3378 * clean nounmapwait flag and retry since softlockcnt
3379 * in this segment may be already 0. We don't drop as
3380 * writer lock so our number of retries without
3381 * sleeping should be very small. See segvn_reclaim()
3382 * for more comments.
3383 */
3384 AS_CLRNOUNMAPWAIT(as);
3385 mutex_exit(&as->a_contents);
3386 }
3387 goto again;
3388 }
3389
3390 as_setwatch(as);
3391 AS_LOCK_EXIT(as);
3392 return (error);
3393 }
3394
3395 /*
3396 * Setup all of the uninitialized watched pages that we can.
3397 */
3398 void
3399 as_setwatch(struct as *as)
3400 {
3401 struct watched_page *pwp;
3402 struct seg *seg;
3403 caddr_t vaddr;
3404 uint_t prot;
3405 int err, retrycnt;
3406
3407 if (avl_numnodes(&as->a_wpage) == 0)
3408 return;
3409
3410 ASSERT(AS_WRITE_HELD(as));
3411
3412 for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3413 pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3414 retrycnt = 0;
3415 retry:
3416 vaddr = pwp->wp_vaddr;
3417 if (pwp->wp_oprot != 0 || /* already set up */
3418 (seg = as_segat(as, vaddr)) == NULL ||
3419 SEGOP_GETPROT(seg, vaddr, 0, &prot) != 0)
3420 continue;
3421
3422 pwp->wp_oprot = prot;
3423 if (pwp->wp_read)
3424 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3425 if (pwp->wp_write)
3426 prot &= ~PROT_WRITE;
3427 if (pwp->wp_exec)
3428 prot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3429 if (!(pwp->wp_flags & WP_NOWATCH) && prot != pwp->wp_oprot) {
3430 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3431 if (err == IE_RETRY) {
3432 pwp->wp_oprot = 0;
3433 ASSERT(retrycnt == 0);
3434 retrycnt++;
3435 goto retry;
3436 }
3437 }
3438 pwp->wp_prot = prot;
3439 }
3440 }
3441
3442 /*
3443 * Clear all of the watched pages in the address space.
3444 */
3445 void
3446 as_clearwatch(struct as *as)
3447 {
3448 struct watched_page *pwp;
3449 struct seg *seg;
3450 caddr_t vaddr;
3451 uint_t prot;
3452 int err, retrycnt;
3453
3454 if (avl_numnodes(&as->a_wpage) == 0)
3455 return;
3456
3457 ASSERT(AS_WRITE_HELD(as));
3458
3459 for (pwp = avl_first(&as->a_wpage); pwp != NULL;
3460 pwp = AVL_NEXT(&as->a_wpage, pwp)) {
3461 retrycnt = 0;
3462 retry:
3463 vaddr = pwp->wp_vaddr;
3464 if (pwp->wp_oprot == 0 || /* not set up */
3465 (seg = as_segat(as, vaddr)) == NULL)
3466 continue;
3467
3468 if ((prot = pwp->wp_oprot) != pwp->wp_prot) {
3469 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, prot);
3470 if (err == IE_RETRY) {
3471 ASSERT(retrycnt == 0);
3472 retrycnt++;
3473 goto retry;
3474 }
3475 }
3476 pwp->wp_oprot = 0;
3477 pwp->wp_prot = 0;
3478 }
3479 }
3480
3481 /*
3482 * Force a new setup for all the watched pages in the range.
3483 */
3484 static void
3485 as_setwatchprot(struct as *as, caddr_t addr, size_t size, uint_t prot)
3486 {
3487 struct watched_page *pwp;
3488 struct watched_page tpw;
3489 caddr_t eaddr = addr + size;
3490 caddr_t vaddr;
3491 struct seg *seg;
3492 int err, retrycnt;
3493 uint_t wprot;
3494 avl_index_t where;
3495
3496 if (avl_numnodes(&as->a_wpage) == 0)
3497 return;
3498
3499 ASSERT(AS_WRITE_HELD(as));
3500
3501 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3502 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3503 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3504
3505 while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3506 retrycnt = 0;
3507 vaddr = pwp->wp_vaddr;
3508
3509 wprot = prot;
3510 if (pwp->wp_read)
3511 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3512 if (pwp->wp_write)
3513 wprot &= ~PROT_WRITE;
3514 if (pwp->wp_exec)
3515 wprot &= ~(PROT_READ|PROT_WRITE|PROT_EXEC);
3516 if (!(pwp->wp_flags & WP_NOWATCH) && wprot != pwp->wp_oprot) {
3517 retry:
3518 seg = as_segat(as, vaddr);
3519 if (seg == NULL) {
3520 panic("as_setwatchprot: no seg");
3521 /*NOTREACHED*/
3522 }
3523 err = SEGOP_SETPROT(seg, vaddr, PAGESIZE, wprot);
3524 if (err == IE_RETRY) {
3525 ASSERT(retrycnt == 0);
3526 retrycnt++;
3527 goto retry;
3528 }
3529 }
3530 pwp->wp_oprot = prot;
3531 pwp->wp_prot = wprot;
3532
3533 pwp = AVL_NEXT(&as->a_wpage, pwp);
3534 }
3535 }
3536
3537 /*
3538 * Clear all of the watched pages in the range.
3539 */
3540 static void
3541 as_clearwatchprot(struct as *as, caddr_t addr, size_t size)
3542 {
3543 caddr_t eaddr = addr + size;
3544 struct watched_page *pwp;
3545 struct watched_page tpw;
3546 uint_t prot;
3547 struct seg *seg;
3548 int err, retrycnt;
3549 avl_index_t where;
3550
3551 if (avl_numnodes(&as->a_wpage) == 0)
3552 return;
3553
3554 tpw.wp_vaddr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
3555 if ((pwp = avl_find(&as->a_wpage, &tpw, &where)) == NULL)
3556 pwp = avl_nearest(&as->a_wpage, where, AVL_AFTER);
3557
3558 ASSERT(AS_WRITE_HELD(as));
3559
3560 while (pwp != NULL && pwp->wp_vaddr < eaddr) {
3561
3562 if ((prot = pwp->wp_oprot) != 0) {
3563 retrycnt = 0;
3564
3565 if (prot != pwp->wp_prot) {
3566 retry:
3567 seg = as_segat(as, pwp->wp_vaddr);
3568 if (seg == NULL)
3569 continue;
3570 err = SEGOP_SETPROT(seg, pwp->wp_vaddr,
3571 PAGESIZE, prot);
3572 if (err == IE_RETRY) {
3573 ASSERT(retrycnt == 0);
3574 retrycnt++;
3575 goto retry;
3576
3577 }
3578 }
3579 pwp->wp_oprot = 0;
3580 pwp->wp_prot = 0;
3581 }
3582
3583 pwp = AVL_NEXT(&as->a_wpage, pwp);
3584 }
3585 }
3586
3587 void
3588 as_signal_proc(struct as *as, k_siginfo_t *siginfo)
3589 {
3590 struct proc *p;
3591
3592 mutex_enter(&pidlock);
3593 for (p = practive; p; p = p->p_next) {
3594 if (p->p_as == as) {
3595 mutex_enter(&p->p_lock);
3596 if (p->p_as == as)
3597 sigaddq(p, NULL, siginfo, KM_NOSLEEP);
3598 mutex_exit(&p->p_lock);
3599 }
3600 }
3601 mutex_exit(&pidlock);
3602 }
3603
3604 /*
3605 * return memory object ID
3606 */
3607 int
3608 as_getmemid(struct as *as, caddr_t addr, memid_t *memidp)
3609 {
3610 struct seg *seg;
3611 int sts;
3612
3613 AS_LOCK_ENTER(as, RW_READER);
3614 seg = as_segat(as, addr);
3615 if (seg == NULL) {
3616 AS_LOCK_EXIT(as);
3617 return (EFAULT);
3618 }
3619 /*
3620 * catch old drivers which may not support getmemid
3621 */
3622 if (seg->s_ops->getmemid == NULL) {
3623 AS_LOCK_EXIT(as);
3624 return (ENODEV);
3625 }
3626
3627 sts = SEGOP_GETMEMID(seg, addr, memidp);
3628
3629 AS_LOCK_EXIT(as);
3630 return (sts);
3631 }