1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 2015, Joyent, Inc. All rights reserved.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
26 */
27
28 #include <sys/param.h>
29 #include <sys/user.h>
30 #include <sys/mman.h>
31 #include <sys/kmem.h>
32 #include <sys/sysmacros.h>
33 #include <sys/cmn_err.h>
34 #include <sys/systm.h>
35 #include <sys/tuneable.h>
36 #include <vm/hat.h>
37 #include <vm/seg.h>
38 #include <vm/as.h>
39 #include <vm/anon.h>
40 #include <vm/page.h>
41 #include <sys/buf.h>
42 #include <sys/swap.h>
43 #include <sys/atomic.h>
44 #include <vm/seg_spt.h>
45 #include <sys/debug.h>
46 #include <sys/vtrace.h>
47 #include <sys/shm.h>
48 #include <sys/shm_impl.h>
49 #include <sys/lgrp.h>
50 #include <sys/vmsystm.h>
51 #include <sys/policy.h>
52 #include <sys/project.h>
53 #include <sys/tnf_probe.h>
54 #include <sys/zone.h>
55
56 #define SEGSPTADDR (caddr_t)0x0
57
58 /*
59 * # pages used for spt
60 */
61 size_t spt_used;
62
63 /*
64 * segspt_minfree is the memory left for system after ISM
65 * locked its pages; it is set up to 5% of availrmem in
66 * sptcreate when ISM is created. ISM should not use more
67 * than ~90% of availrmem; if it does, then the performance
68 * of the system may decrease. Machines with large memories may
69 * be able to use up more memory for ISM so we set the default
70 * segspt_minfree to 5% (which gives ISM max 95% of availrmem.
71 * If somebody wants even more memory for ISM (risking hanging
72 * the system) they can patch the segspt_minfree to smaller number.
73 */
74 volatile pgcnt_t segspt_minfree = 0;
75
76 static int segspt_create(struct seg *seg, caddr_t argsp);
77 static int segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize);
78 static void segspt_free(struct seg *seg);
79 static void segspt_free_pages(struct seg *seg, caddr_t addr, size_t len);
80 static lgrp_mem_policy_info_t *segspt_getpolicy(struct seg *seg, caddr_t addr);
81
82 static void
83 segspt_badop()
84 {
85 panic("segspt_badop called");
86 /*NOTREACHED*/
87 }
88
89 #define SEGSPT_BADOP(t) (t(*)())segspt_badop
90
91 struct seg_ops segspt_ops = {
92 SEGSPT_BADOP(int), /* dup */
93 segspt_unmap,
94 segspt_free,
95 SEGSPT_BADOP(int), /* fault */
96 SEGSPT_BADOP(faultcode_t), /* faulta */
97 SEGSPT_BADOP(int), /* setprot */
98 SEGSPT_BADOP(int), /* checkprot */
99 SEGSPT_BADOP(int), /* kluster */
100 SEGSPT_BADOP(size_t), /* swapout */
101 SEGSPT_BADOP(int), /* sync */
102 SEGSPT_BADOP(size_t), /* incore */
103 SEGSPT_BADOP(int), /* lockop */
104 SEGSPT_BADOP(int), /* getprot */
105 SEGSPT_BADOP(u_offset_t), /* getoffset */
106 SEGSPT_BADOP(int), /* gettype */
107 SEGSPT_BADOP(int), /* getvp */
108 SEGSPT_BADOP(int), /* advise */
109 SEGSPT_BADOP(void), /* dump */
110 SEGSPT_BADOP(int), /* pagelock */
111 SEGSPT_BADOP(int), /* setpgsz */
112 SEGSPT_BADOP(int), /* getmemid */
113 segspt_getpolicy, /* getpolicy */
114 SEGSPT_BADOP(int), /* capable */
115 seg_inherit_notsup /* inherit */
116 };
117
118 static int segspt_shmdup(struct seg *seg, struct seg *newseg);
119 static int segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize);
120 static void segspt_shmfree(struct seg *seg);
121 static faultcode_t segspt_shmfault(struct hat *hat, struct seg *seg,
122 caddr_t addr, size_t len, enum fault_type type, enum seg_rw rw);
123 static faultcode_t segspt_shmfaulta(struct seg *seg, caddr_t addr);
124 static int segspt_shmsetprot(register struct seg *seg, register caddr_t addr,
125 register size_t len, register uint_t prot);
126 static int segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size,
127 uint_t prot);
128 static int segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta);
129 static size_t segspt_shmswapout(struct seg *seg);
130 static size_t segspt_shmincore(struct seg *seg, caddr_t addr, size_t len,
131 register char *vec);
132 static int segspt_shmsync(struct seg *seg, register caddr_t addr, size_t len,
133 int attr, uint_t flags);
134 static int segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
135 int attr, int op, ulong_t *lockmap, size_t pos);
136 static int segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len,
137 uint_t *protv);
138 static u_offset_t segspt_shmgetoffset(struct seg *seg, caddr_t addr);
139 static int segspt_shmgettype(struct seg *seg, caddr_t addr);
140 static int segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
141 static int segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len,
142 uint_t behav);
143 static void segspt_shmdump(struct seg *seg);
144 static int segspt_shmpagelock(struct seg *, caddr_t, size_t,
145 struct page ***, enum lock_type, enum seg_rw);
146 static int segspt_shmsetpgsz(struct seg *, caddr_t, size_t, uint_t);
147 static int segspt_shmgetmemid(struct seg *, caddr_t, memid_t *);
148 static lgrp_mem_policy_info_t *segspt_shmgetpolicy(struct seg *, caddr_t);
149 static int segspt_shmcapable(struct seg *, segcapability_t);
150
151 struct seg_ops segspt_shmops = {
152 segspt_shmdup,
153 segspt_shmunmap,
154 segspt_shmfree,
155 segspt_shmfault,
156 segspt_shmfaulta,
157 segspt_shmsetprot,
158 segspt_shmcheckprot,
159 segspt_shmkluster,
160 segspt_shmswapout,
161 segspt_shmsync,
162 segspt_shmincore,
163 segspt_shmlockop,
164 segspt_shmgetprot,
165 segspt_shmgetoffset,
166 segspt_shmgettype,
167 segspt_shmgetvp,
168 segspt_shmadvise, /* advise */
169 segspt_shmdump,
170 segspt_shmpagelock,
171 segspt_shmsetpgsz,
172 segspt_shmgetmemid,
173 segspt_shmgetpolicy,
174 segspt_shmcapable,
175 seg_inherit_notsup
176 };
177
178 static void segspt_purge(struct seg *seg);
179 static int segspt_reclaim(void *, caddr_t, size_t, struct page **,
180 enum seg_rw, int);
181 static int spt_anon_getpages(struct seg *seg, caddr_t addr, size_t len,
182 page_t **ppa);
183
184
185
186 /*ARGSUSED*/
187 int
188 sptcreate(size_t size, struct seg **sptseg, struct anon_map *amp,
189 uint_t prot, uint_t flags, uint_t share_szc)
190 {
191 int err;
192 struct as *newas;
193 struct segspt_crargs sptcargs;
194
195 #ifdef DEBUG
196 TNF_PROBE_1(sptcreate, "spt", /* CSTYLED */,
197 tnf_ulong, size, size );
198 #endif
199 if (segspt_minfree == 0) /* leave min 5% of availrmem for */
200 segspt_minfree = availrmem/20; /* for the system */
201
202 if (!hat_supported(HAT_SHARED_PT, (void *)0))
203 return (EINVAL);
204
205 /*
206 * get a new as for this shared memory segment
207 */
208 newas = as_alloc();
209 newas->a_proc = NULL;
210 sptcargs.amp = amp;
211 sptcargs.prot = prot;
212 sptcargs.flags = flags;
213 sptcargs.szc = share_szc;
214 /*
215 * create a shared page table (spt) segment
216 */
217
218 if (err = as_map(newas, SEGSPTADDR, size, segspt_create, &sptcargs)) {
219 as_free(newas);
220 return (err);
221 }
222 *sptseg = sptcargs.seg_spt;
223 return (0);
224 }
225
226 void
227 sptdestroy(struct as *as, struct anon_map *amp)
228 {
229
230 #ifdef DEBUG
231 TNF_PROBE_0(sptdestroy, "spt", /* CSTYLED */);
232 #endif
233 (void) as_unmap(as, SEGSPTADDR, amp->size);
234 as_free(as);
235 }
236
237 /*
238 * called from seg_free().
239 * free (i.e., unlock, unmap, return to free list)
240 * all the pages in the given seg.
241 */
242 void
243 segspt_free(struct seg *seg)
244 {
245 struct spt_data *sptd = (struct spt_data *)seg->s_data;
246
247 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
248
249 if (sptd != NULL) {
250 if (sptd->spt_realsize)
251 segspt_free_pages(seg, seg->s_base, sptd->spt_realsize);
252
253 if (sptd->spt_ppa_lckcnt) {
254 kmem_free(sptd->spt_ppa_lckcnt,
255 sizeof (*sptd->spt_ppa_lckcnt)
256 * btopr(sptd->spt_amp->size));
257 }
258 kmem_free(sptd->spt_vp, sizeof (*sptd->spt_vp));
259 cv_destroy(&sptd->spt_cv);
260 mutex_destroy(&sptd->spt_lock);
261 kmem_free(sptd, sizeof (*sptd));
262 }
263 }
264
265 /*ARGSUSED*/
266 static int
267 segspt_shmsync(struct seg *seg, caddr_t addr, size_t len, int attr,
268 uint_t flags)
269 {
270 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
271
272 return (0);
273 }
274
275 /*ARGSUSED*/
276 static size_t
277 segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, char *vec)
278 {
279 caddr_t eo_seg;
280 pgcnt_t npages;
281 struct shm_data *shmd = (struct shm_data *)seg->s_data;
282 struct seg *sptseg;
283 struct spt_data *sptd;
284
285 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
286 #ifdef lint
287 seg = seg;
288 #endif
289 sptseg = shmd->shm_sptseg;
290 sptd = sptseg->s_data;
291
292 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
293 eo_seg = addr + len;
294 while (addr < eo_seg) {
295 /* page exists, and it's locked. */
296 *vec++ = SEG_PAGE_INCORE | SEG_PAGE_LOCKED |
297 SEG_PAGE_ANON;
298 addr += PAGESIZE;
299 }
300 return (len);
301 } else {
302 struct anon_map *amp = shmd->shm_amp;
303 struct anon *ap;
304 page_t *pp;
305 pgcnt_t anon_index;
306 struct vnode *vp;
307 u_offset_t off;
308 ulong_t i;
309 int ret;
310 anon_sync_obj_t cookie;
311
312 addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
313 anon_index = seg_page(seg, addr);
314 npages = btopr(len);
315 if (anon_index + npages > btopr(shmd->shm_amp->size)) {
316 return (EINVAL);
317 }
318 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
319 for (i = 0; i < npages; i++, anon_index++) {
320 ret = 0;
321 anon_array_enter(amp, anon_index, &cookie);
322 ap = anon_get_ptr(amp->ahp, anon_index);
323 if (ap != NULL) {
324 swap_xlate(ap, &vp, &off);
325 anon_array_exit(&cookie);
326 pp = page_lookup_nowait(vp, off, SE_SHARED);
327 if (pp != NULL) {
328 ret |= SEG_PAGE_INCORE | SEG_PAGE_ANON;
329 page_unlock(pp);
330 }
331 } else {
332 anon_array_exit(&cookie);
333 }
334 if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) {
335 ret |= SEG_PAGE_LOCKED;
336 }
337 *vec++ = (char)ret;
338 }
339 ANON_LOCK_EXIT(&->a_rwlock);
340 return (len);
341 }
342 }
343
344 static int
345 segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize)
346 {
347 size_t share_size;
348
349 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
350
351 /*
352 * seg.s_size may have been rounded up to the largest page size
353 * in shmat().
354 * XXX This should be cleanedup. sptdestroy should take a length
355 * argument which should be the same as sptcreate. Then
356 * this rounding would not be needed (or is done in shm.c)
357 * Only the check for full segment will be needed.
358 *
359 * XXX -- shouldn't raddr == 0 always? These tests don't seem
360 * to be useful at all.
361 */
362 share_size = page_get_pagesize(seg->s_szc);
363 ssize = P2ROUNDUP(ssize, share_size);
364
365 if (raddr == seg->s_base && ssize == seg->s_size) {
366 seg_free(seg);
367 return (0);
368 } else
369 return (EINVAL);
370 }
371
372 int
373 segspt_create(struct seg *seg, caddr_t argsp)
374 {
375 int err;
376 caddr_t addr = seg->s_base;
377 struct spt_data *sptd;
378 struct segspt_crargs *sptcargs = (struct segspt_crargs *)argsp;
379 struct anon_map *amp = sptcargs->amp;
380 struct kshmid *sp = amp->a_sp;
381 struct cred *cred = CRED();
382 ulong_t i, j, anon_index = 0;
383 pgcnt_t npages = btopr(amp->size);
384 struct vnode *vp;
385 page_t **ppa;
386 uint_t hat_flags;
387 size_t pgsz;
388 pgcnt_t pgcnt;
389 caddr_t a;
390 pgcnt_t pidx;
391 size_t sz;
392 proc_t *procp = curproc;
393 rctl_qty_t lockedbytes = 0;
394 kproject_t *proj;
395
396 /*
397 * We are holding the a_lock on the underlying dummy as,
398 * so we can make calls to the HAT layer.
399 */
400 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
401 ASSERT(sp != NULL);
402
403 #ifdef DEBUG
404 TNF_PROBE_2(segspt_create, "spt", /* CSTYLED */,
405 tnf_opaque, addr, addr, tnf_ulong, len, seg->s_size);
406 #endif
407 if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
408 if (err = anon_swap_adjust(npages))
409 return (err);
410 }
411 err = ENOMEM;
412
413 if ((sptd = kmem_zalloc(sizeof (*sptd), KM_NOSLEEP)) == NULL)
414 goto out1;
415
416 if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
417 if ((ppa = kmem_zalloc(((sizeof (page_t *)) * npages),
418 KM_NOSLEEP)) == NULL)
419 goto out2;
420 }
421
422 mutex_init(&sptd->spt_lock, NULL, MUTEX_DEFAULT, NULL);
423
424 if ((vp = kmem_zalloc(sizeof (*vp), KM_NOSLEEP)) == NULL)
425 goto out3;
426
427 seg->s_ops = &segspt_ops;
428 sptd->spt_vp = vp;
429 sptd->spt_amp = amp;
430 sptd->spt_prot = sptcargs->prot;
431 sptd->spt_flags = sptcargs->flags;
432 seg->s_data = (caddr_t)sptd;
433 sptd->spt_ppa = NULL;
434 sptd->spt_ppa_lckcnt = NULL;
435 seg->s_szc = sptcargs->szc;
436 cv_init(&sptd->spt_cv, NULL, CV_DEFAULT, NULL);
437 sptd->spt_gen = 0;
438
439 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
440 if (seg->s_szc > amp->a_szc) {
441 amp->a_szc = seg->s_szc;
442 }
443 ANON_LOCK_EXIT(&->a_rwlock);
444
445 /*
446 * Set policy to affect initial allocation of pages in
447 * anon_map_createpages()
448 */
449 (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, amp, anon_index,
450 NULL, 0, ptob(npages));
451
452 if (sptcargs->flags & SHM_PAGEABLE) {
453 size_t share_sz;
454 pgcnt_t new_npgs, more_pgs;
455 struct anon_hdr *nahp;
456 zone_t *zone;
457
458 share_sz = page_get_pagesize(seg->s_szc);
459 if (!IS_P2ALIGNED(amp->size, share_sz)) {
460 /*
461 * We are rounding up the size of the anon array
462 * on 4 M boundary because we always create 4 M
463 * of page(s) when locking, faulting pages and we
464 * don't have to check for all corner cases e.g.
465 * if there is enough space to allocate 4 M
466 * page.
467 */
468 new_npgs = btop(P2ROUNDUP(amp->size, share_sz));
469 more_pgs = new_npgs - npages;
470
471 /*
472 * The zone will never be NULL, as a fully created
473 * shm always has an owning zone.
474 */
475 zone = sp->shm_perm.ipc_zone_ref.zref_zone;
476 ASSERT(zone != NULL);
477 if (anon_resv_zone(ptob(more_pgs), zone) == 0) {
478 err = ENOMEM;
479 goto out4;
480 }
481
482 nahp = anon_create(new_npgs, ANON_SLEEP);
483 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
484 (void) anon_copy_ptr(amp->ahp, 0, nahp, 0, npages,
485 ANON_SLEEP);
486 anon_release(amp->ahp, npages);
487 amp->ahp = nahp;
488 ASSERT(amp->swresv == ptob(npages));
489 amp->swresv = amp->size = ptob(new_npgs);
490 ANON_LOCK_EXIT(&->a_rwlock);
491 npages = new_npgs;
492 }
493
494 sptd->spt_ppa_lckcnt = kmem_zalloc(npages *
495 sizeof (*sptd->spt_ppa_lckcnt), KM_SLEEP);
496 sptd->spt_pcachecnt = 0;
497 sptd->spt_realsize = ptob(npages);
498 sptcargs->seg_spt = seg;
499 return (0);
500 }
501
502 /*
503 * get array of pages for each anon slot in amp
504 */
505 if ((err = anon_map_createpages(amp, anon_index, ptob(npages), ppa,
506 seg, addr, S_CREATE, cred)) != 0)
507 goto out4;
508
509 mutex_enter(&sp->shm_mlock);
510
511 /* May be partially locked, so, count bytes to charge for locking */
512 for (i = 0; i < npages; i++)
513 if (ppa[i]->p_lckcnt == 0)
514 lockedbytes += PAGESIZE;
515
516 proj = sp->shm_perm.ipc_proj;
517
518 if (lockedbytes > 0) {
519 mutex_enter(&procp->p_lock);
520 if (rctl_incr_locked_mem(procp, proj, lockedbytes, 0)) {
521 mutex_exit(&procp->p_lock);
522 mutex_exit(&sp->shm_mlock);
523 for (i = 0; i < npages; i++)
524 page_unlock(ppa[i]);
525 err = ENOMEM;
526 goto out4;
527 }
528 mutex_exit(&procp->p_lock);
529 }
530
531 /*
532 * addr is initial address corresponding to the first page on ppa list
533 */
534 for (i = 0; i < npages; i++) {
535 /* attempt to lock all pages */
536 if (page_pp_lock(ppa[i], 0, 1) == 0) {
537 /*
538 * if unable to lock any page, unlock all
539 * of them and return error
540 */
541 for (j = 0; j < i; j++)
542 page_pp_unlock(ppa[j], 0, 1);
543 for (i = 0; i < npages; i++)
544 page_unlock(ppa[i]);
545 rctl_decr_locked_mem(NULL, proj, lockedbytes, 0);
546 mutex_exit(&sp->shm_mlock);
547 err = ENOMEM;
548 goto out4;
549 }
550 }
551 mutex_exit(&sp->shm_mlock);
552
553 /*
554 * Some platforms assume that ISM mappings are HAT_LOAD_LOCK
555 * for the entire life of the segment. For example platforms
556 * that do not support Dynamic Reconfiguration.
557 */
558 hat_flags = HAT_LOAD_SHARE;
559 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, NULL))
560 hat_flags |= HAT_LOAD_LOCK;
561
562 /*
563 * Load translations one lare page at a time
564 * to make sure we don't create mappings bigger than
565 * segment's size code in case underlying pages
566 * are shared with segvn's segment that uses bigger
567 * size code than we do.
568 */
569 pgsz = page_get_pagesize(seg->s_szc);
570 pgcnt = page_get_pagecnt(seg->s_szc);
571 for (a = addr, pidx = 0; pidx < npages; a += pgsz, pidx += pgcnt) {
572 sz = MIN(pgsz, ptob(npages - pidx));
573 hat_memload_array(seg->s_as->a_hat, a, sz,
574 &ppa[pidx], sptd->spt_prot, hat_flags);
575 }
576
577 /*
578 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
579 * we will leave the pages locked SE_SHARED for the life
580 * of the ISM segment. This will prevent any calls to
581 * hat_pageunload() on this ISM segment for those platforms.
582 */
583 if (!(hat_flags & HAT_LOAD_LOCK)) {
584 /*
585 * On platforms that support HAT_DYNAMIC_ISM_UNMAP,
586 * we no longer need to hold the SE_SHARED lock on the pages,
587 * since L_PAGELOCK and F_SOFTLOCK calls will grab the
588 * SE_SHARED lock on the pages as necessary.
589 */
590 for (i = 0; i < npages; i++)
591 page_unlock(ppa[i]);
592 }
593 sptd->spt_pcachecnt = 0;
594 kmem_free(ppa, ((sizeof (page_t *)) * npages));
595 sptd->spt_realsize = ptob(npages);
596 atomic_add_long(&spt_used, npages);
597 sptcargs->seg_spt = seg;
598 return (0);
599
600 out4:
601 seg->s_data = NULL;
602 kmem_free(vp, sizeof (*vp));
603 cv_destroy(&sptd->spt_cv);
604 out3:
605 mutex_destroy(&sptd->spt_lock);
606 if ((sptcargs->flags & SHM_PAGEABLE) == 0)
607 kmem_free(ppa, (sizeof (*ppa) * npages));
608 out2:
609 kmem_free(sptd, sizeof (*sptd));
610 out1:
611 if ((sptcargs->flags & SHM_PAGEABLE) == 0)
612 anon_swap_restore(npages);
613 return (err);
614 }
615
616 /*ARGSUSED*/
617 void
618 segspt_free_pages(struct seg *seg, caddr_t addr, size_t len)
619 {
620 struct page *pp;
621 struct spt_data *sptd = (struct spt_data *)seg->s_data;
622 pgcnt_t npages;
623 ulong_t anon_idx;
624 struct anon_map *amp;
625 struct anon *ap;
626 struct vnode *vp;
627 u_offset_t off;
628 uint_t hat_flags;
629 int root = 0;
630 pgcnt_t pgs, curnpgs = 0;
631 page_t *rootpp;
632 rctl_qty_t unlocked_bytes = 0;
633 kproject_t *proj;
634 kshmid_t *sp;
635
636 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
637
638 len = P2ROUNDUP(len, PAGESIZE);
639
640 npages = btop(len);
641
642 hat_flags = HAT_UNLOAD_UNLOCK | HAT_UNLOAD_UNMAP;
643 if ((hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) ||
644 (sptd->spt_flags & SHM_PAGEABLE)) {
645 hat_flags = HAT_UNLOAD_UNMAP;
646 }
647
648 hat_unload(seg->s_as->a_hat, addr, len, hat_flags);
649
650 amp = sptd->spt_amp;
651 if (sptd->spt_flags & SHM_PAGEABLE)
652 npages = btop(amp->size);
653
654 ASSERT(amp != NULL);
655
656 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
657 sp = amp->a_sp;
658 proj = sp->shm_perm.ipc_proj;
659 mutex_enter(&sp->shm_mlock);
660 }
661 for (anon_idx = 0; anon_idx < npages; anon_idx++) {
662 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
663 if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) {
664 panic("segspt_free_pages: null app");
665 /*NOTREACHED*/
666 }
667 } else {
668 if ((ap = anon_get_next_ptr(amp->ahp, &anon_idx))
669 == NULL)
670 continue;
671 }
672 ASSERT(ANON_ISBUSY(anon_get_slot(amp->ahp, anon_idx)) == 0);
673 swap_xlate(ap, &vp, &off);
674
675 /*
676 * If this platform supports HAT_DYNAMIC_ISM_UNMAP,
677 * the pages won't be having SE_SHARED lock at this
678 * point.
679 *
680 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
681 * the pages are still held SE_SHARED locked from the
682 * original segspt_create()
683 *
684 * Our goal is to get SE_EXCL lock on each page, remove
685 * permanent lock on it and invalidate the page.
686 */
687 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
688 if (hat_flags == HAT_UNLOAD_UNMAP)
689 pp = page_lookup(vp, off, SE_EXCL);
690 else {
691 if ((pp = page_find(vp, off)) == NULL) {
692 panic("segspt_free_pages: "
693 "page not locked");
694 /*NOTREACHED*/
695 }
696 if (!page_tryupgrade(pp)) {
697 page_unlock(pp);
698 pp = page_lookup(vp, off, SE_EXCL);
699 }
700 }
701 if (pp == NULL) {
702 panic("segspt_free_pages: "
703 "page not in the system");
704 /*NOTREACHED*/
705 }
706 ASSERT(pp->p_lckcnt > 0);
707 page_pp_unlock(pp, 0, 1);
708 if (pp->p_lckcnt == 0)
709 unlocked_bytes += PAGESIZE;
710 } else {
711 if ((pp = page_lookup(vp, off, SE_EXCL)) == NULL)
712 continue;
713 }
714 /*
715 * It's logical to invalidate the pages here as in most cases
716 * these were created by segspt.
717 */
718 if (pp->p_szc != 0) {
719 if (root == 0) {
720 ASSERT(curnpgs == 0);
721 root = 1;
722 rootpp = pp;
723 pgs = curnpgs = page_get_pagecnt(pp->p_szc);
724 ASSERT(pgs > 1);
725 ASSERT(IS_P2ALIGNED(pgs, pgs));
726 ASSERT(!(page_pptonum(pp) & (pgs - 1)));
727 curnpgs--;
728 } else if ((page_pptonum(pp) & (pgs - 1)) == pgs - 1) {
729 ASSERT(curnpgs == 1);
730 ASSERT(page_pptonum(pp) ==
731 page_pptonum(rootpp) + (pgs - 1));
732 page_destroy_pages(rootpp);
733 root = 0;
734 curnpgs = 0;
735 } else {
736 ASSERT(curnpgs > 1);
737 ASSERT(page_pptonum(pp) ==
738 page_pptonum(rootpp) + (pgs - curnpgs));
739 curnpgs--;
740 }
741 } else {
742 if (root != 0 || curnpgs != 0) {
743 panic("segspt_free_pages: bad large page");
744 /*NOTREACHED*/
745 }
746 /*
747 * Before destroying the pages, we need to take care
748 * of the rctl locked memory accounting. For that
749 * we need to calculte the unlocked_bytes.
750 */
751 if (pp->p_lckcnt > 0)
752 unlocked_bytes += PAGESIZE;
753 /*LINTED: constant in conditional context */
754 VN_DISPOSE(pp, B_INVAL, 0, kcred);
755 }
756 }
757 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
758 if (unlocked_bytes > 0)
759 rctl_decr_locked_mem(NULL, proj, unlocked_bytes, 0);
760 mutex_exit(&sp->shm_mlock);
761 }
762 if (root != 0 || curnpgs != 0) {
763 panic("segspt_free_pages: bad large page");
764 /*NOTREACHED*/
765 }
766
767 /*
768 * mark that pages have been released
769 */
770 sptd->spt_realsize = 0;
771
772 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
773 atomic_add_long(&spt_used, -npages);
774 anon_swap_restore(npages);
775 }
776 }
777
778 /*
779 * Get memory allocation policy info for specified address in given segment
780 */
781 static lgrp_mem_policy_info_t *
782 segspt_getpolicy(struct seg *seg, caddr_t addr)
783 {
784 struct anon_map *amp;
785 ulong_t anon_index;
786 lgrp_mem_policy_info_t *policy_info;
787 struct spt_data *spt_data;
788
789 ASSERT(seg != NULL);
790
791 /*
792 * Get anon_map from segspt
793 *
794 * Assume that no lock needs to be held on anon_map, since
795 * it should be protected by its reference count which must be
796 * nonzero for an existing segment
797 * Need to grab readers lock on policy tree though
798 */
799 spt_data = (struct spt_data *)seg->s_data;
800 if (spt_data == NULL)
801 return (NULL);
802 amp = spt_data->spt_amp;
803 ASSERT(amp->refcnt != 0);
804
805 /*
806 * Get policy info
807 *
808 * Assume starting anon index of 0
809 */
810 anon_index = seg_page(seg, addr);
811 policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);
812
813 return (policy_info);
814 }
815
816 /*
817 * DISM only.
818 * Return locked pages over a given range.
819 *
820 * We will cache all DISM locked pages and save the pplist for the
821 * entire segment in the ppa field of the underlying DISM segment structure.
822 * Later, during a call to segspt_reclaim() we will use this ppa array
823 * to page_unlock() all of the pages and then we will free this ppa list.
824 */
825 /*ARGSUSED*/
826 static int
827 segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len,
828 struct page ***ppp, enum lock_type type, enum seg_rw rw)
829 {
830 struct shm_data *shmd = (struct shm_data *)seg->s_data;
831 struct seg *sptseg = shmd->shm_sptseg;
832 struct spt_data *sptd = sptseg->s_data;
833 pgcnt_t pg_idx, npages, tot_npages, npgs;
834 struct page **pplist, **pl, **ppa, *pp;
835 struct anon_map *amp;
836 spgcnt_t an_idx;
837 int ret = ENOTSUP;
838 uint_t pl_built = 0;
839 struct anon *ap;
840 struct vnode *vp;
841 u_offset_t off;
842 pgcnt_t claim_availrmem = 0;
843 uint_t szc;
844
845 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
846 ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK);
847
848 /*
849 * We want to lock/unlock the entire ISM segment. Therefore,
850 * we will be using the underlying sptseg and it's base address
851 * and length for the caching arguments.
852 */
853 ASSERT(sptseg);
854 ASSERT(sptd);
855
856 pg_idx = seg_page(seg, addr);
857 npages = btopr(len);
858
859 /*
860 * check if the request is larger than number of pages covered
861 * by amp
862 */
863 if (pg_idx + npages > btopr(sptd->spt_amp->size)) {
864 *ppp = NULL;
865 return (ENOTSUP);
866 }
867
868 if (type == L_PAGEUNLOCK) {
869 ASSERT(sptd->spt_ppa != NULL);
870
871 seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size,
872 sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
873
874 /*
875 * If someone is blocked while unmapping, we purge
876 * segment page cache and thus reclaim pplist synchronously
877 * without waiting for seg_pasync_thread. This speeds up
878 * unmapping in cases where munmap(2) is called, while
879 * raw async i/o is still in progress or where a thread
880 * exits on data fault in a multithreaded application.
881 */
882 if ((sptd->spt_flags & DISM_PPA_CHANGED) ||
883 (AS_ISUNMAPWAIT(seg->s_as) &&
884 shmd->shm_softlockcnt > 0)) {
885 segspt_purge(seg);
886 }
887 return (0);
888 }
889
890 /* The L_PAGELOCK case ... */
891
892 if (sptd->spt_flags & DISM_PPA_CHANGED) {
893 segspt_purge(seg);
894 /*
895 * for DISM ppa needs to be rebuild since
896 * number of locked pages could be changed
897 */
898 *ppp = NULL;
899 return (ENOTSUP);
900 }
901
902 /*
903 * First try to find pages in segment page cache, without
904 * holding the segment lock.
905 */
906 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
907 S_WRITE, SEGP_FORCE_WIRED);
908 if (pplist != NULL) {
909 ASSERT(sptd->spt_ppa != NULL);
910 ASSERT(sptd->spt_ppa == pplist);
911 ppa = sptd->spt_ppa;
912 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
913 if (ppa[an_idx] == NULL) {
914 seg_pinactive(seg, NULL, seg->s_base,
915 sptd->spt_amp->size, ppa,
916 S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
917 *ppp = NULL;
918 return (ENOTSUP);
919 }
920 if ((szc = ppa[an_idx]->p_szc) != 0) {
921 npgs = page_get_pagecnt(szc);
922 an_idx = P2ROUNDUP(an_idx + 1, npgs);
923 } else {
924 an_idx++;
925 }
926 }
927 /*
928 * Since we cache the entire DISM segment, we want to
929 * set ppp to point to the first slot that corresponds
930 * to the requested addr, i.e. pg_idx.
931 */
932 *ppp = &(sptd->spt_ppa[pg_idx]);
933 return (0);
934 }
935
936 mutex_enter(&sptd->spt_lock);
937 /*
938 * try to find pages in segment page cache with mutex
939 */
940 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
941 S_WRITE, SEGP_FORCE_WIRED);
942 if (pplist != NULL) {
943 ASSERT(sptd->spt_ppa != NULL);
944 ASSERT(sptd->spt_ppa == pplist);
945 ppa = sptd->spt_ppa;
946 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
947 if (ppa[an_idx] == NULL) {
948 mutex_exit(&sptd->spt_lock);
949 seg_pinactive(seg, NULL, seg->s_base,
950 sptd->spt_amp->size, ppa,
951 S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
952 *ppp = NULL;
953 return (ENOTSUP);
954 }
955 if ((szc = ppa[an_idx]->p_szc) != 0) {
956 npgs = page_get_pagecnt(szc);
957 an_idx = P2ROUNDUP(an_idx + 1, npgs);
958 } else {
959 an_idx++;
960 }
961 }
962 /*
963 * Since we cache the entire DISM segment, we want to
964 * set ppp to point to the first slot that corresponds
965 * to the requested addr, i.e. pg_idx.
966 */
967 mutex_exit(&sptd->spt_lock);
968 *ppp = &(sptd->spt_ppa[pg_idx]);
969 return (0);
970 }
971 if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size,
972 SEGP_FORCE_WIRED) == SEGP_FAIL) {
973 mutex_exit(&sptd->spt_lock);
974 *ppp = NULL;
975 return (ENOTSUP);
976 }
977
978 /*
979 * No need to worry about protections because DISM pages are always rw.
980 */
981 pl = pplist = NULL;
982 amp = sptd->spt_amp;
983
984 /*
985 * Do we need to build the ppa array?
986 */
987 if (sptd->spt_ppa == NULL) {
988 pgcnt_t lpg_cnt = 0;
989
990 pl_built = 1;
991 tot_npages = btopr(sptd->spt_amp->size);
992
993 ASSERT(sptd->spt_pcachecnt == 0);
994 pplist = kmem_zalloc(sizeof (page_t *) * tot_npages, KM_SLEEP);
995 pl = pplist;
996
997 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
998 for (an_idx = 0; an_idx < tot_npages; ) {
999 ap = anon_get_ptr(amp->ahp, an_idx);
1000 /*
1001 * Cache only mlocked pages. For large pages
1002 * if one (constituent) page is mlocked
1003 * all pages for that large page
1004 * are cached also. This is for quick
1005 * lookups of ppa array;
1006 */
1007 if ((ap != NULL) && (lpg_cnt != 0 ||
1008 (sptd->spt_ppa_lckcnt[an_idx] != 0))) {
1009
1010 swap_xlate(ap, &vp, &off);
1011 pp = page_lookup(vp, off, SE_SHARED);
1012 ASSERT(pp != NULL);
1013 if (lpg_cnt == 0) {
1014 lpg_cnt++;
1015 /*
1016 * For a small page, we are done --
1017 * lpg_count is reset to 0 below.
1018 *
1019 * For a large page, we are guaranteed
1020 * to find the anon structures of all
1021 * constituent pages and a non-zero
1022 * lpg_cnt ensures that we don't test
1023 * for mlock for these. We are done
1024 * when lpg_count reaches (npgs + 1).
1025 * If we are not the first constituent
1026 * page, restart at the first one.
1027 */
1028 npgs = page_get_pagecnt(pp->p_szc);
1029 if (!IS_P2ALIGNED(an_idx, npgs)) {
1030 an_idx = P2ALIGN(an_idx, npgs);
1031 page_unlock(pp);
1032 continue;
1033 }
1034 }
1035 if (++lpg_cnt > npgs)
1036 lpg_cnt = 0;
1037
1038 /*
1039 * availrmem is decremented only
1040 * for unlocked pages
1041 */
1042 if (sptd->spt_ppa_lckcnt[an_idx] == 0)
1043 claim_availrmem++;
1044 pplist[an_idx] = pp;
1045 }
1046 an_idx++;
1047 }
1048 ANON_LOCK_EXIT(&->a_rwlock);
1049
1050 if (claim_availrmem) {
1051 mutex_enter(&freemem_lock);
1052 if (availrmem < tune.t_minarmem + claim_availrmem) {
1053 mutex_exit(&freemem_lock);
1054 ret = ENOTSUP;
1055 claim_availrmem = 0;
1056 goto insert_fail;
1057 } else {
1058 availrmem -= claim_availrmem;
1059 }
1060 mutex_exit(&freemem_lock);
1061 }
1062
1063 sptd->spt_ppa = pl;
1064 } else {
1065 /*
1066 * We already have a valid ppa[].
1067 */
1068 pl = sptd->spt_ppa;
1069 }
1070
1071 ASSERT(pl != NULL);
1072
1073 ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size,
1074 sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED,
1075 segspt_reclaim);
1076 if (ret == SEGP_FAIL) {
1077 /*
1078 * seg_pinsert failed. We return
1079 * ENOTSUP, so that the as_pagelock() code will
1080 * then try the slower F_SOFTLOCK path.
1081 */
1082 if (pl_built) {
1083 /*
1084 * No one else has referenced the ppa[].
1085 * We created it and we need to destroy it.
1086 */
1087 sptd->spt_ppa = NULL;
1088 }
1089 ret = ENOTSUP;
1090 goto insert_fail;
1091 }
1092
1093 /*
1094 * In either case, we increment softlockcnt on the 'real' segment.
1095 */
1096 sptd->spt_pcachecnt++;
1097 atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1098
1099 ppa = sptd->spt_ppa;
1100 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
1101 if (ppa[an_idx] == NULL) {
1102 mutex_exit(&sptd->spt_lock);
1103 seg_pinactive(seg, NULL, seg->s_base,
1104 sptd->spt_amp->size,
1105 pl, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1106 *ppp = NULL;
1107 return (ENOTSUP);
1108 }
1109 if ((szc = ppa[an_idx]->p_szc) != 0) {
1110 npgs = page_get_pagecnt(szc);
1111 an_idx = P2ROUNDUP(an_idx + 1, npgs);
1112 } else {
1113 an_idx++;
1114 }
1115 }
1116 /*
1117 * We can now drop the sptd->spt_lock since the ppa[]
1118 * exists and we have incremented pacachecnt.
1119 */
1120 mutex_exit(&sptd->spt_lock);
1121
1122 /*
1123 * Since we cache the entire segment, we want to
1124 * set ppp to point to the first slot that corresponds
1125 * to the requested addr, i.e. pg_idx.
1126 */
1127 *ppp = &(sptd->spt_ppa[pg_idx]);
1128 return (0);
1129
1130 insert_fail:
1131 /*
1132 * We will only reach this code if we tried and failed.
1133 *
1134 * And we can drop the lock on the dummy seg, once we've failed
1135 * to set up a new ppa[].
1136 */
1137 mutex_exit(&sptd->spt_lock);
1138
1139 if (pl_built) {
1140 if (claim_availrmem) {
1141 mutex_enter(&freemem_lock);
1142 availrmem += claim_availrmem;
1143 mutex_exit(&freemem_lock);
1144 }
1145
1146 /*
1147 * We created pl and we need to destroy it.
1148 */
1149 pplist = pl;
1150 for (an_idx = 0; an_idx < tot_npages; an_idx++) {
1151 if (pplist[an_idx] != NULL)
1152 page_unlock(pplist[an_idx]);
1153 }
1154 kmem_free(pl, sizeof (page_t *) * tot_npages);
1155 }
1156
1157 if (shmd->shm_softlockcnt <= 0) {
1158 if (AS_ISUNMAPWAIT(seg->s_as)) {
1159 mutex_enter(&seg->s_as->a_contents);
1160 if (AS_ISUNMAPWAIT(seg->s_as)) {
1161 AS_CLRUNMAPWAIT(seg->s_as);
1162 cv_broadcast(&seg->s_as->a_cv);
1163 }
1164 mutex_exit(&seg->s_as->a_contents);
1165 }
1166 }
1167 *ppp = NULL;
1168 return (ret);
1169 }
1170
1171
1172
1173 /*
1174 * return locked pages over a given range.
1175 *
1176 * We will cache the entire ISM segment and save the pplist for the
1177 * entire segment in the ppa field of the underlying ISM segment structure.
1178 * Later, during a call to segspt_reclaim() we will use this ppa array
1179 * to page_unlock() all of the pages and then we will free this ppa list.
1180 */
1181 /*ARGSUSED*/
1182 static int
1183 segspt_shmpagelock(struct seg *seg, caddr_t addr, size_t len,
1184 struct page ***ppp, enum lock_type type, enum seg_rw rw)
1185 {
1186 struct shm_data *shmd = (struct shm_data *)seg->s_data;
1187 struct seg *sptseg = shmd->shm_sptseg;
1188 struct spt_data *sptd = sptseg->s_data;
1189 pgcnt_t np, page_index, npages;
1190 caddr_t a, spt_base;
1191 struct page **pplist, **pl, *pp;
1192 struct anon_map *amp;
1193 ulong_t anon_index;
1194 int ret = ENOTSUP;
1195 uint_t pl_built = 0;
1196 struct anon *ap;
1197 struct vnode *vp;
1198 u_offset_t off;
1199
1200 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
1201 ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK);
1202
1203
1204 /*
1205 * We want to lock/unlock the entire ISM segment. Therefore,
1206 * we will be using the underlying sptseg and it's base address
1207 * and length for the caching arguments.
1208 */
1209 ASSERT(sptseg);
1210 ASSERT(sptd);
1211
1212 if (sptd->spt_flags & SHM_PAGEABLE) {
1213 return (segspt_dismpagelock(seg, addr, len, ppp, type, rw));
1214 }
1215
1216 page_index = seg_page(seg, addr);
1217 npages = btopr(len);
1218
1219 /*
1220 * check if the request is larger than number of pages covered
1221 * by amp
1222 */
1223 if (page_index + npages > btopr(sptd->spt_amp->size)) {
1224 *ppp = NULL;
1225 return (ENOTSUP);
1226 }
1227
1228 if (type == L_PAGEUNLOCK) {
1229
1230 ASSERT(sptd->spt_ppa != NULL);
1231
1232 seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size,
1233 sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1234
1235 /*
1236 * If someone is blocked while unmapping, we purge
1237 * segment page cache and thus reclaim pplist synchronously
1238 * without waiting for seg_pasync_thread. This speeds up
1239 * unmapping in cases where munmap(2) is called, while
1240 * raw async i/o is still in progress or where a thread
1241 * exits on data fault in a multithreaded application.
1242 */
1243 if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) {
1244 segspt_purge(seg);
1245 }
1246 return (0);
1247 }
1248
1249 /* The L_PAGELOCK case... */
1250
1251 /*
1252 * First try to find pages in segment page cache, without
1253 * holding the segment lock.
1254 */
1255 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1256 S_WRITE, SEGP_FORCE_WIRED);
1257 if (pplist != NULL) {
1258 ASSERT(sptd->spt_ppa == pplist);
1259 ASSERT(sptd->spt_ppa[page_index]);
1260 /*
1261 * Since we cache the entire ISM segment, we want to
1262 * set ppp to point to the first slot that corresponds
1263 * to the requested addr, i.e. page_index.
1264 */
1265 *ppp = &(sptd->spt_ppa[page_index]);
1266 return (0);
1267 }
1268
1269 mutex_enter(&sptd->spt_lock);
1270
1271 /*
1272 * try to find pages in segment page cache
1273 */
1274 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1275 S_WRITE, SEGP_FORCE_WIRED);
1276 if (pplist != NULL) {
1277 ASSERT(sptd->spt_ppa == pplist);
1278 /*
1279 * Since we cache the entire segment, we want to
1280 * set ppp to point to the first slot that corresponds
1281 * to the requested addr, i.e. page_index.
1282 */
1283 mutex_exit(&sptd->spt_lock);
1284 *ppp = &(sptd->spt_ppa[page_index]);
1285 return (0);
1286 }
1287
1288 if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size,
1289 SEGP_FORCE_WIRED) == SEGP_FAIL) {
1290 mutex_exit(&sptd->spt_lock);
1291 *ppp = NULL;
1292 return (ENOTSUP);
1293 }
1294
1295 /*
1296 * No need to worry about protections because ISM pages
1297 * are always rw.
1298 */
1299 pl = pplist = NULL;
1300
1301 /*
1302 * Do we need to build the ppa array?
1303 */
1304 if (sptd->spt_ppa == NULL) {
1305 ASSERT(sptd->spt_ppa == pplist);
1306
1307 spt_base = sptseg->s_base;
1308 pl_built = 1;
1309
1310 /*
1311 * availrmem is decremented once during anon_swap_adjust()
1312 * and is incremented during the anon_unresv(), which is
1313 * called from shm_rm_amp() when the segment is destroyed.
1314 */
1315 amp = sptd->spt_amp;
1316 ASSERT(amp != NULL);
1317
1318 /* pcachecnt is protected by sptd->spt_lock */
1319 ASSERT(sptd->spt_pcachecnt == 0);
1320 pplist = kmem_zalloc(sizeof (page_t *)
1321 * btopr(sptd->spt_amp->size), KM_SLEEP);
1322 pl = pplist;
1323
1324 anon_index = seg_page(sptseg, spt_base);
1325
1326 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
1327 for (a = spt_base; a < (spt_base + sptd->spt_amp->size);
1328 a += PAGESIZE, anon_index++, pplist++) {
1329 ap = anon_get_ptr(amp->ahp, anon_index);
1330 ASSERT(ap != NULL);
1331 swap_xlate(ap, &vp, &off);
1332 pp = page_lookup(vp, off, SE_SHARED);
1333 ASSERT(pp != NULL);
1334 *pplist = pp;
1335 }
1336 ANON_LOCK_EXIT(&->a_rwlock);
1337
1338 if (a < (spt_base + sptd->spt_amp->size)) {
1339 ret = ENOTSUP;
1340 goto insert_fail;
1341 }
1342 sptd->spt_ppa = pl;
1343 } else {
1344 /*
1345 * We already have a valid ppa[].
1346 */
1347 pl = sptd->spt_ppa;
1348 }
1349
1350 ASSERT(pl != NULL);
1351
1352 ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size,
1353 sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED,
1354 segspt_reclaim);
1355 if (ret == SEGP_FAIL) {
1356 /*
1357 * seg_pinsert failed. We return
1358 * ENOTSUP, so that the as_pagelock() code will
1359 * then try the slower F_SOFTLOCK path.
1360 */
1361 if (pl_built) {
1362 /*
1363 * No one else has referenced the ppa[].
1364 * We created it and we need to destroy it.
1365 */
1366 sptd->spt_ppa = NULL;
1367 }
1368 ret = ENOTSUP;
1369 goto insert_fail;
1370 }
1371
1372 /*
1373 * In either case, we increment softlockcnt on the 'real' segment.
1374 */
1375 sptd->spt_pcachecnt++;
1376 atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1377
1378 /*
1379 * We can now drop the sptd->spt_lock since the ppa[]
1380 * exists and we have incremented pacachecnt.
1381 */
1382 mutex_exit(&sptd->spt_lock);
1383
1384 /*
1385 * Since we cache the entire segment, we want to
1386 * set ppp to point to the first slot that corresponds
1387 * to the requested addr, i.e. page_index.
1388 */
1389 *ppp = &(sptd->spt_ppa[page_index]);
1390 return (0);
1391
1392 insert_fail:
1393 /*
1394 * We will only reach this code if we tried and failed.
1395 *
1396 * And we can drop the lock on the dummy seg, once we've failed
1397 * to set up a new ppa[].
1398 */
1399 mutex_exit(&sptd->spt_lock);
1400
1401 if (pl_built) {
1402 /*
1403 * We created pl and we need to destroy it.
1404 */
1405 pplist = pl;
1406 np = (((uintptr_t)(a - spt_base)) >> PAGESHIFT);
1407 while (np) {
1408 page_unlock(*pplist);
1409 np--;
1410 pplist++;
1411 }
1412 kmem_free(pl, sizeof (page_t *) * btopr(sptd->spt_amp->size));
1413 }
1414 if (shmd->shm_softlockcnt <= 0) {
1415 if (AS_ISUNMAPWAIT(seg->s_as)) {
1416 mutex_enter(&seg->s_as->a_contents);
1417 if (AS_ISUNMAPWAIT(seg->s_as)) {
1418 AS_CLRUNMAPWAIT(seg->s_as);
1419 cv_broadcast(&seg->s_as->a_cv);
1420 }
1421 mutex_exit(&seg->s_as->a_contents);
1422 }
1423 }
1424 *ppp = NULL;
1425 return (ret);
1426 }
1427
1428 /*
1429 * purge any cached pages in the I/O page cache
1430 */
1431 static void
1432 segspt_purge(struct seg *seg)
1433 {
1434 seg_ppurge(seg, NULL, SEGP_FORCE_WIRED);
1435 }
1436
1437 static int
1438 segspt_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist,
1439 enum seg_rw rw, int async)
1440 {
1441 struct seg *seg = (struct seg *)ptag;
1442 struct shm_data *shmd = (struct shm_data *)seg->s_data;
1443 struct seg *sptseg;
1444 struct spt_data *sptd;
1445 pgcnt_t npages, i, free_availrmem = 0;
1446 int done = 0;
1447
1448 #ifdef lint
1449 addr = addr;
1450 #endif
1451 sptseg = shmd->shm_sptseg;
1452 sptd = sptseg->s_data;
1453 npages = (len >> PAGESHIFT);
1454 ASSERT(npages);
1455 ASSERT(sptd->spt_pcachecnt != 0);
1456 ASSERT(sptd->spt_ppa == pplist);
1457 ASSERT(npages == btopr(sptd->spt_amp->size));
1458 ASSERT(async || AS_LOCK_HELD(seg->s_as));
1459
1460 /*
1461 * Acquire the lock on the dummy seg and destroy the
1462 * ppa array IF this is the last pcachecnt.
1463 */
1464 mutex_enter(&sptd->spt_lock);
1465 if (--sptd->spt_pcachecnt == 0) {
1466 for (i = 0; i < npages; i++) {
1467 if (pplist[i] == NULL) {
1468 continue;
1469 }
1470 if (rw == S_WRITE) {
1471 hat_setrefmod(pplist[i]);
1472 } else {
1473 hat_setref(pplist[i]);
1474 }
1475 if ((sptd->spt_flags & SHM_PAGEABLE) &&
1476 (sptd->spt_ppa_lckcnt[i] == 0))
1477 free_availrmem++;
1478 page_unlock(pplist[i]);
1479 }
1480 if ((sptd->spt_flags & SHM_PAGEABLE) && free_availrmem) {
1481 mutex_enter(&freemem_lock);
1482 availrmem += free_availrmem;
1483 mutex_exit(&freemem_lock);
1484 }
1485 /*
1486 * Since we want to cach/uncache the entire ISM segment,
1487 * we will track the pplist in a segspt specific field
1488 * ppa, that is initialized at the time we add an entry to
1489 * the cache.
1490 */
1491 ASSERT(sptd->spt_pcachecnt == 0);
1492 kmem_free(pplist, sizeof (page_t *) * npages);
1493 sptd->spt_ppa = NULL;
1494 sptd->spt_flags &= ~DISM_PPA_CHANGED;
1495 sptd->spt_gen++;
1496 cv_broadcast(&sptd->spt_cv);
1497 done = 1;
1498 }
1499 mutex_exit(&sptd->spt_lock);
1500
1501 /*
1502 * If we are pcache async thread or called via seg_ppurge_wiredpp() we
1503 * may not hold AS lock (in this case async argument is not 0). This
1504 * means if softlockcnt drops to 0 after the decrement below address
1505 * space may get freed. We can't allow it since after softlock
1506 * derement to 0 we still need to access as structure for possible
1507 * wakeup of unmap waiters. To prevent the disappearance of as we take
1508 * this segment's shm_segfree_syncmtx. segspt_shmfree() also takes
1509 * this mutex as a barrier to make sure this routine completes before
1510 * segment is freed.
1511 *
1512 * The second complication we have to deal with in async case is a
1513 * possibility of missed wake up of unmap wait thread. When we don't
1514 * hold as lock here we may take a_contents lock before unmap wait
1515 * thread that was first to see softlockcnt was still not 0. As a
1516 * result we'll fail to wake up an unmap wait thread. To avoid this
1517 * race we set nounmapwait flag in as structure if we drop softlockcnt
1518 * to 0 if async is not 0. unmapwait thread
1519 * will not block if this flag is set.
1520 */
1521 if (async)
1522 mutex_enter(&shmd->shm_segfree_syncmtx);
1523
1524 /*
1525 * Now decrement softlockcnt.
1526 */
1527 ASSERT(shmd->shm_softlockcnt > 0);
1528 atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1529
1530 if (shmd->shm_softlockcnt <= 0) {
1531 if (async || AS_ISUNMAPWAIT(seg->s_as)) {
1532 mutex_enter(&seg->s_as->a_contents);
1533 if (async)
1534 AS_SETNOUNMAPWAIT(seg->s_as);
1535 if (AS_ISUNMAPWAIT(seg->s_as)) {
1536 AS_CLRUNMAPWAIT(seg->s_as);
1537 cv_broadcast(&seg->s_as->a_cv);
1538 }
1539 mutex_exit(&seg->s_as->a_contents);
1540 }
1541 }
1542
1543 if (async)
1544 mutex_exit(&shmd->shm_segfree_syncmtx);
1545
1546 return (done);
1547 }
1548
1549 /*
1550 * Do a F_SOFTUNLOCK call over the range requested.
1551 * The range must have already been F_SOFTLOCK'ed.
1552 *
1553 * The calls to acquire and release the anon map lock mutex were
1554 * removed in order to avoid a deadly embrace during a DR
1555 * memory delete operation. (Eg. DR blocks while waiting for a
1556 * exclusive lock on a page that is being used for kaio; the
1557 * thread that will complete the kaio and call segspt_softunlock
1558 * blocks on the anon map lock; another thread holding the anon
1559 * map lock blocks on another page lock via the segspt_shmfault
1560 * -> page_lookup -> page_lookup_create -> page_lock_es code flow.)
1561 *
1562 * The appropriateness of the removal is based upon the following:
1563 * 1. If we are holding a segment's reader lock and the page is held
1564 * shared, then the corresponding element in anonmap which points to
1565 * anon struct cannot change and there is no need to acquire the
1566 * anonymous map lock.
1567 * 2. Threads in segspt_softunlock have a reader lock on the segment
1568 * and already have the shared page lock, so we are guaranteed that
1569 * the anon map slot cannot change and therefore can call anon_get_ptr()
1570 * without grabbing the anonymous map lock.
1571 * 3. Threads that softlock a shared page break copy-on-write, even if
1572 * its a read. Thus cow faults can be ignored with respect to soft
1573 * unlocking, since the breaking of cow means that the anon slot(s) will
1574 * not be shared.
1575 */
1576 static void
1577 segspt_softunlock(struct seg *seg, caddr_t sptseg_addr,
1578 size_t len, enum seg_rw rw)
1579 {
1580 struct shm_data *shmd = (struct shm_data *)seg->s_data;
1581 struct seg *sptseg;
1582 struct spt_data *sptd;
1583 page_t *pp;
1584 caddr_t adr;
1585 struct vnode *vp;
1586 u_offset_t offset;
1587 ulong_t anon_index;
1588 struct anon_map *amp; /* XXX - for locknest */
1589 struct anon *ap = NULL;
1590 pgcnt_t npages;
1591
1592 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
1593
1594 sptseg = shmd->shm_sptseg;
1595 sptd = sptseg->s_data;
1596
1597 /*
1598 * Some platforms assume that ISM mappings are HAT_LOAD_LOCK
1599 * and therefore their pages are SE_SHARED locked
1600 * for the entire life of the segment.
1601 */
1602 if ((!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) &&
1603 ((sptd->spt_flags & SHM_PAGEABLE) == 0)) {
1604 goto softlock_decrement;
1605 }
1606
1607 /*
1608 * Any thread is free to do a page_find and
1609 * page_unlock() on the pages within this seg.
1610 *
1611 * We are already holding the as->a_lock on the user's
1612 * real segment, but we need to hold the a_lock on the
1613 * underlying dummy as. This is mostly to satisfy the
1614 * underlying HAT layer.
1615 */
1616 AS_LOCK_ENTER(sptseg->s_as, RW_READER);
1617 hat_unlock(sptseg->s_as->a_hat, sptseg_addr, len);
1618 AS_LOCK_EXIT(sptseg->s_as);
1619
1620 amp = sptd->spt_amp;
1621 ASSERT(amp != NULL);
1622 anon_index = seg_page(sptseg, sptseg_addr);
1623
1624 for (adr = sptseg_addr; adr < sptseg_addr + len; adr += PAGESIZE) {
1625 ap = anon_get_ptr(amp->ahp, anon_index++);
1626 ASSERT(ap != NULL);
1627 swap_xlate(ap, &vp, &offset);
1628
1629 /*
1630 * Use page_find() instead of page_lookup() to
1631 * find the page since we know that it has a
1632 * "shared" lock.
1633 */
1634 pp = page_find(vp, offset);
1635 ASSERT(ap == anon_get_ptr(amp->ahp, anon_index - 1));
1636 if (pp == NULL) {
1637 panic("segspt_softunlock: "
1638 "addr %p, ap %p, vp %p, off %llx",
1639 (void *)adr, (void *)ap, (void *)vp, offset);
1640 /*NOTREACHED*/
1641 }
1642
1643 if (rw == S_WRITE) {
1644 hat_setrefmod(pp);
1645 } else if (rw != S_OTHER) {
1646 hat_setref(pp);
1647 }
1648 page_unlock(pp);
1649 }
1650
1651 softlock_decrement:
1652 npages = btopr(len);
1653 ASSERT(shmd->shm_softlockcnt >= npages);
1654 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -npages);
1655 if (shmd->shm_softlockcnt == 0) {
1656 /*
1657 * All SOFTLOCKS are gone. Wakeup any waiting
1658 * unmappers so they can try again to unmap.
1659 * Check for waiters first without the mutex
1660 * held so we don't always grab the mutex on
1661 * softunlocks.
1662 */
1663 if (AS_ISUNMAPWAIT(seg->s_as)) {
1664 mutex_enter(&seg->s_as->a_contents);
1665 if (AS_ISUNMAPWAIT(seg->s_as)) {
1666 AS_CLRUNMAPWAIT(seg->s_as);
1667 cv_broadcast(&seg->s_as->a_cv);
1668 }
1669 mutex_exit(&seg->s_as->a_contents);
1670 }
1671 }
1672 }
1673
1674 int
1675 segspt_shmattach(struct seg *seg, caddr_t *argsp)
1676 {
1677 struct shm_data *shmd_arg = (struct shm_data *)argsp;
1678 struct shm_data *shmd;
1679 struct anon_map *shm_amp = shmd_arg->shm_amp;
1680 struct spt_data *sptd;
1681 int error = 0;
1682
1683 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
1684
1685 shmd = kmem_zalloc((sizeof (*shmd)), KM_NOSLEEP);
1686 if (shmd == NULL)
1687 return (ENOMEM);
1688
1689 shmd->shm_sptas = shmd_arg->shm_sptas;
1690 shmd->shm_amp = shm_amp;
1691 shmd->shm_sptseg = shmd_arg->shm_sptseg;
1692
1693 (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, shm_amp, 0,
1694 NULL, 0, seg->s_size);
1695
1696 mutex_init(&shmd->shm_segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL);
1697
1698 seg->s_data = (void *)shmd;
1699 seg->s_ops = &segspt_shmops;
1700 seg->s_szc = shmd->shm_sptseg->s_szc;
1701 sptd = shmd->shm_sptseg->s_data;
1702
1703 if (sptd->spt_flags & SHM_PAGEABLE) {
1704 if ((shmd->shm_vpage = kmem_zalloc(btopr(shm_amp->size),
1705 KM_NOSLEEP)) == NULL) {
1706 seg->s_data = (void *)NULL;
1707 kmem_free(shmd, (sizeof (*shmd)));
1708 return (ENOMEM);
1709 }
1710 shmd->shm_lckpgs = 0;
1711 if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
1712 if ((error = hat_share(seg->s_as->a_hat, seg->s_base,
1713 shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
1714 seg->s_size, seg->s_szc)) != 0) {
1715 kmem_free(shmd->shm_vpage,
1716 btopr(shm_amp->size));
1717 }
1718 }
1719 } else {
1720 error = hat_share(seg->s_as->a_hat, seg->s_base,
1721 shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
1722 seg->s_size, seg->s_szc);
1723 }
1724 if (error) {
1725 seg->s_szc = 0;
1726 seg->s_data = (void *)NULL;
1727 kmem_free(shmd, (sizeof (*shmd)));
1728 } else {
1729 ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
1730 shm_amp->refcnt++;
1731 ANON_LOCK_EXIT(&shm_amp->a_rwlock);
1732 }
1733 return (error);
1734 }
1735
1736 int
1737 segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize)
1738 {
1739 struct shm_data *shmd = (struct shm_data *)seg->s_data;
1740 int reclaim = 1;
1741
1742 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
1743 retry:
1744 if (shmd->shm_softlockcnt > 0) {
1745 if (reclaim == 1) {
1746 segspt_purge(seg);
1747 reclaim = 0;
1748 goto retry;
1749 }
1750 return (EAGAIN);
1751 }
1752
1753 if (ssize != seg->s_size) {
1754 #ifdef DEBUG
1755 cmn_err(CE_WARN, "Incompatible ssize %lx s_size %lx\n",
1756 ssize, seg->s_size);
1757 #endif
1758 return (EINVAL);
1759 }
1760
1761 (void) segspt_shmlockop(seg, raddr, shmd->shm_amp->size, 0, MC_UNLOCK,
1762 NULL, 0);
1763 hat_unshare(seg->s_as->a_hat, raddr, ssize, seg->s_szc);
1764
1765 seg_free(seg);
1766
1767 return (0);
1768 }
1769
1770 void
1771 segspt_shmfree(struct seg *seg)
1772 {
1773 struct shm_data *shmd = (struct shm_data *)seg->s_data;
1774 struct anon_map *shm_amp = shmd->shm_amp;
1775
1776 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
1777
1778 (void) segspt_shmlockop(seg, seg->s_base, shm_amp->size, 0,
1779 MC_UNLOCK, NULL, 0);
1780
1781 /*
1782 * Need to increment refcnt when attaching
1783 * and decrement when detaching because of dup().
1784 */
1785 ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
1786 shm_amp->refcnt--;
1787 ANON_LOCK_EXIT(&shm_amp->a_rwlock);
1788
1789 if (shmd->shm_vpage) { /* only for DISM */
1790 kmem_free(shmd->shm_vpage, btopr(shm_amp->size));
1791 shmd->shm_vpage = NULL;
1792 }
1793
1794 /*
1795 * Take shm_segfree_syncmtx lock to let segspt_reclaim() finish if it's
1796 * still working with this segment without holding as lock.
1797 */
1798 ASSERT(shmd->shm_softlockcnt == 0);
1799 mutex_enter(&shmd->shm_segfree_syncmtx);
1800 mutex_destroy(&shmd->shm_segfree_syncmtx);
1801
1802 kmem_free(shmd, sizeof (*shmd));
1803 }
1804
1805 /*ARGSUSED*/
1806 int
1807 segspt_shmsetprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
1808 {
1809 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
1810
1811 /*
1812 * Shared page table is more than shared mapping.
1813 * Individual process sharing page tables can't change prot
1814 * because there is only one set of page tables.
1815 * This will be allowed after private page table is
1816 * supported.
1817 */
1818 /* need to return correct status error? */
1819 return (0);
1820 }
1821
1822
1823 faultcode_t
1824 segspt_dismfault(struct hat *hat, struct seg *seg, caddr_t addr,
1825 size_t len, enum fault_type type, enum seg_rw rw)
1826 {
1827 struct shm_data *shmd = (struct shm_data *)seg->s_data;
1828 struct seg *sptseg = shmd->shm_sptseg;
1829 struct as *curspt = shmd->shm_sptas;
1830 struct spt_data *sptd = sptseg->s_data;
1831 pgcnt_t npages;
1832 size_t size;
1833 caddr_t segspt_addr, shm_addr;
1834 page_t **ppa;
1835 int i;
1836 ulong_t an_idx = 0;
1837 int err = 0;
1838 int dyn_ism_unmap = hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0);
1839 size_t pgsz;
1840 pgcnt_t pgcnt;
1841 caddr_t a;
1842 pgcnt_t pidx;
1843
1844 #ifdef lint
1845 hat = hat;
1846 #endif
1847 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
1848
1849 /*
1850 * Because of the way spt is implemented
1851 * the realsize of the segment does not have to be
1852 * equal to the segment size itself. The segment size is
1853 * often in multiples of a page size larger than PAGESIZE.
1854 * The realsize is rounded up to the nearest PAGESIZE
1855 * based on what the user requested. This is a bit of
1856 * ungliness that is historical but not easily fixed
1857 * without re-designing the higher levels of ISM.
1858 */
1859 ASSERT(addr >= seg->s_base);
1860 if (((addr + len) - seg->s_base) > sptd->spt_realsize)
1861 return (FC_NOMAP);
1862 /*
1863 * For all of the following cases except F_PROT, we need to
1864 * make any necessary adjustments to addr and len
1865 * and get all of the necessary page_t's into an array called ppa[].
1866 *
1867 * The code in shmat() forces base addr and len of ISM segment
1868 * to be aligned to largest page size supported. Therefore,
1869 * we are able to handle F_SOFTLOCK and F_INVAL calls in "large
1870 * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
1871 * in large pagesize chunks, or else we will screw up the HAT
1872 * layer by calling hat_memload_array() with differing page sizes
1873 * over a given virtual range.
1874 */
1875 pgsz = page_get_pagesize(sptseg->s_szc);
1876 pgcnt = page_get_pagecnt(sptseg->s_szc);
1877 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz);
1878 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz);
1879 npages = btopr(size);
1880
1881 /*
1882 * Now we need to convert from addr in segshm to addr in segspt.
1883 */
1884 an_idx = seg_page(seg, shm_addr);
1885 segspt_addr = sptseg->s_base + ptob(an_idx);
1886
1887 ASSERT((segspt_addr + ptob(npages)) <=
1888 (sptseg->s_base + sptd->spt_realsize));
1889 ASSERT(segspt_addr < (sptseg->s_base + sptseg->s_size));
1890
1891 switch (type) {
1892
1893 case F_SOFTLOCK:
1894
1895 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
1896 /*
1897 * Fall through to the F_INVAL case to load up the hat layer
1898 * entries with the HAT_LOAD_LOCK flag.
1899 */
1900 /* FALLTHRU */
1901 case F_INVAL:
1902
1903 if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
1904 return (FC_NOMAP);
1905
1906 ppa = kmem_zalloc(npages * sizeof (page_t *), KM_SLEEP);
1907
1908 err = spt_anon_getpages(sptseg, segspt_addr, size, ppa);
1909 if (err != 0) {
1910 if (type == F_SOFTLOCK) {
1911 atomic_add_long((ulong_t *)(
1912 &(shmd->shm_softlockcnt)), -npages);
1913 }
1914 goto dism_err;
1915 }
1916 AS_LOCK_ENTER(sptseg->s_as, RW_READER);
1917 a = segspt_addr;
1918 pidx = 0;
1919 if (type == F_SOFTLOCK) {
1920
1921 /*
1922 * Load up the translation keeping it
1923 * locked and don't unlock the page.
1924 */
1925 for (; pidx < npages; a += pgsz, pidx += pgcnt) {
1926 hat_memload_array(sptseg->s_as->a_hat,
1927 a, pgsz, &ppa[pidx], sptd->spt_prot,
1928 HAT_LOAD_LOCK | HAT_LOAD_SHARE);
1929 }
1930 } else {
1931 /*
1932 * Migrate pages marked for migration
1933 */
1934 if (lgrp_optimizations())
1935 page_migrate(seg, shm_addr, ppa, npages);
1936
1937 for (; pidx < npages; a += pgsz, pidx += pgcnt) {
1938 hat_memload_array(sptseg->s_as->a_hat,
1939 a, pgsz, &ppa[pidx],
1940 sptd->spt_prot,
1941 HAT_LOAD_SHARE);
1942 }
1943
1944 /*
1945 * And now drop the SE_SHARED lock(s).
1946 */
1947 if (dyn_ism_unmap) {
1948 for (i = 0; i < npages; i++) {
1949 page_unlock(ppa[i]);
1950 }
1951 }
1952 }
1953
1954 if (!dyn_ism_unmap) {
1955 if (hat_share(seg->s_as->a_hat, shm_addr,
1956 curspt->a_hat, segspt_addr, ptob(npages),
1957 seg->s_szc) != 0) {
1958 panic("hat_share err in DISM fault");
1959 /* NOTREACHED */
1960 }
1961 if (type == F_INVAL) {
1962 for (i = 0; i < npages; i++) {
1963 page_unlock(ppa[i]);
1964 }
1965 }
1966 }
1967 AS_LOCK_EXIT(sptseg->s_as);
1968 dism_err:
1969 kmem_free(ppa, npages * sizeof (page_t *));
1970 return (err);
1971
1972 case F_SOFTUNLOCK:
1973
1974 /*
1975 * This is a bit ugly, we pass in the real seg pointer,
1976 * but the segspt_addr is the virtual address within the
1977 * dummy seg.
1978 */
1979 segspt_softunlock(seg, segspt_addr, size, rw);
1980 return (0);
1981
1982 case F_PROT:
1983
1984 /*
1985 * This takes care of the unusual case where a user
1986 * allocates a stack in shared memory and a register
1987 * window overflow is written to that stack page before
1988 * it is otherwise modified.
1989 *
1990 * We can get away with this because ISM segments are
1991 * always rw. Other than this unusual case, there
1992 * should be no instances of protection violations.
1993 */
1994 return (0);
1995
1996 default:
1997 #ifdef DEBUG
1998 panic("segspt_dismfault default type?");
1999 #else
2000 return (FC_NOMAP);
2001 #endif
2002 }
2003 }
2004
2005
2006 faultcode_t
2007 segspt_shmfault(struct hat *hat, struct seg *seg, caddr_t addr,
2008 size_t len, enum fault_type type, enum seg_rw rw)
2009 {
2010 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2011 struct seg *sptseg = shmd->shm_sptseg;
2012 struct as *curspt = shmd->shm_sptas;
2013 struct spt_data *sptd = sptseg->s_data;
2014 pgcnt_t npages;
2015 size_t size;
2016 caddr_t sptseg_addr, shm_addr;
2017 page_t *pp, **ppa;
2018 int i;
2019 u_offset_t offset;
2020 ulong_t anon_index = 0;
2021 struct vnode *vp;
2022 struct anon_map *amp; /* XXX - for locknest */
2023 struct anon *ap = NULL;
2024 size_t pgsz;
2025 pgcnt_t pgcnt;
2026 caddr_t a;
2027 pgcnt_t pidx;
2028 size_t sz;
2029
2030 #ifdef lint
2031 hat = hat;
2032 #endif
2033
2034 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2035
2036 if (sptd->spt_flags & SHM_PAGEABLE) {
2037 return (segspt_dismfault(hat, seg, addr, len, type, rw));
2038 }
2039
2040 /*
2041 * Because of the way spt is implemented
2042 * the realsize of the segment does not have to be
2043 * equal to the segment size itself. The segment size is
2044 * often in multiples of a page size larger than PAGESIZE.
2045 * The realsize is rounded up to the nearest PAGESIZE
2046 * based on what the user requested. This is a bit of
2047 * ungliness that is historical but not easily fixed
2048 * without re-designing the higher levels of ISM.
2049 */
2050 ASSERT(addr >= seg->s_base);
2051 if (((addr + len) - seg->s_base) > sptd->spt_realsize)
2052 return (FC_NOMAP);
2053 /*
2054 * For all of the following cases except F_PROT, we need to
2055 * make any necessary adjustments to addr and len
2056 * and get all of the necessary page_t's into an array called ppa[].
2057 *
2058 * The code in shmat() forces base addr and len of ISM segment
2059 * to be aligned to largest page size supported. Therefore,
2060 * we are able to handle F_SOFTLOCK and F_INVAL calls in "large
2061 * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
2062 * in large pagesize chunks, or else we will screw up the HAT
2063 * layer by calling hat_memload_array() with differing page sizes
2064 * over a given virtual range.
2065 */
2066 pgsz = page_get_pagesize(sptseg->s_szc);
2067 pgcnt = page_get_pagecnt(sptseg->s_szc);
2068 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz);
2069 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz);
2070 npages = btopr(size);
2071
2072 /*
2073 * Now we need to convert from addr in segshm to addr in segspt.
2074 */
2075 anon_index = seg_page(seg, shm_addr);
2076 sptseg_addr = sptseg->s_base + ptob(anon_index);
2077
2078 /*
2079 * And now we may have to adjust npages downward if we have
2080 * exceeded the realsize of the segment or initial anon
2081 * allocations.
2082 */
2083 if ((sptseg_addr + ptob(npages)) >
2084 (sptseg->s_base + sptd->spt_realsize))
2085 size = (sptseg->s_base + sptd->spt_realsize) - sptseg_addr;
2086
2087 npages = btopr(size);
2088
2089 ASSERT(sptseg_addr < (sptseg->s_base + sptseg->s_size));
2090 ASSERT((sptd->spt_flags & SHM_PAGEABLE) == 0);
2091
2092 switch (type) {
2093
2094 case F_SOFTLOCK:
2095
2096 /*
2097 * availrmem is decremented once during anon_swap_adjust()
2098 * and is incremented during the anon_unresv(), which is
2099 * called from shm_rm_amp() when the segment is destroyed.
2100 */
2101 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
2102 /*
2103 * Some platforms assume that ISM pages are SE_SHARED
2104 * locked for the entire life of the segment.
2105 */
2106 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0))
2107 return (0);
2108 /*
2109 * Fall through to the F_INVAL case to load up the hat layer
2110 * entries with the HAT_LOAD_LOCK flag.
2111 */
2112
2113 /* FALLTHRU */
2114 case F_INVAL:
2115
2116 if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
2117 return (FC_NOMAP);
2118
2119 /*
2120 * Some platforms that do NOT support DYNAMIC_ISM_UNMAP
2121 * may still rely on this call to hat_share(). That
2122 * would imply that those hat's can fault on a
2123 * HAT_LOAD_LOCK translation, which would seem
2124 * contradictory.
2125 */
2126 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
2127 if (hat_share(seg->s_as->a_hat, seg->s_base,
2128 curspt->a_hat, sptseg->s_base,
2129 sptseg->s_size, sptseg->s_szc) != 0) {
2130 panic("hat_share error in ISM fault");
2131 /*NOTREACHED*/
2132 }
2133 return (0);
2134 }
2135 ppa = kmem_zalloc(sizeof (page_t *) * npages, KM_SLEEP);
2136
2137 /*
2138 * I see no need to lock the real seg,
2139 * here, because all of our work will be on the underlying
2140 * dummy seg.
2141 *
2142 * sptseg_addr and npages now account for large pages.
2143 */
2144 amp = sptd->spt_amp;
2145 ASSERT(amp != NULL);
2146 anon_index = seg_page(sptseg, sptseg_addr);
2147
2148 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
2149 for (i = 0; i < npages; i++) {
2150 ap = anon_get_ptr(amp->ahp, anon_index++);
2151 ASSERT(ap != NULL);
2152 swap_xlate(ap, &vp, &offset);
2153 pp = page_lookup(vp, offset, SE_SHARED);
2154 ASSERT(pp != NULL);
2155 ppa[i] = pp;
2156 }
2157 ANON_LOCK_EXIT(&->a_rwlock);
2158 ASSERT(i == npages);
2159
2160 /*
2161 * We are already holding the as->a_lock on the user's
2162 * real segment, but we need to hold the a_lock on the
2163 * underlying dummy as. This is mostly to satisfy the
2164 * underlying HAT layer.
2165 */
2166 AS_LOCK_ENTER(sptseg->s_as, RW_READER);
2167 a = sptseg_addr;
2168 pidx = 0;
2169 if (type == F_SOFTLOCK) {
2170 /*
2171 * Load up the translation keeping it
2172 * locked and don't unlock the page.
2173 */
2174 for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2175 sz = MIN(pgsz, ptob(npages - pidx));
2176 hat_memload_array(sptseg->s_as->a_hat, a,
2177 sz, &ppa[pidx], sptd->spt_prot,
2178 HAT_LOAD_LOCK | HAT_LOAD_SHARE);
2179 }
2180 } else {
2181 /*
2182 * Migrate pages marked for migration.
2183 */
2184 if (lgrp_optimizations())
2185 page_migrate(seg, shm_addr, ppa, npages);
2186
2187 for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2188 sz = MIN(pgsz, ptob(npages - pidx));
2189 hat_memload_array(sptseg->s_as->a_hat,
2190 a, sz, &ppa[pidx],
2191 sptd->spt_prot, HAT_LOAD_SHARE);
2192 }
2193
2194 /*
2195 * And now drop the SE_SHARED lock(s).
2196 */
2197 for (i = 0; i < npages; i++)
2198 page_unlock(ppa[i]);
2199 }
2200 AS_LOCK_EXIT(sptseg->s_as);
2201
2202 kmem_free(ppa, sizeof (page_t *) * npages);
2203 return (0);
2204 case F_SOFTUNLOCK:
2205
2206 /*
2207 * This is a bit ugly, we pass in the real seg pointer,
2208 * but the sptseg_addr is the virtual address within the
2209 * dummy seg.
2210 */
2211 segspt_softunlock(seg, sptseg_addr, ptob(npages), rw);
2212 return (0);
2213
2214 case F_PROT:
2215
2216 /*
2217 * This takes care of the unusual case where a user
2218 * allocates a stack in shared memory and a register
2219 * window overflow is written to that stack page before
2220 * it is otherwise modified.
2221 *
2222 * We can get away with this because ISM segments are
2223 * always rw. Other than this unusual case, there
2224 * should be no instances of protection violations.
2225 */
2226 return (0);
2227
2228 default:
2229 #ifdef DEBUG
2230 cmn_err(CE_WARN, "segspt_shmfault default type?");
2231 #endif
2232 return (FC_NOMAP);
2233 }
2234 }
2235
2236 /*ARGSUSED*/
2237 static faultcode_t
2238 segspt_shmfaulta(struct seg *seg, caddr_t addr)
2239 {
2240 return (0);
2241 }
2242
2243 /*ARGSUSED*/
2244 static int
2245 segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta)
2246 {
2247 return (0);
2248 }
2249
2250 /*ARGSUSED*/
2251 static size_t
2252 segspt_shmswapout(struct seg *seg)
2253 {
2254 return (0);
2255 }
2256
2257 /*
2258 * duplicate the shared page tables
2259 */
2260 int
2261 segspt_shmdup(struct seg *seg, struct seg *newseg)
2262 {
2263 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2264 struct anon_map *amp = shmd->shm_amp;
2265 struct shm_data *shmd_new;
2266 struct seg *spt_seg = shmd->shm_sptseg;
2267 struct spt_data *sptd = spt_seg->s_data;
2268 int error = 0;
2269
2270 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
2271
2272 shmd_new = kmem_zalloc((sizeof (*shmd_new)), KM_SLEEP);
2273 newseg->s_data = (void *)shmd_new;
2274 shmd_new->shm_sptas = shmd->shm_sptas;
2275 shmd_new->shm_amp = amp;
2276 shmd_new->shm_sptseg = shmd->shm_sptseg;
2277 newseg->s_ops = &segspt_shmops;
2278 newseg->s_szc = seg->s_szc;
2279 ASSERT(seg->s_szc == shmd->shm_sptseg->s_szc);
2280
2281 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
2282 amp->refcnt++;
2283 ANON_LOCK_EXIT(&->a_rwlock);
2284
2285 if (sptd->spt_flags & SHM_PAGEABLE) {
2286 shmd_new->shm_vpage = kmem_zalloc(btopr(amp->size), KM_SLEEP);
2287 shmd_new->shm_lckpgs = 0;
2288 if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
2289 if ((error = hat_share(newseg->s_as->a_hat,
2290 newseg->s_base, shmd->shm_sptas->a_hat, SEGSPTADDR,
2291 seg->s_size, seg->s_szc)) != 0) {
2292 kmem_free(shmd_new->shm_vpage,
2293 btopr(amp->size));
2294 }
2295 }
2296 return (error);
2297 } else {
2298 return (hat_share(newseg->s_as->a_hat, newseg->s_base,
2299 shmd->shm_sptas->a_hat, SEGSPTADDR, seg->s_size,
2300 seg->s_szc));
2301
2302 }
2303 }
2304
2305 /*ARGSUSED*/
2306 int
2307 segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
2308 {
2309 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2310 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2311
2312 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2313
2314 /*
2315 * ISM segment is always rw.
2316 */
2317 return (((sptd->spt_prot & prot) != prot) ? EACCES : 0);
2318 }
2319
2320 /*
2321 * Return an array of locked large pages, for empty slots allocate
2322 * private zero-filled anon pages.
2323 */
2324 static int
2325 spt_anon_getpages(
2326 struct seg *sptseg,
2327 caddr_t sptaddr,
2328 size_t len,
2329 page_t *ppa[])
2330 {
2331 struct spt_data *sptd = sptseg->s_data;
2332 struct anon_map *amp = sptd->spt_amp;
2333 enum seg_rw rw = sptd->spt_prot;
2334 uint_t szc = sptseg->s_szc;
2335 size_t pg_sz, share_sz = page_get_pagesize(szc);
2336 pgcnt_t lp_npgs;
2337 caddr_t lp_addr, e_sptaddr;
2338 uint_t vpprot, ppa_szc = 0;
2339 struct vpage *vpage = NULL;
2340 ulong_t j, ppa_idx;
2341 int err, ierr = 0;
2342 pgcnt_t an_idx;
2343 anon_sync_obj_t cookie;
2344 int anon_locked = 0;
2345 pgcnt_t amp_pgs;
2346
2347
2348 ASSERT(IS_P2ALIGNED(sptaddr, share_sz) && IS_P2ALIGNED(len, share_sz));
2349 ASSERT(len != 0);
2350
2351 pg_sz = share_sz;
2352 lp_npgs = btop(pg_sz);
2353 lp_addr = sptaddr;
2354 e_sptaddr = sptaddr + len;
2355 an_idx = seg_page(sptseg, sptaddr);
2356 ppa_idx = 0;
2357
2358 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
2359
2360 amp_pgs = page_get_pagecnt(amp->a_szc);
2361
2362 /*CONSTCOND*/
2363 while (1) {
2364 for (; lp_addr < e_sptaddr;
2365 an_idx += lp_npgs, lp_addr += pg_sz, ppa_idx += lp_npgs) {
2366
2367 /*
2368 * If we're currently locked, and we get to a new
2369 * page, unlock our current anon chunk.
2370 */
2371 if (anon_locked && P2PHASE(an_idx, amp_pgs) == 0) {
2372 anon_array_exit(&cookie);
2373 anon_locked = 0;
2374 }
2375 if (!anon_locked) {
2376 anon_array_enter(amp, an_idx, &cookie);
2377 anon_locked = 1;
2378 }
2379 ppa_szc = (uint_t)-1;
2380 ierr = anon_map_getpages(amp, an_idx, szc, sptseg,
2381 lp_addr, sptd->spt_prot, &vpprot, &ppa[ppa_idx],
2382 &ppa_szc, vpage, rw, 0, segvn_anypgsz, 0, kcred);
2383
2384 if (ierr != 0) {
2385 if (ierr > 0) {
2386 err = FC_MAKE_ERR(ierr);
2387 goto lpgs_err;
2388 }
2389 break;
2390 }
2391 }
2392 if (lp_addr == e_sptaddr) {
2393 break;
2394 }
2395 ASSERT(lp_addr < e_sptaddr);
2396
2397 /*
2398 * ierr == -1 means we failed to allocate a large page.
2399 * so do a size down operation.
2400 *
2401 * ierr == -2 means some other process that privately shares
2402 * pages with this process has allocated a larger page and we
2403 * need to retry with larger pages. So do a size up
2404 * operation. This relies on the fact that large pages are
2405 * never partially shared i.e. if we share any constituent
2406 * page of a large page with another process we must share the
2407 * entire large page. Note this cannot happen for SOFTLOCK
2408 * case, unless current address (lpaddr) is at the beginning
2409 * of the next page size boundary because the other process
2410 * couldn't have relocated locked pages.
2411 */
2412 ASSERT(ierr == -1 || ierr == -2);
2413 if (segvn_anypgsz) {
2414 ASSERT(ierr == -2 || szc != 0);
2415 ASSERT(ierr == -1 || szc < sptseg->s_szc);
2416 szc = (ierr == -1) ? szc - 1 : szc + 1;
2417 } else {
2418 /*
2419 * For faults and segvn_anypgsz == 0
2420 * we need to be careful not to loop forever
2421 * if existing page is found with szc other
2422 * than 0 or seg->s_szc. This could be due
2423 * to page relocations on behalf of DR or
2424 * more likely large page creation. For this
2425 * case simply re-size to existing page's szc
2426 * if returned by anon_map_getpages().
2427 */
2428 if (ppa_szc == (uint_t)-1) {
2429 szc = (ierr == -1) ? 0 : sptseg->s_szc;
2430 } else {
2431 ASSERT(ppa_szc <= sptseg->s_szc);
2432 ASSERT(ierr == -2 || ppa_szc < szc);
2433 ASSERT(ierr == -1 || ppa_szc > szc);
2434 szc = ppa_szc;
2435 }
2436 }
2437 pg_sz = page_get_pagesize(szc);
2438 lp_npgs = btop(pg_sz);
2439 ASSERT(IS_P2ALIGNED(lp_addr, pg_sz));
2440 }
2441 if (anon_locked) {
2442 anon_array_exit(&cookie);
2443 }
2444 ANON_LOCK_EXIT(&->a_rwlock);
2445 return (0);
2446
2447 lpgs_err:
2448 if (anon_locked) {
2449 anon_array_exit(&cookie);
2450 }
2451 ANON_LOCK_EXIT(&->a_rwlock);
2452 for (j = 0; j < ppa_idx; j++)
2453 page_unlock(ppa[j]);
2454 return (err);
2455 }
2456
2457 /*
2458 * count the number of bytes in a set of spt pages that are currently not
2459 * locked
2460 */
2461 static rctl_qty_t
2462 spt_unlockedbytes(pgcnt_t npages, page_t **ppa)
2463 {
2464 ulong_t i;
2465 rctl_qty_t unlocked = 0;
2466
2467 for (i = 0; i < npages; i++) {
2468 if (ppa[i]->p_lckcnt == 0)
2469 unlocked += PAGESIZE;
2470 }
2471 return (unlocked);
2472 }
2473
2474 extern u_longlong_t randtick(void);
2475 /* number of locks to reserve/skip by spt_lockpages() and spt_unlockpages() */
2476 #define NLCK (NCPU_P2)
2477 /* Random number with a range [0, n-1], n must be power of two */
2478 #define RAND_P2(n) \
2479 ((((long)curthread >> PTR24_LSB) ^ (long)randtick()) & ((n) - 1))
2480
2481 int
2482 spt_lockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages,
2483 page_t **ppa, ulong_t *lockmap, size_t pos,
2484 rctl_qty_t *locked)
2485 {
2486 struct shm_data *shmd = seg->s_data;
2487 struct spt_data *sptd = shmd->shm_sptseg->s_data;
2488 ulong_t i;
2489 int kernel;
2490 pgcnt_t nlck = 0;
2491 int rv = 0;
2492 int use_reserved = 1;
2493
2494 /* return the number of bytes actually locked */
2495 *locked = 0;
2496
2497 /*
2498 * To avoid contention on freemem_lock, availrmem and pages_locked
2499 * global counters are updated only every nlck locked pages instead of
2500 * every time. Reserve nlck locks up front and deduct from this
2501 * reservation for each page that requires a lock. When the reservation
2502 * is consumed, reserve again. nlck is randomized, so the competing
2503 * threads do not fall into a cyclic lock contention pattern. When
2504 * memory is low, the lock ahead is disabled, and instead page_pp_lock()
2505 * is used to lock pages.
2506 */
2507 for (i = 0; i < npages; anon_index++, pos++, i++) {
2508 if (nlck == 0 && use_reserved == 1) {
2509 nlck = NLCK + RAND_P2(NLCK);
2510 /* if fewer loops left, decrease nlck */
2511 nlck = MIN(nlck, npages - i);
2512 /*
2513 * Reserve nlck locks up front and deduct from this
2514 * reservation for each page that requires a lock. When
2515 * the reservation is consumed, reserve again.
2516 */
2517 mutex_enter(&freemem_lock);
2518 if ((availrmem - nlck) < pages_pp_maximum) {
2519 /* Do not do advance memory reserves */
2520 use_reserved = 0;
2521 } else {
2522 availrmem -= nlck;
2523 pages_locked += nlck;
2524 }
2525 mutex_exit(&freemem_lock);
2526 }
2527 if (!(shmd->shm_vpage[anon_index] & DISM_PG_LOCKED)) {
2528 if (sptd->spt_ppa_lckcnt[anon_index] <
2529 (ushort_t)DISM_LOCK_MAX) {
2530 if (++sptd->spt_ppa_lckcnt[anon_index] ==
2531 (ushort_t)DISM_LOCK_MAX) {
2532 cmn_err(CE_WARN,
2533 "DISM page lock limit "
2534 "reached on DISM offset 0x%lx\n",
2535 anon_index << PAGESHIFT);
2536 }
2537 kernel = (sptd->spt_ppa &&
2538 sptd->spt_ppa[anon_index]);
2539 if (!page_pp_lock(ppa[i], 0, kernel ||
2540 use_reserved)) {
2541 sptd->spt_ppa_lckcnt[anon_index]--;
2542 rv = EAGAIN;
2543 break;
2544 }
2545 /* if this is a newly locked page, count it */
2546 if (ppa[i]->p_lckcnt == 1) {
2547 if (kernel == 0 && use_reserved == 1)
2548 nlck--;
2549 *locked += PAGESIZE;
2550 }
2551 shmd->shm_lckpgs++;
2552 shmd->shm_vpage[anon_index] |= DISM_PG_LOCKED;
2553 if (lockmap != NULL)
2554 BT_SET(lockmap, pos);
2555 }
2556 }
2557 }
2558 /* Return unused lock reservation */
2559 if (nlck != 0 && use_reserved == 1) {
2560 mutex_enter(&freemem_lock);
2561 availrmem += nlck;
2562 pages_locked -= nlck;
2563 mutex_exit(&freemem_lock);
2564 }
2565
2566 return (rv);
2567 }
2568
2569 int
2570 spt_unlockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages,
2571 rctl_qty_t *unlocked)
2572 {
2573 struct shm_data *shmd = seg->s_data;
2574 struct spt_data *sptd = shmd->shm_sptseg->s_data;
2575 struct anon_map *amp = sptd->spt_amp;
2576 struct anon *ap;
2577 struct vnode *vp;
2578 u_offset_t off;
2579 struct page *pp;
2580 int kernel;
2581 anon_sync_obj_t cookie;
2582 ulong_t i;
2583 pgcnt_t nlck = 0;
2584 pgcnt_t nlck_limit = NLCK;
2585
2586 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
2587 for (i = 0; i < npages; i++, anon_index++) {
2588 if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) {
2589 anon_array_enter(amp, anon_index, &cookie);
2590 ap = anon_get_ptr(amp->ahp, anon_index);
2591 ASSERT(ap);
2592
2593 swap_xlate(ap, &vp, &off);
2594 anon_array_exit(&cookie);
2595 pp = page_lookup(vp, off, SE_SHARED);
2596 ASSERT(pp);
2597 /*
2598 * availrmem is decremented only for pages which are not
2599 * in seg pcache, for pages in seg pcache availrmem was
2600 * decremented in _dismpagelock()
2601 */
2602 kernel = (sptd->spt_ppa && sptd->spt_ppa[anon_index]);
2603 ASSERT(pp->p_lckcnt > 0);
2604
2605 /*
2606 * lock page but do not change availrmem, we do it
2607 * ourselves every nlck loops.
2608 */
2609 page_pp_unlock(pp, 0, 1);
2610 if (pp->p_lckcnt == 0) {
2611 if (kernel == 0)
2612 nlck++;
2613 *unlocked += PAGESIZE;
2614 }
2615 page_unlock(pp);
2616 shmd->shm_vpage[anon_index] &= ~DISM_PG_LOCKED;
2617 sptd->spt_ppa_lckcnt[anon_index]--;
2618 shmd->shm_lckpgs--;
2619 }
2620
2621 /*
2622 * To reduce freemem_lock contention, do not update availrmem
2623 * until at least NLCK pages have been unlocked.
2624 * 1. No need to update if nlck is zero
2625 * 2. Always update if the last iteration
2626 */
2627 if (nlck > 0 && (nlck == nlck_limit || i == npages - 1)) {
2628 mutex_enter(&freemem_lock);
2629 availrmem += nlck;
2630 pages_locked -= nlck;
2631 mutex_exit(&freemem_lock);
2632 nlck = 0;
2633 nlck_limit = NLCK + RAND_P2(NLCK);
2634 }
2635 }
2636 ANON_LOCK_EXIT(&->a_rwlock);
2637
2638 return (0);
2639 }
2640
2641 /*ARGSUSED*/
2642 static int
2643 segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
2644 int attr, int op, ulong_t *lockmap, size_t pos)
2645 {
2646 struct shm_data *shmd = seg->s_data;
2647 struct seg *sptseg = shmd->shm_sptseg;
2648 struct spt_data *sptd = sptseg->s_data;
2649 struct kshmid *sp = sptd->spt_amp->a_sp;
2650 pgcnt_t npages, a_npages;
2651 page_t **ppa;
2652 pgcnt_t an_idx, a_an_idx, ppa_idx;
2653 caddr_t spt_addr, a_addr; /* spt and aligned address */
2654 size_t a_len; /* aligned len */
2655 size_t share_sz;
2656 ulong_t i;
2657 int sts = 0;
2658 rctl_qty_t unlocked = 0;
2659 rctl_qty_t locked = 0;
2660 struct proc *p = curproc;
2661 kproject_t *proj;
2662
2663 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2664 ASSERT(sp != NULL);
2665
2666 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
2667 return (0);
2668 }
2669
2670 addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2671 an_idx = seg_page(seg, addr);
2672 npages = btopr(len);
2673
2674 if (an_idx + npages > btopr(shmd->shm_amp->size)) {
2675 return (ENOMEM);
2676 }
2677
2678 /*
2679 * A shm's project never changes, so no lock needed.
2680 * The shm has a hold on the project, so it will not go away.
2681 * Since we have a mapping to shm within this zone, we know
2682 * that the zone will not go away.
2683 */
2684 proj = sp->shm_perm.ipc_proj;
2685
2686 if (op == MC_LOCK) {
2687
2688 /*
2689 * Need to align addr and size request if they are not
2690 * aligned so we can always allocate large page(s) however
2691 * we only lock what was requested in initial request.
2692 */
2693 share_sz = page_get_pagesize(sptseg->s_szc);
2694 a_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_sz);
2695 a_len = P2ROUNDUP((uintptr_t)(((addr + len) - a_addr)),
2696 share_sz);
2697 a_npages = btop(a_len);
2698 a_an_idx = seg_page(seg, a_addr);
2699 spt_addr = sptseg->s_base + ptob(a_an_idx);
2700 ppa_idx = an_idx - a_an_idx;
2701
2702 if ((ppa = kmem_zalloc(((sizeof (page_t *)) * a_npages),
2703 KM_NOSLEEP)) == NULL) {
2704 return (ENOMEM);
2705 }
2706
2707 /*
2708 * Don't cache any new pages for IO and
2709 * flush any cached pages.
2710 */
2711 mutex_enter(&sptd->spt_lock);
2712 if (sptd->spt_ppa != NULL)
2713 sptd->spt_flags |= DISM_PPA_CHANGED;
2714
2715 sts = spt_anon_getpages(sptseg, spt_addr, a_len, ppa);
2716 if (sts != 0) {
2717 mutex_exit(&sptd->spt_lock);
2718 kmem_free(ppa, ((sizeof (page_t *)) * a_npages));
2719 return (sts);
2720 }
2721
2722 mutex_enter(&sp->shm_mlock);
2723 /* enforce locked memory rctl */
2724 unlocked = spt_unlockedbytes(npages, &ppa[ppa_idx]);
2725
2726 mutex_enter(&p->p_lock);
2727 if (rctl_incr_locked_mem(p, proj, unlocked, 0)) {
2728 mutex_exit(&p->p_lock);
2729 sts = EAGAIN;
2730 } else {
2731 mutex_exit(&p->p_lock);
2732 sts = spt_lockpages(seg, an_idx, npages,
2733 &ppa[ppa_idx], lockmap, pos, &locked);
2734
2735 /*
2736 * correct locked count if not all pages could be
2737 * locked
2738 */
2739 if ((unlocked - locked) > 0) {
2740 rctl_decr_locked_mem(NULL, proj,
2741 (unlocked - locked), 0);
2742 }
2743 }
2744 /*
2745 * unlock pages
2746 */
2747 for (i = 0; i < a_npages; i++)
2748 page_unlock(ppa[i]);
2749 if (sptd->spt_ppa != NULL)
2750 sptd->spt_flags |= DISM_PPA_CHANGED;
2751 mutex_exit(&sp->shm_mlock);
2752 mutex_exit(&sptd->spt_lock);
2753
2754 kmem_free(ppa, ((sizeof (page_t *)) * a_npages));
2755
2756 } else if (op == MC_UNLOCK) { /* unlock */
2757 page_t **ppa;
2758
2759 mutex_enter(&sptd->spt_lock);
2760 if (shmd->shm_lckpgs == 0) {
2761 mutex_exit(&sptd->spt_lock);
2762 return (0);
2763 }
2764 /*
2765 * Don't cache new IO pages.
2766 */
2767 if (sptd->spt_ppa != NULL)
2768 sptd->spt_flags |= DISM_PPA_CHANGED;
2769
2770 mutex_enter(&sp->shm_mlock);
2771 sts = spt_unlockpages(seg, an_idx, npages, &unlocked);
2772 if ((ppa = sptd->spt_ppa) != NULL)
2773 sptd->spt_flags |= DISM_PPA_CHANGED;
2774 mutex_exit(&sptd->spt_lock);
2775
2776 rctl_decr_locked_mem(NULL, proj, unlocked, 0);
2777 mutex_exit(&sp->shm_mlock);
2778
2779 if (ppa != NULL)
2780 seg_ppurge_wiredpp(ppa);
2781 }
2782 return (sts);
2783 }
2784
2785 /*ARGSUSED*/
2786 int
2787 segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
2788 {
2789 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2790 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2791 spgcnt_t pgno = seg_page(seg, addr+len) - seg_page(seg, addr) + 1;
2792
2793 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2794
2795 /*
2796 * ISM segment is always rw.
2797 */
2798 while (--pgno >= 0)
2799 *protv++ = sptd->spt_prot;
2800 return (0);
2801 }
2802
2803 /*ARGSUSED*/
2804 u_offset_t
2805 segspt_shmgetoffset(struct seg *seg, caddr_t addr)
2806 {
2807 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2808
2809 /* Offset does not matter in ISM memory */
2810
2811 return ((u_offset_t)0);
2812 }
2813
2814 /* ARGSUSED */
2815 int
2816 segspt_shmgettype(struct seg *seg, caddr_t addr)
2817 {
2818 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2819 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2820
2821 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2822
2823 /*
2824 * The shared memory mapping is always MAP_SHARED, SWAP is only
2825 * reserved for DISM
2826 */
2827 return (MAP_SHARED |
2828 ((sptd->spt_flags & SHM_PAGEABLE) ? 0 : MAP_NORESERVE));
2829 }
2830
2831 /*ARGSUSED*/
2832 int
2833 segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
2834 {
2835 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2836 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2837
2838 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2839
2840 *vpp = sptd->spt_vp;
2841 return (0);
2842 }
2843
2844 /*
2845 * We need to wait for pending IO to complete to a DISM segment in order for
2846 * pages to get kicked out of the seg_pcache. 120 seconds should be more
2847 * than enough time to wait.
2848 */
2849 static clock_t spt_pcache_wait = 120;
2850
2851 /*ARGSUSED*/
2852 static int
2853 segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
2854 {
2855 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2856 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2857 struct anon_map *amp;
2858 pgcnt_t pg_idx;
2859 ushort_t gen;
2860 clock_t end_lbolt;
2861 int writer;
2862 page_t **ppa;
2863
2864 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2865
2866 if (behav == MADV_FREE || behav == MADV_PURGE) {
2867 if ((sptd->spt_flags & SHM_PAGEABLE) == 0)
2868 return (0);
2869
2870 amp = sptd->spt_amp;
2871 pg_idx = seg_page(seg, addr);
2872
2873 mutex_enter(&sptd->spt_lock);
2874 if ((ppa = sptd->spt_ppa) == NULL) {
2875 mutex_exit(&sptd->spt_lock);
2876 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
2877 (void) anon_disclaim(amp, pg_idx, len, behav, NULL);
2878 ANON_LOCK_EXIT(&->a_rwlock);
2879 return (0);
2880 }
2881
2882 sptd->spt_flags |= DISM_PPA_CHANGED;
2883 gen = sptd->spt_gen;
2884
2885 mutex_exit(&sptd->spt_lock);
2886
2887 /*
2888 * Purge all DISM cached pages
2889 */
2890 seg_ppurge_wiredpp(ppa);
2891
2892 /*
2893 * Drop the AS_LOCK so that other threads can grab it
2894 * in the as_pageunlock path and hopefully get the segment
2895 * kicked out of the seg_pcache. We bump the shm_softlockcnt
2896 * to keep this segment resident.
2897 */
2898 writer = AS_WRITE_HELD(seg->s_as);
2899 atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
2900 AS_LOCK_EXIT(seg->s_as);
2901
2902 mutex_enter(&sptd->spt_lock);
2903
2904 end_lbolt = ddi_get_lbolt() + (hz * spt_pcache_wait);
2905
2906 /*
2907 * Try to wait for pages to get kicked out of the seg_pcache.
2908 */
2909 while (sptd->spt_gen == gen &&
2910 (sptd->spt_flags & DISM_PPA_CHANGED) &&
2911 ddi_get_lbolt() < end_lbolt) {
2912 if (!cv_timedwait_sig(&sptd->spt_cv,
2913 &sptd->spt_lock, end_lbolt)) {
2914 break;
2915 }
2916 }
2917
2918 mutex_exit(&sptd->spt_lock);
2919
2920 /* Regrab the AS_LOCK and release our hold on the segment */
2921 AS_LOCK_ENTER(seg->s_as, writer ? RW_WRITER : RW_READER);
2922 atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
2923 if (shmd->shm_softlockcnt <= 0) {
2924 if (AS_ISUNMAPWAIT(seg->s_as)) {
2925 mutex_enter(&seg->s_as->a_contents);
2926 if (AS_ISUNMAPWAIT(seg->s_as)) {
2927 AS_CLRUNMAPWAIT(seg->s_as);
2928 cv_broadcast(&seg->s_as->a_cv);
2929 }
2930 mutex_exit(&seg->s_as->a_contents);
2931 }
2932 }
2933
2934 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
2935 (void) anon_disclaim(amp, pg_idx, len, behav, NULL);
2936 ANON_LOCK_EXIT(&->a_rwlock);
2937 } else if (lgrp_optimizations() && (behav == MADV_ACCESS_LWP ||
2938 behav == MADV_ACCESS_MANY || behav == MADV_ACCESS_DEFAULT)) {
2939 int already_set;
2940 ulong_t anon_index;
2941 lgrp_mem_policy_t policy;
2942 caddr_t shm_addr;
2943 size_t share_size;
2944 size_t size;
2945 struct seg *sptseg = shmd->shm_sptseg;
2946 caddr_t sptseg_addr;
2947
2948 /*
2949 * Align address and length to page size of underlying segment
2950 */
2951 share_size = page_get_pagesize(shmd->shm_sptseg->s_szc);
2952 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_size);
2953 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)),
2954 share_size);
2955
2956 amp = shmd->shm_amp;
2957 anon_index = seg_page(seg, shm_addr);
2958
2959 /*
2960 * And now we may have to adjust size downward if we have
2961 * exceeded the realsize of the segment or initial anon
2962 * allocations.
2963 */
2964 sptseg_addr = sptseg->s_base + ptob(anon_index);
2965 if ((sptseg_addr + size) >
2966 (sptseg->s_base + sptd->spt_realsize))
2967 size = (sptseg->s_base + sptd->spt_realsize) -
2968 sptseg_addr;
2969
2970 /*
2971 * Set memory allocation policy for this segment
2972 */
2973 policy = lgrp_madv_to_policy(behav, len, MAP_SHARED);
2974 already_set = lgrp_shm_policy_set(policy, amp, anon_index,
2975 NULL, 0, len);
2976
2977 /*
2978 * If random memory allocation policy set already,
2979 * don't bother reapplying it.
2980 */
2981 if (already_set && !LGRP_MEM_POLICY_REAPPLICABLE(policy))
2982 return (0);
2983
2984 /*
2985 * Mark any existing pages in the given range for
2986 * migration, flushing the I/O page cache, and using
2987 * underlying segment to calculate anon index and get
2988 * anonmap and vnode pointer from
2989 */
2990 if (shmd->shm_softlockcnt > 0)
2991 segspt_purge(seg);
2992
2993 page_mark_migrate(seg, shm_addr, size, amp, 0, NULL, 0, 0);
2994 }
2995
2996 return (0);
2997 }
2998
2999 /*ARGSUSED*/
3000 void
3001 segspt_shmdump(struct seg *seg)
3002 {
3003 /* no-op for ISM segment */
3004 }
3005
3006 /*ARGSUSED*/
3007 static faultcode_t
3008 segspt_shmsetpgsz(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
3009 {
3010 return (ENOTSUP);
3011 }
3012
3013 /*
3014 * get a memory ID for an addr in a given segment
3015 */
3016 static int
3017 segspt_shmgetmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
3018 {
3019 struct shm_data *shmd = (struct shm_data *)seg->s_data;
3020 struct anon *ap;
3021 size_t anon_index;
3022 struct anon_map *amp = shmd->shm_amp;
3023 struct spt_data *sptd = shmd->shm_sptseg->s_data;
3024 struct seg *sptseg = shmd->shm_sptseg;
3025 anon_sync_obj_t cookie;
3026
3027 anon_index = seg_page(seg, addr);
3028
3029 if (addr > (seg->s_base + sptd->spt_realsize)) {
3030 return (EFAULT);
3031 }
3032
3033 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
3034 anon_array_enter(amp, anon_index, &cookie);
3035 ap = anon_get_ptr(amp->ahp, anon_index);
3036 if (ap == NULL) {
3037 struct page *pp;
3038 caddr_t spt_addr = sptseg->s_base + ptob(anon_index);
3039
3040 pp = anon_zero(sptseg, spt_addr, &ap, kcred);
3041 if (pp == NULL) {
3042 anon_array_exit(&cookie);
3043 ANON_LOCK_EXIT(&->a_rwlock);
3044 return (ENOMEM);
3045 }
3046 (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP);
3047 page_unlock(pp);
3048 }
3049 anon_array_exit(&cookie);
3050 ANON_LOCK_EXIT(&->a_rwlock);
3051 memidp->val[0] = (uintptr_t)ap;
3052 memidp->val[1] = (uintptr_t)addr & PAGEOFFSET;
3053 return (0);
3054 }
3055
3056 /*
3057 * Get memory allocation policy info for specified address in given segment
3058 */
3059 static lgrp_mem_policy_info_t *
3060 segspt_shmgetpolicy(struct seg *seg, caddr_t addr)
3061 {
3062 struct anon_map *amp;
3063 ulong_t anon_index;
3064 lgrp_mem_policy_info_t *policy_info;
3065 struct shm_data *shm_data;
3066
3067 ASSERT(seg != NULL);
3068
3069 /*
3070 * Get anon_map from segshm
3071 *
3072 * Assume that no lock needs to be held on anon_map, since
3073 * it should be protected by its reference count which must be
3074 * nonzero for an existing segment
3075 * Need to grab readers lock on policy tree though
3076 */
3077 shm_data = (struct shm_data *)seg->s_data;
3078 if (shm_data == NULL)
3079 return (NULL);
3080 amp = shm_data->shm_amp;
3081 ASSERT(amp->refcnt != 0);
3082
3083 /*
3084 * Get policy info
3085 *
3086 * Assume starting anon index of 0
3087 */
3088 anon_index = seg_page(seg, addr);
3089 policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);
3090
3091 return (policy_info);
3092 }
3093
3094 /*ARGSUSED*/
3095 static int
3096 segspt_shmcapable(struct seg *seg, segcapability_t capability)
3097 {
3098 return (0);
3099 }