Print this page
NEX-5177 backport illumos 6345 remove xhat support
Reviewed by: Kevin Crowe <kevin.crowe@nexenta.com>
6345 remove xhat support
Reviewed by: Dan McDonald <danmcd@omniti.com>
Reviewed by: Garrett D'Amore <garrett@damore.org>
Approved by: Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org>
NEX-5164 backport illumos 6514 AS_* lock macros simplification
Reviewed by: Kevin Crowe <kevin.crowe@nexenta.com>
6514 AS_* lock macros simplification
Reviewed by: Piotr Jasiukajtis <estibi@me.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Albert Lee <trisk@omniti.com>
Approved by: Dan McDonald <danmcd@omniti.com>
re #13613 rb4516 Tunables needs volatile keyword
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/vm/seg_spt.c
+++ new/usr/src/uts/common/vm/seg_spt.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
|
↓ open down ↓ |
12 lines elided |
↑ open up ↑ |
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved.
23 + * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
23 24 * Copyright (c) 2015, Joyent, Inc. All rights reserved.
24 25 * Copyright (c) 2016 by Delphix. All rights reserved.
25 26 */
26 27
27 28 #include <sys/param.h>
28 29 #include <sys/user.h>
29 30 #include <sys/mman.h>
30 31 #include <sys/kmem.h>
31 32 #include <sys/sysmacros.h>
32 33 #include <sys/cmn_err.h>
33 34 #include <sys/systm.h>
34 35 #include <sys/tuneable.h>
35 36 #include <vm/hat.h>
36 37 #include <vm/seg.h>
37 38 #include <vm/as.h>
38 39 #include <vm/anon.h>
39 40 #include <vm/page.h>
40 41 #include <sys/buf.h>
41 42 #include <sys/swap.h>
42 43 #include <sys/atomic.h>
43 44 #include <vm/seg_spt.h>
44 45 #include <sys/debug.h>
45 46 #include <sys/vtrace.h>
46 47 #include <sys/shm.h>
47 48 #include <sys/shm_impl.h>
48 49 #include <sys/lgrp.h>
49 50 #include <sys/vmsystm.h>
50 51 #include <sys/policy.h>
51 52 #include <sys/project.h>
52 53 #include <sys/tnf_probe.h>
53 54 #include <sys/zone.h>
54 55
55 56 #define SEGSPTADDR (caddr_t)0x0
56 57
57 58 /*
58 59 * # pages used for spt
59 60 */
60 61 size_t spt_used;
61 62
62 63 /*
|
↓ open down ↓ |
30 lines elided |
↑ open up ↑ |
63 64 * segspt_minfree is the memory left for system after ISM
64 65 * locked its pages; it is set up to 5% of availrmem in
65 66 * sptcreate when ISM is created. ISM should not use more
66 67 * than ~90% of availrmem; if it does, then the performance
67 68 * of the system may decrease. Machines with large memories may
68 69 * be able to use up more memory for ISM so we set the default
69 70 * segspt_minfree to 5% (which gives ISM max 95% of availrmem.
70 71 * If somebody wants even more memory for ISM (risking hanging
71 72 * the system) they can patch the segspt_minfree to smaller number.
72 73 */
73 -pgcnt_t segspt_minfree = 0;
74 +volatile pgcnt_t segspt_minfree = 0;
74 75
75 76 static int segspt_create(struct seg *seg, caddr_t argsp);
76 77 static int segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize);
77 78 static void segspt_free(struct seg *seg);
78 79 static void segspt_free_pages(struct seg *seg, caddr_t addr, size_t len);
79 80 static lgrp_mem_policy_info_t *segspt_getpolicy(struct seg *seg, caddr_t addr);
80 81
81 82 static void
82 83 segspt_badop()
83 84 {
84 85 panic("segspt_badop called");
85 86 /*NOTREACHED*/
86 87 }
87 88
88 89 #define SEGSPT_BADOP(t) (t(*)())segspt_badop
89 90
90 91 struct seg_ops segspt_ops = {
91 92 SEGSPT_BADOP(int), /* dup */
92 93 segspt_unmap,
93 94 segspt_free,
94 95 SEGSPT_BADOP(int), /* fault */
95 96 SEGSPT_BADOP(faultcode_t), /* faulta */
96 97 SEGSPT_BADOP(int), /* setprot */
97 98 SEGSPT_BADOP(int), /* checkprot */
98 99 SEGSPT_BADOP(int), /* kluster */
99 100 SEGSPT_BADOP(size_t), /* swapout */
100 101 SEGSPT_BADOP(int), /* sync */
101 102 SEGSPT_BADOP(size_t), /* incore */
102 103 SEGSPT_BADOP(int), /* lockop */
103 104 SEGSPT_BADOP(int), /* getprot */
104 105 SEGSPT_BADOP(u_offset_t), /* getoffset */
105 106 SEGSPT_BADOP(int), /* gettype */
106 107 SEGSPT_BADOP(int), /* getvp */
107 108 SEGSPT_BADOP(int), /* advise */
108 109 SEGSPT_BADOP(void), /* dump */
109 110 SEGSPT_BADOP(int), /* pagelock */
110 111 SEGSPT_BADOP(int), /* setpgsz */
111 112 SEGSPT_BADOP(int), /* getmemid */
112 113 segspt_getpolicy, /* getpolicy */
113 114 SEGSPT_BADOP(int), /* capable */
114 115 seg_inherit_notsup /* inherit */
115 116 };
116 117
117 118 static int segspt_shmdup(struct seg *seg, struct seg *newseg);
118 119 static int segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize);
119 120 static void segspt_shmfree(struct seg *seg);
120 121 static faultcode_t segspt_shmfault(struct hat *hat, struct seg *seg,
121 122 caddr_t addr, size_t len, enum fault_type type, enum seg_rw rw);
122 123 static faultcode_t segspt_shmfaulta(struct seg *seg, caddr_t addr);
123 124 static int segspt_shmsetprot(register struct seg *seg, register caddr_t addr,
124 125 register size_t len, register uint_t prot);
125 126 static int segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size,
126 127 uint_t prot);
127 128 static int segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta);
128 129 static size_t segspt_shmswapout(struct seg *seg);
129 130 static size_t segspt_shmincore(struct seg *seg, caddr_t addr, size_t len,
130 131 register char *vec);
131 132 static int segspt_shmsync(struct seg *seg, register caddr_t addr, size_t len,
132 133 int attr, uint_t flags);
133 134 static int segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
134 135 int attr, int op, ulong_t *lockmap, size_t pos);
135 136 static int segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len,
136 137 uint_t *protv);
137 138 static u_offset_t segspt_shmgetoffset(struct seg *seg, caddr_t addr);
138 139 static int segspt_shmgettype(struct seg *seg, caddr_t addr);
139 140 static int segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp);
140 141 static int segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len,
141 142 uint_t behav);
142 143 static void segspt_shmdump(struct seg *seg);
143 144 static int segspt_shmpagelock(struct seg *, caddr_t, size_t,
144 145 struct page ***, enum lock_type, enum seg_rw);
145 146 static int segspt_shmsetpgsz(struct seg *, caddr_t, size_t, uint_t);
146 147 static int segspt_shmgetmemid(struct seg *, caddr_t, memid_t *);
147 148 static lgrp_mem_policy_info_t *segspt_shmgetpolicy(struct seg *, caddr_t);
148 149 static int segspt_shmcapable(struct seg *, segcapability_t);
149 150
150 151 struct seg_ops segspt_shmops = {
151 152 segspt_shmdup,
152 153 segspt_shmunmap,
153 154 segspt_shmfree,
154 155 segspt_shmfault,
155 156 segspt_shmfaulta,
156 157 segspt_shmsetprot,
157 158 segspt_shmcheckprot,
158 159 segspt_shmkluster,
159 160 segspt_shmswapout,
160 161 segspt_shmsync,
161 162 segspt_shmincore,
162 163 segspt_shmlockop,
163 164 segspt_shmgetprot,
164 165 segspt_shmgetoffset,
165 166 segspt_shmgettype,
166 167 segspt_shmgetvp,
167 168 segspt_shmadvise, /* advise */
168 169 segspt_shmdump,
169 170 segspt_shmpagelock,
170 171 segspt_shmsetpgsz,
171 172 segspt_shmgetmemid,
172 173 segspt_shmgetpolicy,
173 174 segspt_shmcapable,
174 175 seg_inherit_notsup
175 176 };
176 177
177 178 static void segspt_purge(struct seg *seg);
178 179 static int segspt_reclaim(void *, caddr_t, size_t, struct page **,
179 180 enum seg_rw, int);
180 181 static int spt_anon_getpages(struct seg *seg, caddr_t addr, size_t len,
181 182 page_t **ppa);
182 183
183 184
184 185
185 186 /*ARGSUSED*/
186 187 int
187 188 sptcreate(size_t size, struct seg **sptseg, struct anon_map *amp,
188 189 uint_t prot, uint_t flags, uint_t share_szc)
189 190 {
190 191 int err;
191 192 struct as *newas;
192 193 struct segspt_crargs sptcargs;
193 194
194 195 #ifdef DEBUG
195 196 TNF_PROBE_1(sptcreate, "spt", /* CSTYLED */,
196 197 tnf_ulong, size, size );
197 198 #endif
198 199 if (segspt_minfree == 0) /* leave min 5% of availrmem for */
199 200 segspt_minfree = availrmem/20; /* for the system */
200 201
201 202 if (!hat_supported(HAT_SHARED_PT, (void *)0))
202 203 return (EINVAL);
203 204
204 205 /*
205 206 * get a new as for this shared memory segment
206 207 */
207 208 newas = as_alloc();
208 209 newas->a_proc = NULL;
209 210 sptcargs.amp = amp;
210 211 sptcargs.prot = prot;
211 212 sptcargs.flags = flags;
212 213 sptcargs.szc = share_szc;
213 214 /*
214 215 * create a shared page table (spt) segment
215 216 */
216 217
217 218 if (err = as_map(newas, SEGSPTADDR, size, segspt_create, &sptcargs)) {
218 219 as_free(newas);
219 220 return (err);
220 221 }
221 222 *sptseg = sptcargs.seg_spt;
222 223 return (0);
223 224 }
224 225
225 226 void
226 227 sptdestroy(struct as *as, struct anon_map *amp)
227 228 {
228 229
229 230 #ifdef DEBUG
230 231 TNF_PROBE_0(sptdestroy, "spt", /* CSTYLED */);
231 232 #endif
232 233 (void) as_unmap(as, SEGSPTADDR, amp->size);
233 234 as_free(as);
234 235 }
235 236
236 237 /*
237 238 * called from seg_free().
238 239 * free (i.e., unlock, unmap, return to free list)
239 240 * all the pages in the given seg.
240 241 */
241 242 void
242 243 segspt_free(struct seg *seg)
243 244 {
244 245 struct spt_data *sptd = (struct spt_data *)seg->s_data;
245 246
246 247 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
247 248
248 249 if (sptd != NULL) {
249 250 if (sptd->spt_realsize)
250 251 segspt_free_pages(seg, seg->s_base, sptd->spt_realsize);
251 252
252 253 if (sptd->spt_ppa_lckcnt) {
253 254 kmem_free(sptd->spt_ppa_lckcnt,
254 255 sizeof (*sptd->spt_ppa_lckcnt)
255 256 * btopr(sptd->spt_amp->size));
256 257 }
257 258 kmem_free(sptd->spt_vp, sizeof (*sptd->spt_vp));
258 259 cv_destroy(&sptd->spt_cv);
259 260 mutex_destroy(&sptd->spt_lock);
260 261 kmem_free(sptd, sizeof (*sptd));
261 262 }
262 263 }
263 264
264 265 /*ARGSUSED*/
265 266 static int
266 267 segspt_shmsync(struct seg *seg, caddr_t addr, size_t len, int attr,
267 268 uint_t flags)
268 269 {
269 270 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
270 271
271 272 return (0);
272 273 }
273 274
274 275 /*ARGSUSED*/
275 276 static size_t
276 277 segspt_shmincore(struct seg *seg, caddr_t addr, size_t len, char *vec)
277 278 {
278 279 caddr_t eo_seg;
279 280 pgcnt_t npages;
280 281 struct shm_data *shmd = (struct shm_data *)seg->s_data;
281 282 struct seg *sptseg;
282 283 struct spt_data *sptd;
283 284
284 285 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
285 286 #ifdef lint
286 287 seg = seg;
287 288 #endif
288 289 sptseg = shmd->shm_sptseg;
289 290 sptd = sptseg->s_data;
290 291
291 292 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
292 293 eo_seg = addr + len;
293 294 while (addr < eo_seg) {
294 295 /* page exists, and it's locked. */
295 296 *vec++ = SEG_PAGE_INCORE | SEG_PAGE_LOCKED |
296 297 SEG_PAGE_ANON;
297 298 addr += PAGESIZE;
298 299 }
299 300 return (len);
300 301 } else {
301 302 struct anon_map *amp = shmd->shm_amp;
302 303 struct anon *ap;
303 304 page_t *pp;
304 305 pgcnt_t anon_index;
305 306 struct vnode *vp;
306 307 u_offset_t off;
307 308 ulong_t i;
308 309 int ret;
309 310 anon_sync_obj_t cookie;
310 311
311 312 addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
312 313 anon_index = seg_page(seg, addr);
313 314 npages = btopr(len);
314 315 if (anon_index + npages > btopr(shmd->shm_amp->size)) {
315 316 return (EINVAL);
316 317 }
317 318 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
318 319 for (i = 0; i < npages; i++, anon_index++) {
319 320 ret = 0;
320 321 anon_array_enter(amp, anon_index, &cookie);
321 322 ap = anon_get_ptr(amp->ahp, anon_index);
322 323 if (ap != NULL) {
323 324 swap_xlate(ap, &vp, &off);
324 325 anon_array_exit(&cookie);
325 326 pp = page_lookup_nowait(vp, off, SE_SHARED);
326 327 if (pp != NULL) {
327 328 ret |= SEG_PAGE_INCORE | SEG_PAGE_ANON;
328 329 page_unlock(pp);
329 330 }
330 331 } else {
331 332 anon_array_exit(&cookie);
332 333 }
333 334 if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) {
334 335 ret |= SEG_PAGE_LOCKED;
335 336 }
336 337 *vec++ = (char)ret;
337 338 }
338 339 ANON_LOCK_EXIT(&->a_rwlock);
339 340 return (len);
340 341 }
341 342 }
342 343
343 344 static int
344 345 segspt_unmap(struct seg *seg, caddr_t raddr, size_t ssize)
345 346 {
346 347 size_t share_size;
347 348
348 349 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
349 350
350 351 /*
351 352 * seg.s_size may have been rounded up to the largest page size
352 353 * in shmat().
353 354 * XXX This should be cleanedup. sptdestroy should take a length
354 355 * argument which should be the same as sptcreate. Then
355 356 * this rounding would not be needed (or is done in shm.c)
356 357 * Only the check for full segment will be needed.
357 358 *
358 359 * XXX -- shouldn't raddr == 0 always? These tests don't seem
359 360 * to be useful at all.
360 361 */
361 362 share_size = page_get_pagesize(seg->s_szc);
362 363 ssize = P2ROUNDUP(ssize, share_size);
363 364
364 365 if (raddr == seg->s_base && ssize == seg->s_size) {
365 366 seg_free(seg);
366 367 return (0);
367 368 } else
368 369 return (EINVAL);
369 370 }
370 371
371 372 int
372 373 segspt_create(struct seg *seg, caddr_t argsp)
373 374 {
374 375 int err;
375 376 caddr_t addr = seg->s_base;
376 377 struct spt_data *sptd;
377 378 struct segspt_crargs *sptcargs = (struct segspt_crargs *)argsp;
378 379 struct anon_map *amp = sptcargs->amp;
379 380 struct kshmid *sp = amp->a_sp;
380 381 struct cred *cred = CRED();
381 382 ulong_t i, j, anon_index = 0;
382 383 pgcnt_t npages = btopr(amp->size);
383 384 struct vnode *vp;
384 385 page_t **ppa;
385 386 uint_t hat_flags;
386 387 size_t pgsz;
387 388 pgcnt_t pgcnt;
388 389 caddr_t a;
389 390 pgcnt_t pidx;
390 391 size_t sz;
391 392 proc_t *procp = curproc;
392 393 rctl_qty_t lockedbytes = 0;
393 394 kproject_t *proj;
394 395
395 396 /*
396 397 * We are holding the a_lock on the underlying dummy as,
397 398 * so we can make calls to the HAT layer.
398 399 */
399 400 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
400 401 ASSERT(sp != NULL);
401 402
402 403 #ifdef DEBUG
403 404 TNF_PROBE_2(segspt_create, "spt", /* CSTYLED */,
404 405 tnf_opaque, addr, addr, tnf_ulong, len, seg->s_size);
405 406 #endif
406 407 if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
407 408 if (err = anon_swap_adjust(npages))
408 409 return (err);
409 410 }
410 411 err = ENOMEM;
411 412
412 413 if ((sptd = kmem_zalloc(sizeof (*sptd), KM_NOSLEEP)) == NULL)
413 414 goto out1;
414 415
415 416 if ((sptcargs->flags & SHM_PAGEABLE) == 0) {
416 417 if ((ppa = kmem_zalloc(((sizeof (page_t *)) * npages),
417 418 KM_NOSLEEP)) == NULL)
418 419 goto out2;
419 420 }
420 421
421 422 mutex_init(&sptd->spt_lock, NULL, MUTEX_DEFAULT, NULL);
422 423
423 424 if ((vp = kmem_zalloc(sizeof (*vp), KM_NOSLEEP)) == NULL)
424 425 goto out3;
425 426
426 427 seg->s_ops = &segspt_ops;
427 428 sptd->spt_vp = vp;
428 429 sptd->spt_amp = amp;
429 430 sptd->spt_prot = sptcargs->prot;
430 431 sptd->spt_flags = sptcargs->flags;
431 432 seg->s_data = (caddr_t)sptd;
432 433 sptd->spt_ppa = NULL;
433 434 sptd->spt_ppa_lckcnt = NULL;
434 435 seg->s_szc = sptcargs->szc;
435 436 cv_init(&sptd->spt_cv, NULL, CV_DEFAULT, NULL);
436 437 sptd->spt_gen = 0;
437 438
438 439 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
439 440 if (seg->s_szc > amp->a_szc) {
440 441 amp->a_szc = seg->s_szc;
441 442 }
442 443 ANON_LOCK_EXIT(&->a_rwlock);
443 444
444 445 /*
445 446 * Set policy to affect initial allocation of pages in
446 447 * anon_map_createpages()
447 448 */
448 449 (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, amp, anon_index,
449 450 NULL, 0, ptob(npages));
450 451
451 452 if (sptcargs->flags & SHM_PAGEABLE) {
452 453 size_t share_sz;
453 454 pgcnt_t new_npgs, more_pgs;
454 455 struct anon_hdr *nahp;
455 456 zone_t *zone;
456 457
457 458 share_sz = page_get_pagesize(seg->s_szc);
458 459 if (!IS_P2ALIGNED(amp->size, share_sz)) {
459 460 /*
460 461 * We are rounding up the size of the anon array
461 462 * on 4 M boundary because we always create 4 M
462 463 * of page(s) when locking, faulting pages and we
463 464 * don't have to check for all corner cases e.g.
464 465 * if there is enough space to allocate 4 M
465 466 * page.
466 467 */
467 468 new_npgs = btop(P2ROUNDUP(amp->size, share_sz));
468 469 more_pgs = new_npgs - npages;
469 470
470 471 /*
471 472 * The zone will never be NULL, as a fully created
472 473 * shm always has an owning zone.
473 474 */
474 475 zone = sp->shm_perm.ipc_zone_ref.zref_zone;
475 476 ASSERT(zone != NULL);
476 477 if (anon_resv_zone(ptob(more_pgs), zone) == 0) {
477 478 err = ENOMEM;
478 479 goto out4;
479 480 }
480 481
481 482 nahp = anon_create(new_npgs, ANON_SLEEP);
482 483 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
483 484 (void) anon_copy_ptr(amp->ahp, 0, nahp, 0, npages,
484 485 ANON_SLEEP);
485 486 anon_release(amp->ahp, npages);
486 487 amp->ahp = nahp;
487 488 ASSERT(amp->swresv == ptob(npages));
488 489 amp->swresv = amp->size = ptob(new_npgs);
489 490 ANON_LOCK_EXIT(&->a_rwlock);
490 491 npages = new_npgs;
491 492 }
492 493
493 494 sptd->spt_ppa_lckcnt = kmem_zalloc(npages *
494 495 sizeof (*sptd->spt_ppa_lckcnt), KM_SLEEP);
495 496 sptd->spt_pcachecnt = 0;
496 497 sptd->spt_realsize = ptob(npages);
497 498 sptcargs->seg_spt = seg;
498 499 return (0);
499 500 }
500 501
501 502 /*
502 503 * get array of pages for each anon slot in amp
503 504 */
504 505 if ((err = anon_map_createpages(amp, anon_index, ptob(npages), ppa,
505 506 seg, addr, S_CREATE, cred)) != 0)
506 507 goto out4;
507 508
508 509 mutex_enter(&sp->shm_mlock);
509 510
510 511 /* May be partially locked, so, count bytes to charge for locking */
511 512 for (i = 0; i < npages; i++)
512 513 if (ppa[i]->p_lckcnt == 0)
513 514 lockedbytes += PAGESIZE;
514 515
515 516 proj = sp->shm_perm.ipc_proj;
516 517
517 518 if (lockedbytes > 0) {
518 519 mutex_enter(&procp->p_lock);
519 520 if (rctl_incr_locked_mem(procp, proj, lockedbytes, 0)) {
520 521 mutex_exit(&procp->p_lock);
521 522 mutex_exit(&sp->shm_mlock);
522 523 for (i = 0; i < npages; i++)
523 524 page_unlock(ppa[i]);
524 525 err = ENOMEM;
525 526 goto out4;
526 527 }
527 528 mutex_exit(&procp->p_lock);
528 529 }
529 530
530 531 /*
531 532 * addr is initial address corresponding to the first page on ppa list
532 533 */
533 534 for (i = 0; i < npages; i++) {
534 535 /* attempt to lock all pages */
535 536 if (page_pp_lock(ppa[i], 0, 1) == 0) {
536 537 /*
537 538 * if unable to lock any page, unlock all
538 539 * of them and return error
539 540 */
540 541 for (j = 0; j < i; j++)
541 542 page_pp_unlock(ppa[j], 0, 1);
542 543 for (i = 0; i < npages; i++)
543 544 page_unlock(ppa[i]);
544 545 rctl_decr_locked_mem(NULL, proj, lockedbytes, 0);
545 546 mutex_exit(&sp->shm_mlock);
546 547 err = ENOMEM;
547 548 goto out4;
548 549 }
549 550 }
550 551 mutex_exit(&sp->shm_mlock);
551 552
552 553 /*
553 554 * Some platforms assume that ISM mappings are HAT_LOAD_LOCK
554 555 * for the entire life of the segment. For example platforms
555 556 * that do not support Dynamic Reconfiguration.
556 557 */
557 558 hat_flags = HAT_LOAD_SHARE;
558 559 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, NULL))
559 560 hat_flags |= HAT_LOAD_LOCK;
560 561
561 562 /*
562 563 * Load translations one lare page at a time
563 564 * to make sure we don't create mappings bigger than
564 565 * segment's size code in case underlying pages
565 566 * are shared with segvn's segment that uses bigger
566 567 * size code than we do.
567 568 */
568 569 pgsz = page_get_pagesize(seg->s_szc);
569 570 pgcnt = page_get_pagecnt(seg->s_szc);
570 571 for (a = addr, pidx = 0; pidx < npages; a += pgsz, pidx += pgcnt) {
571 572 sz = MIN(pgsz, ptob(npages - pidx));
572 573 hat_memload_array(seg->s_as->a_hat, a, sz,
573 574 &ppa[pidx], sptd->spt_prot, hat_flags);
574 575 }
575 576
576 577 /*
577 578 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
578 579 * we will leave the pages locked SE_SHARED for the life
579 580 * of the ISM segment. This will prevent any calls to
580 581 * hat_pageunload() on this ISM segment for those platforms.
581 582 */
582 583 if (!(hat_flags & HAT_LOAD_LOCK)) {
583 584 /*
584 585 * On platforms that support HAT_DYNAMIC_ISM_UNMAP,
585 586 * we no longer need to hold the SE_SHARED lock on the pages,
586 587 * since L_PAGELOCK and F_SOFTLOCK calls will grab the
587 588 * SE_SHARED lock on the pages as necessary.
588 589 */
589 590 for (i = 0; i < npages; i++)
590 591 page_unlock(ppa[i]);
591 592 }
592 593 sptd->spt_pcachecnt = 0;
593 594 kmem_free(ppa, ((sizeof (page_t *)) * npages));
594 595 sptd->spt_realsize = ptob(npages);
595 596 atomic_add_long(&spt_used, npages);
596 597 sptcargs->seg_spt = seg;
597 598 return (0);
598 599
599 600 out4:
600 601 seg->s_data = NULL;
601 602 kmem_free(vp, sizeof (*vp));
602 603 cv_destroy(&sptd->spt_cv);
603 604 out3:
604 605 mutex_destroy(&sptd->spt_lock);
605 606 if ((sptcargs->flags & SHM_PAGEABLE) == 0)
606 607 kmem_free(ppa, (sizeof (*ppa) * npages));
607 608 out2:
608 609 kmem_free(sptd, sizeof (*sptd));
609 610 out1:
610 611 if ((sptcargs->flags & SHM_PAGEABLE) == 0)
611 612 anon_swap_restore(npages);
612 613 return (err);
613 614 }
614 615
615 616 /*ARGSUSED*/
616 617 void
617 618 segspt_free_pages(struct seg *seg, caddr_t addr, size_t len)
618 619 {
619 620 struct page *pp;
620 621 struct spt_data *sptd = (struct spt_data *)seg->s_data;
621 622 pgcnt_t npages;
622 623 ulong_t anon_idx;
623 624 struct anon_map *amp;
624 625 struct anon *ap;
625 626 struct vnode *vp;
626 627 u_offset_t off;
627 628 uint_t hat_flags;
628 629 int root = 0;
629 630 pgcnt_t pgs, curnpgs = 0;
630 631 page_t *rootpp;
631 632 rctl_qty_t unlocked_bytes = 0;
632 633 kproject_t *proj;
633 634 kshmid_t *sp;
634 635
635 636 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
636 637
637 638 len = P2ROUNDUP(len, PAGESIZE);
638 639
639 640 npages = btop(len);
640 641
641 642 hat_flags = HAT_UNLOAD_UNLOCK | HAT_UNLOAD_UNMAP;
642 643 if ((hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) ||
643 644 (sptd->spt_flags & SHM_PAGEABLE)) {
644 645 hat_flags = HAT_UNLOAD_UNMAP;
645 646 }
646 647
647 648 hat_unload(seg->s_as->a_hat, addr, len, hat_flags);
648 649
649 650 amp = sptd->spt_amp;
650 651 if (sptd->spt_flags & SHM_PAGEABLE)
651 652 npages = btop(amp->size);
652 653
653 654 ASSERT(amp != NULL);
654 655
655 656 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
656 657 sp = amp->a_sp;
657 658 proj = sp->shm_perm.ipc_proj;
658 659 mutex_enter(&sp->shm_mlock);
659 660 }
660 661 for (anon_idx = 0; anon_idx < npages; anon_idx++) {
661 662 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
662 663 if ((ap = anon_get_ptr(amp->ahp, anon_idx)) == NULL) {
663 664 panic("segspt_free_pages: null app");
664 665 /*NOTREACHED*/
665 666 }
666 667 } else {
667 668 if ((ap = anon_get_next_ptr(amp->ahp, &anon_idx))
668 669 == NULL)
669 670 continue;
670 671 }
671 672 ASSERT(ANON_ISBUSY(anon_get_slot(amp->ahp, anon_idx)) == 0);
672 673 swap_xlate(ap, &vp, &off);
673 674
674 675 /*
675 676 * If this platform supports HAT_DYNAMIC_ISM_UNMAP,
676 677 * the pages won't be having SE_SHARED lock at this
677 678 * point.
678 679 *
679 680 * On platforms that do not support HAT_DYNAMIC_ISM_UNMAP,
680 681 * the pages are still held SE_SHARED locked from the
681 682 * original segspt_create()
682 683 *
683 684 * Our goal is to get SE_EXCL lock on each page, remove
684 685 * permanent lock on it and invalidate the page.
685 686 */
686 687 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
687 688 if (hat_flags == HAT_UNLOAD_UNMAP)
688 689 pp = page_lookup(vp, off, SE_EXCL);
689 690 else {
690 691 if ((pp = page_find(vp, off)) == NULL) {
691 692 panic("segspt_free_pages: "
692 693 "page not locked");
693 694 /*NOTREACHED*/
694 695 }
695 696 if (!page_tryupgrade(pp)) {
696 697 page_unlock(pp);
697 698 pp = page_lookup(vp, off, SE_EXCL);
698 699 }
699 700 }
700 701 if (pp == NULL) {
701 702 panic("segspt_free_pages: "
702 703 "page not in the system");
703 704 /*NOTREACHED*/
704 705 }
705 706 ASSERT(pp->p_lckcnt > 0);
706 707 page_pp_unlock(pp, 0, 1);
707 708 if (pp->p_lckcnt == 0)
708 709 unlocked_bytes += PAGESIZE;
709 710 } else {
710 711 if ((pp = page_lookup(vp, off, SE_EXCL)) == NULL)
711 712 continue;
712 713 }
713 714 /*
714 715 * It's logical to invalidate the pages here as in most cases
715 716 * these were created by segspt.
716 717 */
717 718 if (pp->p_szc != 0) {
718 719 if (root == 0) {
719 720 ASSERT(curnpgs == 0);
720 721 root = 1;
721 722 rootpp = pp;
722 723 pgs = curnpgs = page_get_pagecnt(pp->p_szc);
723 724 ASSERT(pgs > 1);
724 725 ASSERT(IS_P2ALIGNED(pgs, pgs));
725 726 ASSERT(!(page_pptonum(pp) & (pgs - 1)));
726 727 curnpgs--;
727 728 } else if ((page_pptonum(pp) & (pgs - 1)) == pgs - 1) {
728 729 ASSERT(curnpgs == 1);
729 730 ASSERT(page_pptonum(pp) ==
730 731 page_pptonum(rootpp) + (pgs - 1));
731 732 page_destroy_pages(rootpp);
732 733 root = 0;
733 734 curnpgs = 0;
734 735 } else {
735 736 ASSERT(curnpgs > 1);
736 737 ASSERT(page_pptonum(pp) ==
737 738 page_pptonum(rootpp) + (pgs - curnpgs));
738 739 curnpgs--;
739 740 }
740 741 } else {
741 742 if (root != 0 || curnpgs != 0) {
742 743 panic("segspt_free_pages: bad large page");
743 744 /*NOTREACHED*/
744 745 }
745 746 /*
746 747 * Before destroying the pages, we need to take care
747 748 * of the rctl locked memory accounting. For that
748 749 * we need to calculte the unlocked_bytes.
749 750 */
750 751 if (pp->p_lckcnt > 0)
751 752 unlocked_bytes += PAGESIZE;
752 753 /*LINTED: constant in conditional context */
753 754 VN_DISPOSE(pp, B_INVAL, 0, kcred);
754 755 }
755 756 }
756 757 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
757 758 if (unlocked_bytes > 0)
758 759 rctl_decr_locked_mem(NULL, proj, unlocked_bytes, 0);
759 760 mutex_exit(&sp->shm_mlock);
760 761 }
761 762 if (root != 0 || curnpgs != 0) {
762 763 panic("segspt_free_pages: bad large page");
763 764 /*NOTREACHED*/
764 765 }
765 766
766 767 /*
767 768 * mark that pages have been released
768 769 */
769 770 sptd->spt_realsize = 0;
770 771
771 772 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
772 773 atomic_add_long(&spt_used, -npages);
773 774 anon_swap_restore(npages);
774 775 }
775 776 }
776 777
777 778 /*
778 779 * Get memory allocation policy info for specified address in given segment
779 780 */
780 781 static lgrp_mem_policy_info_t *
781 782 segspt_getpolicy(struct seg *seg, caddr_t addr)
782 783 {
783 784 struct anon_map *amp;
784 785 ulong_t anon_index;
785 786 lgrp_mem_policy_info_t *policy_info;
786 787 struct spt_data *spt_data;
787 788
788 789 ASSERT(seg != NULL);
789 790
790 791 /*
791 792 * Get anon_map from segspt
792 793 *
793 794 * Assume that no lock needs to be held on anon_map, since
794 795 * it should be protected by its reference count which must be
795 796 * nonzero for an existing segment
796 797 * Need to grab readers lock on policy tree though
797 798 */
798 799 spt_data = (struct spt_data *)seg->s_data;
799 800 if (spt_data == NULL)
800 801 return (NULL);
801 802 amp = spt_data->spt_amp;
802 803 ASSERT(amp->refcnt != 0);
803 804
804 805 /*
805 806 * Get policy info
806 807 *
807 808 * Assume starting anon index of 0
808 809 */
809 810 anon_index = seg_page(seg, addr);
810 811 policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);
811 812
812 813 return (policy_info);
813 814 }
814 815
815 816 /*
816 817 * DISM only.
817 818 * Return locked pages over a given range.
818 819 *
819 820 * We will cache all DISM locked pages and save the pplist for the
820 821 * entire segment in the ppa field of the underlying DISM segment structure.
821 822 * Later, during a call to segspt_reclaim() we will use this ppa array
822 823 * to page_unlock() all of the pages and then we will free this ppa list.
823 824 */
824 825 /*ARGSUSED*/
825 826 static int
826 827 segspt_dismpagelock(struct seg *seg, caddr_t addr, size_t len,
827 828 struct page ***ppp, enum lock_type type, enum seg_rw rw)
828 829 {
829 830 struct shm_data *shmd = (struct shm_data *)seg->s_data;
830 831 struct seg *sptseg = shmd->shm_sptseg;
831 832 struct spt_data *sptd = sptseg->s_data;
832 833 pgcnt_t pg_idx, npages, tot_npages, npgs;
833 834 struct page **pplist, **pl, **ppa, *pp;
834 835 struct anon_map *amp;
835 836 spgcnt_t an_idx;
836 837 int ret = ENOTSUP;
837 838 uint_t pl_built = 0;
838 839 struct anon *ap;
839 840 struct vnode *vp;
840 841 u_offset_t off;
841 842 pgcnt_t claim_availrmem = 0;
842 843 uint_t szc;
843 844
844 845 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
845 846 ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK);
846 847
847 848 /*
848 849 * We want to lock/unlock the entire ISM segment. Therefore,
849 850 * we will be using the underlying sptseg and it's base address
850 851 * and length for the caching arguments.
851 852 */
852 853 ASSERT(sptseg);
853 854 ASSERT(sptd);
854 855
855 856 pg_idx = seg_page(seg, addr);
856 857 npages = btopr(len);
857 858
858 859 /*
859 860 * check if the request is larger than number of pages covered
860 861 * by amp
861 862 */
862 863 if (pg_idx + npages > btopr(sptd->spt_amp->size)) {
863 864 *ppp = NULL;
864 865 return (ENOTSUP);
865 866 }
866 867
867 868 if (type == L_PAGEUNLOCK) {
868 869 ASSERT(sptd->spt_ppa != NULL);
869 870
870 871 seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size,
871 872 sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
872 873
873 874 /*
874 875 * If someone is blocked while unmapping, we purge
875 876 * segment page cache and thus reclaim pplist synchronously
876 877 * without waiting for seg_pasync_thread. This speeds up
877 878 * unmapping in cases where munmap(2) is called, while
878 879 * raw async i/o is still in progress or where a thread
879 880 * exits on data fault in a multithreaded application.
880 881 */
881 882 if ((sptd->spt_flags & DISM_PPA_CHANGED) ||
882 883 (AS_ISUNMAPWAIT(seg->s_as) &&
883 884 shmd->shm_softlockcnt > 0)) {
884 885 segspt_purge(seg);
885 886 }
886 887 return (0);
887 888 }
888 889
889 890 /* The L_PAGELOCK case ... */
890 891
891 892 if (sptd->spt_flags & DISM_PPA_CHANGED) {
892 893 segspt_purge(seg);
893 894 /*
894 895 * for DISM ppa needs to be rebuild since
895 896 * number of locked pages could be changed
896 897 */
897 898 *ppp = NULL;
898 899 return (ENOTSUP);
899 900 }
900 901
901 902 /*
902 903 * First try to find pages in segment page cache, without
903 904 * holding the segment lock.
904 905 */
905 906 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
906 907 S_WRITE, SEGP_FORCE_WIRED);
907 908 if (pplist != NULL) {
908 909 ASSERT(sptd->spt_ppa != NULL);
909 910 ASSERT(sptd->spt_ppa == pplist);
910 911 ppa = sptd->spt_ppa;
911 912 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
912 913 if (ppa[an_idx] == NULL) {
913 914 seg_pinactive(seg, NULL, seg->s_base,
914 915 sptd->spt_amp->size, ppa,
915 916 S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
916 917 *ppp = NULL;
917 918 return (ENOTSUP);
918 919 }
919 920 if ((szc = ppa[an_idx]->p_szc) != 0) {
920 921 npgs = page_get_pagecnt(szc);
921 922 an_idx = P2ROUNDUP(an_idx + 1, npgs);
922 923 } else {
923 924 an_idx++;
924 925 }
925 926 }
926 927 /*
927 928 * Since we cache the entire DISM segment, we want to
928 929 * set ppp to point to the first slot that corresponds
929 930 * to the requested addr, i.e. pg_idx.
930 931 */
931 932 *ppp = &(sptd->spt_ppa[pg_idx]);
932 933 return (0);
933 934 }
934 935
935 936 mutex_enter(&sptd->spt_lock);
936 937 /*
937 938 * try to find pages in segment page cache with mutex
938 939 */
939 940 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
940 941 S_WRITE, SEGP_FORCE_WIRED);
941 942 if (pplist != NULL) {
942 943 ASSERT(sptd->spt_ppa != NULL);
943 944 ASSERT(sptd->spt_ppa == pplist);
944 945 ppa = sptd->spt_ppa;
945 946 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
946 947 if (ppa[an_idx] == NULL) {
947 948 mutex_exit(&sptd->spt_lock);
948 949 seg_pinactive(seg, NULL, seg->s_base,
949 950 sptd->spt_amp->size, ppa,
950 951 S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
951 952 *ppp = NULL;
952 953 return (ENOTSUP);
953 954 }
954 955 if ((szc = ppa[an_idx]->p_szc) != 0) {
955 956 npgs = page_get_pagecnt(szc);
956 957 an_idx = P2ROUNDUP(an_idx + 1, npgs);
957 958 } else {
958 959 an_idx++;
959 960 }
960 961 }
961 962 /*
962 963 * Since we cache the entire DISM segment, we want to
963 964 * set ppp to point to the first slot that corresponds
964 965 * to the requested addr, i.e. pg_idx.
965 966 */
966 967 mutex_exit(&sptd->spt_lock);
967 968 *ppp = &(sptd->spt_ppa[pg_idx]);
968 969 return (0);
969 970 }
970 971 if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size,
971 972 SEGP_FORCE_WIRED) == SEGP_FAIL) {
972 973 mutex_exit(&sptd->spt_lock);
973 974 *ppp = NULL;
974 975 return (ENOTSUP);
975 976 }
976 977
977 978 /*
978 979 * No need to worry about protections because DISM pages are always rw.
979 980 */
980 981 pl = pplist = NULL;
981 982 amp = sptd->spt_amp;
982 983
983 984 /*
984 985 * Do we need to build the ppa array?
985 986 */
986 987 if (sptd->spt_ppa == NULL) {
987 988 pgcnt_t lpg_cnt = 0;
988 989
989 990 pl_built = 1;
990 991 tot_npages = btopr(sptd->spt_amp->size);
991 992
992 993 ASSERT(sptd->spt_pcachecnt == 0);
993 994 pplist = kmem_zalloc(sizeof (page_t *) * tot_npages, KM_SLEEP);
994 995 pl = pplist;
995 996
996 997 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
997 998 for (an_idx = 0; an_idx < tot_npages; ) {
998 999 ap = anon_get_ptr(amp->ahp, an_idx);
999 1000 /*
1000 1001 * Cache only mlocked pages. For large pages
1001 1002 * if one (constituent) page is mlocked
1002 1003 * all pages for that large page
1003 1004 * are cached also. This is for quick
1004 1005 * lookups of ppa array;
1005 1006 */
1006 1007 if ((ap != NULL) && (lpg_cnt != 0 ||
1007 1008 (sptd->spt_ppa_lckcnt[an_idx] != 0))) {
1008 1009
1009 1010 swap_xlate(ap, &vp, &off);
1010 1011 pp = page_lookup(vp, off, SE_SHARED);
1011 1012 ASSERT(pp != NULL);
1012 1013 if (lpg_cnt == 0) {
1013 1014 lpg_cnt++;
1014 1015 /*
1015 1016 * For a small page, we are done --
1016 1017 * lpg_count is reset to 0 below.
1017 1018 *
1018 1019 * For a large page, we are guaranteed
1019 1020 * to find the anon structures of all
1020 1021 * constituent pages and a non-zero
1021 1022 * lpg_cnt ensures that we don't test
1022 1023 * for mlock for these. We are done
1023 1024 * when lpg_count reaches (npgs + 1).
1024 1025 * If we are not the first constituent
1025 1026 * page, restart at the first one.
1026 1027 */
1027 1028 npgs = page_get_pagecnt(pp->p_szc);
1028 1029 if (!IS_P2ALIGNED(an_idx, npgs)) {
1029 1030 an_idx = P2ALIGN(an_idx, npgs);
1030 1031 page_unlock(pp);
1031 1032 continue;
1032 1033 }
1033 1034 }
1034 1035 if (++lpg_cnt > npgs)
1035 1036 lpg_cnt = 0;
1036 1037
1037 1038 /*
1038 1039 * availrmem is decremented only
1039 1040 * for unlocked pages
1040 1041 */
1041 1042 if (sptd->spt_ppa_lckcnt[an_idx] == 0)
1042 1043 claim_availrmem++;
1043 1044 pplist[an_idx] = pp;
1044 1045 }
1045 1046 an_idx++;
1046 1047 }
1047 1048 ANON_LOCK_EXIT(&->a_rwlock);
1048 1049
1049 1050 if (claim_availrmem) {
1050 1051 mutex_enter(&freemem_lock);
1051 1052 if (availrmem < tune.t_minarmem + claim_availrmem) {
1052 1053 mutex_exit(&freemem_lock);
1053 1054 ret = ENOTSUP;
1054 1055 claim_availrmem = 0;
1055 1056 goto insert_fail;
1056 1057 } else {
1057 1058 availrmem -= claim_availrmem;
1058 1059 }
1059 1060 mutex_exit(&freemem_lock);
1060 1061 }
1061 1062
1062 1063 sptd->spt_ppa = pl;
1063 1064 } else {
1064 1065 /*
1065 1066 * We already have a valid ppa[].
1066 1067 */
1067 1068 pl = sptd->spt_ppa;
1068 1069 }
1069 1070
1070 1071 ASSERT(pl != NULL);
1071 1072
1072 1073 ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size,
1073 1074 sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED,
1074 1075 segspt_reclaim);
1075 1076 if (ret == SEGP_FAIL) {
1076 1077 /*
1077 1078 * seg_pinsert failed. We return
1078 1079 * ENOTSUP, so that the as_pagelock() code will
1079 1080 * then try the slower F_SOFTLOCK path.
1080 1081 */
1081 1082 if (pl_built) {
1082 1083 /*
1083 1084 * No one else has referenced the ppa[].
1084 1085 * We created it and we need to destroy it.
1085 1086 */
1086 1087 sptd->spt_ppa = NULL;
1087 1088 }
1088 1089 ret = ENOTSUP;
1089 1090 goto insert_fail;
1090 1091 }
1091 1092
1092 1093 /*
1093 1094 * In either case, we increment softlockcnt on the 'real' segment.
1094 1095 */
1095 1096 sptd->spt_pcachecnt++;
1096 1097 atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1097 1098
1098 1099 ppa = sptd->spt_ppa;
1099 1100 for (an_idx = pg_idx; an_idx < pg_idx + npages; ) {
1100 1101 if (ppa[an_idx] == NULL) {
1101 1102 mutex_exit(&sptd->spt_lock);
1102 1103 seg_pinactive(seg, NULL, seg->s_base,
1103 1104 sptd->spt_amp->size,
1104 1105 pl, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1105 1106 *ppp = NULL;
1106 1107 return (ENOTSUP);
1107 1108 }
1108 1109 if ((szc = ppa[an_idx]->p_szc) != 0) {
1109 1110 npgs = page_get_pagecnt(szc);
1110 1111 an_idx = P2ROUNDUP(an_idx + 1, npgs);
1111 1112 } else {
1112 1113 an_idx++;
1113 1114 }
1114 1115 }
1115 1116 /*
1116 1117 * We can now drop the sptd->spt_lock since the ppa[]
1117 1118 * exists and we have incremented pacachecnt.
1118 1119 */
1119 1120 mutex_exit(&sptd->spt_lock);
1120 1121
1121 1122 /*
1122 1123 * Since we cache the entire segment, we want to
1123 1124 * set ppp to point to the first slot that corresponds
1124 1125 * to the requested addr, i.e. pg_idx.
1125 1126 */
1126 1127 *ppp = &(sptd->spt_ppa[pg_idx]);
1127 1128 return (0);
1128 1129
1129 1130 insert_fail:
1130 1131 /*
1131 1132 * We will only reach this code if we tried and failed.
1132 1133 *
1133 1134 * And we can drop the lock on the dummy seg, once we've failed
1134 1135 * to set up a new ppa[].
1135 1136 */
1136 1137 mutex_exit(&sptd->spt_lock);
1137 1138
1138 1139 if (pl_built) {
1139 1140 if (claim_availrmem) {
1140 1141 mutex_enter(&freemem_lock);
1141 1142 availrmem += claim_availrmem;
1142 1143 mutex_exit(&freemem_lock);
1143 1144 }
1144 1145
1145 1146 /*
1146 1147 * We created pl and we need to destroy it.
1147 1148 */
1148 1149 pplist = pl;
1149 1150 for (an_idx = 0; an_idx < tot_npages; an_idx++) {
1150 1151 if (pplist[an_idx] != NULL)
1151 1152 page_unlock(pplist[an_idx]);
1152 1153 }
1153 1154 kmem_free(pl, sizeof (page_t *) * tot_npages);
1154 1155 }
1155 1156
1156 1157 if (shmd->shm_softlockcnt <= 0) {
1157 1158 if (AS_ISUNMAPWAIT(seg->s_as)) {
1158 1159 mutex_enter(&seg->s_as->a_contents);
1159 1160 if (AS_ISUNMAPWAIT(seg->s_as)) {
1160 1161 AS_CLRUNMAPWAIT(seg->s_as);
1161 1162 cv_broadcast(&seg->s_as->a_cv);
1162 1163 }
1163 1164 mutex_exit(&seg->s_as->a_contents);
1164 1165 }
1165 1166 }
1166 1167 *ppp = NULL;
1167 1168 return (ret);
1168 1169 }
1169 1170
1170 1171
1171 1172
1172 1173 /*
1173 1174 * return locked pages over a given range.
1174 1175 *
1175 1176 * We will cache the entire ISM segment and save the pplist for the
1176 1177 * entire segment in the ppa field of the underlying ISM segment structure.
1177 1178 * Later, during a call to segspt_reclaim() we will use this ppa array
1178 1179 * to page_unlock() all of the pages and then we will free this ppa list.
1179 1180 */
1180 1181 /*ARGSUSED*/
1181 1182 static int
1182 1183 segspt_shmpagelock(struct seg *seg, caddr_t addr, size_t len,
1183 1184 struct page ***ppp, enum lock_type type, enum seg_rw rw)
1184 1185 {
1185 1186 struct shm_data *shmd = (struct shm_data *)seg->s_data;
1186 1187 struct seg *sptseg = shmd->shm_sptseg;
1187 1188 struct spt_data *sptd = sptseg->s_data;
1188 1189 pgcnt_t np, page_index, npages;
1189 1190 caddr_t a, spt_base;
1190 1191 struct page **pplist, **pl, *pp;
1191 1192 struct anon_map *amp;
1192 1193 ulong_t anon_index;
1193 1194 int ret = ENOTSUP;
1194 1195 uint_t pl_built = 0;
1195 1196 struct anon *ap;
1196 1197 struct vnode *vp;
1197 1198 u_offset_t off;
1198 1199
1199 1200 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
1200 1201 ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK);
1201 1202
1202 1203
1203 1204 /*
1204 1205 * We want to lock/unlock the entire ISM segment. Therefore,
1205 1206 * we will be using the underlying sptseg and it's base address
1206 1207 * and length for the caching arguments.
1207 1208 */
1208 1209 ASSERT(sptseg);
1209 1210 ASSERT(sptd);
1210 1211
1211 1212 if (sptd->spt_flags & SHM_PAGEABLE) {
1212 1213 return (segspt_dismpagelock(seg, addr, len, ppp, type, rw));
1213 1214 }
1214 1215
1215 1216 page_index = seg_page(seg, addr);
1216 1217 npages = btopr(len);
1217 1218
1218 1219 /*
1219 1220 * check if the request is larger than number of pages covered
1220 1221 * by amp
1221 1222 */
1222 1223 if (page_index + npages > btopr(sptd->spt_amp->size)) {
1223 1224 *ppp = NULL;
1224 1225 return (ENOTSUP);
1225 1226 }
1226 1227
1227 1228 if (type == L_PAGEUNLOCK) {
1228 1229
1229 1230 ASSERT(sptd->spt_ppa != NULL);
1230 1231
1231 1232 seg_pinactive(seg, NULL, seg->s_base, sptd->spt_amp->size,
1232 1233 sptd->spt_ppa, S_WRITE, SEGP_FORCE_WIRED, segspt_reclaim);
1233 1234
1234 1235 /*
1235 1236 * If someone is blocked while unmapping, we purge
1236 1237 * segment page cache and thus reclaim pplist synchronously
1237 1238 * without waiting for seg_pasync_thread. This speeds up
1238 1239 * unmapping in cases where munmap(2) is called, while
1239 1240 * raw async i/o is still in progress or where a thread
1240 1241 * exits on data fault in a multithreaded application.
1241 1242 */
1242 1243 if (AS_ISUNMAPWAIT(seg->s_as) && (shmd->shm_softlockcnt > 0)) {
1243 1244 segspt_purge(seg);
1244 1245 }
1245 1246 return (0);
1246 1247 }
1247 1248
1248 1249 /* The L_PAGELOCK case... */
1249 1250
1250 1251 /*
1251 1252 * First try to find pages in segment page cache, without
1252 1253 * holding the segment lock.
1253 1254 */
1254 1255 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1255 1256 S_WRITE, SEGP_FORCE_WIRED);
1256 1257 if (pplist != NULL) {
1257 1258 ASSERT(sptd->spt_ppa == pplist);
1258 1259 ASSERT(sptd->spt_ppa[page_index]);
1259 1260 /*
1260 1261 * Since we cache the entire ISM segment, we want to
1261 1262 * set ppp to point to the first slot that corresponds
1262 1263 * to the requested addr, i.e. page_index.
1263 1264 */
1264 1265 *ppp = &(sptd->spt_ppa[page_index]);
1265 1266 return (0);
1266 1267 }
1267 1268
1268 1269 mutex_enter(&sptd->spt_lock);
1269 1270
1270 1271 /*
1271 1272 * try to find pages in segment page cache
1272 1273 */
1273 1274 pplist = seg_plookup(seg, NULL, seg->s_base, sptd->spt_amp->size,
1274 1275 S_WRITE, SEGP_FORCE_WIRED);
1275 1276 if (pplist != NULL) {
1276 1277 ASSERT(sptd->spt_ppa == pplist);
1277 1278 /*
1278 1279 * Since we cache the entire segment, we want to
1279 1280 * set ppp to point to the first slot that corresponds
1280 1281 * to the requested addr, i.e. page_index.
1281 1282 */
1282 1283 mutex_exit(&sptd->spt_lock);
1283 1284 *ppp = &(sptd->spt_ppa[page_index]);
1284 1285 return (0);
1285 1286 }
1286 1287
1287 1288 if (seg_pinsert_check(seg, NULL, seg->s_base, sptd->spt_amp->size,
1288 1289 SEGP_FORCE_WIRED) == SEGP_FAIL) {
1289 1290 mutex_exit(&sptd->spt_lock);
1290 1291 *ppp = NULL;
1291 1292 return (ENOTSUP);
1292 1293 }
1293 1294
1294 1295 /*
1295 1296 * No need to worry about protections because ISM pages
1296 1297 * are always rw.
1297 1298 */
1298 1299 pl = pplist = NULL;
1299 1300
1300 1301 /*
1301 1302 * Do we need to build the ppa array?
1302 1303 */
1303 1304 if (sptd->spt_ppa == NULL) {
1304 1305 ASSERT(sptd->spt_ppa == pplist);
1305 1306
1306 1307 spt_base = sptseg->s_base;
1307 1308 pl_built = 1;
1308 1309
1309 1310 /*
1310 1311 * availrmem is decremented once during anon_swap_adjust()
1311 1312 * and is incremented during the anon_unresv(), which is
1312 1313 * called from shm_rm_amp() when the segment is destroyed.
1313 1314 */
1314 1315 amp = sptd->spt_amp;
1315 1316 ASSERT(amp != NULL);
1316 1317
1317 1318 /* pcachecnt is protected by sptd->spt_lock */
1318 1319 ASSERT(sptd->spt_pcachecnt == 0);
1319 1320 pplist = kmem_zalloc(sizeof (page_t *)
1320 1321 * btopr(sptd->spt_amp->size), KM_SLEEP);
1321 1322 pl = pplist;
1322 1323
1323 1324 anon_index = seg_page(sptseg, spt_base);
1324 1325
1325 1326 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
1326 1327 for (a = spt_base; a < (spt_base + sptd->spt_amp->size);
1327 1328 a += PAGESIZE, anon_index++, pplist++) {
1328 1329 ap = anon_get_ptr(amp->ahp, anon_index);
1329 1330 ASSERT(ap != NULL);
1330 1331 swap_xlate(ap, &vp, &off);
1331 1332 pp = page_lookup(vp, off, SE_SHARED);
1332 1333 ASSERT(pp != NULL);
1333 1334 *pplist = pp;
1334 1335 }
1335 1336 ANON_LOCK_EXIT(&->a_rwlock);
1336 1337
1337 1338 if (a < (spt_base + sptd->spt_amp->size)) {
1338 1339 ret = ENOTSUP;
1339 1340 goto insert_fail;
1340 1341 }
1341 1342 sptd->spt_ppa = pl;
1342 1343 } else {
1343 1344 /*
1344 1345 * We already have a valid ppa[].
1345 1346 */
1346 1347 pl = sptd->spt_ppa;
1347 1348 }
1348 1349
1349 1350 ASSERT(pl != NULL);
1350 1351
1351 1352 ret = seg_pinsert(seg, NULL, seg->s_base, sptd->spt_amp->size,
1352 1353 sptd->spt_amp->size, pl, S_WRITE, SEGP_FORCE_WIRED,
1353 1354 segspt_reclaim);
1354 1355 if (ret == SEGP_FAIL) {
1355 1356 /*
1356 1357 * seg_pinsert failed. We return
1357 1358 * ENOTSUP, so that the as_pagelock() code will
1358 1359 * then try the slower F_SOFTLOCK path.
1359 1360 */
1360 1361 if (pl_built) {
1361 1362 /*
1362 1363 * No one else has referenced the ppa[].
1363 1364 * We created it and we need to destroy it.
1364 1365 */
1365 1366 sptd->spt_ppa = NULL;
1366 1367 }
1367 1368 ret = ENOTSUP;
1368 1369 goto insert_fail;
1369 1370 }
1370 1371
1371 1372 /*
1372 1373 * In either case, we increment softlockcnt on the 'real' segment.
1373 1374 */
1374 1375 sptd->spt_pcachecnt++;
1375 1376 atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1376 1377
1377 1378 /*
1378 1379 * We can now drop the sptd->spt_lock since the ppa[]
1379 1380 * exists and we have incremented pacachecnt.
1380 1381 */
1381 1382 mutex_exit(&sptd->spt_lock);
1382 1383
1383 1384 /*
1384 1385 * Since we cache the entire segment, we want to
1385 1386 * set ppp to point to the first slot that corresponds
1386 1387 * to the requested addr, i.e. page_index.
1387 1388 */
1388 1389 *ppp = &(sptd->spt_ppa[page_index]);
1389 1390 return (0);
1390 1391
1391 1392 insert_fail:
1392 1393 /*
1393 1394 * We will only reach this code if we tried and failed.
1394 1395 *
1395 1396 * And we can drop the lock on the dummy seg, once we've failed
1396 1397 * to set up a new ppa[].
1397 1398 */
1398 1399 mutex_exit(&sptd->spt_lock);
1399 1400
1400 1401 if (pl_built) {
1401 1402 /*
1402 1403 * We created pl and we need to destroy it.
1403 1404 */
1404 1405 pplist = pl;
1405 1406 np = (((uintptr_t)(a - spt_base)) >> PAGESHIFT);
1406 1407 while (np) {
1407 1408 page_unlock(*pplist);
1408 1409 np--;
1409 1410 pplist++;
1410 1411 }
1411 1412 kmem_free(pl, sizeof (page_t *) * btopr(sptd->spt_amp->size));
1412 1413 }
1413 1414 if (shmd->shm_softlockcnt <= 0) {
1414 1415 if (AS_ISUNMAPWAIT(seg->s_as)) {
1415 1416 mutex_enter(&seg->s_as->a_contents);
1416 1417 if (AS_ISUNMAPWAIT(seg->s_as)) {
1417 1418 AS_CLRUNMAPWAIT(seg->s_as);
1418 1419 cv_broadcast(&seg->s_as->a_cv);
1419 1420 }
1420 1421 mutex_exit(&seg->s_as->a_contents);
1421 1422 }
1422 1423 }
1423 1424 *ppp = NULL;
1424 1425 return (ret);
1425 1426 }
1426 1427
1427 1428 /*
1428 1429 * purge any cached pages in the I/O page cache
1429 1430 */
1430 1431 static void
1431 1432 segspt_purge(struct seg *seg)
1432 1433 {
1433 1434 seg_ppurge(seg, NULL, SEGP_FORCE_WIRED);
1434 1435 }
1435 1436
1436 1437 static int
1437 1438 segspt_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist,
1438 1439 enum seg_rw rw, int async)
1439 1440 {
1440 1441 struct seg *seg = (struct seg *)ptag;
1441 1442 struct shm_data *shmd = (struct shm_data *)seg->s_data;
1442 1443 struct seg *sptseg;
1443 1444 struct spt_data *sptd;
1444 1445 pgcnt_t npages, i, free_availrmem = 0;
1445 1446 int done = 0;
1446 1447
1447 1448 #ifdef lint
1448 1449 addr = addr;
1449 1450 #endif
1450 1451 sptseg = shmd->shm_sptseg;
1451 1452 sptd = sptseg->s_data;
1452 1453 npages = (len >> PAGESHIFT);
1453 1454 ASSERT(npages);
1454 1455 ASSERT(sptd->spt_pcachecnt != 0);
1455 1456 ASSERT(sptd->spt_ppa == pplist);
1456 1457 ASSERT(npages == btopr(sptd->spt_amp->size));
1457 1458 ASSERT(async || AS_LOCK_HELD(seg->s_as));
1458 1459
1459 1460 /*
1460 1461 * Acquire the lock on the dummy seg and destroy the
1461 1462 * ppa array IF this is the last pcachecnt.
1462 1463 */
1463 1464 mutex_enter(&sptd->spt_lock);
1464 1465 if (--sptd->spt_pcachecnt == 0) {
1465 1466 for (i = 0; i < npages; i++) {
1466 1467 if (pplist[i] == NULL) {
1467 1468 continue;
1468 1469 }
1469 1470 if (rw == S_WRITE) {
1470 1471 hat_setrefmod(pplist[i]);
1471 1472 } else {
1472 1473 hat_setref(pplist[i]);
1473 1474 }
1474 1475 if ((sptd->spt_flags & SHM_PAGEABLE) &&
1475 1476 (sptd->spt_ppa_lckcnt[i] == 0))
1476 1477 free_availrmem++;
1477 1478 page_unlock(pplist[i]);
1478 1479 }
1479 1480 if ((sptd->spt_flags & SHM_PAGEABLE) && free_availrmem) {
1480 1481 mutex_enter(&freemem_lock);
1481 1482 availrmem += free_availrmem;
1482 1483 mutex_exit(&freemem_lock);
1483 1484 }
1484 1485 /*
1485 1486 * Since we want to cach/uncache the entire ISM segment,
1486 1487 * we will track the pplist in a segspt specific field
1487 1488 * ppa, that is initialized at the time we add an entry to
1488 1489 * the cache.
1489 1490 */
1490 1491 ASSERT(sptd->spt_pcachecnt == 0);
1491 1492 kmem_free(pplist, sizeof (page_t *) * npages);
1492 1493 sptd->spt_ppa = NULL;
1493 1494 sptd->spt_flags &= ~DISM_PPA_CHANGED;
1494 1495 sptd->spt_gen++;
1495 1496 cv_broadcast(&sptd->spt_cv);
1496 1497 done = 1;
1497 1498 }
1498 1499 mutex_exit(&sptd->spt_lock);
1499 1500
1500 1501 /*
1501 1502 * If we are pcache async thread or called via seg_ppurge_wiredpp() we
1502 1503 * may not hold AS lock (in this case async argument is not 0). This
1503 1504 * means if softlockcnt drops to 0 after the decrement below address
1504 1505 * space may get freed. We can't allow it since after softlock
1505 1506 * derement to 0 we still need to access as structure for possible
1506 1507 * wakeup of unmap waiters. To prevent the disappearance of as we take
1507 1508 * this segment's shm_segfree_syncmtx. segspt_shmfree() also takes
1508 1509 * this mutex as a barrier to make sure this routine completes before
1509 1510 * segment is freed.
1510 1511 *
1511 1512 * The second complication we have to deal with in async case is a
1512 1513 * possibility of missed wake up of unmap wait thread. When we don't
1513 1514 * hold as lock here we may take a_contents lock before unmap wait
1514 1515 * thread that was first to see softlockcnt was still not 0. As a
1515 1516 * result we'll fail to wake up an unmap wait thread. To avoid this
1516 1517 * race we set nounmapwait flag in as structure if we drop softlockcnt
1517 1518 * to 0 if async is not 0. unmapwait thread
1518 1519 * will not block if this flag is set.
1519 1520 */
1520 1521 if (async)
1521 1522 mutex_enter(&shmd->shm_segfree_syncmtx);
1522 1523
1523 1524 /*
1524 1525 * Now decrement softlockcnt.
1525 1526 */
1526 1527 ASSERT(shmd->shm_softlockcnt > 0);
1527 1528 atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
1528 1529
1529 1530 if (shmd->shm_softlockcnt <= 0) {
1530 1531 if (async || AS_ISUNMAPWAIT(seg->s_as)) {
1531 1532 mutex_enter(&seg->s_as->a_contents);
1532 1533 if (async)
1533 1534 AS_SETNOUNMAPWAIT(seg->s_as);
1534 1535 if (AS_ISUNMAPWAIT(seg->s_as)) {
1535 1536 AS_CLRUNMAPWAIT(seg->s_as);
1536 1537 cv_broadcast(&seg->s_as->a_cv);
1537 1538 }
1538 1539 mutex_exit(&seg->s_as->a_contents);
1539 1540 }
1540 1541 }
1541 1542
1542 1543 if (async)
1543 1544 mutex_exit(&shmd->shm_segfree_syncmtx);
1544 1545
1545 1546 return (done);
1546 1547 }
1547 1548
1548 1549 /*
1549 1550 * Do a F_SOFTUNLOCK call over the range requested.
1550 1551 * The range must have already been F_SOFTLOCK'ed.
1551 1552 *
1552 1553 * The calls to acquire and release the anon map lock mutex were
1553 1554 * removed in order to avoid a deadly embrace during a DR
1554 1555 * memory delete operation. (Eg. DR blocks while waiting for a
1555 1556 * exclusive lock on a page that is being used for kaio; the
1556 1557 * thread that will complete the kaio and call segspt_softunlock
1557 1558 * blocks on the anon map lock; another thread holding the anon
1558 1559 * map lock blocks on another page lock via the segspt_shmfault
1559 1560 * -> page_lookup -> page_lookup_create -> page_lock_es code flow.)
1560 1561 *
1561 1562 * The appropriateness of the removal is based upon the following:
1562 1563 * 1. If we are holding a segment's reader lock and the page is held
1563 1564 * shared, then the corresponding element in anonmap which points to
1564 1565 * anon struct cannot change and there is no need to acquire the
1565 1566 * anonymous map lock.
1566 1567 * 2. Threads in segspt_softunlock have a reader lock on the segment
1567 1568 * and already have the shared page lock, so we are guaranteed that
1568 1569 * the anon map slot cannot change and therefore can call anon_get_ptr()
1569 1570 * without grabbing the anonymous map lock.
1570 1571 * 3. Threads that softlock a shared page break copy-on-write, even if
1571 1572 * its a read. Thus cow faults can be ignored with respect to soft
1572 1573 * unlocking, since the breaking of cow means that the anon slot(s) will
1573 1574 * not be shared.
1574 1575 */
1575 1576 static void
1576 1577 segspt_softunlock(struct seg *seg, caddr_t sptseg_addr,
1577 1578 size_t len, enum seg_rw rw)
1578 1579 {
1579 1580 struct shm_data *shmd = (struct shm_data *)seg->s_data;
1580 1581 struct seg *sptseg;
1581 1582 struct spt_data *sptd;
1582 1583 page_t *pp;
1583 1584 caddr_t adr;
1584 1585 struct vnode *vp;
1585 1586 u_offset_t offset;
1586 1587 ulong_t anon_index;
1587 1588 struct anon_map *amp; /* XXX - for locknest */
1588 1589 struct anon *ap = NULL;
1589 1590 pgcnt_t npages;
1590 1591
1591 1592 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
1592 1593
1593 1594 sptseg = shmd->shm_sptseg;
1594 1595 sptd = sptseg->s_data;
1595 1596
1596 1597 /*
1597 1598 * Some platforms assume that ISM mappings are HAT_LOAD_LOCK
1598 1599 * and therefore their pages are SE_SHARED locked
1599 1600 * for the entire life of the segment.
1600 1601 */
1601 1602 if ((!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) &&
1602 1603 ((sptd->spt_flags & SHM_PAGEABLE) == 0)) {
1603 1604 goto softlock_decrement;
1604 1605 }
1605 1606
1606 1607 /*
1607 1608 * Any thread is free to do a page_find and
1608 1609 * page_unlock() on the pages within this seg.
1609 1610 *
1610 1611 * We are already holding the as->a_lock on the user's
1611 1612 * real segment, but we need to hold the a_lock on the
1612 1613 * underlying dummy as. This is mostly to satisfy the
1613 1614 * underlying HAT layer.
1614 1615 */
1615 1616 AS_LOCK_ENTER(sptseg->s_as, RW_READER);
1616 1617 hat_unlock(sptseg->s_as->a_hat, sptseg_addr, len);
1617 1618 AS_LOCK_EXIT(sptseg->s_as);
1618 1619
1619 1620 amp = sptd->spt_amp;
1620 1621 ASSERT(amp != NULL);
1621 1622 anon_index = seg_page(sptseg, sptseg_addr);
1622 1623
1623 1624 for (adr = sptseg_addr; adr < sptseg_addr + len; adr += PAGESIZE) {
1624 1625 ap = anon_get_ptr(amp->ahp, anon_index++);
1625 1626 ASSERT(ap != NULL);
1626 1627 swap_xlate(ap, &vp, &offset);
1627 1628
1628 1629 /*
1629 1630 * Use page_find() instead of page_lookup() to
1630 1631 * find the page since we know that it has a
1631 1632 * "shared" lock.
1632 1633 */
1633 1634 pp = page_find(vp, offset);
1634 1635 ASSERT(ap == anon_get_ptr(amp->ahp, anon_index - 1));
1635 1636 if (pp == NULL) {
1636 1637 panic("segspt_softunlock: "
1637 1638 "addr %p, ap %p, vp %p, off %llx",
1638 1639 (void *)adr, (void *)ap, (void *)vp, offset);
1639 1640 /*NOTREACHED*/
1640 1641 }
1641 1642
1642 1643 if (rw == S_WRITE) {
1643 1644 hat_setrefmod(pp);
1644 1645 } else if (rw != S_OTHER) {
1645 1646 hat_setref(pp);
1646 1647 }
1647 1648 page_unlock(pp);
1648 1649 }
1649 1650
1650 1651 softlock_decrement:
1651 1652 npages = btopr(len);
1652 1653 ASSERT(shmd->shm_softlockcnt >= npages);
1653 1654 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), -npages);
1654 1655 if (shmd->shm_softlockcnt == 0) {
1655 1656 /*
1656 1657 * All SOFTLOCKS are gone. Wakeup any waiting
1657 1658 * unmappers so they can try again to unmap.
1658 1659 * Check for waiters first without the mutex
1659 1660 * held so we don't always grab the mutex on
1660 1661 * softunlocks.
1661 1662 */
1662 1663 if (AS_ISUNMAPWAIT(seg->s_as)) {
1663 1664 mutex_enter(&seg->s_as->a_contents);
1664 1665 if (AS_ISUNMAPWAIT(seg->s_as)) {
1665 1666 AS_CLRUNMAPWAIT(seg->s_as);
1666 1667 cv_broadcast(&seg->s_as->a_cv);
1667 1668 }
1668 1669 mutex_exit(&seg->s_as->a_contents);
1669 1670 }
1670 1671 }
1671 1672 }
1672 1673
1673 1674 int
1674 1675 segspt_shmattach(struct seg *seg, caddr_t *argsp)
1675 1676 {
1676 1677 struct shm_data *shmd_arg = (struct shm_data *)argsp;
1677 1678 struct shm_data *shmd;
1678 1679 struct anon_map *shm_amp = shmd_arg->shm_amp;
1679 1680 struct spt_data *sptd;
1680 1681 int error = 0;
1681 1682
1682 1683 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
1683 1684
1684 1685 shmd = kmem_zalloc((sizeof (*shmd)), KM_NOSLEEP);
1685 1686 if (shmd == NULL)
1686 1687 return (ENOMEM);
1687 1688
1688 1689 shmd->shm_sptas = shmd_arg->shm_sptas;
1689 1690 shmd->shm_amp = shm_amp;
1690 1691 shmd->shm_sptseg = shmd_arg->shm_sptseg;
1691 1692
1692 1693 (void) lgrp_shm_policy_set(LGRP_MEM_POLICY_DEFAULT, shm_amp, 0,
1693 1694 NULL, 0, seg->s_size);
1694 1695
1695 1696 mutex_init(&shmd->shm_segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL);
1696 1697
1697 1698 seg->s_data = (void *)shmd;
1698 1699 seg->s_ops = &segspt_shmops;
1699 1700 seg->s_szc = shmd->shm_sptseg->s_szc;
1700 1701 sptd = shmd->shm_sptseg->s_data;
1701 1702
1702 1703 if (sptd->spt_flags & SHM_PAGEABLE) {
1703 1704 if ((shmd->shm_vpage = kmem_zalloc(btopr(shm_amp->size),
1704 1705 KM_NOSLEEP)) == NULL) {
1705 1706 seg->s_data = (void *)NULL;
1706 1707 kmem_free(shmd, (sizeof (*shmd)));
1707 1708 return (ENOMEM);
1708 1709 }
1709 1710 shmd->shm_lckpgs = 0;
1710 1711 if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
1711 1712 if ((error = hat_share(seg->s_as->a_hat, seg->s_base,
1712 1713 shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
1713 1714 seg->s_size, seg->s_szc)) != 0) {
1714 1715 kmem_free(shmd->shm_vpage,
1715 1716 btopr(shm_amp->size));
1716 1717 }
1717 1718 }
1718 1719 } else {
1719 1720 error = hat_share(seg->s_as->a_hat, seg->s_base,
1720 1721 shmd_arg->shm_sptas->a_hat, SEGSPTADDR,
1721 1722 seg->s_size, seg->s_szc);
1722 1723 }
1723 1724 if (error) {
1724 1725 seg->s_szc = 0;
1725 1726 seg->s_data = (void *)NULL;
1726 1727 kmem_free(shmd, (sizeof (*shmd)));
1727 1728 } else {
1728 1729 ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
1729 1730 shm_amp->refcnt++;
1730 1731 ANON_LOCK_EXIT(&shm_amp->a_rwlock);
1731 1732 }
1732 1733 return (error);
1733 1734 }
1734 1735
1735 1736 int
1736 1737 segspt_shmunmap(struct seg *seg, caddr_t raddr, size_t ssize)
1737 1738 {
1738 1739 struct shm_data *shmd = (struct shm_data *)seg->s_data;
1739 1740 int reclaim = 1;
1740 1741
1741 1742 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
1742 1743 retry:
1743 1744 if (shmd->shm_softlockcnt > 0) {
1744 1745 if (reclaim == 1) {
1745 1746 segspt_purge(seg);
1746 1747 reclaim = 0;
1747 1748 goto retry;
1748 1749 }
1749 1750 return (EAGAIN);
1750 1751 }
1751 1752
1752 1753 if (ssize != seg->s_size) {
1753 1754 #ifdef DEBUG
1754 1755 cmn_err(CE_WARN, "Incompatible ssize %lx s_size %lx\n",
1755 1756 ssize, seg->s_size);
1756 1757 #endif
1757 1758 return (EINVAL);
1758 1759 }
1759 1760
1760 1761 (void) segspt_shmlockop(seg, raddr, shmd->shm_amp->size, 0, MC_UNLOCK,
1761 1762 NULL, 0);
1762 1763 hat_unshare(seg->s_as->a_hat, raddr, ssize, seg->s_szc);
1763 1764
1764 1765 seg_free(seg);
1765 1766
1766 1767 return (0);
1767 1768 }
1768 1769
1769 1770 void
1770 1771 segspt_shmfree(struct seg *seg)
1771 1772 {
1772 1773 struct shm_data *shmd = (struct shm_data *)seg->s_data;
1773 1774 struct anon_map *shm_amp = shmd->shm_amp;
1774 1775
1775 1776 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
1776 1777
1777 1778 (void) segspt_shmlockop(seg, seg->s_base, shm_amp->size, 0,
1778 1779 MC_UNLOCK, NULL, 0);
1779 1780
1780 1781 /*
1781 1782 * Need to increment refcnt when attaching
1782 1783 * and decrement when detaching because of dup().
1783 1784 */
1784 1785 ANON_LOCK_ENTER(&shm_amp->a_rwlock, RW_WRITER);
1785 1786 shm_amp->refcnt--;
1786 1787 ANON_LOCK_EXIT(&shm_amp->a_rwlock);
1787 1788
1788 1789 if (shmd->shm_vpage) { /* only for DISM */
1789 1790 kmem_free(shmd->shm_vpage, btopr(shm_amp->size));
1790 1791 shmd->shm_vpage = NULL;
1791 1792 }
1792 1793
1793 1794 /*
1794 1795 * Take shm_segfree_syncmtx lock to let segspt_reclaim() finish if it's
1795 1796 * still working with this segment without holding as lock.
1796 1797 */
1797 1798 ASSERT(shmd->shm_softlockcnt == 0);
1798 1799 mutex_enter(&shmd->shm_segfree_syncmtx);
1799 1800 mutex_destroy(&shmd->shm_segfree_syncmtx);
1800 1801
1801 1802 kmem_free(shmd, sizeof (*shmd));
1802 1803 }
1803 1804
1804 1805 /*ARGSUSED*/
1805 1806 int
1806 1807 segspt_shmsetprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot)
1807 1808 {
1808 1809 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
1809 1810
1810 1811 /*
1811 1812 * Shared page table is more than shared mapping.
1812 1813 * Individual process sharing page tables can't change prot
1813 1814 * because there is only one set of page tables.
1814 1815 * This will be allowed after private page table is
1815 1816 * supported.
1816 1817 */
1817 1818 /* need to return correct status error? */
1818 1819 return (0);
1819 1820 }
1820 1821
1821 1822
1822 1823 faultcode_t
1823 1824 segspt_dismfault(struct hat *hat, struct seg *seg, caddr_t addr,
1824 1825 size_t len, enum fault_type type, enum seg_rw rw)
1825 1826 {
1826 1827 struct shm_data *shmd = (struct shm_data *)seg->s_data;
1827 1828 struct seg *sptseg = shmd->shm_sptseg;
1828 1829 struct as *curspt = shmd->shm_sptas;
1829 1830 struct spt_data *sptd = sptseg->s_data;
1830 1831 pgcnt_t npages;
1831 1832 size_t size;
1832 1833 caddr_t segspt_addr, shm_addr;
1833 1834 page_t **ppa;
1834 1835 int i;
1835 1836 ulong_t an_idx = 0;
1836 1837 int err = 0;
1837 1838 int dyn_ism_unmap = hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0);
1838 1839 size_t pgsz;
1839 1840 pgcnt_t pgcnt;
1840 1841 caddr_t a;
1841 1842 pgcnt_t pidx;
1842 1843
1843 1844 #ifdef lint
1844 1845 hat = hat;
1845 1846 #endif
1846 1847 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
1847 1848
1848 1849 /*
1849 1850 * Because of the way spt is implemented
1850 1851 * the realsize of the segment does not have to be
1851 1852 * equal to the segment size itself. The segment size is
1852 1853 * often in multiples of a page size larger than PAGESIZE.
1853 1854 * The realsize is rounded up to the nearest PAGESIZE
1854 1855 * based on what the user requested. This is a bit of
1855 1856 * ungliness that is historical but not easily fixed
1856 1857 * without re-designing the higher levels of ISM.
1857 1858 */
1858 1859 ASSERT(addr >= seg->s_base);
1859 1860 if (((addr + len) - seg->s_base) > sptd->spt_realsize)
1860 1861 return (FC_NOMAP);
1861 1862 /*
1862 1863 * For all of the following cases except F_PROT, we need to
1863 1864 * make any necessary adjustments to addr and len
1864 1865 * and get all of the necessary page_t's into an array called ppa[].
1865 1866 *
1866 1867 * The code in shmat() forces base addr and len of ISM segment
1867 1868 * to be aligned to largest page size supported. Therefore,
1868 1869 * we are able to handle F_SOFTLOCK and F_INVAL calls in "large
1869 1870 * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
1870 1871 * in large pagesize chunks, or else we will screw up the HAT
1871 1872 * layer by calling hat_memload_array() with differing page sizes
1872 1873 * over a given virtual range.
1873 1874 */
1874 1875 pgsz = page_get_pagesize(sptseg->s_szc);
1875 1876 pgcnt = page_get_pagecnt(sptseg->s_szc);
1876 1877 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz);
1877 1878 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz);
1878 1879 npages = btopr(size);
1879 1880
1880 1881 /*
1881 1882 * Now we need to convert from addr in segshm to addr in segspt.
1882 1883 */
1883 1884 an_idx = seg_page(seg, shm_addr);
1884 1885 segspt_addr = sptseg->s_base + ptob(an_idx);
1885 1886
1886 1887 ASSERT((segspt_addr + ptob(npages)) <=
1887 1888 (sptseg->s_base + sptd->spt_realsize));
1888 1889 ASSERT(segspt_addr < (sptseg->s_base + sptseg->s_size));
1889 1890
1890 1891 switch (type) {
1891 1892
1892 1893 case F_SOFTLOCK:
1893 1894
1894 1895 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
1895 1896 /*
1896 1897 * Fall through to the F_INVAL case to load up the hat layer
1897 1898 * entries with the HAT_LOAD_LOCK flag.
1898 1899 */
1899 1900 /* FALLTHRU */
1900 1901 case F_INVAL:
1901 1902
1902 1903 if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
1903 1904 return (FC_NOMAP);
1904 1905
1905 1906 ppa = kmem_zalloc(npages * sizeof (page_t *), KM_SLEEP);
1906 1907
1907 1908 err = spt_anon_getpages(sptseg, segspt_addr, size, ppa);
1908 1909 if (err != 0) {
1909 1910 if (type == F_SOFTLOCK) {
1910 1911 atomic_add_long((ulong_t *)(
1911 1912 &(shmd->shm_softlockcnt)), -npages);
1912 1913 }
1913 1914 goto dism_err;
1914 1915 }
1915 1916 AS_LOCK_ENTER(sptseg->s_as, RW_READER);
1916 1917 a = segspt_addr;
1917 1918 pidx = 0;
1918 1919 if (type == F_SOFTLOCK) {
1919 1920
1920 1921 /*
1921 1922 * Load up the translation keeping it
1922 1923 * locked and don't unlock the page.
1923 1924 */
1924 1925 for (; pidx < npages; a += pgsz, pidx += pgcnt) {
1925 1926 hat_memload_array(sptseg->s_as->a_hat,
1926 1927 a, pgsz, &ppa[pidx], sptd->spt_prot,
1927 1928 HAT_LOAD_LOCK | HAT_LOAD_SHARE);
1928 1929 }
1929 1930 } else {
1930 1931 /*
1931 1932 * Migrate pages marked for migration
1932 1933 */
1933 1934 if (lgrp_optimizations())
1934 1935 page_migrate(seg, shm_addr, ppa, npages);
1935 1936
1936 1937 for (; pidx < npages; a += pgsz, pidx += pgcnt) {
1937 1938 hat_memload_array(sptseg->s_as->a_hat,
1938 1939 a, pgsz, &ppa[pidx],
1939 1940 sptd->spt_prot,
1940 1941 HAT_LOAD_SHARE);
1941 1942 }
1942 1943
1943 1944 /*
1944 1945 * And now drop the SE_SHARED lock(s).
1945 1946 */
1946 1947 if (dyn_ism_unmap) {
1947 1948 for (i = 0; i < npages; i++) {
1948 1949 page_unlock(ppa[i]);
1949 1950 }
1950 1951 }
1951 1952 }
1952 1953
1953 1954 if (!dyn_ism_unmap) {
1954 1955 if (hat_share(seg->s_as->a_hat, shm_addr,
1955 1956 curspt->a_hat, segspt_addr, ptob(npages),
1956 1957 seg->s_szc) != 0) {
1957 1958 panic("hat_share err in DISM fault");
1958 1959 /* NOTREACHED */
1959 1960 }
1960 1961 if (type == F_INVAL) {
1961 1962 for (i = 0; i < npages; i++) {
1962 1963 page_unlock(ppa[i]);
1963 1964 }
1964 1965 }
1965 1966 }
1966 1967 AS_LOCK_EXIT(sptseg->s_as);
1967 1968 dism_err:
1968 1969 kmem_free(ppa, npages * sizeof (page_t *));
1969 1970 return (err);
1970 1971
1971 1972 case F_SOFTUNLOCK:
1972 1973
1973 1974 /*
1974 1975 * This is a bit ugly, we pass in the real seg pointer,
1975 1976 * but the segspt_addr is the virtual address within the
1976 1977 * dummy seg.
1977 1978 */
1978 1979 segspt_softunlock(seg, segspt_addr, size, rw);
1979 1980 return (0);
1980 1981
1981 1982 case F_PROT:
1982 1983
1983 1984 /*
1984 1985 * This takes care of the unusual case where a user
1985 1986 * allocates a stack in shared memory and a register
1986 1987 * window overflow is written to that stack page before
1987 1988 * it is otherwise modified.
1988 1989 *
1989 1990 * We can get away with this because ISM segments are
1990 1991 * always rw. Other than this unusual case, there
1991 1992 * should be no instances of protection violations.
1992 1993 */
1993 1994 return (0);
1994 1995
1995 1996 default:
1996 1997 #ifdef DEBUG
1997 1998 panic("segspt_dismfault default type?");
1998 1999 #else
1999 2000 return (FC_NOMAP);
2000 2001 #endif
2001 2002 }
2002 2003 }
2003 2004
2004 2005
2005 2006 faultcode_t
2006 2007 segspt_shmfault(struct hat *hat, struct seg *seg, caddr_t addr,
2007 2008 size_t len, enum fault_type type, enum seg_rw rw)
2008 2009 {
2009 2010 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2010 2011 struct seg *sptseg = shmd->shm_sptseg;
2011 2012 struct as *curspt = shmd->shm_sptas;
2012 2013 struct spt_data *sptd = sptseg->s_data;
2013 2014 pgcnt_t npages;
2014 2015 size_t size;
2015 2016 caddr_t sptseg_addr, shm_addr;
2016 2017 page_t *pp, **ppa;
2017 2018 int i;
2018 2019 u_offset_t offset;
2019 2020 ulong_t anon_index = 0;
2020 2021 struct vnode *vp;
2021 2022 struct anon_map *amp; /* XXX - for locknest */
2022 2023 struct anon *ap = NULL;
2023 2024 size_t pgsz;
2024 2025 pgcnt_t pgcnt;
2025 2026 caddr_t a;
2026 2027 pgcnt_t pidx;
2027 2028 size_t sz;
2028 2029
2029 2030 #ifdef lint
2030 2031 hat = hat;
2031 2032 #endif
2032 2033
2033 2034 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2034 2035
2035 2036 if (sptd->spt_flags & SHM_PAGEABLE) {
2036 2037 return (segspt_dismfault(hat, seg, addr, len, type, rw));
2037 2038 }
2038 2039
2039 2040 /*
2040 2041 * Because of the way spt is implemented
2041 2042 * the realsize of the segment does not have to be
2042 2043 * equal to the segment size itself. The segment size is
2043 2044 * often in multiples of a page size larger than PAGESIZE.
2044 2045 * The realsize is rounded up to the nearest PAGESIZE
2045 2046 * based on what the user requested. This is a bit of
2046 2047 * ungliness that is historical but not easily fixed
2047 2048 * without re-designing the higher levels of ISM.
2048 2049 */
2049 2050 ASSERT(addr >= seg->s_base);
2050 2051 if (((addr + len) - seg->s_base) > sptd->spt_realsize)
2051 2052 return (FC_NOMAP);
2052 2053 /*
2053 2054 * For all of the following cases except F_PROT, we need to
2054 2055 * make any necessary adjustments to addr and len
2055 2056 * and get all of the necessary page_t's into an array called ppa[].
2056 2057 *
2057 2058 * The code in shmat() forces base addr and len of ISM segment
2058 2059 * to be aligned to largest page size supported. Therefore,
2059 2060 * we are able to handle F_SOFTLOCK and F_INVAL calls in "large
2060 2061 * pagesize" chunks. We want to make sure that we HAT_LOAD_LOCK
2061 2062 * in large pagesize chunks, or else we will screw up the HAT
2062 2063 * layer by calling hat_memload_array() with differing page sizes
2063 2064 * over a given virtual range.
2064 2065 */
2065 2066 pgsz = page_get_pagesize(sptseg->s_szc);
2066 2067 pgcnt = page_get_pagecnt(sptseg->s_szc);
2067 2068 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz);
2068 2069 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)), pgsz);
2069 2070 npages = btopr(size);
2070 2071
2071 2072 /*
2072 2073 * Now we need to convert from addr in segshm to addr in segspt.
2073 2074 */
2074 2075 anon_index = seg_page(seg, shm_addr);
2075 2076 sptseg_addr = sptseg->s_base + ptob(anon_index);
2076 2077
2077 2078 /*
2078 2079 * And now we may have to adjust npages downward if we have
2079 2080 * exceeded the realsize of the segment or initial anon
2080 2081 * allocations.
2081 2082 */
2082 2083 if ((sptseg_addr + ptob(npages)) >
2083 2084 (sptseg->s_base + sptd->spt_realsize))
2084 2085 size = (sptseg->s_base + sptd->spt_realsize) - sptseg_addr;
2085 2086
2086 2087 npages = btopr(size);
2087 2088
2088 2089 ASSERT(sptseg_addr < (sptseg->s_base + sptseg->s_size));
2089 2090 ASSERT((sptd->spt_flags & SHM_PAGEABLE) == 0);
2090 2091
2091 2092 switch (type) {
2092 2093
2093 2094 case F_SOFTLOCK:
2094 2095
2095 2096 /*
2096 2097 * availrmem is decremented once during anon_swap_adjust()
2097 2098 * and is incremented during the anon_unresv(), which is
2098 2099 * called from shm_rm_amp() when the segment is destroyed.
2099 2100 */
2100 2101 atomic_add_long((ulong_t *)(&(shmd->shm_softlockcnt)), npages);
2101 2102 /*
2102 2103 * Some platforms assume that ISM pages are SE_SHARED
2103 2104 * locked for the entire life of the segment.
2104 2105 */
2105 2106 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0))
2106 2107 return (0);
2107 2108 /*
2108 2109 * Fall through to the F_INVAL case to load up the hat layer
2109 2110 * entries with the HAT_LOAD_LOCK flag.
2110 2111 */
2111 2112
2112 2113 /* FALLTHRU */
2113 2114 case F_INVAL:
2114 2115
2115 2116 if ((rw == S_EXEC) && !(sptd->spt_prot & PROT_EXEC))
2116 2117 return (FC_NOMAP);
2117 2118
2118 2119 /*
2119 2120 * Some platforms that do NOT support DYNAMIC_ISM_UNMAP
2120 2121 * may still rely on this call to hat_share(). That
2121 2122 * would imply that those hat's can fault on a
2122 2123 * HAT_LOAD_LOCK translation, which would seem
2123 2124 * contradictory.
2124 2125 */
2125 2126 if (!hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
2126 2127 if (hat_share(seg->s_as->a_hat, seg->s_base,
2127 2128 curspt->a_hat, sptseg->s_base,
2128 2129 sptseg->s_size, sptseg->s_szc) != 0) {
2129 2130 panic("hat_share error in ISM fault");
2130 2131 /*NOTREACHED*/
2131 2132 }
2132 2133 return (0);
2133 2134 }
2134 2135 ppa = kmem_zalloc(sizeof (page_t *) * npages, KM_SLEEP);
2135 2136
2136 2137 /*
2137 2138 * I see no need to lock the real seg,
2138 2139 * here, because all of our work will be on the underlying
2139 2140 * dummy seg.
2140 2141 *
2141 2142 * sptseg_addr and npages now account for large pages.
2142 2143 */
2143 2144 amp = sptd->spt_amp;
2144 2145 ASSERT(amp != NULL);
2145 2146 anon_index = seg_page(sptseg, sptseg_addr);
2146 2147
2147 2148 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
2148 2149 for (i = 0; i < npages; i++) {
2149 2150 ap = anon_get_ptr(amp->ahp, anon_index++);
2150 2151 ASSERT(ap != NULL);
2151 2152 swap_xlate(ap, &vp, &offset);
2152 2153 pp = page_lookup(vp, offset, SE_SHARED);
2153 2154 ASSERT(pp != NULL);
2154 2155 ppa[i] = pp;
2155 2156 }
2156 2157 ANON_LOCK_EXIT(&->a_rwlock);
2157 2158 ASSERT(i == npages);
2158 2159
2159 2160 /*
2160 2161 * We are already holding the as->a_lock on the user's
2161 2162 * real segment, but we need to hold the a_lock on the
2162 2163 * underlying dummy as. This is mostly to satisfy the
2163 2164 * underlying HAT layer.
2164 2165 */
2165 2166 AS_LOCK_ENTER(sptseg->s_as, RW_READER);
2166 2167 a = sptseg_addr;
2167 2168 pidx = 0;
2168 2169 if (type == F_SOFTLOCK) {
2169 2170 /*
2170 2171 * Load up the translation keeping it
2171 2172 * locked and don't unlock the page.
2172 2173 */
2173 2174 for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2174 2175 sz = MIN(pgsz, ptob(npages - pidx));
2175 2176 hat_memload_array(sptseg->s_as->a_hat, a,
2176 2177 sz, &ppa[pidx], sptd->spt_prot,
2177 2178 HAT_LOAD_LOCK | HAT_LOAD_SHARE);
2178 2179 }
2179 2180 } else {
2180 2181 /*
2181 2182 * Migrate pages marked for migration.
2182 2183 */
2183 2184 if (lgrp_optimizations())
2184 2185 page_migrate(seg, shm_addr, ppa, npages);
2185 2186
2186 2187 for (; pidx < npages; a += pgsz, pidx += pgcnt) {
2187 2188 sz = MIN(pgsz, ptob(npages - pidx));
2188 2189 hat_memload_array(sptseg->s_as->a_hat,
2189 2190 a, sz, &ppa[pidx],
2190 2191 sptd->spt_prot, HAT_LOAD_SHARE);
2191 2192 }
2192 2193
2193 2194 /*
2194 2195 * And now drop the SE_SHARED lock(s).
2195 2196 */
2196 2197 for (i = 0; i < npages; i++)
2197 2198 page_unlock(ppa[i]);
2198 2199 }
2199 2200 AS_LOCK_EXIT(sptseg->s_as);
2200 2201
2201 2202 kmem_free(ppa, sizeof (page_t *) * npages);
2202 2203 return (0);
2203 2204 case F_SOFTUNLOCK:
2204 2205
2205 2206 /*
2206 2207 * This is a bit ugly, we pass in the real seg pointer,
2207 2208 * but the sptseg_addr is the virtual address within the
2208 2209 * dummy seg.
2209 2210 */
2210 2211 segspt_softunlock(seg, sptseg_addr, ptob(npages), rw);
2211 2212 return (0);
2212 2213
2213 2214 case F_PROT:
2214 2215
2215 2216 /*
2216 2217 * This takes care of the unusual case where a user
2217 2218 * allocates a stack in shared memory and a register
2218 2219 * window overflow is written to that stack page before
2219 2220 * it is otherwise modified.
2220 2221 *
2221 2222 * We can get away with this because ISM segments are
2222 2223 * always rw. Other than this unusual case, there
2223 2224 * should be no instances of protection violations.
2224 2225 */
2225 2226 return (0);
2226 2227
2227 2228 default:
2228 2229 #ifdef DEBUG
2229 2230 cmn_err(CE_WARN, "segspt_shmfault default type?");
2230 2231 #endif
2231 2232 return (FC_NOMAP);
2232 2233 }
2233 2234 }
2234 2235
2235 2236 /*ARGSUSED*/
2236 2237 static faultcode_t
2237 2238 segspt_shmfaulta(struct seg *seg, caddr_t addr)
2238 2239 {
2239 2240 return (0);
2240 2241 }
2241 2242
2242 2243 /*ARGSUSED*/
2243 2244 static int
2244 2245 segspt_shmkluster(struct seg *seg, caddr_t addr, ssize_t delta)
2245 2246 {
2246 2247 return (0);
2247 2248 }
2248 2249
2249 2250 /*ARGSUSED*/
2250 2251 static size_t
2251 2252 segspt_shmswapout(struct seg *seg)
2252 2253 {
2253 2254 return (0);
2254 2255 }
2255 2256
2256 2257 /*
2257 2258 * duplicate the shared page tables
2258 2259 */
2259 2260 int
2260 2261 segspt_shmdup(struct seg *seg, struct seg *newseg)
2261 2262 {
2262 2263 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2263 2264 struct anon_map *amp = shmd->shm_amp;
2264 2265 struct shm_data *shmd_new;
2265 2266 struct seg *spt_seg = shmd->shm_sptseg;
2266 2267 struct spt_data *sptd = spt_seg->s_data;
2267 2268 int error = 0;
2268 2269
2269 2270 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as));
2270 2271
2271 2272 shmd_new = kmem_zalloc((sizeof (*shmd_new)), KM_SLEEP);
2272 2273 newseg->s_data = (void *)shmd_new;
2273 2274 shmd_new->shm_sptas = shmd->shm_sptas;
2274 2275 shmd_new->shm_amp = amp;
2275 2276 shmd_new->shm_sptseg = shmd->shm_sptseg;
2276 2277 newseg->s_ops = &segspt_shmops;
2277 2278 newseg->s_szc = seg->s_szc;
2278 2279 ASSERT(seg->s_szc == shmd->shm_sptseg->s_szc);
2279 2280
2280 2281 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
2281 2282 amp->refcnt++;
2282 2283 ANON_LOCK_EXIT(&->a_rwlock);
2283 2284
2284 2285 if (sptd->spt_flags & SHM_PAGEABLE) {
2285 2286 shmd_new->shm_vpage = kmem_zalloc(btopr(amp->size), KM_SLEEP);
2286 2287 shmd_new->shm_lckpgs = 0;
2287 2288 if (hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
2288 2289 if ((error = hat_share(newseg->s_as->a_hat,
2289 2290 newseg->s_base, shmd->shm_sptas->a_hat, SEGSPTADDR,
2290 2291 seg->s_size, seg->s_szc)) != 0) {
2291 2292 kmem_free(shmd_new->shm_vpage,
2292 2293 btopr(amp->size));
2293 2294 }
2294 2295 }
2295 2296 return (error);
2296 2297 } else {
2297 2298 return (hat_share(newseg->s_as->a_hat, newseg->s_base,
2298 2299 shmd->shm_sptas->a_hat, SEGSPTADDR, seg->s_size,
2299 2300 seg->s_szc));
2300 2301
2301 2302 }
2302 2303 }
2303 2304
2304 2305 /*ARGSUSED*/
2305 2306 int
2306 2307 segspt_shmcheckprot(struct seg *seg, caddr_t addr, size_t size, uint_t prot)
2307 2308 {
2308 2309 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2309 2310 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2310 2311
2311 2312 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2312 2313
2313 2314 /*
2314 2315 * ISM segment is always rw.
2315 2316 */
2316 2317 return (((sptd->spt_prot & prot) != prot) ? EACCES : 0);
2317 2318 }
2318 2319
2319 2320 /*
2320 2321 * Return an array of locked large pages, for empty slots allocate
2321 2322 * private zero-filled anon pages.
2322 2323 */
2323 2324 static int
2324 2325 spt_anon_getpages(
2325 2326 struct seg *sptseg,
2326 2327 caddr_t sptaddr,
2327 2328 size_t len,
2328 2329 page_t *ppa[])
2329 2330 {
2330 2331 struct spt_data *sptd = sptseg->s_data;
2331 2332 struct anon_map *amp = sptd->spt_amp;
2332 2333 enum seg_rw rw = sptd->spt_prot;
2333 2334 uint_t szc = sptseg->s_szc;
2334 2335 size_t pg_sz, share_sz = page_get_pagesize(szc);
2335 2336 pgcnt_t lp_npgs;
2336 2337 caddr_t lp_addr, e_sptaddr;
2337 2338 uint_t vpprot, ppa_szc = 0;
2338 2339 struct vpage *vpage = NULL;
2339 2340 ulong_t j, ppa_idx;
2340 2341 int err, ierr = 0;
2341 2342 pgcnt_t an_idx;
2342 2343 anon_sync_obj_t cookie;
2343 2344 int anon_locked = 0;
2344 2345 pgcnt_t amp_pgs;
2345 2346
2346 2347
2347 2348 ASSERT(IS_P2ALIGNED(sptaddr, share_sz) && IS_P2ALIGNED(len, share_sz));
2348 2349 ASSERT(len != 0);
2349 2350
2350 2351 pg_sz = share_sz;
2351 2352 lp_npgs = btop(pg_sz);
2352 2353 lp_addr = sptaddr;
2353 2354 e_sptaddr = sptaddr + len;
2354 2355 an_idx = seg_page(sptseg, sptaddr);
2355 2356 ppa_idx = 0;
2356 2357
2357 2358 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
2358 2359
2359 2360 amp_pgs = page_get_pagecnt(amp->a_szc);
2360 2361
2361 2362 /*CONSTCOND*/
2362 2363 while (1) {
2363 2364 for (; lp_addr < e_sptaddr;
2364 2365 an_idx += lp_npgs, lp_addr += pg_sz, ppa_idx += lp_npgs) {
2365 2366
2366 2367 /*
2367 2368 * If we're currently locked, and we get to a new
2368 2369 * page, unlock our current anon chunk.
2369 2370 */
2370 2371 if (anon_locked && P2PHASE(an_idx, amp_pgs) == 0) {
2371 2372 anon_array_exit(&cookie);
2372 2373 anon_locked = 0;
2373 2374 }
2374 2375 if (!anon_locked) {
2375 2376 anon_array_enter(amp, an_idx, &cookie);
2376 2377 anon_locked = 1;
2377 2378 }
2378 2379 ppa_szc = (uint_t)-1;
2379 2380 ierr = anon_map_getpages(amp, an_idx, szc, sptseg,
2380 2381 lp_addr, sptd->spt_prot, &vpprot, &ppa[ppa_idx],
2381 2382 &ppa_szc, vpage, rw, 0, segvn_anypgsz, 0, kcred);
2382 2383
2383 2384 if (ierr != 0) {
2384 2385 if (ierr > 0) {
2385 2386 err = FC_MAKE_ERR(ierr);
2386 2387 goto lpgs_err;
2387 2388 }
2388 2389 break;
2389 2390 }
2390 2391 }
2391 2392 if (lp_addr == e_sptaddr) {
2392 2393 break;
2393 2394 }
2394 2395 ASSERT(lp_addr < e_sptaddr);
2395 2396
2396 2397 /*
2397 2398 * ierr == -1 means we failed to allocate a large page.
2398 2399 * so do a size down operation.
2399 2400 *
2400 2401 * ierr == -2 means some other process that privately shares
2401 2402 * pages with this process has allocated a larger page and we
2402 2403 * need to retry with larger pages. So do a size up
2403 2404 * operation. This relies on the fact that large pages are
2404 2405 * never partially shared i.e. if we share any constituent
2405 2406 * page of a large page with another process we must share the
2406 2407 * entire large page. Note this cannot happen for SOFTLOCK
2407 2408 * case, unless current address (lpaddr) is at the beginning
2408 2409 * of the next page size boundary because the other process
2409 2410 * couldn't have relocated locked pages.
2410 2411 */
2411 2412 ASSERT(ierr == -1 || ierr == -2);
2412 2413 if (segvn_anypgsz) {
2413 2414 ASSERT(ierr == -2 || szc != 0);
2414 2415 ASSERT(ierr == -1 || szc < sptseg->s_szc);
2415 2416 szc = (ierr == -1) ? szc - 1 : szc + 1;
2416 2417 } else {
2417 2418 /*
2418 2419 * For faults and segvn_anypgsz == 0
2419 2420 * we need to be careful not to loop forever
2420 2421 * if existing page is found with szc other
2421 2422 * than 0 or seg->s_szc. This could be due
2422 2423 * to page relocations on behalf of DR or
2423 2424 * more likely large page creation. For this
2424 2425 * case simply re-size to existing page's szc
2425 2426 * if returned by anon_map_getpages().
2426 2427 */
2427 2428 if (ppa_szc == (uint_t)-1) {
2428 2429 szc = (ierr == -1) ? 0 : sptseg->s_szc;
2429 2430 } else {
2430 2431 ASSERT(ppa_szc <= sptseg->s_szc);
2431 2432 ASSERT(ierr == -2 || ppa_szc < szc);
2432 2433 ASSERT(ierr == -1 || ppa_szc > szc);
2433 2434 szc = ppa_szc;
2434 2435 }
2435 2436 }
2436 2437 pg_sz = page_get_pagesize(szc);
2437 2438 lp_npgs = btop(pg_sz);
2438 2439 ASSERT(IS_P2ALIGNED(lp_addr, pg_sz));
2439 2440 }
2440 2441 if (anon_locked) {
2441 2442 anon_array_exit(&cookie);
2442 2443 }
2443 2444 ANON_LOCK_EXIT(&->a_rwlock);
2444 2445 return (0);
2445 2446
2446 2447 lpgs_err:
2447 2448 if (anon_locked) {
2448 2449 anon_array_exit(&cookie);
2449 2450 }
2450 2451 ANON_LOCK_EXIT(&->a_rwlock);
2451 2452 for (j = 0; j < ppa_idx; j++)
2452 2453 page_unlock(ppa[j]);
2453 2454 return (err);
2454 2455 }
2455 2456
2456 2457 /*
2457 2458 * count the number of bytes in a set of spt pages that are currently not
2458 2459 * locked
2459 2460 */
2460 2461 static rctl_qty_t
2461 2462 spt_unlockedbytes(pgcnt_t npages, page_t **ppa)
2462 2463 {
2463 2464 ulong_t i;
2464 2465 rctl_qty_t unlocked = 0;
2465 2466
2466 2467 for (i = 0; i < npages; i++) {
2467 2468 if (ppa[i]->p_lckcnt == 0)
2468 2469 unlocked += PAGESIZE;
2469 2470 }
2470 2471 return (unlocked);
2471 2472 }
2472 2473
2473 2474 extern u_longlong_t randtick(void);
2474 2475 /* number of locks to reserve/skip by spt_lockpages() and spt_unlockpages() */
2475 2476 #define NLCK (NCPU_P2)
2476 2477 /* Random number with a range [0, n-1], n must be power of two */
2477 2478 #define RAND_P2(n) \
2478 2479 ((((long)curthread >> PTR24_LSB) ^ (long)randtick()) & ((n) - 1))
2479 2480
2480 2481 int
2481 2482 spt_lockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages,
2482 2483 page_t **ppa, ulong_t *lockmap, size_t pos,
2483 2484 rctl_qty_t *locked)
2484 2485 {
2485 2486 struct shm_data *shmd = seg->s_data;
2486 2487 struct spt_data *sptd = shmd->shm_sptseg->s_data;
2487 2488 ulong_t i;
2488 2489 int kernel;
2489 2490 pgcnt_t nlck = 0;
2490 2491 int rv = 0;
2491 2492 int use_reserved = 1;
2492 2493
2493 2494 /* return the number of bytes actually locked */
2494 2495 *locked = 0;
2495 2496
2496 2497 /*
2497 2498 * To avoid contention on freemem_lock, availrmem and pages_locked
2498 2499 * global counters are updated only every nlck locked pages instead of
2499 2500 * every time. Reserve nlck locks up front and deduct from this
2500 2501 * reservation for each page that requires a lock. When the reservation
2501 2502 * is consumed, reserve again. nlck is randomized, so the competing
2502 2503 * threads do not fall into a cyclic lock contention pattern. When
2503 2504 * memory is low, the lock ahead is disabled, and instead page_pp_lock()
2504 2505 * is used to lock pages.
2505 2506 */
2506 2507 for (i = 0; i < npages; anon_index++, pos++, i++) {
2507 2508 if (nlck == 0 && use_reserved == 1) {
2508 2509 nlck = NLCK + RAND_P2(NLCK);
2509 2510 /* if fewer loops left, decrease nlck */
2510 2511 nlck = MIN(nlck, npages - i);
2511 2512 /*
2512 2513 * Reserve nlck locks up front and deduct from this
2513 2514 * reservation for each page that requires a lock. When
2514 2515 * the reservation is consumed, reserve again.
2515 2516 */
2516 2517 mutex_enter(&freemem_lock);
2517 2518 if ((availrmem - nlck) < pages_pp_maximum) {
2518 2519 /* Do not do advance memory reserves */
2519 2520 use_reserved = 0;
2520 2521 } else {
2521 2522 availrmem -= nlck;
2522 2523 pages_locked += nlck;
2523 2524 }
2524 2525 mutex_exit(&freemem_lock);
2525 2526 }
2526 2527 if (!(shmd->shm_vpage[anon_index] & DISM_PG_LOCKED)) {
2527 2528 if (sptd->spt_ppa_lckcnt[anon_index] <
2528 2529 (ushort_t)DISM_LOCK_MAX) {
2529 2530 if (++sptd->spt_ppa_lckcnt[anon_index] ==
2530 2531 (ushort_t)DISM_LOCK_MAX) {
2531 2532 cmn_err(CE_WARN,
2532 2533 "DISM page lock limit "
2533 2534 "reached on DISM offset 0x%lx\n",
2534 2535 anon_index << PAGESHIFT);
2535 2536 }
2536 2537 kernel = (sptd->spt_ppa &&
2537 2538 sptd->spt_ppa[anon_index]);
2538 2539 if (!page_pp_lock(ppa[i], 0, kernel ||
2539 2540 use_reserved)) {
2540 2541 sptd->spt_ppa_lckcnt[anon_index]--;
2541 2542 rv = EAGAIN;
2542 2543 break;
2543 2544 }
2544 2545 /* if this is a newly locked page, count it */
2545 2546 if (ppa[i]->p_lckcnt == 1) {
2546 2547 if (kernel == 0 && use_reserved == 1)
2547 2548 nlck--;
2548 2549 *locked += PAGESIZE;
2549 2550 }
2550 2551 shmd->shm_lckpgs++;
2551 2552 shmd->shm_vpage[anon_index] |= DISM_PG_LOCKED;
2552 2553 if (lockmap != NULL)
2553 2554 BT_SET(lockmap, pos);
2554 2555 }
2555 2556 }
2556 2557 }
2557 2558 /* Return unused lock reservation */
2558 2559 if (nlck != 0 && use_reserved == 1) {
2559 2560 mutex_enter(&freemem_lock);
2560 2561 availrmem += nlck;
2561 2562 pages_locked -= nlck;
2562 2563 mutex_exit(&freemem_lock);
2563 2564 }
2564 2565
2565 2566 return (rv);
2566 2567 }
2567 2568
2568 2569 int
2569 2570 spt_unlockpages(struct seg *seg, pgcnt_t anon_index, pgcnt_t npages,
2570 2571 rctl_qty_t *unlocked)
2571 2572 {
2572 2573 struct shm_data *shmd = seg->s_data;
2573 2574 struct spt_data *sptd = shmd->shm_sptseg->s_data;
2574 2575 struct anon_map *amp = sptd->spt_amp;
2575 2576 struct anon *ap;
2576 2577 struct vnode *vp;
2577 2578 u_offset_t off;
2578 2579 struct page *pp;
2579 2580 int kernel;
2580 2581 anon_sync_obj_t cookie;
2581 2582 ulong_t i;
2582 2583 pgcnt_t nlck = 0;
2583 2584 pgcnt_t nlck_limit = NLCK;
2584 2585
2585 2586 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
2586 2587 for (i = 0; i < npages; i++, anon_index++) {
2587 2588 if (shmd->shm_vpage[anon_index] & DISM_PG_LOCKED) {
2588 2589 anon_array_enter(amp, anon_index, &cookie);
2589 2590 ap = anon_get_ptr(amp->ahp, anon_index);
2590 2591 ASSERT(ap);
2591 2592
2592 2593 swap_xlate(ap, &vp, &off);
2593 2594 anon_array_exit(&cookie);
2594 2595 pp = page_lookup(vp, off, SE_SHARED);
2595 2596 ASSERT(pp);
2596 2597 /*
2597 2598 * availrmem is decremented only for pages which are not
2598 2599 * in seg pcache, for pages in seg pcache availrmem was
2599 2600 * decremented in _dismpagelock()
2600 2601 */
2601 2602 kernel = (sptd->spt_ppa && sptd->spt_ppa[anon_index]);
2602 2603 ASSERT(pp->p_lckcnt > 0);
2603 2604
2604 2605 /*
2605 2606 * lock page but do not change availrmem, we do it
2606 2607 * ourselves every nlck loops.
2607 2608 */
2608 2609 page_pp_unlock(pp, 0, 1);
2609 2610 if (pp->p_lckcnt == 0) {
2610 2611 if (kernel == 0)
2611 2612 nlck++;
2612 2613 *unlocked += PAGESIZE;
2613 2614 }
2614 2615 page_unlock(pp);
2615 2616 shmd->shm_vpage[anon_index] &= ~DISM_PG_LOCKED;
2616 2617 sptd->spt_ppa_lckcnt[anon_index]--;
2617 2618 shmd->shm_lckpgs--;
2618 2619 }
2619 2620
2620 2621 /*
2621 2622 * To reduce freemem_lock contention, do not update availrmem
2622 2623 * until at least NLCK pages have been unlocked.
2623 2624 * 1. No need to update if nlck is zero
2624 2625 * 2. Always update if the last iteration
2625 2626 */
2626 2627 if (nlck > 0 && (nlck == nlck_limit || i == npages - 1)) {
2627 2628 mutex_enter(&freemem_lock);
2628 2629 availrmem += nlck;
2629 2630 pages_locked -= nlck;
2630 2631 mutex_exit(&freemem_lock);
2631 2632 nlck = 0;
2632 2633 nlck_limit = NLCK + RAND_P2(NLCK);
2633 2634 }
2634 2635 }
2635 2636 ANON_LOCK_EXIT(&->a_rwlock);
2636 2637
2637 2638 return (0);
2638 2639 }
2639 2640
2640 2641 /*ARGSUSED*/
2641 2642 static int
2642 2643 segspt_shmlockop(struct seg *seg, caddr_t addr, size_t len,
2643 2644 int attr, int op, ulong_t *lockmap, size_t pos)
2644 2645 {
2645 2646 struct shm_data *shmd = seg->s_data;
2646 2647 struct seg *sptseg = shmd->shm_sptseg;
2647 2648 struct spt_data *sptd = sptseg->s_data;
2648 2649 struct kshmid *sp = sptd->spt_amp->a_sp;
2649 2650 pgcnt_t npages, a_npages;
2650 2651 page_t **ppa;
2651 2652 pgcnt_t an_idx, a_an_idx, ppa_idx;
2652 2653 caddr_t spt_addr, a_addr; /* spt and aligned address */
2653 2654 size_t a_len; /* aligned len */
2654 2655 size_t share_sz;
2655 2656 ulong_t i;
2656 2657 int sts = 0;
2657 2658 rctl_qty_t unlocked = 0;
2658 2659 rctl_qty_t locked = 0;
2659 2660 struct proc *p = curproc;
2660 2661 kproject_t *proj;
2661 2662
2662 2663 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2663 2664 ASSERT(sp != NULL);
2664 2665
2665 2666 if ((sptd->spt_flags & SHM_PAGEABLE) == 0) {
2666 2667 return (0);
2667 2668 }
2668 2669
2669 2670 addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
2670 2671 an_idx = seg_page(seg, addr);
2671 2672 npages = btopr(len);
2672 2673
2673 2674 if (an_idx + npages > btopr(shmd->shm_amp->size)) {
2674 2675 return (ENOMEM);
2675 2676 }
2676 2677
2677 2678 /*
2678 2679 * A shm's project never changes, so no lock needed.
2679 2680 * The shm has a hold on the project, so it will not go away.
2680 2681 * Since we have a mapping to shm within this zone, we know
2681 2682 * that the zone will not go away.
2682 2683 */
2683 2684 proj = sp->shm_perm.ipc_proj;
2684 2685
2685 2686 if (op == MC_LOCK) {
2686 2687
2687 2688 /*
2688 2689 * Need to align addr and size request if they are not
2689 2690 * aligned so we can always allocate large page(s) however
2690 2691 * we only lock what was requested in initial request.
2691 2692 */
2692 2693 share_sz = page_get_pagesize(sptseg->s_szc);
2693 2694 a_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_sz);
2694 2695 a_len = P2ROUNDUP((uintptr_t)(((addr + len) - a_addr)),
2695 2696 share_sz);
2696 2697 a_npages = btop(a_len);
2697 2698 a_an_idx = seg_page(seg, a_addr);
2698 2699 spt_addr = sptseg->s_base + ptob(a_an_idx);
2699 2700 ppa_idx = an_idx - a_an_idx;
2700 2701
2701 2702 if ((ppa = kmem_zalloc(((sizeof (page_t *)) * a_npages),
2702 2703 KM_NOSLEEP)) == NULL) {
2703 2704 return (ENOMEM);
2704 2705 }
2705 2706
2706 2707 /*
2707 2708 * Don't cache any new pages for IO and
2708 2709 * flush any cached pages.
2709 2710 */
2710 2711 mutex_enter(&sptd->spt_lock);
2711 2712 if (sptd->spt_ppa != NULL)
2712 2713 sptd->spt_flags |= DISM_PPA_CHANGED;
2713 2714
2714 2715 sts = spt_anon_getpages(sptseg, spt_addr, a_len, ppa);
2715 2716 if (sts != 0) {
2716 2717 mutex_exit(&sptd->spt_lock);
2717 2718 kmem_free(ppa, ((sizeof (page_t *)) * a_npages));
2718 2719 return (sts);
2719 2720 }
2720 2721
2721 2722 mutex_enter(&sp->shm_mlock);
2722 2723 /* enforce locked memory rctl */
2723 2724 unlocked = spt_unlockedbytes(npages, &ppa[ppa_idx]);
2724 2725
2725 2726 mutex_enter(&p->p_lock);
2726 2727 if (rctl_incr_locked_mem(p, proj, unlocked, 0)) {
2727 2728 mutex_exit(&p->p_lock);
2728 2729 sts = EAGAIN;
2729 2730 } else {
2730 2731 mutex_exit(&p->p_lock);
2731 2732 sts = spt_lockpages(seg, an_idx, npages,
2732 2733 &ppa[ppa_idx], lockmap, pos, &locked);
2733 2734
2734 2735 /*
2735 2736 * correct locked count if not all pages could be
2736 2737 * locked
2737 2738 */
2738 2739 if ((unlocked - locked) > 0) {
2739 2740 rctl_decr_locked_mem(NULL, proj,
2740 2741 (unlocked - locked), 0);
2741 2742 }
2742 2743 }
2743 2744 /*
2744 2745 * unlock pages
2745 2746 */
2746 2747 for (i = 0; i < a_npages; i++)
2747 2748 page_unlock(ppa[i]);
2748 2749 if (sptd->spt_ppa != NULL)
2749 2750 sptd->spt_flags |= DISM_PPA_CHANGED;
2750 2751 mutex_exit(&sp->shm_mlock);
2751 2752 mutex_exit(&sptd->spt_lock);
2752 2753
2753 2754 kmem_free(ppa, ((sizeof (page_t *)) * a_npages));
2754 2755
2755 2756 } else if (op == MC_UNLOCK) { /* unlock */
2756 2757 page_t **ppa;
2757 2758
2758 2759 mutex_enter(&sptd->spt_lock);
2759 2760 if (shmd->shm_lckpgs == 0) {
2760 2761 mutex_exit(&sptd->spt_lock);
2761 2762 return (0);
2762 2763 }
2763 2764 /*
2764 2765 * Don't cache new IO pages.
2765 2766 */
2766 2767 if (sptd->spt_ppa != NULL)
2767 2768 sptd->spt_flags |= DISM_PPA_CHANGED;
2768 2769
2769 2770 mutex_enter(&sp->shm_mlock);
2770 2771 sts = spt_unlockpages(seg, an_idx, npages, &unlocked);
2771 2772 if ((ppa = sptd->spt_ppa) != NULL)
2772 2773 sptd->spt_flags |= DISM_PPA_CHANGED;
2773 2774 mutex_exit(&sptd->spt_lock);
2774 2775
2775 2776 rctl_decr_locked_mem(NULL, proj, unlocked, 0);
2776 2777 mutex_exit(&sp->shm_mlock);
2777 2778
2778 2779 if (ppa != NULL)
2779 2780 seg_ppurge_wiredpp(ppa);
2780 2781 }
2781 2782 return (sts);
2782 2783 }
2783 2784
2784 2785 /*ARGSUSED*/
2785 2786 int
2786 2787 segspt_shmgetprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv)
2787 2788 {
2788 2789 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2789 2790 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2790 2791 spgcnt_t pgno = seg_page(seg, addr+len) - seg_page(seg, addr) + 1;
2791 2792
2792 2793 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2793 2794
2794 2795 /*
2795 2796 * ISM segment is always rw.
2796 2797 */
2797 2798 while (--pgno >= 0)
2798 2799 *protv++ = sptd->spt_prot;
2799 2800 return (0);
2800 2801 }
2801 2802
2802 2803 /*ARGSUSED*/
2803 2804 u_offset_t
2804 2805 segspt_shmgetoffset(struct seg *seg, caddr_t addr)
2805 2806 {
2806 2807 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2807 2808
2808 2809 /* Offset does not matter in ISM memory */
2809 2810
2810 2811 return ((u_offset_t)0);
2811 2812 }
2812 2813
2813 2814 /* ARGSUSED */
2814 2815 int
2815 2816 segspt_shmgettype(struct seg *seg, caddr_t addr)
2816 2817 {
2817 2818 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2818 2819 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2819 2820
2820 2821 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2821 2822
2822 2823 /*
2823 2824 * The shared memory mapping is always MAP_SHARED, SWAP is only
2824 2825 * reserved for DISM
2825 2826 */
2826 2827 return (MAP_SHARED |
2827 2828 ((sptd->spt_flags & SHM_PAGEABLE) ? 0 : MAP_NORESERVE));
2828 2829 }
2829 2830
2830 2831 /*ARGSUSED*/
2831 2832 int
2832 2833 segspt_shmgetvp(struct seg *seg, caddr_t addr, struct vnode **vpp)
2833 2834 {
2834 2835 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2835 2836 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2836 2837
2837 2838 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2838 2839
2839 2840 *vpp = sptd->spt_vp;
2840 2841 return (0);
2841 2842 }
2842 2843
2843 2844 /*
2844 2845 * We need to wait for pending IO to complete to a DISM segment in order for
2845 2846 * pages to get kicked out of the seg_pcache. 120 seconds should be more
2846 2847 * than enough time to wait.
2847 2848 */
2848 2849 static clock_t spt_pcache_wait = 120;
2849 2850
2850 2851 /*ARGSUSED*/
2851 2852 static int
2852 2853 segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, uint_t behav)
2853 2854 {
2854 2855 struct shm_data *shmd = (struct shm_data *)seg->s_data;
2855 2856 struct spt_data *sptd = (struct spt_data *)shmd->shm_sptseg->s_data;
2856 2857 struct anon_map *amp;
2857 2858 pgcnt_t pg_idx;
2858 2859 ushort_t gen;
2859 2860 clock_t end_lbolt;
2860 2861 int writer;
2861 2862 page_t **ppa;
2862 2863
2863 2864 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
2864 2865
2865 2866 if (behav == MADV_FREE || behav == MADV_PURGE) {
2866 2867 if ((sptd->spt_flags & SHM_PAGEABLE) == 0)
2867 2868 return (0);
2868 2869
2869 2870 amp = sptd->spt_amp;
2870 2871 pg_idx = seg_page(seg, addr);
2871 2872
2872 2873 mutex_enter(&sptd->spt_lock);
2873 2874 if ((ppa = sptd->spt_ppa) == NULL) {
2874 2875 mutex_exit(&sptd->spt_lock);
2875 2876 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
2876 2877 (void) anon_disclaim(amp, pg_idx, len, behav, NULL);
2877 2878 ANON_LOCK_EXIT(&->a_rwlock);
2878 2879 return (0);
2879 2880 }
2880 2881
2881 2882 sptd->spt_flags |= DISM_PPA_CHANGED;
2882 2883 gen = sptd->spt_gen;
2883 2884
2884 2885 mutex_exit(&sptd->spt_lock);
2885 2886
2886 2887 /*
2887 2888 * Purge all DISM cached pages
2888 2889 */
2889 2890 seg_ppurge_wiredpp(ppa);
2890 2891
2891 2892 /*
2892 2893 * Drop the AS_LOCK so that other threads can grab it
2893 2894 * in the as_pageunlock path and hopefully get the segment
2894 2895 * kicked out of the seg_pcache. We bump the shm_softlockcnt
2895 2896 * to keep this segment resident.
2896 2897 */
2897 2898 writer = AS_WRITE_HELD(seg->s_as);
2898 2899 atomic_inc_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
2899 2900 AS_LOCK_EXIT(seg->s_as);
2900 2901
2901 2902 mutex_enter(&sptd->spt_lock);
2902 2903
2903 2904 end_lbolt = ddi_get_lbolt() + (hz * spt_pcache_wait);
2904 2905
2905 2906 /*
2906 2907 * Try to wait for pages to get kicked out of the seg_pcache.
2907 2908 */
2908 2909 while (sptd->spt_gen == gen &&
2909 2910 (sptd->spt_flags & DISM_PPA_CHANGED) &&
2910 2911 ddi_get_lbolt() < end_lbolt) {
2911 2912 if (!cv_timedwait_sig(&sptd->spt_cv,
2912 2913 &sptd->spt_lock, end_lbolt)) {
2913 2914 break;
2914 2915 }
2915 2916 }
2916 2917
2917 2918 mutex_exit(&sptd->spt_lock);
2918 2919
2919 2920 /* Regrab the AS_LOCK and release our hold on the segment */
2920 2921 AS_LOCK_ENTER(seg->s_as, writer ? RW_WRITER : RW_READER);
2921 2922 atomic_dec_ulong((ulong_t *)(&(shmd->shm_softlockcnt)));
2922 2923 if (shmd->shm_softlockcnt <= 0) {
2923 2924 if (AS_ISUNMAPWAIT(seg->s_as)) {
2924 2925 mutex_enter(&seg->s_as->a_contents);
2925 2926 if (AS_ISUNMAPWAIT(seg->s_as)) {
2926 2927 AS_CLRUNMAPWAIT(seg->s_as);
2927 2928 cv_broadcast(&seg->s_as->a_cv);
2928 2929 }
2929 2930 mutex_exit(&seg->s_as->a_contents);
2930 2931 }
2931 2932 }
2932 2933
2933 2934 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
2934 2935 (void) anon_disclaim(amp, pg_idx, len, behav, NULL);
2935 2936 ANON_LOCK_EXIT(&->a_rwlock);
2936 2937 } else if (lgrp_optimizations() && (behav == MADV_ACCESS_LWP ||
2937 2938 behav == MADV_ACCESS_MANY || behav == MADV_ACCESS_DEFAULT)) {
2938 2939 int already_set;
2939 2940 ulong_t anon_index;
2940 2941 lgrp_mem_policy_t policy;
2941 2942 caddr_t shm_addr;
2942 2943 size_t share_size;
2943 2944 size_t size;
2944 2945 struct seg *sptseg = shmd->shm_sptseg;
2945 2946 caddr_t sptseg_addr;
2946 2947
2947 2948 /*
2948 2949 * Align address and length to page size of underlying segment
2949 2950 */
2950 2951 share_size = page_get_pagesize(shmd->shm_sptseg->s_szc);
2951 2952 shm_addr = (caddr_t)P2ALIGN((uintptr_t)(addr), share_size);
2952 2953 size = P2ROUNDUP((uintptr_t)(((addr + len) - shm_addr)),
2953 2954 share_size);
2954 2955
2955 2956 amp = shmd->shm_amp;
2956 2957 anon_index = seg_page(seg, shm_addr);
2957 2958
2958 2959 /*
2959 2960 * And now we may have to adjust size downward if we have
2960 2961 * exceeded the realsize of the segment or initial anon
2961 2962 * allocations.
2962 2963 */
2963 2964 sptseg_addr = sptseg->s_base + ptob(anon_index);
2964 2965 if ((sptseg_addr + size) >
2965 2966 (sptseg->s_base + sptd->spt_realsize))
2966 2967 size = (sptseg->s_base + sptd->spt_realsize) -
2967 2968 sptseg_addr;
2968 2969
2969 2970 /*
2970 2971 * Set memory allocation policy for this segment
2971 2972 */
2972 2973 policy = lgrp_madv_to_policy(behav, len, MAP_SHARED);
2973 2974 already_set = lgrp_shm_policy_set(policy, amp, anon_index,
2974 2975 NULL, 0, len);
2975 2976
2976 2977 /*
2977 2978 * If random memory allocation policy set already,
2978 2979 * don't bother reapplying it.
2979 2980 */
2980 2981 if (already_set && !LGRP_MEM_POLICY_REAPPLICABLE(policy))
2981 2982 return (0);
2982 2983
2983 2984 /*
2984 2985 * Mark any existing pages in the given range for
2985 2986 * migration, flushing the I/O page cache, and using
2986 2987 * underlying segment to calculate anon index and get
2987 2988 * anonmap and vnode pointer from
2988 2989 */
2989 2990 if (shmd->shm_softlockcnt > 0)
2990 2991 segspt_purge(seg);
2991 2992
2992 2993 page_mark_migrate(seg, shm_addr, size, amp, 0, NULL, 0, 0);
2993 2994 }
2994 2995
2995 2996 return (0);
2996 2997 }
2997 2998
2998 2999 /*ARGSUSED*/
2999 3000 void
3000 3001 segspt_shmdump(struct seg *seg)
3001 3002 {
3002 3003 /* no-op for ISM segment */
3003 3004 }
3004 3005
3005 3006 /*ARGSUSED*/
3006 3007 static faultcode_t
3007 3008 segspt_shmsetpgsz(struct seg *seg, caddr_t addr, size_t len, uint_t szc)
3008 3009 {
3009 3010 return (ENOTSUP);
3010 3011 }
3011 3012
3012 3013 /*
3013 3014 * get a memory ID for an addr in a given segment
3014 3015 */
3015 3016 static int
3016 3017 segspt_shmgetmemid(struct seg *seg, caddr_t addr, memid_t *memidp)
3017 3018 {
3018 3019 struct shm_data *shmd = (struct shm_data *)seg->s_data;
3019 3020 struct anon *ap;
3020 3021 size_t anon_index;
3021 3022 struct anon_map *amp = shmd->shm_amp;
3022 3023 struct spt_data *sptd = shmd->shm_sptseg->s_data;
3023 3024 struct seg *sptseg = shmd->shm_sptseg;
3024 3025 anon_sync_obj_t cookie;
3025 3026
3026 3027 anon_index = seg_page(seg, addr);
3027 3028
3028 3029 if (addr > (seg->s_base + sptd->spt_realsize)) {
3029 3030 return (EFAULT);
3030 3031 }
3031 3032
3032 3033 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
3033 3034 anon_array_enter(amp, anon_index, &cookie);
3034 3035 ap = anon_get_ptr(amp->ahp, anon_index);
3035 3036 if (ap == NULL) {
3036 3037 struct page *pp;
3037 3038 caddr_t spt_addr = sptseg->s_base + ptob(anon_index);
3038 3039
3039 3040 pp = anon_zero(sptseg, spt_addr, &ap, kcred);
3040 3041 if (pp == NULL) {
3041 3042 anon_array_exit(&cookie);
3042 3043 ANON_LOCK_EXIT(&->a_rwlock);
3043 3044 return (ENOMEM);
3044 3045 }
3045 3046 (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP);
3046 3047 page_unlock(pp);
3047 3048 }
3048 3049 anon_array_exit(&cookie);
3049 3050 ANON_LOCK_EXIT(&->a_rwlock);
3050 3051 memidp->val[0] = (uintptr_t)ap;
3051 3052 memidp->val[1] = (uintptr_t)addr & PAGEOFFSET;
3052 3053 return (0);
3053 3054 }
3054 3055
3055 3056 /*
3056 3057 * Get memory allocation policy info for specified address in given segment
3057 3058 */
3058 3059 static lgrp_mem_policy_info_t *
3059 3060 segspt_shmgetpolicy(struct seg *seg, caddr_t addr)
3060 3061 {
3061 3062 struct anon_map *amp;
3062 3063 ulong_t anon_index;
3063 3064 lgrp_mem_policy_info_t *policy_info;
3064 3065 struct shm_data *shm_data;
3065 3066
3066 3067 ASSERT(seg != NULL);
3067 3068
3068 3069 /*
3069 3070 * Get anon_map from segshm
3070 3071 *
3071 3072 * Assume that no lock needs to be held on anon_map, since
3072 3073 * it should be protected by its reference count which must be
3073 3074 * nonzero for an existing segment
3074 3075 * Need to grab readers lock on policy tree though
3075 3076 */
3076 3077 shm_data = (struct shm_data *)seg->s_data;
3077 3078 if (shm_data == NULL)
3078 3079 return (NULL);
3079 3080 amp = shm_data->shm_amp;
3080 3081 ASSERT(amp->refcnt != 0);
3081 3082
3082 3083 /*
3083 3084 * Get policy info
3084 3085 *
3085 3086 * Assume starting anon index of 0
3086 3087 */
3087 3088 anon_index = seg_page(seg, addr);
3088 3089 policy_info = lgrp_shm_policy_get(amp, anon_index, NULL, 0);
3089 3090
3090 3091 return (policy_info);
3091 3092 }
3092 3093
3093 3094 /*ARGSUSED*/
3094 3095 static int
3095 3096 segspt_shmcapable(struct seg *seg, segcapability_t capability)
3096 3097 {
3097 3098 return (0);
3098 3099 }
|
↓ open down ↓ |
3015 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX