Print this page
NEX-5164 backport illumos 6514 AS_* lock macros simplification
Reviewed by: Kevin Crowe <kevin.crowe@nexenta.com>
6514 AS_* lock macros simplification
Reviewed by: Piotr Jasiukajtis <estibi@me.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Albert Lee <trisk@omniti.com>
Approved by: Dan McDonald <danmcd@omniti.com>
re #13613 rb4516 Tunables needs volatile keyword
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/os/vm_subr.c
+++ new/usr/src/uts/common/os/vm_subr.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
|
↓ open down ↓ |
12 lines elided |
↑ open up ↑ |
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23 + * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
23 24 */
24 25
25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
26 27 /* All Rights Reserved */
27 28
28 29 /*
29 30 * University Copyright- Copyright (c) 1982, 1986, 1988
30 31 * The Regents of the University of California
31 32 * All Rights Reserved
32 33 *
33 34 * University Acknowledgment- Portions of this document are derived from
34 35 * software developed by the University of California, Berkeley, and its
35 36 * contributors.
36 37 */
37 38
38 39 #include <sys/types.h>
39 40 #include <sys/t_lock.h>
40 41 #include <sys/param.h>
41 42 #include <sys/errno.h>
42 43 #include <sys/debug.h>
43 44 #include <sys/cmn_err.h>
44 45 #include <sys/kmem.h>
45 46 #include <sys/sysmacros.h>
46 47 #include <sys/inline.h>
47 48 #include <sys/buf.h>
48 49 #include <sys/uio.h>
49 50 #include <sys/user.h>
50 51 #include <sys/proc.h>
51 52 #include <sys/systm.h>
52 53 #include <sys/vmsystm.h>
53 54 #include <sys/cpuvar.h>
54 55 #include <sys/mman.h>
55 56 #include <sys/cred.h>
56 57 #include <sys/vnode.h>
57 58 #include <sys/file.h>
58 59 #include <sys/vm.h>
59 60
60 61 #include <sys/swap.h>
61 62 #include <sys/vtrace.h>
62 63 #include <sys/tnf_probe.h>
63 64 #include <sys/fs/snode.h>
64 65 #include <sys/copyops.h>
65 66 #include <sys/conf.h>
|
↓ open down ↓ |
33 lines elided |
↑ open up ↑ |
66 67 #include <sys/sdt.h>
67 68
68 69 #include <vm/anon.h>
69 70 #include <vm/hat.h>
70 71 #include <vm/as.h>
71 72 #include <vm/seg.h>
72 73 #include <vm/page.h>
73 74 #include <vm/seg_vn.h>
74 75 #include <vm/seg_kmem.h>
75 76
76 -extern int maxphys;
77 +#include <sys/sunddi.h>
77 78
78 79 void
79 80 minphys(struct buf *bp)
80 81 {
81 82 if (bp->b_bcount > maxphys)
82 83 bp->b_bcount = maxphys;
83 84 }
84 85
85 86 /*
86 87 * use kmem_cache_create for physio buffers. This has shown
87 88 * a better cache distribution compared to buffers on the
88 89 * stack. It also avoids semaphore construction/deconstruction
89 90 * per request
90 91 */
91 92
92 93 static struct kmem_cache *physio_buf_cache;
93 94
94 95 /* ARGSUSED */
95 96 static int
96 97 physio_buf_constructor(void *buf, void *cdrarg, int kmflags)
97 98 {
98 99 bioinit((struct buf *)buf);
99 100 return (0);
100 101 }
101 102
102 103 /* ARGSUSED */
103 104 static void
104 105 physio_buf_destructor(void *buf, void *cdrarg)
105 106 {
106 107 biofini((struct buf *)buf);
107 108 }
108 109
109 110 void
110 111 physio_bufs_init(void)
111 112 {
112 113 physio_buf_cache = kmem_cache_create("physio_buf_cache",
113 114 sizeof (struct buf), 0, physio_buf_constructor,
114 115 physio_buf_destructor, NULL, NULL, NULL, 0);
115 116 }
116 117
117 118
118 119
119 120 /*
120 121 * initiate raw I/O request
121 122 *
122 123 * allocate buf header if necessary
123 124 * adjust max size of each I/O request
124 125 * lock down user pages and verify access protections
125 126 * call driver's strategy routine to submit request
126 127 * wait for I/O completion
127 128 * unlock user pages and free allocated buf header
128 129 */
129 130
130 131 int
131 132 default_physio(int (*strat)(struct buf *), struct buf *bp, dev_t dev,
132 133 int rw, void (*mincnt)(struct buf *), struct uio *uio)
133 134 {
134 135 struct iovec *iov;
135 136 struct proc *procp;
136 137 struct as *asp;
137 138 ssize_t c;
138 139 char *a;
139 140 int error = 0;
140 141 page_t **pplist;
141 142 int allocbuf = 0;
142 143
143 144 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_START, "physio_start: bp %p", bp);
144 145
145 146 /* Kernel probe */
146 147 TNF_PROBE_4(physio_start, "io rawio", /* CSTYLED */,
147 148 tnf_device, device, dev,
148 149 tnf_offset, offset, uio->uio_loffset,
149 150 tnf_size, size, uio->uio_resid,
150 151 tnf_bioflags, rw, rw);
151 152
152 153 if (rw == B_READ) {
153 154 CPU_STATS_ADD_K(sys, phread, 1);
154 155 } else {
155 156 CPU_STATS_ADD_K(sys, phwrite, 1);
156 157 }
157 158
158 159 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_START,
159 160 "getbuf_start: bp %p", bp);
160 161
161 162 if (bp == NULL) {
162 163 bp = kmem_cache_alloc(physio_buf_cache, KM_SLEEP);
163 164 bp->b_iodone = NULL;
164 165 bp->b_resid = 0;
165 166 allocbuf = 1;
166 167 }
167 168 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_END, "getbuf_end: bp %p", bp);
168 169
169 170 if (uio->uio_segflg == UIO_USERSPACE) {
170 171 procp = ttoproc(curthread);
171 172 asp = procp->p_as;
172 173 } else {
173 174 procp = NULL;
174 175 asp = &kas;
175 176 }
176 177 ASSERT(SEMA_HELD(&bp->b_sem));
177 178
178 179 /*
179 180 * We need to prepare this buffer for the io:::start probe, including
180 181 * NULL'ing out the file, clearing the offset, and filling in the
181 182 * b_dip field.
182 183 */
183 184 bp->b_file = NULL;
184 185 bp->b_offset = -1;
185 186
186 187 if (dev != NODEV) {
187 188 (void) devopsp[getmajor(dev)]->devo_getinfo(NULL,
188 189 DDI_INFO_DEVT2DEVINFO, (void *)dev, (void **)&bp->b_dip);
189 190 } else {
190 191 bp->b_dip = NULL;
191 192 }
192 193
193 194 while (uio->uio_iovcnt > 0) {
194 195 iov = uio->uio_iov;
195 196
196 197 bp->b_error = 0;
197 198 bp->b_proc = procp;
198 199
199 200 while (iov->iov_len > 0) {
200 201 if (uio->uio_resid == 0)
201 202 break;
202 203 if (uio->uio_loffset < 0) {
203 204 error = EINVAL;
204 205 break;
205 206 }
206 207 #ifdef _ILP32
207 208 /*
208 209 * For 32-bit kernels, check against SPEC_MAXOFFSET_T
209 210 * which represents the maximum size that can be
210 211 * supported by the IO subsystem.
211 212 * XXX this code assumes a D_64BIT driver.
212 213 */
213 214 if (uio->uio_loffset > SPEC_MAXOFFSET_T) {
214 215 error = EINVAL;
215 216 break;
216 217 }
217 218 #endif /* _ILP32 */
218 219 bp->b_flags = B_BUSY | B_PHYS | rw;
219 220 bp->b_edev = dev;
220 221 bp->b_lblkno = btodt(uio->uio_loffset);
221 222
222 223 /*
223 224 * Don't count on b_addr remaining untouched by the
224 225 * code below (it may be reset because someone does
225 226 * a bp_mapin on the buffer) -- reset from the iov
226 227 * each time through, updating the iov's base address
227 228 * instead.
228 229 */
229 230 a = bp->b_un.b_addr = iov->iov_base;
230 231 bp->b_bcount = MIN(iov->iov_len, uio->uio_resid);
231 232 (*mincnt)(bp);
232 233 c = bp->b_bcount;
233 234
234 235 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_START,
235 236 "as_pagelock_start: bp %p", bp);
236 237
237 238 error = as_pagelock(asp, &pplist, a,
238 239 c, rw == B_READ? S_WRITE : S_READ);
239 240
240 241 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_END,
241 242 "as_pagelock_end:");
242 243
243 244 if (error != 0) {
244 245 bp->b_flags |= B_ERROR;
245 246 bp->b_error = error;
246 247 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS);
247 248 break;
248 249 }
249 250 bp->b_shadow = pplist;
250 251 if (pplist != NULL) {
251 252 bp->b_flags |= B_SHADOW;
252 253 }
253 254
254 255 DTRACE_IO1(start, struct buf *, bp);
255 256 bp->b_flags |= B_STARTED;
256 257
257 258 (void) (*strat)(bp);
258 259 error = biowait(bp);
259 260
260 261 /*
261 262 * unlock the pages
262 263 */
263 264 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_START,
264 265 "as_pageunlock_start: bp %p", bp);
265 266
266 267 as_pageunlock(asp, pplist, a, c,
267 268 rw == B_READ? S_WRITE : S_READ);
268 269
269 270 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_END,
270 271 "as_pageunlock_end:");
271 272
272 273 c -= bp->b_resid;
273 274 iov->iov_base += c;
274 275 iov->iov_len -= c;
275 276 uio->uio_resid -= c;
276 277 uio->uio_loffset += c;
277 278 /* bp->b_resid - temp kludge for tape drives */
278 279 if (bp->b_resid || error)
279 280 break;
280 281 }
281 282 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
282 283 /* bp->b_resid - temp kludge for tape drives */
283 284 if (bp->b_resid || error)
284 285 break;
285 286 uio->uio_iov++;
286 287 uio->uio_iovcnt--;
287 288 }
288 289
289 290 if (allocbuf) {
290 291 kmem_cache_free(physio_buf_cache, bp);
291 292 }
292 293
293 294 /* Kernel probe */
294 295 TNF_PROBE_1(physio_end, "io rawio", /* CSTYLED */,
295 296 tnf_device, device, dev);
296 297
297 298 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_END, "physio_end: bp %p", bp);
298 299
299 300 return (error);
300 301 }
301 302
302 303 /*
303 304 * Returns 0 on success, or an error on failure.
304 305 *
305 306 * This function is no longer a part of the DDI/DKI.
306 307 * However, for compatibility, its interface should not
307 308 * be changed and it should not be removed from the kernel.
308 309 */
309 310 int
310 311 useracc(void *addr, size_t count, int access)
311 312 {
312 313 uint_t prot;
313 314
314 315 prot = PROT_USER | ((access == B_READ) ? PROT_READ : PROT_WRITE);
315 316 return (as_checkprot(ttoproc(curthread)->p_as, addr, count, prot));
316 317 }
317 318
318 319 #define MAX_MAPIN_PAGES 8
319 320
320 321 /*
321 322 * This function temporarily "borrows" user pages for kernel use. If
322 323 * "cow" is on, it also sets up copy-on-write protection (only feasible
323 324 * on MAP_PRIVATE segment) on the user mappings, to protect the borrowed
324 325 * pages from any changes by the user. The caller is responsible for
325 326 * unlocking and tearing down cow settings when it's done with the pages.
326 327 * For an example, see kcfree().
327 328 *
328 329 * Pages behind [uaddr..uaddr+*lenp] under address space "as" are locked
329 330 * (shared), and mapped into kernel address range [kaddr..kaddr+*lenp] if
330 331 * kaddr != -1. On entering this function, cached_ppp contains a list
331 332 * of pages that are mapped into [kaddr..kaddr+*lenp] already (from a
332 333 * previous call). Thus if same pages remain behind [uaddr..uaddr+*lenp],
333 334 * the kernel map won't need to be reloaded again.
334 335 *
335 336 * For cow == 1, if the pages are anonymous pages, it also bumps the anon
336 337 * reference count, and change the user-mapping to read-only. This
337 338 * scheme should work on all types of segment drivers. But to be safe,
338 339 * we check against segvn here.
339 340 *
340 341 * Since this function is used to emulate copyin() semantic, it checks
341 342 * to make sure the user-mappings allow "user-read".
342 343 *
343 344 * On exit "lenp" contains the number of bytes successfully locked and
344 345 * mapped in. For the unsuccessful ones, the caller can fall back to
345 346 * copyin().
346 347 *
347 348 * Error return:
348 349 * ENOTSUP - operation like this is not supported either on this segment
349 350 * type, or on this platform type.
350 351 */
351 352 int
352 353 cow_mapin(struct as *as, caddr_t uaddr, caddr_t kaddr, struct page **cached_ppp,
353 354 struct anon **app, size_t *lenp, int cow)
354 355 {
355 356 struct hat *hat;
356 357 struct seg *seg;
357 358 caddr_t base;
358 359 page_t *pp, *ppp[MAX_MAPIN_PAGES];
359 360 long i;
360 361 int flags;
361 362 size_t size, total = *lenp;
362 363 char first = 1;
363 364 faultcode_t res;
364 365
365 366 *lenp = 0;
366 367 if (cow) {
367 368 AS_LOCK_ENTER(as, RW_WRITER);
368 369 seg = as_findseg(as, uaddr, 0);
369 370 if ((seg == NULL) || ((base = seg->s_base) > uaddr) ||
370 371 (uaddr + total) > base + seg->s_size) {
371 372 AS_LOCK_EXIT(as);
372 373 return (EINVAL);
373 374 }
374 375 /*
375 376 * The COW scheme should work for all segment types.
376 377 * But to be safe, we check against segvn.
377 378 */
378 379 if (seg->s_ops != &segvn_ops) {
379 380 AS_LOCK_EXIT(as);
380 381 return (ENOTSUP);
381 382 } else if ((SEGOP_GETTYPE(seg, uaddr) & MAP_PRIVATE) == 0) {
382 383 AS_LOCK_EXIT(as);
383 384 return (ENOTSUP);
384 385 }
385 386 }
386 387 hat = as->a_hat;
387 388 size = total;
388 389 tryagain:
389 390 /*
390 391 * If (cow), hat_softlock will also change the usr protection to RO.
391 392 * This is the first step toward setting up cow. Before we
392 393 * bump up an_refcnt, we can't allow any cow-fault on this
393 394 * address. Otherwise segvn_fault will change the protection back
394 395 * to RW upon seeing an_refcnt == 1.
395 396 * The solution is to hold the writer lock on "as".
396 397 */
397 398 res = hat_softlock(hat, uaddr, &size, &ppp[0], cow ? HAT_COW : 0);
398 399 size = total - size;
399 400 *lenp += size;
400 401 size = size >> PAGESHIFT;
401 402 i = 0;
402 403 while (i < size) {
403 404 pp = ppp[i];
404 405 if (cow) {
405 406 kmutex_t *ahm;
406 407 /*
407 408 * Another solution is to hold SE_EXCL on pp, and
408 409 * disable PROT_WRITE. This also works for MAP_SHARED
409 410 * segment. The disadvantage is that it locks the
410 411 * page from being used by anybody else.
411 412 */
412 413 ahm = AH_MUTEX(pp->p_vnode, pp->p_offset);
413 414 mutex_enter(ahm);
414 415 *app = swap_anon(pp->p_vnode, pp->p_offset);
415 416 /*
416 417 * Since we are holding the as lock, this avoids a
417 418 * potential race with anon_decref. (segvn_unmap and
418 419 * segvn_free needs the as writer lock to do anon_free.)
419 420 */
420 421 if (*app != NULL) {
421 422 #if 0
422 423 if ((*app)->an_refcnt == 0)
423 424 /*
424 425 * Consider the following senario (unlikey
425 426 * though):
426 427 * 1. an_refcnt == 2
427 428 * 2. we solftlock the page.
428 429 * 3. cow ocurrs on this addr. So a new ap,
429 430 * page and mapping is established on addr.
430 431 * 4. an_refcnt drops to 1 (segvn_faultpage
431 432 * -> anon_decref(oldap))
432 433 * 5. the last ref to ap also drops (from
433 434 * another as). It ends up blocked inside
434 435 * anon_decref trying to get page's excl lock.
435 436 * 6. Later kcfree unlocks the page, call
436 437 * anon_decref -> oops, ap is gone already.
437 438 *
438 439 * Holding as writer lock solves all problems.
439 440 */
440 441 *app = NULL;
441 442 else
442 443 #endif
443 444 (*app)->an_refcnt++;
444 445 }
445 446 mutex_exit(ahm);
446 447 } else {
447 448 *app = NULL;
448 449 }
449 450 if (kaddr != (caddr_t)-1) {
450 451 if (pp != *cached_ppp) {
451 452 if (*cached_ppp == NULL)
452 453 flags = HAT_LOAD_LOCK | HAT_NOSYNC |
453 454 HAT_LOAD_NOCONSIST;
454 455 else
455 456 flags = HAT_LOAD_REMAP |
456 457 HAT_LOAD_NOCONSIST;
457 458 /*
458 459 * In order to cache the kernel mapping after
459 460 * the user page is unlocked, we call
460 461 * hat_devload instead of hat_memload so
461 462 * that the kernel mapping we set up here is
462 463 * "invisible" to the rest of the world. This
463 464 * is not very pretty. But as long as the
464 465 * caller bears the responsibility of keeping
465 466 * cache consistency, we should be ok -
466 467 * HAT_NOCONSIST will get us a uncached
467 468 * mapping on VAC. hat_softlock will flush
468 469 * a VAC_WRITEBACK cache. Therefore the kaddr
469 470 * doesn't have to be of the same vcolor as
470 471 * uaddr.
471 472 * The alternative is - change hat_devload
472 473 * to get a cached mapping. Allocate a kaddr
473 474 * with the same vcolor as uaddr. Then
474 475 * hat_softlock won't need to flush the VAC.
475 476 */
476 477 hat_devload(kas.a_hat, kaddr, PAGESIZE,
477 478 page_pptonum(pp), PROT_READ, flags);
478 479 *cached_ppp = pp;
479 480 }
480 481 kaddr += PAGESIZE;
481 482 }
482 483 cached_ppp++;
483 484 app++;
484 485 ++i;
485 486 }
486 487 if (cow) {
487 488 AS_LOCK_EXIT(as);
488 489 }
489 490 if (first && res == FC_NOMAP) {
490 491 /*
491 492 * If the address is not mapped yet, we call as_fault to
492 493 * fault the pages in. We could've fallen back to copy and
493 494 * let it fault in the pages. But for a mapped file, we
494 495 * normally reference each page only once. For zero-copy to
495 496 * be of any use, we'd better fall in the page now and try
496 497 * again.
497 498 */
498 499 first = 0;
499 500 size = size << PAGESHIFT;
500 501 uaddr += size;
501 502 total -= size;
502 503 size = total;
503 504 res = as_fault(as->a_hat, as, uaddr, size, F_INVAL, S_READ);
504 505 if (cow)
505 506 AS_LOCK_ENTER(as, RW_WRITER);
506 507 goto tryagain;
507 508 }
508 509 switch (res) {
509 510 case FC_NOSUPPORT:
510 511 return (ENOTSUP);
511 512 case FC_PROT: /* Pretend we don't know about it. This will be */
512 513 /* caught by the caller when uiomove fails. */
513 514 case FC_NOMAP:
514 515 case FC_OBJERR:
515 516 default:
516 517 return (0);
517 518 }
518 519 }
|
↓ open down ↓ |
432 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX