1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
24 */
25
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
28
29 /*
30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 * The Regents of the University of California
32 * All Rights Reserved
33 *
34 * University Acknowledgment- Portions of this document are derived from
35 * software developed by the University of California, Berkeley, and its
36 * contributors.
37 */
38
39 #include <sys/types.h>
40 #include <sys/t_lock.h>
41 #include <sys/param.h>
42 #include <sys/errno.h>
43 #include <sys/debug.h>
44 #include <sys/cmn_err.h>
45 #include <sys/kmem.h>
46 #include <sys/sysmacros.h>
47 #include <sys/inline.h>
48 #include <sys/buf.h>
49 #include <sys/uio.h>
50 #include <sys/user.h>
51 #include <sys/proc.h>
52 #include <sys/systm.h>
53 #include <sys/vmsystm.h>
54 #include <sys/cpuvar.h>
55 #include <sys/mman.h>
56 #include <sys/cred.h>
57 #include <sys/vnode.h>
58 #include <sys/file.h>
59 #include <sys/vm.h>
60
61 #include <sys/swap.h>
62 #include <sys/vtrace.h>
63 #include <sys/tnf_probe.h>
64 #include <sys/fs/snode.h>
65 #include <sys/copyops.h>
66 #include <sys/conf.h>
67 #include <sys/sdt.h>
68
69 #include <vm/anon.h>
70 #include <vm/hat.h>
71 #include <vm/as.h>
72 #include <vm/seg.h>
73 #include <vm/page.h>
74 #include <vm/seg_vn.h>
75 #include <vm/seg_kmem.h>
76
77 #include <sys/sunddi.h>
78
79 void
80 minphys(struct buf *bp)
81 {
82 if (bp->b_bcount > maxphys)
83 bp->b_bcount = maxphys;
84 }
85
86 /*
87 * use kmem_cache_create for physio buffers. This has shown
88 * a better cache distribution compared to buffers on the
89 * stack. It also avoids semaphore construction/deconstruction
90 * per request
91 */
92
93 static struct kmem_cache *physio_buf_cache;
94
95 /* ARGSUSED */
96 static int
97 physio_buf_constructor(void *buf, void *cdrarg, int kmflags)
98 {
99 bioinit((struct buf *)buf);
100 return (0);
101 }
102
103 /* ARGSUSED */
104 static void
105 physio_buf_destructor(void *buf, void *cdrarg)
106 {
107 biofini((struct buf *)buf);
108 }
109
110 void
111 physio_bufs_init(void)
112 {
113 physio_buf_cache = kmem_cache_create("physio_buf_cache",
114 sizeof (struct buf), 0, physio_buf_constructor,
115 physio_buf_destructor, NULL, NULL, NULL, 0);
116 }
117
118
119
120 /*
121 * initiate raw I/O request
122 *
123 * allocate buf header if necessary
124 * adjust max size of each I/O request
125 * lock down user pages and verify access protections
126 * call driver's strategy routine to submit request
127 * wait for I/O completion
128 * unlock user pages and free allocated buf header
129 */
130
131 int
132 default_physio(int (*strat)(struct buf *), struct buf *bp, dev_t dev,
133 int rw, void (*mincnt)(struct buf *), struct uio *uio)
134 {
135 struct iovec *iov;
136 struct proc *procp;
137 struct as *asp;
138 ssize_t c;
139 char *a;
140 int error = 0;
141 page_t **pplist;
142 int allocbuf = 0;
143
144 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_START, "physio_start: bp %p", bp);
145
146 /* Kernel probe */
147 TNF_PROBE_4(physio_start, "io rawio", /* CSTYLED */,
148 tnf_device, device, dev,
149 tnf_offset, offset, uio->uio_loffset,
150 tnf_size, size, uio->uio_resid,
151 tnf_bioflags, rw, rw);
152
153 if (rw == B_READ) {
154 CPU_STATS_ADD_K(sys, phread, 1);
155 } else {
156 CPU_STATS_ADD_K(sys, phwrite, 1);
157 }
158
159 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_START,
160 "getbuf_start: bp %p", bp);
161
162 if (bp == NULL) {
163 bp = kmem_cache_alloc(physio_buf_cache, KM_SLEEP);
164 bp->b_iodone = NULL;
165 bp->b_resid = 0;
166 allocbuf = 1;
167 }
168 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_END, "getbuf_end: bp %p", bp);
169
170 if (uio->uio_segflg == UIO_USERSPACE) {
171 procp = ttoproc(curthread);
172 asp = procp->p_as;
173 } else {
174 procp = NULL;
175 asp = &kas;
176 }
177 ASSERT(SEMA_HELD(&bp->b_sem));
178
179 /*
180 * We need to prepare this buffer for the io:::start probe, including
181 * NULL'ing out the file, clearing the offset, and filling in the
182 * b_dip field.
183 */
184 bp->b_file = NULL;
185 bp->b_offset = -1;
186
187 if (dev != NODEV) {
188 (void) devopsp[getmajor(dev)]->devo_getinfo(NULL,
189 DDI_INFO_DEVT2DEVINFO, (void *)dev, (void **)&bp->b_dip);
190 } else {
191 bp->b_dip = NULL;
192 }
193
194 while (uio->uio_iovcnt > 0) {
195 iov = uio->uio_iov;
196
197 bp->b_error = 0;
198 bp->b_proc = procp;
199
200 while (iov->iov_len > 0) {
201 if (uio->uio_resid == 0)
202 break;
203 if (uio->uio_loffset < 0) {
204 error = EINVAL;
205 break;
206 }
207 #ifdef _ILP32
208 /*
209 * For 32-bit kernels, check against SPEC_MAXOFFSET_T
210 * which represents the maximum size that can be
211 * supported by the IO subsystem.
212 * XXX this code assumes a D_64BIT driver.
213 */
214 if (uio->uio_loffset > SPEC_MAXOFFSET_T) {
215 error = EINVAL;
216 break;
217 }
218 #endif /* _ILP32 */
219 bp->b_flags = B_BUSY | B_PHYS | rw;
220 bp->b_edev = dev;
221 bp->b_lblkno = btodt(uio->uio_loffset);
222
223 /*
224 * Don't count on b_addr remaining untouched by the
225 * code below (it may be reset because someone does
226 * a bp_mapin on the buffer) -- reset from the iov
227 * each time through, updating the iov's base address
228 * instead.
229 */
230 a = bp->b_un.b_addr = iov->iov_base;
231 bp->b_bcount = MIN(iov->iov_len, uio->uio_resid);
232 (*mincnt)(bp);
233 c = bp->b_bcount;
234
235 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_START,
236 "as_pagelock_start: bp %p", bp);
237
238 error = as_pagelock(asp, &pplist, a,
239 c, rw == B_READ? S_WRITE : S_READ);
240
241 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_END,
242 "as_pagelock_end:");
243
244 if (error != 0) {
245 bp->b_flags |= B_ERROR;
246 bp->b_error = error;
247 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS);
248 break;
249 }
250 bp->b_shadow = pplist;
251 if (pplist != NULL) {
252 bp->b_flags |= B_SHADOW;
253 }
254
255 DTRACE_IO1(start, struct buf *, bp);
256 bp->b_flags |= B_STARTED;
257
258 (void) (*strat)(bp);
259 error = biowait(bp);
260
261 /*
262 * unlock the pages
263 */
264 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_START,
265 "as_pageunlock_start: bp %p", bp);
266
267 as_pageunlock(asp, pplist, a, c,
268 rw == B_READ? S_WRITE : S_READ);
269
270 TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_END,
271 "as_pageunlock_end:");
272
273 c -= bp->b_resid;
274 iov->iov_base += c;
275 iov->iov_len -= c;
276 uio->uio_resid -= c;
277 uio->uio_loffset += c;
278 /* bp->b_resid - temp kludge for tape drives */
279 if (bp->b_resid || error)
280 break;
281 }
282 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
283 /* bp->b_resid - temp kludge for tape drives */
284 if (bp->b_resid || error)
285 break;
286 uio->uio_iov++;
287 uio->uio_iovcnt--;
288 }
289
290 if (allocbuf) {
291 kmem_cache_free(physio_buf_cache, bp);
292 }
293
294 /* Kernel probe */
295 TNF_PROBE_1(physio_end, "io rawio", /* CSTYLED */,
296 tnf_device, device, dev);
297
298 TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_END, "physio_end: bp %p", bp);
299
300 return (error);
301 }
302
303 /*
304 * Returns 0 on success, or an error on failure.
305 *
306 * This function is no longer a part of the DDI/DKI.
307 * However, for compatibility, its interface should not
308 * be changed and it should not be removed from the kernel.
309 */
310 int
311 useracc(void *addr, size_t count, int access)
312 {
313 uint_t prot;
314
315 prot = PROT_USER | ((access == B_READ) ? PROT_READ : PROT_WRITE);
316 return (as_checkprot(ttoproc(curthread)->p_as, addr, count, prot));
317 }
318
319 #define MAX_MAPIN_PAGES 8
320
321 /*
322 * This function temporarily "borrows" user pages for kernel use. If
323 * "cow" is on, it also sets up copy-on-write protection (only feasible
324 * on MAP_PRIVATE segment) on the user mappings, to protect the borrowed
325 * pages from any changes by the user. The caller is responsible for
326 * unlocking and tearing down cow settings when it's done with the pages.
327 * For an example, see kcfree().
328 *
329 * Pages behind [uaddr..uaddr+*lenp] under address space "as" are locked
330 * (shared), and mapped into kernel address range [kaddr..kaddr+*lenp] if
331 * kaddr != -1. On entering this function, cached_ppp contains a list
332 * of pages that are mapped into [kaddr..kaddr+*lenp] already (from a
333 * previous call). Thus if same pages remain behind [uaddr..uaddr+*lenp],
334 * the kernel map won't need to be reloaded again.
335 *
336 * For cow == 1, if the pages are anonymous pages, it also bumps the anon
337 * reference count, and change the user-mapping to read-only. This
338 * scheme should work on all types of segment drivers. But to be safe,
339 * we check against segvn here.
340 *
341 * Since this function is used to emulate copyin() semantic, it checks
342 * to make sure the user-mappings allow "user-read".
343 *
344 * On exit "lenp" contains the number of bytes successfully locked and
345 * mapped in. For the unsuccessful ones, the caller can fall back to
346 * copyin().
347 *
348 * Error return:
349 * ENOTSUP - operation like this is not supported either on this segment
350 * type, or on this platform type.
351 */
352 int
353 cow_mapin(struct as *as, caddr_t uaddr, caddr_t kaddr, struct page **cached_ppp,
354 struct anon **app, size_t *lenp, int cow)
355 {
356 struct hat *hat;
357 struct seg *seg;
358 caddr_t base;
359 page_t *pp, *ppp[MAX_MAPIN_PAGES];
360 long i;
361 int flags;
362 size_t size, total = *lenp;
363 char first = 1;
364 faultcode_t res;
365
366 *lenp = 0;
367 if (cow) {
368 AS_LOCK_ENTER(as, RW_WRITER);
369 seg = as_findseg(as, uaddr, 0);
370 if ((seg == NULL) || ((base = seg->s_base) > uaddr) ||
371 (uaddr + total) > base + seg->s_size) {
372 AS_LOCK_EXIT(as);
373 return (EINVAL);
374 }
375 /*
376 * The COW scheme should work for all segment types.
377 * But to be safe, we check against segvn.
378 */
379 if (seg->s_ops != &segvn_ops) {
380 AS_LOCK_EXIT(as);
381 return (ENOTSUP);
382 } else if ((SEGOP_GETTYPE(seg, uaddr) & MAP_PRIVATE) == 0) {
383 AS_LOCK_EXIT(as);
384 return (ENOTSUP);
385 }
386 }
387 hat = as->a_hat;
388 size = total;
389 tryagain:
390 /*
391 * If (cow), hat_softlock will also change the usr protection to RO.
392 * This is the first step toward setting up cow. Before we
393 * bump up an_refcnt, we can't allow any cow-fault on this
394 * address. Otherwise segvn_fault will change the protection back
395 * to RW upon seeing an_refcnt == 1.
396 * The solution is to hold the writer lock on "as".
397 */
398 res = hat_softlock(hat, uaddr, &size, &ppp[0], cow ? HAT_COW : 0);
399 size = total - size;
400 *lenp += size;
401 size = size >> PAGESHIFT;
402 i = 0;
403 while (i < size) {
404 pp = ppp[i];
405 if (cow) {
406 kmutex_t *ahm;
407 /*
408 * Another solution is to hold SE_EXCL on pp, and
409 * disable PROT_WRITE. This also works for MAP_SHARED
410 * segment. The disadvantage is that it locks the
411 * page from being used by anybody else.
412 */
413 ahm = AH_MUTEX(pp->p_vnode, pp->p_offset);
414 mutex_enter(ahm);
415 *app = swap_anon(pp->p_vnode, pp->p_offset);
416 /*
417 * Since we are holding the as lock, this avoids a
418 * potential race with anon_decref. (segvn_unmap and
419 * segvn_free needs the as writer lock to do anon_free.)
420 */
421 if (*app != NULL) {
422 #if 0
423 if ((*app)->an_refcnt == 0)
424 /*
425 * Consider the following senario (unlikey
426 * though):
427 * 1. an_refcnt == 2
428 * 2. we solftlock the page.
429 * 3. cow ocurrs on this addr. So a new ap,
430 * page and mapping is established on addr.
431 * 4. an_refcnt drops to 1 (segvn_faultpage
432 * -> anon_decref(oldap))
433 * 5. the last ref to ap also drops (from
434 * another as). It ends up blocked inside
435 * anon_decref trying to get page's excl lock.
436 * 6. Later kcfree unlocks the page, call
437 * anon_decref -> oops, ap is gone already.
438 *
439 * Holding as writer lock solves all problems.
440 */
441 *app = NULL;
442 else
443 #endif
444 (*app)->an_refcnt++;
445 }
446 mutex_exit(ahm);
447 } else {
448 *app = NULL;
449 }
450 if (kaddr != (caddr_t)-1) {
451 if (pp != *cached_ppp) {
452 if (*cached_ppp == NULL)
453 flags = HAT_LOAD_LOCK | HAT_NOSYNC |
454 HAT_LOAD_NOCONSIST;
455 else
456 flags = HAT_LOAD_REMAP |
457 HAT_LOAD_NOCONSIST;
458 /*
459 * In order to cache the kernel mapping after
460 * the user page is unlocked, we call
461 * hat_devload instead of hat_memload so
462 * that the kernel mapping we set up here is
463 * "invisible" to the rest of the world. This
464 * is not very pretty. But as long as the
465 * caller bears the responsibility of keeping
466 * cache consistency, we should be ok -
467 * HAT_NOCONSIST will get us a uncached
468 * mapping on VAC. hat_softlock will flush
469 * a VAC_WRITEBACK cache. Therefore the kaddr
470 * doesn't have to be of the same vcolor as
471 * uaddr.
472 * The alternative is - change hat_devload
473 * to get a cached mapping. Allocate a kaddr
474 * with the same vcolor as uaddr. Then
475 * hat_softlock won't need to flush the VAC.
476 */
477 hat_devload(kas.a_hat, kaddr, PAGESIZE,
478 page_pptonum(pp), PROT_READ, flags);
479 *cached_ppp = pp;
480 }
481 kaddr += PAGESIZE;
482 }
483 cached_ppp++;
484 app++;
485 ++i;
486 }
487 if (cow) {
488 AS_LOCK_EXIT(as);
489 }
490 if (first && res == FC_NOMAP) {
491 /*
492 * If the address is not mapped yet, we call as_fault to
493 * fault the pages in. We could've fallen back to copy and
494 * let it fault in the pages. But for a mapped file, we
495 * normally reference each page only once. For zero-copy to
496 * be of any use, we'd better fall in the page now and try
497 * again.
498 */
499 first = 0;
500 size = size << PAGESHIFT;
501 uaddr += size;
502 total -= size;
503 size = total;
504 res = as_fault(as->a_hat, as, uaddr, size, F_INVAL, S_READ);
505 if (cow)
506 AS_LOCK_ENTER(as, RW_WRITER);
507 goto tryagain;
508 }
509 switch (res) {
510 case FC_NOSUPPORT:
511 return (ENOTSUP);
512 case FC_PROT: /* Pretend we don't know about it. This will be */
513 /* caught by the caller when uiomove fails. */
514 case FC_NOMAP:
515 case FC_OBJERR:
516 default:
517 return (0);
518 }
519 }