big-one New usr/src/uts/common/os/vm

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  24  */
  25 
  26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  27 /*        All Rights Reserved   */
  28 
  29 /*
  30  * University Copyright- Copyright (c) 1982, 1986, 1988
  31  * The Regents of the University of California
  32  * All Rights Reserved
  33  *
  34  * University Acknowledgment- Portions of this document are derived from
  35  * software developed by the University of California, Berkeley, and its
  36  * contributors.
  37  */
  38 
  39 #include <sys/types.h>
  40 #include <sys/t_lock.h>
  41 #include <sys/param.h>
  42 #include <sys/errno.h>
  43 #include <sys/debug.h>
  44 #include <sys/cmn_err.h>
  45 #include <sys/kmem.h>
  46 #include <sys/sysmacros.h>
  47 #include <sys/inline.h>
  48 #include <sys/buf.h>
  49 #include <sys/uio.h>
  50 #include <sys/user.h>
  51 #include <sys/proc.h>
  52 #include <sys/systm.h>
  53 #include <sys/vmsystm.h>
  54 #include <sys/cpuvar.h>
  55 #include <sys/mman.h>
  56 #include <sys/cred.h>
  57 #include <sys/vnode.h>
  58 #include <sys/file.h>
  59 #include <sys/vm.h>
  60 
  61 #include <sys/swap.h>
  62 #include <sys/vtrace.h>
  63 #include <sys/tnf_probe.h>
  64 #include <sys/fs/snode.h>
  65 #include <sys/copyops.h>
  66 #include <sys/conf.h>
  67 #include <sys/sdt.h>
  68 
  69 #include <vm/anon.h>
  70 #include <vm/hat.h>
  71 #include <vm/as.h>
  72 #include <vm/seg.h>
  73 #include <vm/page.h>
  74 #include <vm/seg_vn.h>
  75 #include <vm/seg_kmem.h>
  76 
  77 #include <sys/sunddi.h>
  78 
  79 void
  80 minphys(struct buf *bp)
  81 {
  82         if (bp->b_bcount > maxphys)
  83                 bp->b_bcount = maxphys;
  84 }
  85 
  86 /*
  87  * use kmem_cache_create for physio buffers. This has shown
  88  * a better cache distribution compared to buffers on the
  89  * stack. It also avoids semaphore construction/deconstruction
  90  * per request
  91  */
  92 
  93 static struct kmem_cache *physio_buf_cache;
  94 
  95 /* ARGSUSED */
  96 static int
  97 physio_buf_constructor(void *buf, void *cdrarg, int kmflags)
  98 {
  99         bioinit((struct buf *)buf);
 100         return (0);
 101 }
 102 
 103 /* ARGSUSED */
 104 static void
 105 physio_buf_destructor(void *buf, void *cdrarg)
 106 {
 107         biofini((struct buf *)buf);
 108 }
 109 
 110 void
 111 physio_bufs_init(void)
 112 {
 113         physio_buf_cache = kmem_cache_create("physio_buf_cache",
 114             sizeof (struct buf), 0, physio_buf_constructor,
 115             physio_buf_destructor, NULL, NULL, NULL, 0);
 116 }
 117 
 118 
 119 
 120 /*
 121  * initiate raw I/O request
 122  *
 123  * allocate buf header if necessary
 124  * adjust max size of each I/O request
 125  * lock down user pages and verify access protections
 126  * call driver's strategy routine to submit request
 127  * wait for I/O completion
 128  * unlock user pages and free allocated buf header
 129  */
 130 
 131 int
 132 default_physio(int (*strat)(struct buf *), struct buf *bp, dev_t dev,
 133         int rw, void (*mincnt)(struct buf *), struct uio *uio)
 134 {
 135         struct iovec *iov;
 136         struct proc *procp;
 137         struct as *asp;
 138         ssize_t c;
 139         char *a;
 140         int error = 0;
 141         page_t **pplist;
 142         int allocbuf = 0;
 143 
 144         TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_START, "physio_start: bp %p", bp);
 145 
 146         /* Kernel probe */
 147         TNF_PROBE_4(physio_start, "io rawio", /* CSTYLED */,
 148             tnf_device,         device,         dev,
 149             tnf_offset,         offset,         uio->uio_loffset,
 150             tnf_size,           size,           uio->uio_resid,
 151             tnf_bioflags,       rw,             rw);
 152 
 153         if (rw == B_READ) {
 154                 CPU_STATS_ADD_K(sys, phread, 1);
 155         } else {
 156                 CPU_STATS_ADD_K(sys, phwrite, 1);
 157         }
 158 
 159         TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_START,
 160             "getbuf_start: bp %p", bp);
 161 
 162         if (bp == NULL) {
 163                 bp = kmem_cache_alloc(physio_buf_cache, KM_SLEEP);
 164                 bp->b_iodone = NULL;
 165                 bp->b_resid = 0;
 166                 allocbuf = 1;
 167         }
 168         TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_END, "getbuf_end: bp %p", bp);
 169 
 170         if (uio->uio_segflg == UIO_USERSPACE) {
 171                 procp = ttoproc(curthread);
 172                 asp = procp->p_as;
 173         } else {
 174                 procp = NULL;
 175                 asp = &kas;
 176         }
 177         ASSERT(SEMA_HELD(&bp->b_sem));
 178 
 179         /*
 180          * We need to prepare this buffer for the io:::start probe, including
 181          * NULL'ing out the file, clearing the offset, and filling in the
 182          * b_dip field.
 183          */
 184         bp->b_file = NULL;
 185         bp->b_offset = -1;
 186 
 187         if (dev != NODEV) {
 188                 (void) devopsp[getmajor(dev)]->devo_getinfo(NULL,
 189                     DDI_INFO_DEVT2DEVINFO, (void *)dev, (void **)&bp->b_dip);
 190         } else {
 191                 bp->b_dip = NULL;
 192         }
 193 
 194         while (uio->uio_iovcnt > 0) {
 195                 iov = uio->uio_iov;
 196 
 197                 bp->b_error = 0;
 198                 bp->b_proc = procp;
 199 
 200                 while (iov->iov_len > 0) {
 201                         if (uio->uio_resid == 0)
 202                                 break;
 203                         if (uio->uio_loffset < 0) {
 204                                 error = EINVAL;
 205                                 break;
 206                         }
 207 #ifdef  _ILP32
 208                         /*
 209                          * For 32-bit kernels, check against SPEC_MAXOFFSET_T
 210                          * which represents the maximum size that can be
 211                          * supported by the IO subsystem.
 212                          * XXX this code assumes a D_64BIT driver.
 213                          */
 214                         if (uio->uio_loffset > SPEC_MAXOFFSET_T) {
 215                                 error = EINVAL;
 216                                 break;
 217                         }
 218 #endif  /* _ILP32 */
 219                         bp->b_flags = B_BUSY | B_PHYS | rw;
 220                         bp->b_edev = dev;
 221                         bp->b_lblkno = btodt(uio->uio_loffset);
 222 
 223                         /*
 224                          * Don't count on b_addr remaining untouched by the
 225                          * code below (it may be reset because someone does
 226                          * a bp_mapin on the buffer) -- reset from the iov
 227                          * each time through, updating the iov's base address
 228                          * instead.
 229                          */
 230                         a = bp->b_un.b_addr = iov->iov_base;
 231                         bp->b_bcount = MIN(iov->iov_len, uio->uio_resid);
 232                         (*mincnt)(bp);
 233                         c = bp->b_bcount;
 234 
 235                         TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_START,
 236                             "as_pagelock_start: bp %p", bp);
 237 
 238                         error = as_pagelock(asp, &pplist, a,
 239                             c, rw == B_READ? S_WRITE : S_READ);
 240 
 241                         TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_END,
 242                             "as_pagelock_end:");
 243 
 244                         if (error != 0) {
 245                                 bp->b_flags |= B_ERROR;
 246                                 bp->b_error = error;
 247                                 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS);
 248                                 break;
 249                         }
 250                         bp->b_shadow = pplist;
 251                         if (pplist != NULL) {
 252                                 bp->b_flags |= B_SHADOW;
 253                         }
 254 
 255                         DTRACE_IO1(start, struct buf *, bp);
 256                         bp->b_flags |= B_STARTED;
 257 
 258                         (void) (*strat)(bp);
 259                         error = biowait(bp);
 260 
 261                         /*
 262                          * unlock the pages
 263                          */
 264                         TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_START,
 265                             "as_pageunlock_start: bp %p", bp);
 266 
 267                         as_pageunlock(asp, pplist, a, c,
 268                             rw == B_READ? S_WRITE : S_READ);
 269 
 270                         TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_END,
 271                             "as_pageunlock_end:");
 272 
 273                         c -= bp->b_resid;
 274                         iov->iov_base += c;
 275                         iov->iov_len -= c;
 276                         uio->uio_resid -= c;
 277                         uio->uio_loffset += c;
 278                         /* bp->b_resid - temp kludge for tape drives */
 279                         if (bp->b_resid || error)
 280                                 break;
 281                 }
 282                 bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
 283                 /* bp->b_resid - temp kludge for tape drives */
 284                 if (bp->b_resid || error)
 285                         break;
 286                 uio->uio_iov++;
 287                 uio->uio_iovcnt--;
 288         }
 289 
 290         if (allocbuf) {
 291                 kmem_cache_free(physio_buf_cache, bp);
 292         }
 293 
 294         /* Kernel probe */
 295         TNF_PROBE_1(physio_end, "io rawio", /* CSTYLED */,
 296                 tnf_device,     device,         dev);
 297 
 298         TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_END, "physio_end: bp %p", bp);
 299 
 300         return (error);
 301 }
 302 
 303 /*
 304  * Returns 0 on success, or an error on failure.
 305  *
 306  * This function is no longer a part of the DDI/DKI.
 307  * However, for compatibility, its interface should not
 308  * be changed and it should not be removed from the kernel.
 309  */
 310 int
 311 useracc(void *addr, size_t count, int access)
 312 {
 313         uint_t prot;
 314 
 315         prot = PROT_USER | ((access == B_READ) ? PROT_READ : PROT_WRITE);
 316         return (as_checkprot(ttoproc(curthread)->p_as, addr, count, prot));
 317 }
 318 
 319 #define MAX_MAPIN_PAGES 8
 320 
 321 /*
 322  * This function temporarily "borrows" user pages for kernel use. If
 323  * "cow" is on, it also sets up copy-on-write protection (only feasible
 324  * on MAP_PRIVATE segment) on the user mappings, to protect the borrowed
 325  * pages from any changes by the user. The caller is responsible for
 326  * unlocking and tearing down cow settings when it's done with the pages.
 327  * For an example, see kcfree().
 328  *
 329  * Pages behind [uaddr..uaddr+*lenp] under address space "as" are locked
 330  * (shared), and mapped into kernel address range [kaddr..kaddr+*lenp] if
 331  * kaddr != -1. On entering this function, cached_ppp contains a list
 332  * of pages that are mapped into [kaddr..kaddr+*lenp] already (from a
 333  * previous call). Thus if same pages remain behind [uaddr..uaddr+*lenp],
 334  * the kernel map won't need to be reloaded again.
 335  *
 336  * For cow == 1, if the pages are anonymous pages, it also bumps the anon
 337  * reference count, and change the user-mapping to read-only. This
 338  * scheme should work on all types of segment drivers. But to be safe,
 339  * we check against segvn here.
 340  *
 341  * Since this function is used to emulate copyin() semantic, it checks
 342  * to make sure the user-mappings allow "user-read".
 343  *
 344  * On exit "lenp" contains the number of bytes successfully locked and
 345  * mapped in. For the unsuccessful ones, the caller can fall back to
 346  * copyin().
 347  *
 348  * Error return:
 349  * ENOTSUP - operation like this is not supported either on this segment
 350  * type, or on this platform type.
 351  */
 352 int
 353 cow_mapin(struct as *as, caddr_t uaddr, caddr_t kaddr, struct page **cached_ppp,
 354     struct anon **app, size_t *lenp, int cow)
 355 {
 356         struct          hat *hat;
 357         struct seg      *seg;
 358         caddr_t         base;
 359         page_t          *pp, *ppp[MAX_MAPIN_PAGES];
 360         long            i;
 361         int             flags;
 362         size_t          size, total = *lenp;
 363         char            first = 1;
 364         faultcode_t     res;
 365 
 366         *lenp = 0;
 367         if (cow) {
 368                 AS_LOCK_ENTER(as, RW_WRITER);
 369                 seg = as_findseg(as, uaddr, 0);
 370                 if ((seg == NULL) || ((base = seg->s_base) > uaddr) ||
 371                     (uaddr + total) > base + seg->s_size) {
 372                         AS_LOCK_EXIT(as);
 373                         return (EINVAL);
 374                 }
 375                 /*
 376                  * The COW scheme should work for all segment types.
 377                  * But to be safe, we check against segvn.
 378                  */
 379                 if (seg->s_ops != &segvn_ops) {
 380                         AS_LOCK_EXIT(as);
 381                         return (ENOTSUP);
 382                 } else if ((SEGOP_GETTYPE(seg, uaddr) & MAP_PRIVATE) == 0) {
 383                         AS_LOCK_EXIT(as);
 384                         return (ENOTSUP);
 385                 }
 386         }
 387         hat = as->a_hat;
 388         size = total;
 389 tryagain:
 390         /*
 391          * If (cow), hat_softlock will also change the usr protection to RO.
 392          * This is the first step toward setting up cow. Before we
 393          * bump up an_refcnt, we can't allow any cow-fault on this
 394          * address. Otherwise segvn_fault will change the protection back
 395          * to RW upon seeing an_refcnt == 1.
 396          * The solution is to hold the writer lock on "as".
 397          */
 398         res = hat_softlock(hat, uaddr, &size, &ppp[0], cow ? HAT_COW : 0);
 399         size = total - size;
 400         *lenp += size;
 401         size = size >> PAGESHIFT;
 402         i = 0;
 403         while (i < size) {
 404                 pp = ppp[i];
 405                 if (cow) {
 406                         kmutex_t *ahm;
 407                         /*
 408                          * Another solution is to hold SE_EXCL on pp, and
 409                          * disable PROT_WRITE. This also works for MAP_SHARED
 410                          * segment. The disadvantage is that it locks the
 411                          * page from being used by anybody else.
 412                          */
 413                         ahm = AH_MUTEX(pp->p_vnode, pp->p_offset);
 414                         mutex_enter(ahm);
 415                         *app = swap_anon(pp->p_vnode, pp->p_offset);
 416                         /*
 417                          * Since we are holding the as lock, this avoids a
 418                          * potential race with anon_decref. (segvn_unmap and
 419                          * segvn_free needs the as writer lock to do anon_free.)
 420                          */
 421                         if (*app != NULL) {
 422 #if 0
 423                                 if ((*app)->an_refcnt == 0)
 424                                 /*
 425                                  * Consider the following senario (unlikey
 426                                  * though):
 427                                  * 1. an_refcnt == 2
 428                                  * 2. we solftlock the page.
 429                                  * 3. cow ocurrs on this addr. So a new ap,
 430                                  * page and mapping is established on addr.
 431                                  * 4. an_refcnt drops to 1 (segvn_faultpage
 432                                  * -> anon_decref(oldap))
 433                                  * 5. the last ref to ap also drops (from
 434                                  * another as). It ends up blocked inside
 435                                  * anon_decref trying to get page's excl lock.
 436                                  * 6. Later kcfree unlocks the page, call
 437                                  * anon_decref -> oops, ap is gone already.
 438                                  *
 439                                  * Holding as writer lock solves all problems.
 440                                  */
 441                                         *app = NULL;
 442                                 else
 443 #endif
 444                                         (*app)->an_refcnt++;
 445                         }
 446                         mutex_exit(ahm);
 447                 } else {
 448                         *app = NULL;
 449                 }
 450                 if (kaddr != (caddr_t)-1) {
 451                         if (pp != *cached_ppp) {
 452                                 if (*cached_ppp == NULL)
 453                                         flags = HAT_LOAD_LOCK | HAT_NOSYNC |
 454                                             HAT_LOAD_NOCONSIST;
 455                                 else
 456                                         flags = HAT_LOAD_REMAP |
 457                                             HAT_LOAD_NOCONSIST;
 458                                 /*
 459                                  * In order to cache the kernel mapping after
 460                                  * the user page is unlocked, we call
 461                                  * hat_devload instead of hat_memload so
 462                                  * that the kernel mapping we set up here is
 463                                  * "invisible" to the rest of the world. This
 464                                  * is not very pretty. But as long as the
 465                                  * caller bears the responsibility of keeping
 466                                  * cache consistency, we should be ok -
 467                                  * HAT_NOCONSIST will get us a uncached
 468                                  * mapping on VAC. hat_softlock will flush
 469                                  * a VAC_WRITEBACK cache. Therefore the kaddr
 470                                  * doesn't have to be of the same vcolor as
 471                                  * uaddr.
 472                                  * The alternative is - change hat_devload
 473                                  * to get a cached mapping. Allocate a kaddr
 474                                  * with the same vcolor as uaddr. Then
 475                                  * hat_softlock won't need to flush the VAC.
 476                                  */
 477                                 hat_devload(kas.a_hat, kaddr, PAGESIZE,
 478                                     page_pptonum(pp), PROT_READ, flags);
 479                                 *cached_ppp = pp;
 480                         }
 481                         kaddr += PAGESIZE;
 482                 }
 483                 cached_ppp++;
 484                 app++;
 485                 ++i;
 486         }
 487         if (cow) {
 488                 AS_LOCK_EXIT(as);
 489         }
 490         if (first && res == FC_NOMAP) {
 491                 /*
 492                  * If the address is not mapped yet, we call as_fault to
 493                  * fault the pages in. We could've fallen back to copy and
 494                  * let it fault in the pages. But for a mapped file, we
 495                  * normally reference each page only once. For zero-copy to
 496                  * be of any use, we'd better fall in the page now and try
 497                  * again.
 498                  */
 499                 first = 0;
 500                 size = size << PAGESHIFT;
 501                 uaddr += size;
 502                 total -= size;
 503                 size = total;
 504                 res = as_fault(as->a_hat, as, uaddr, size, F_INVAL, S_READ);
 505                 if (cow)
 506                         AS_LOCK_ENTER(as, RW_WRITER);
 507                 goto tryagain;
 508         }
 509         switch (res) {
 510         case FC_NOSUPPORT:
 511                 return (ENOTSUP);
 512         case FC_PROT:   /* Pretend we don't know about it. This will be */
 513                         /* caught by the caller when uiomove fails. */
 514         case FC_NOMAP:
 515         case FC_OBJERR:
 516         default:
 517                 return (0);
 518         }
 519 }