big-one Wdiff usr/src/uts/common/os/vm_subr.c

Print this page

NEX-5164 backport illumos 6514 AS_* lock macros simplification
Reviewed by: Kevin Crowe <kevin.crowe@nexenta.com>
6514 AS_* lock macros simplification
Reviewed by: Piotr Jasiukajtis <estibi@me.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Albert Lee <trisk@omniti.com>
Approved by: Dan McDonald <danmcd@omniti.com>
re #13613 rb4516 Tunables needs volatile keyword

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/os/vm_subr.c
          +++ new/usr/src/uts/common/os/vm_subr.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *

↓ open down ↓

12 lines elided

↑ open up ↑

  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
       23 + * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  23   24   */
  24   25  
  25   26  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  26   27  /*        All Rights Reserved   */
  27   28  
  28   29  /*
  29   30   * University Copyright- Copyright (c) 1982, 1986, 1988
  30   31   * The Regents of the University of California
  31   32   * All Rights Reserved
  32   33   *

  33   34   * University Acknowledgment- Portions of this document are derived from
  34   35   * software developed by the University of California, Berkeley, and its
  35   36   * contributors.
  36   37   */
  37   38  
  38   39  #include <sys/types.h>
  39   40  #include <sys/t_lock.h>
  40   41  #include <sys/param.h>
  41   42  #include <sys/errno.h>
  42   43  #include <sys/debug.h>
  43   44  #include <sys/cmn_err.h>
  44   45  #include <sys/kmem.h>
  45   46  #include <sys/sysmacros.h>
  46   47  #include <sys/inline.h>
  47   48  #include <sys/buf.h>
  48   49  #include <sys/uio.h>
  49   50  #include <sys/user.h>
  50   51  #include <sys/proc.h>
  51   52  #include <sys/systm.h>
  52   53  #include <sys/vmsystm.h>
  53   54  #include <sys/cpuvar.h>
  54   55  #include <sys/mman.h>
  55   56  #include <sys/cred.h>
  56   57  #include <sys/vnode.h>
  57   58  #include <sys/file.h>
  58   59  #include <sys/vm.h>
  59   60  
  60   61  #include <sys/swap.h>
  61   62  #include <sys/vtrace.h>
  62   63  #include <sys/tnf_probe.h>
  63   64  #include <sys/fs/snode.h>
  64   65  #include <sys/copyops.h>
  65   66  #include <sys/conf.h>

↓ open down ↓

33 lines elided

↑ open up ↑

  66   67  #include <sys/sdt.h>
  67   68  
  68   69  #include <vm/anon.h>
  69   70  #include <vm/hat.h>
  70   71  #include <vm/as.h>
  71   72  #include <vm/seg.h>
  72   73  #include <vm/page.h>
  73   74  #include <vm/seg_vn.h>
  74   75  #include <vm/seg_kmem.h>
  75   76  
  76      -extern int maxphys;
       77 +#include <sys/sunddi.h>
  77   78  
  78   79  void
  79   80  minphys(struct buf *bp)
  80   81  {
  81   82          if (bp->b_bcount > maxphys)
  82   83                  bp->b_bcount = maxphys;
  83   84  }
  84   85  
  85   86  /*
  86   87   * use kmem_cache_create for physio buffers. This has shown

  87   88   * a better cache distribution compared to buffers on the
  88   89   * stack. It also avoids semaphore construction/deconstruction
  89   90   * per request
  90   91   */
  91   92  
  92   93  static struct kmem_cache *physio_buf_cache;
  93   94  
  94   95  /* ARGSUSED */
  95   96  static int
  96   97  physio_buf_constructor(void *buf, void *cdrarg, int kmflags)
  97   98  {
  98   99          bioinit((struct buf *)buf);
  99  100          return (0);
 100  101  }
 101  102  
 102  103  /* ARGSUSED */
 103  104  static void
 104  105  physio_buf_destructor(void *buf, void *cdrarg)
 105  106  {
 106  107          biofini((struct buf *)buf);
 107  108  }
 108  109  
 109  110  void
 110  111  physio_bufs_init(void)
 111  112  {
 112  113          physio_buf_cache = kmem_cache_create("physio_buf_cache",
 113  114              sizeof (struct buf), 0, physio_buf_constructor,
 114  115              physio_buf_destructor, NULL, NULL, NULL, 0);
 115  116  }
 116  117  
 117  118  
 118  119  
 119  120  /*
 120  121   * initiate raw I/O request
 121  122   *
 122  123   * allocate buf header if necessary
 123  124   * adjust max size of each I/O request
 124  125   * lock down user pages and verify access protections
 125  126   * call driver's strategy routine to submit request
 126  127   * wait for I/O completion
 127  128   * unlock user pages and free allocated buf header
 128  129   */
 129  130  
 130  131  int
 131  132  default_physio(int (*strat)(struct buf *), struct buf *bp, dev_t dev,
 132  133          int rw, void (*mincnt)(struct buf *), struct uio *uio)
 133  134  {
 134  135          struct iovec *iov;
 135  136          struct proc *procp;
 136  137          struct as *asp;
 137  138          ssize_t c;
 138  139          char *a;
 139  140          int error = 0;
 140  141          page_t **pplist;
 141  142          int allocbuf = 0;
 142  143  
 143  144          TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_START, "physio_start: bp %p", bp);
 144  145  
 145  146          /* Kernel probe */
 146  147          TNF_PROBE_4(physio_start, "io rawio", /* CSTYLED */,
 147  148              tnf_device,         device,         dev,
 148  149              tnf_offset,         offset,         uio->uio_loffset,
 149  150              tnf_size,           size,           uio->uio_resid,
 150  151              tnf_bioflags,       rw,             rw);
 151  152  
 152  153          if (rw == B_READ) {
 153  154                  CPU_STATS_ADD_K(sys, phread, 1);
 154  155          } else {
 155  156                  CPU_STATS_ADD_K(sys, phwrite, 1);
 156  157          }
 157  158  
 158  159          TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_START,
 159  160              "getbuf_start: bp %p", bp);
 160  161  
 161  162          if (bp == NULL) {
 162  163                  bp = kmem_cache_alloc(physio_buf_cache, KM_SLEEP);
 163  164                  bp->b_iodone = NULL;
 164  165                  bp->b_resid = 0;
 165  166                  allocbuf = 1;
 166  167          }
 167  168          TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_END, "getbuf_end: bp %p", bp);
 168  169  
 169  170          if (uio->uio_segflg == UIO_USERSPACE) {
 170  171                  procp = ttoproc(curthread);
 171  172                  asp = procp->p_as;
 172  173          } else {
 173  174                  procp = NULL;
 174  175                  asp = &kas;
 175  176          }
 176  177          ASSERT(SEMA_HELD(&bp->b_sem));
 177  178  
 178  179          /*
 179  180           * We need to prepare this buffer for the io:::start probe, including
 180  181           * NULL'ing out the file, clearing the offset, and filling in the
 181  182           * b_dip field.
 182  183           */
 183  184          bp->b_file = NULL;
 184  185          bp->b_offset = -1;
 185  186  
 186  187          if (dev != NODEV) {
 187  188                  (void) devopsp[getmajor(dev)]->devo_getinfo(NULL,
 188  189                      DDI_INFO_DEVT2DEVINFO, (void *)dev, (void **)&bp->b_dip);
 189  190          } else {
 190  191                  bp->b_dip = NULL;
 191  192          }
 192  193  
 193  194          while (uio->uio_iovcnt > 0) {
 194  195                  iov = uio->uio_iov;
 195  196  
 196  197                  bp->b_error = 0;
 197  198                  bp->b_proc = procp;
 198  199  
 199  200                  while (iov->iov_len > 0) {
 200  201                          if (uio->uio_resid == 0)
 201  202                                  break;
 202  203                          if (uio->uio_loffset < 0) {
 203  204                                  error = EINVAL;
 204  205                                  break;
 205  206                          }
 206  207  #ifdef  _ILP32
 207  208                          /*
 208  209                           * For 32-bit kernels, check against SPEC_MAXOFFSET_T
 209  210                           * which represents the maximum size that can be
 210  211                           * supported by the IO subsystem.
 211  212                           * XXX this code assumes a D_64BIT driver.
 212  213                           */
 213  214                          if (uio->uio_loffset > SPEC_MAXOFFSET_T) {
 214  215                                  error = EINVAL;
 215  216                                  break;
 216  217                          }
 217  218  #endif  /* _ILP32 */
 218  219                          bp->b_flags = B_BUSY | B_PHYS | rw;
 219  220                          bp->b_edev = dev;
 220  221                          bp->b_lblkno = btodt(uio->uio_loffset);
 221  222  
 222  223                          /*
 223  224                           * Don't count on b_addr remaining untouched by the
 224  225                           * code below (it may be reset because someone does
 225  226                           * a bp_mapin on the buffer) -- reset from the iov
 226  227                           * each time through, updating the iov's base address
 227  228                           * instead.
 228  229                           */
 229  230                          a = bp->b_un.b_addr = iov->iov_base;
 230  231                          bp->b_bcount = MIN(iov->iov_len, uio->uio_resid);
 231  232                          (*mincnt)(bp);
 232  233                          c = bp->b_bcount;
 233  234  
 234  235                          TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_START,
 235  236                              "as_pagelock_start: bp %p", bp);
 236  237  
 237  238                          error = as_pagelock(asp, &pplist, a,
 238  239                              c, rw == B_READ? S_WRITE : S_READ);
 239  240  
 240  241                          TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_END,
 241  242                              "as_pagelock_end:");
 242  243  
 243  244                          if (error != 0) {
 244  245                                  bp->b_flags |= B_ERROR;
 245  246                                  bp->b_error = error;
 246  247                                  bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS);
 247  248                                  break;
 248  249                          }
 249  250                          bp->b_shadow = pplist;
 250  251                          if (pplist != NULL) {
 251  252                                  bp->b_flags |= B_SHADOW;
 252  253                          }
 253  254  
 254  255                          DTRACE_IO1(start, struct buf *, bp);
 255  256                          bp->b_flags |= B_STARTED;
 256  257  
 257  258                          (void) (*strat)(bp);
 258  259                          error = biowait(bp);
 259  260  
 260  261                          /*
 261  262                           * unlock the pages
 262  263                           */
 263  264                          TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_START,
 264  265                              "as_pageunlock_start: bp %p", bp);
 265  266  
 266  267                          as_pageunlock(asp, pplist, a, c,
 267  268                              rw == B_READ? S_WRITE : S_READ);
 268  269  
 269  270                          TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_END,
 270  271                              "as_pageunlock_end:");
 271  272  
 272  273                          c -= bp->b_resid;
 273  274                          iov->iov_base += c;
 274  275                          iov->iov_len -= c;
 275  276                          uio->uio_resid -= c;
 276  277                          uio->uio_loffset += c;
 277  278                          /* bp->b_resid - temp kludge for tape drives */
 278  279                          if (bp->b_resid || error)
 279  280                                  break;
 280  281                  }
 281  282                  bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
 282  283                  /* bp->b_resid - temp kludge for tape drives */
 283  284                  if (bp->b_resid || error)
 284  285                          break;
 285  286                  uio->uio_iov++;
 286  287                  uio->uio_iovcnt--;
 287  288          }
 288  289  
 289  290          if (allocbuf) {
 290  291                  kmem_cache_free(physio_buf_cache, bp);
 291  292          }
 292  293  
 293  294          /* Kernel probe */
 294  295          TNF_PROBE_1(physio_end, "io rawio", /* CSTYLED */,
 295  296                  tnf_device,     device,         dev);
 296  297  
 297  298          TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_END, "physio_end: bp %p", bp);
 298  299  
 299  300          return (error);
 300  301  }
 301  302  
 302  303  /*
 303  304   * Returns 0 on success, or an error on failure.
 304  305   *
 305  306   * This function is no longer a part of the DDI/DKI.
 306  307   * However, for compatibility, its interface should not
 307  308   * be changed and it should not be removed from the kernel.
 308  309   */
 309  310  int
 310  311  useracc(void *addr, size_t count, int access)
 311  312  {
 312  313          uint_t prot;
 313  314  
 314  315          prot = PROT_USER | ((access == B_READ) ? PROT_READ : PROT_WRITE);
 315  316          return (as_checkprot(ttoproc(curthread)->p_as, addr, count, prot));
 316  317  }
 317  318  
 318  319  #define MAX_MAPIN_PAGES 8
 319  320  
 320  321  /*
 321  322   * This function temporarily "borrows" user pages for kernel use. If
 322  323   * "cow" is on, it also sets up copy-on-write protection (only feasible
 323  324   * on MAP_PRIVATE segment) on the user mappings, to protect the borrowed
 324  325   * pages from any changes by the user. The caller is responsible for
 325  326   * unlocking and tearing down cow settings when it's done with the pages.
 326  327   * For an example, see kcfree().
 327  328   *
 328  329   * Pages behind [uaddr..uaddr+*lenp] under address space "as" are locked
 329  330   * (shared), and mapped into kernel address range [kaddr..kaddr+*lenp] if
 330  331   * kaddr != -1. On entering this function, cached_ppp contains a list
 331  332   * of pages that are mapped into [kaddr..kaddr+*lenp] already (from a
 332  333   * previous call). Thus if same pages remain behind [uaddr..uaddr+*lenp],
 333  334   * the kernel map won't need to be reloaded again.
 334  335   *
 335  336   * For cow == 1, if the pages are anonymous pages, it also bumps the anon
 336  337   * reference count, and change the user-mapping to read-only. This
 337  338   * scheme should work on all types of segment drivers. But to be safe,
 338  339   * we check against segvn here.
 339  340   *
 340  341   * Since this function is used to emulate copyin() semantic, it checks
 341  342   * to make sure the user-mappings allow "user-read".
 342  343   *
 343  344   * On exit "lenp" contains the number of bytes successfully locked and
 344  345   * mapped in. For the unsuccessful ones, the caller can fall back to
 345  346   * copyin().
 346  347   *
 347  348   * Error return:
 348  349   * ENOTSUP - operation like this is not supported either on this segment
 349  350   * type, or on this platform type.
 350  351   */
 351  352  int
 352  353  cow_mapin(struct as *as, caddr_t uaddr, caddr_t kaddr, struct page **cached_ppp,
 353  354      struct anon **app, size_t *lenp, int cow)
 354  355  {
 355  356          struct          hat *hat;
 356  357          struct seg      *seg;
 357  358          caddr_t         base;
 358  359          page_t          *pp, *ppp[MAX_MAPIN_PAGES];
 359  360          long            i;
 360  361          int             flags;
 361  362          size_t          size, total = *lenp;
 362  363          char            first = 1;
 363  364          faultcode_t     res;
 364  365  
 365  366          *lenp = 0;
 366  367          if (cow) {
 367  368                  AS_LOCK_ENTER(as, RW_WRITER);
 368  369                  seg = as_findseg(as, uaddr, 0);
 369  370                  if ((seg == NULL) || ((base = seg->s_base) > uaddr) ||
 370  371                      (uaddr + total) > base + seg->s_size) {
 371  372                          AS_LOCK_EXIT(as);
 372  373                          return (EINVAL);
 373  374                  }
 374  375                  /*
 375  376                   * The COW scheme should work for all segment types.
 376  377                   * But to be safe, we check against segvn.
 377  378                   */
 378  379                  if (seg->s_ops != &segvn_ops) {
 379  380                          AS_LOCK_EXIT(as);
 380  381                          return (ENOTSUP);
 381  382                  } else if ((SEGOP_GETTYPE(seg, uaddr) & MAP_PRIVATE) == 0) {
 382  383                          AS_LOCK_EXIT(as);
 383  384                          return (ENOTSUP);
 384  385                  }
 385  386          }
 386  387          hat = as->a_hat;
 387  388          size = total;
 388  389  tryagain:
 389  390          /*
 390  391           * If (cow), hat_softlock will also change the usr protection to RO.
 391  392           * This is the first step toward setting up cow. Before we
 392  393           * bump up an_refcnt, we can't allow any cow-fault on this
 393  394           * address. Otherwise segvn_fault will change the protection back
 394  395           * to RW upon seeing an_refcnt == 1.
 395  396           * The solution is to hold the writer lock on "as".
 396  397           */
 397  398          res = hat_softlock(hat, uaddr, &size, &ppp[0], cow ? HAT_COW : 0);
 398  399          size = total - size;
 399  400          *lenp += size;
 400  401          size = size >> PAGESHIFT;
 401  402          i = 0;
 402  403          while (i < size) {
 403  404                  pp = ppp[i];
 404  405                  if (cow) {
 405  406                          kmutex_t *ahm;
 406  407                          /*
 407  408                           * Another solution is to hold SE_EXCL on pp, and
 408  409                           * disable PROT_WRITE. This also works for MAP_SHARED
 409  410                           * segment. The disadvantage is that it locks the
 410  411                           * page from being used by anybody else.
 411  412                           */
 412  413                          ahm = AH_MUTEX(pp->p_vnode, pp->p_offset);
 413  414                          mutex_enter(ahm);
 414  415                          *app = swap_anon(pp->p_vnode, pp->p_offset);
 415  416                          /*
 416  417                           * Since we are holding the as lock, this avoids a
 417  418                           * potential race with anon_decref. (segvn_unmap and
 418  419                           * segvn_free needs the as writer lock to do anon_free.)
 419  420                           */
 420  421                          if (*app != NULL) {
 421  422  #if 0
 422  423                                  if ((*app)->an_refcnt == 0)
 423  424                                  /*
 424  425                                   * Consider the following senario (unlikey
 425  426                                   * though):
 426  427                                   * 1. an_refcnt == 2
 427  428                                   * 2. we solftlock the page.
 428  429                                   * 3. cow ocurrs on this addr. So a new ap,
 429  430                                   * page and mapping is established on addr.
 430  431                                   * 4. an_refcnt drops to 1 (segvn_faultpage
 431  432                                   * -> anon_decref(oldap))
 432  433                                   * 5. the last ref to ap also drops (from
 433  434                                   * another as). It ends up blocked inside
 434  435                                   * anon_decref trying to get page's excl lock.
 435  436                                   * 6. Later kcfree unlocks the page, call
 436  437                                   * anon_decref -> oops, ap is gone already.
 437  438                                   *
 438  439                                   * Holding as writer lock solves all problems.
 439  440                                   */
 440  441                                          *app = NULL;
 441  442                                  else
 442  443  #endif
 443  444                                          (*app)->an_refcnt++;
 444  445                          }
 445  446                          mutex_exit(ahm);
 446  447                  } else {
 447  448                          *app = NULL;
 448  449                  }
 449  450                  if (kaddr != (caddr_t)-1) {
 450  451                          if (pp != *cached_ppp) {
 451  452                                  if (*cached_ppp == NULL)
 452  453                                          flags = HAT_LOAD_LOCK | HAT_NOSYNC |
 453  454                                              HAT_LOAD_NOCONSIST;
 454  455                                  else
 455  456                                          flags = HAT_LOAD_REMAP |
 456  457                                              HAT_LOAD_NOCONSIST;
 457  458                                  /*
 458  459                                   * In order to cache the kernel mapping after
 459  460                                   * the user page is unlocked, we call
 460  461                                   * hat_devload instead of hat_memload so
 461  462                                   * that the kernel mapping we set up here is
 462  463                                   * "invisible" to the rest of the world. This
 463  464                                   * is not very pretty. But as long as the
 464  465                                   * caller bears the responsibility of keeping
 465  466                                   * cache consistency, we should be ok -
 466  467                                   * HAT_NOCONSIST will get us a uncached
 467  468                                   * mapping on VAC. hat_softlock will flush
 468  469                                   * a VAC_WRITEBACK cache. Therefore the kaddr
 469  470                                   * doesn't have to be of the same vcolor as
 470  471                                   * uaddr.
 471  472                                   * The alternative is - change hat_devload
 472  473                                   * to get a cached mapping. Allocate a kaddr
 473  474                                   * with the same vcolor as uaddr. Then
 474  475                                   * hat_softlock won't need to flush the VAC.
 475  476                                   */
 476  477                                  hat_devload(kas.a_hat, kaddr, PAGESIZE,
 477  478                                      page_pptonum(pp), PROT_READ, flags);
 478  479                                  *cached_ppp = pp;
 479  480                          }
 480  481                          kaddr += PAGESIZE;
 481  482                  }
 482  483                  cached_ppp++;
 483  484                  app++;
 484  485                  ++i;
 485  486          }
 486  487          if (cow) {
 487  488                  AS_LOCK_EXIT(as);
 488  489          }
 489  490          if (first && res == FC_NOMAP) {
 490  491                  /*
 491  492                   * If the address is not mapped yet, we call as_fault to
 492  493                   * fault the pages in. We could've fallen back to copy and
 493  494                   * let it fault in the pages. But for a mapped file, we
 494  495                   * normally reference each page only once. For zero-copy to
 495  496                   * be of any use, we'd better fall in the page now and try
 496  497                   * again.
 497  498                   */
 498  499                  first = 0;
 499  500                  size = size << PAGESHIFT;
 500  501                  uaddr += size;
 501  502                  total -= size;
 502  503                  size = total;
 503  504                  res = as_fault(as->a_hat, as, uaddr, size, F_INVAL, S_READ);
 504  505                  if (cow)
 505  506                          AS_LOCK_ENTER(as, RW_WRITER);
 506  507                  goto tryagain;
 507  508          }
 508  509          switch (res) {
 509  510          case FC_NOSUPPORT:
 510  511                  return (ENOTSUP);
 511  512          case FC_PROT:   /* Pretend we don't know about it. This will be */
 512  513                          /* caught by the caller when uiomove fails. */
 513  514          case FC_NOMAP:
 514  515          case FC_OBJERR:
 515  516          default:
 516  517                  return (0);
 517  518          }
 518  519  }

↓ open down ↓

432 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX