epoll Wdiff usr/src/uts/common/fs/ufs/ufs_vnops.c

Print this page

8634 epoll fails to wake on certain edge-triggered conditions
8635 epoll should not emit POLLNVAL
8636 recursive epoll should emit EPOLLRDNORM
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Toomas Soome <tsoome@me.com>
Reviewed by: Igor Kozhukhov <igor@dilos.org>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/ufs/ufs_vnops.c
          +++ new/usr/src/uts/common/fs/ufs/ufs_vnops.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each

↓ open down ↓

13 lines elided

↑ open up ↑

  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1984, 2010, Oracle and/or its affiliates. All rights reserved.
  24      - * Copyright 2015, Joyent, Inc.
       24 + * Copyright 2017 Joyent, Inc.
  25   25   * Copyright (c) 2016 by Delphix. All rights reserved.
  26   26   */
  27   27  
  28   28  /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  29   29  /*        All Rights Reserved   */
  30   30  
  31   31  /*
  32   32   * Portions of this source code were derived from Berkeley 4.3 BSD
  33   33   * under license from the Regents of the University of California.
  34   34   */

  35   35  
  36   36  #include <sys/types.h>
  37   37  #include <sys/t_lock.h>
  38   38  #include <sys/ksynch.h>
  39   39  #include <sys/param.h>
  40   40  #include <sys/time.h>
  41   41  #include <sys/systm.h>
  42   42  #include <sys/sysmacros.h>
  43   43  #include <sys/resource.h>
  44   44  #include <sys/signal.h>
  45   45  #include <sys/cred.h>
  46   46  #include <sys/user.h>
  47   47  #include <sys/buf.h>
  48   48  #include <sys/vfs.h>
  49   49  #include <sys/vfs_opreg.h>
  50   50  #include <sys/vnode.h>
  51   51  #include <sys/proc.h>
  52   52  #include <sys/disp.h>
  53   53  #include <sys/file.h>
  54   54  #include <sys/fcntl.h>
  55   55  #include <sys/flock.h>
  56   56  #include <sys/atomic.h>
  57   57  #include <sys/kmem.h>
  58   58  #include <sys/uio.h>
  59   59  #include <sys/dnlc.h>
  60   60  #include <sys/conf.h>
  61   61  #include <sys/mman.h>
  62   62  #include <sys/pathname.h>
  63   63  #include <sys/debug.h>
  64   64  #include <sys/vmsystm.h>
  65   65  #include <sys/cmn_err.h>
  66   66  #include <sys/filio.h>
  67   67  #include <sys/policy.h>
  68   68  
  69   69  #include <sys/fs/ufs_fs.h>
  70   70  #include <sys/fs/ufs_lockfs.h>
  71   71  #include <sys/fs/ufs_filio.h>
  72   72  #include <sys/fs/ufs_inode.h>
  73   73  #include <sys/fs/ufs_fsdir.h>
  74   74  #include <sys/fs/ufs_quota.h>
  75   75  #include <sys/fs/ufs_log.h>
  76   76  #include <sys/fs/ufs_snap.h>
  77   77  #include <sys/fs/ufs_trans.h>
  78   78  #include <sys/fs/ufs_panic.h>
  79   79  #include <sys/fs/ufs_bio.h>
  80   80  #include <sys/dirent.h>         /* must be AFTER <sys/fs/fsdir.h>! */
  81   81  #include <sys/errno.h>
  82   82  #include <sys/fssnap_if.h>
  83   83  #include <sys/unistd.h>
  84   84  #include <sys/sunddi.h>
  85   85  
  86   86  #include <sys/filio.h>          /* _FIOIO */
  87   87  
  88   88  #include <vm/hat.h>
  89   89  #include <vm/page.h>
  90   90  #include <vm/pvn.h>
  91   91  #include <vm/as.h>
  92   92  #include <vm/seg.h>
  93   93  #include <vm/seg_map.h>
  94   94  #include <vm/seg_vn.h>
  95   95  #include <vm/seg_kmem.h>
  96   96  #include <vm/rm.h>
  97   97  #include <sys/swap.h>
  98   98  
  99   99  #include <fs/fs_subr.h>
 100  100  
 101  101  #include <sys/fs/decomp.h>
 102  102  
 103  103  static struct instats ins;
 104  104  
 105  105  static  int ufs_getpage_ra(struct vnode *, u_offset_t, struct seg *, caddr_t);
 106  106  static  int ufs_getpage_miss(struct vnode *, u_offset_t, size_t, struct seg *,
 107  107                  caddr_t, struct page **, size_t, enum seg_rw, int);
 108  108  static  int ufs_open(struct vnode **, int, struct cred *, caller_context_t *);
 109  109  static  int ufs_close(struct vnode *, int, int, offset_t, struct cred *,
 110  110                  caller_context_t *);
 111  111  static  int ufs_read(struct vnode *, struct uio *, int, struct cred *,
 112  112                  struct caller_context *);
 113  113  static  int ufs_write(struct vnode *, struct uio *, int, struct cred *,
 114  114                  struct caller_context *);
 115  115  static  int ufs_ioctl(struct vnode *, int, intptr_t, int, struct cred *,
 116  116                  int *, caller_context_t *);
 117  117  static  int ufs_getattr(struct vnode *, struct vattr *, int, struct cred *,
 118  118                  caller_context_t *);
 119  119  static  int ufs_setattr(struct vnode *, struct vattr *, int, struct cred *,
 120  120                  caller_context_t *);
 121  121  static  int ufs_access(struct vnode *, int, int, struct cred *,
 122  122                  caller_context_t *);
 123  123  static  int ufs_lookup(struct vnode *, char *, struct vnode **,
 124  124                  struct pathname *, int, struct vnode *, struct cred *,
 125  125                  caller_context_t *, int *, pathname_t *);
 126  126  static  int ufs_create(struct vnode *, char *, struct vattr *, enum vcexcl,
 127  127                  int, struct vnode **, struct cred *, int,
 128  128                  caller_context_t *, vsecattr_t  *);
 129  129  static  int ufs_remove(struct vnode *, char *, struct cred *,
 130  130                  caller_context_t *, int);
 131  131  static  int ufs_link(struct vnode *, struct vnode *, char *, struct cred *,
 132  132                  caller_context_t *, int);
 133  133  static  int ufs_rename(struct vnode *, char *, struct vnode *, char *,
 134  134                  struct cred *, caller_context_t *, int);
 135  135  static  int ufs_mkdir(struct vnode *, char *, struct vattr *, struct vnode **,
 136  136                  struct cred *, caller_context_t *, int, vsecattr_t *);
 137  137  static  int ufs_rmdir(struct vnode *, char *, struct vnode *, struct cred *,
 138  138                  caller_context_t *, int);
 139  139  static  int ufs_readdir(struct vnode *, struct uio *, struct cred *, int *,
 140  140                  caller_context_t *, int);
 141  141  static  int ufs_symlink(struct vnode *, char *, struct vattr *, char *,
 142  142                  struct cred *, caller_context_t *, int);
 143  143  static  int ufs_readlink(struct vnode *, struct uio *, struct cred *,
 144  144                  caller_context_t *);
 145  145  static  int ufs_fsync(struct vnode *, int, struct cred *, caller_context_t *);
 146  146  static  void ufs_inactive(struct vnode *, struct cred *, caller_context_t *);
 147  147  static  int ufs_fid(struct vnode *, struct fid *, caller_context_t *);
 148  148  static  int ufs_rwlock(struct vnode *, int, caller_context_t *);
 149  149  static  void ufs_rwunlock(struct vnode *, int, caller_context_t *);
 150  150  static  int ufs_seek(struct vnode *, offset_t, offset_t *, caller_context_t *);
 151  151  static  int ufs_frlock(struct vnode *, int, struct flock64 *, int, offset_t,
 152  152                  struct flk_callback *, struct cred *,
 153  153                  caller_context_t *);
 154  154  static  int ufs_space(struct vnode *, int, struct flock64 *, int, offset_t,
 155  155                  cred_t *, caller_context_t *);
 156  156  static  int ufs_getpage(struct vnode *, offset_t, size_t, uint_t *,
 157  157                  struct page **, size_t, struct seg *, caddr_t,
 158  158                  enum seg_rw, struct cred *, caller_context_t *);
 159  159  static  int ufs_putpage(struct vnode *, offset_t, size_t, int, struct cred *,
 160  160                  caller_context_t *);
 161  161  static  int ufs_putpages(struct vnode *, offset_t, size_t, int, struct cred *);
 162  162  static  int ufs_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t,
 163  163                  uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
 164  164  static  int ufs_addmap(struct vnode *, offset_t, struct as *, caddr_t,  size_t,
 165  165                  uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
 166  166  static  int ufs_delmap(struct vnode *, offset_t, struct as *, caddr_t,  size_t,
 167  167                  uint_t, uint_t, uint_t, struct cred *, caller_context_t *);
 168  168  static  int ufs_poll(vnode_t *, short, int, short *, struct pollhead **,
 169  169                  caller_context_t *);
 170  170  static  int ufs_dump(vnode_t *, caddr_t, offset_t, offset_t,
 171  171      caller_context_t *);
 172  172  static  int ufs_l_pathconf(struct vnode *, int, ulong_t *, struct cred *,
 173  173                  caller_context_t *);
 174  174  static  int ufs_pageio(struct vnode *, struct page *, u_offset_t, size_t, int,
 175  175                  struct cred *, caller_context_t *);
 176  176  static  int ufs_dumpctl(vnode_t *, int, offset_t *, caller_context_t *);
 177  177  static  daddr32_t *save_dblks(struct inode *, struct ufsvfs *, daddr32_t *,
 178  178                  daddr32_t *, int, int);
 179  179  static  int ufs_getsecattr(struct vnode *, vsecattr_t *, int, struct cred *,
 180  180                  caller_context_t *);
 181  181  static  int ufs_setsecattr(struct vnode *, vsecattr_t *, int, struct cred *,
 182  182                  caller_context_t *);
 183  183  static  int ufs_priv_access(void *, int, struct cred *);
 184  184  static  int ufs_eventlookup(struct vnode *, char *, struct cred *,
 185  185      struct vnode **);
 186  186  extern int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
 187  187  
 188  188  /*
 189  189   * For lockfs: ulockfs begin/end is now inlined in the ufs_xxx functions.
 190  190   *
 191  191   * XXX - ULOCKFS in fs_pathconf and ufs_ioctl is not inlined yet.
 192  192   */
 193  193  struct vnodeops *ufs_vnodeops;
 194  194  
 195  195  /* NOTE: "not blkd" below  means that the operation isn't blocked by lockfs */
 196  196  const fs_operation_def_t ufs_vnodeops_template[] = {
 197  197          VOPNAME_OPEN,           { .vop_open = ufs_open },       /* not blkd */
 198  198          VOPNAME_CLOSE,          { .vop_close = ufs_close },     /* not blkd */
 199  199          VOPNAME_READ,           { .vop_read = ufs_read },
 200  200          VOPNAME_WRITE,          { .vop_write = ufs_write },
 201  201          VOPNAME_IOCTL,          { .vop_ioctl = ufs_ioctl },
 202  202          VOPNAME_GETATTR,        { .vop_getattr = ufs_getattr },
 203  203          VOPNAME_SETATTR,        { .vop_setattr = ufs_setattr },
 204  204          VOPNAME_ACCESS,         { .vop_access = ufs_access },
 205  205          VOPNAME_LOOKUP,         { .vop_lookup = ufs_lookup },
 206  206          VOPNAME_CREATE,         { .vop_create = ufs_create },
 207  207          VOPNAME_REMOVE,         { .vop_remove = ufs_remove },
 208  208          VOPNAME_LINK,           { .vop_link = ufs_link },
 209  209          VOPNAME_RENAME,         { .vop_rename = ufs_rename },
 210  210          VOPNAME_MKDIR,          { .vop_mkdir = ufs_mkdir },
 211  211          VOPNAME_RMDIR,          { .vop_rmdir = ufs_rmdir },
 212  212          VOPNAME_READDIR,        { .vop_readdir = ufs_readdir },
 213  213          VOPNAME_SYMLINK,        { .vop_symlink = ufs_symlink },
 214  214          VOPNAME_READLINK,       { .vop_readlink = ufs_readlink },
 215  215          VOPNAME_FSYNC,          { .vop_fsync = ufs_fsync },
 216  216          VOPNAME_INACTIVE,       { .vop_inactive = ufs_inactive }, /* not blkd */
 217  217          VOPNAME_FID,            { .vop_fid = ufs_fid },
 218  218          VOPNAME_RWLOCK,         { .vop_rwlock = ufs_rwlock },   /* not blkd */
 219  219          VOPNAME_RWUNLOCK,       { .vop_rwunlock = ufs_rwunlock }, /* not blkd */
 220  220          VOPNAME_SEEK,           { .vop_seek = ufs_seek },
 221  221          VOPNAME_FRLOCK,         { .vop_frlock = ufs_frlock },
 222  222          VOPNAME_SPACE,          { .vop_space = ufs_space },
 223  223          VOPNAME_GETPAGE,        { .vop_getpage = ufs_getpage },
 224  224          VOPNAME_PUTPAGE,        { .vop_putpage = ufs_putpage },
 225  225          VOPNAME_MAP,            { .vop_map = ufs_map },
 226  226          VOPNAME_ADDMAP,         { .vop_addmap = ufs_addmap },   /* not blkd */
 227  227          VOPNAME_DELMAP,         { .vop_delmap = ufs_delmap },   /* not blkd */
 228  228          VOPNAME_POLL,           { .vop_poll = ufs_poll },       /* not blkd */
 229  229          VOPNAME_DUMP,           { .vop_dump = ufs_dump },
 230  230          VOPNAME_PATHCONF,       { .vop_pathconf = ufs_l_pathconf },
 231  231          VOPNAME_PAGEIO,         { .vop_pageio = ufs_pageio },
 232  232          VOPNAME_DUMPCTL,        { .vop_dumpctl = ufs_dumpctl },
 233  233          VOPNAME_GETSECATTR,     { .vop_getsecattr = ufs_getsecattr },
 234  234          VOPNAME_SETSECATTR,     { .vop_setsecattr = ufs_setsecattr },
 235  235          VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
 236  236          NULL,                   NULL
 237  237  };
 238  238  
 239  239  #define MAX_BACKFILE_COUNT      9999
 240  240  
 241  241  /*
 242  242   * Created by ufs_dumpctl() to store a file's disk block info into memory.
 243  243   * Used by ufs_dump() to dump data to disk directly.
 244  244   */
 245  245  struct dump {
 246  246          struct inode    *ip;            /* the file we contain */
 247  247          daddr_t         fsbs;           /* number of blocks stored */
 248  248          struct timeval32 time;          /* time stamp for the struct */
 249  249          daddr32_t       dblk[1];        /* place holder for block info */
 250  250  };
 251  251  
 252  252  static struct dump *dump_info = NULL;
 253  253  
 254  254  /*
 255  255   * Previously there was no special action required for ordinary files.
 256  256   * (Devices are handled through the device file system.)
 257  257   * Now we support Large Files and Large File API requires open to
 258  258   * fail if file is large.
 259  259   * We could take care to prevent data corruption
 260  260   * by doing an atomic check of size and truncate if file is opened with
 261  261   * FTRUNC flag set but traditionally this is being done by the vfs/vnode
 262  262   * layers. So taking care of truncation here is a change in the existing
 263  263   * semantics of VOP_OPEN and therefore we chose not to implement any thing
 264  264   * here. The check for the size of the file > 2GB is being done at the
 265  265   * vfs layer in routine vn_open().
 266  266   */
 267  267

↓ open down ↓

233 lines elided

↑ open up ↑

 268  268  /* ARGSUSED */
 269  269  static int
 270  270  ufs_open(struct vnode **vpp, int flag, struct cred *cr, caller_context_t *ct)
 271  271  {
 272  272          return (0);
 273  273  }
 274  274  
 275  275  /*ARGSUSED*/
 276  276  static int
 277  277  ufs_close(struct vnode *vp, int flag, int count, offset_t offset,
 278      -        struct cred *cr, caller_context_t *ct)
      278 +    struct cred *cr, caller_context_t *ct)
 279  279  {
 280  280          cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
 281  281          cleanshares(vp, ttoproc(curthread)->p_pid);
 282  282  
 283  283          /*
 284  284           * Push partially filled cluster at last close.
 285  285           * ``last close'' is approximated because the dnlc
 286  286           * may have a hold on the vnode.
 287  287           * Checking for VBAD here will also act as a forced umount check.
 288  288           */

 289  289          if (vp->v_count <= 2 && vp->v_type != VBAD) {
 290  290                  struct inode *ip = VTOI(vp);
 291  291                  if (ip->i_delaylen) {
 292  292                          ins.in_poc.value.ul++;
 293  293                          (void) ufs_putpages(vp, ip->i_delayoff, ip->i_delaylen,
 294  294                              B_ASYNC | B_FREE, cr);

↓ open down ↓

6 lines elided

↑ open up ↑

 295  295                          ip->i_delaylen = 0;
 296  296                  }
 297  297          }
 298  298  
 299  299          return (0);
 300  300  }
 301  301  
 302  302  /*ARGSUSED*/
 303  303  static int
 304  304  ufs_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr,
 305      -        struct caller_context *ct)
      305 +    struct caller_context *ct)
 306  306  {
 307  307          struct inode *ip = VTOI(vp);
 308  308          struct ufsvfs *ufsvfsp;
 309  309          struct ulockfs *ulp = NULL;
 310  310          int error = 0;
 311  311          int intrans = 0;
 312  312  
 313  313          ASSERT(RW_READ_HELD(&ip->i_rwlock));
 314  314  
 315  315          /*

 316  316           * Mandatory locking needs to be done before ufs_lockfs_begin()
 317  317           * and TRANS_BEGIN_SYNC() calls since mandatory locks can sleep.
 318  318           */
 319  319          if (MANDLOCK(vp, ip->i_mode)) {
 320  320                  /*
 321  321                   * ufs_getattr ends up being called by chklock
 322  322                   */
 323  323                  error = chklock(vp, FREAD, uiop->uio_loffset,
 324  324                      uiop->uio_resid, uiop->uio_fmode, ct);
 325  325                  if (error)
 326  326                          goto out;
 327  327          }
 328  328  
 329  329          ufsvfsp = ip->i_ufsvfs;
 330  330          error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READ_MASK);
 331  331          if (error)
 332  332                  goto out;
 333  333  
 334  334          /*
 335  335           * In the case that a directory is opened for reading as a file
 336  336           * (eg "cat .") with the  O_RSYNC, O_SYNC and O_DSYNC flags set.
 337  337           * The locking order had to be changed to avoid a deadlock with
 338  338           * an update taking place on that directory at the same time.
 339  339           */
 340  340          if ((ip->i_mode & IFMT) == IFDIR) {
 341  341  
 342  342                  rw_enter(&ip->i_contents, RW_READER);
 343  343                  error = rdip(ip, uiop, ioflag, cr);
 344  344                  rw_exit(&ip->i_contents);
 345  345  
 346  346                  if (error) {
 347  347                          if (ulp)
 348  348                                  ufs_lockfs_end(ulp);
 349  349                          goto out;
 350  350                  }
 351  351  
 352  352                  if (ulp && (ioflag & FRSYNC) && (ioflag & (FSYNC | FDSYNC)) &&
 353  353                      TRANS_ISTRANS(ufsvfsp)) {
 354  354                          rw_exit(&ip->i_rwlock);
 355  355                          TRANS_BEGIN_SYNC(ufsvfsp, TOP_READ_SYNC, TOP_READ_SIZE,
 356  356                              error);
 357  357                          ASSERT(!error);
 358  358                          TRANS_END_SYNC(ufsvfsp, error, TOP_READ_SYNC,
 359  359                              TOP_READ_SIZE);
 360  360                          rw_enter(&ip->i_rwlock, RW_READER);
 361  361                  }
 362  362          } else {
 363  363                  /*
 364  364                   * Only transact reads to files opened for sync-read and
 365  365                   * sync-write on a file system that is not write locked.
 366  366                   *
 367  367                   * The ``not write locked'' check prevents problems with
 368  368                   * enabling/disabling logging on a busy file system.  E.g.,
 369  369                   * logging exists at the beginning of the read but does not
 370  370                   * at the end.
 371  371                   *
 372  372                   */
 373  373                  if (ulp && (ioflag & FRSYNC) && (ioflag & (FSYNC | FDSYNC)) &&
 374  374                      TRANS_ISTRANS(ufsvfsp)) {
 375  375                          TRANS_BEGIN_SYNC(ufsvfsp, TOP_READ_SYNC, TOP_READ_SIZE,
 376  376                              error);
 377  377                          ASSERT(!error);
 378  378                          intrans = 1;
 379  379                  }
 380  380  
 381  381                  rw_enter(&ip->i_contents, RW_READER);
 382  382                  error = rdip(ip, uiop, ioflag, cr);
 383  383                  rw_exit(&ip->i_contents);
 384  384  
 385  385                  if (intrans) {
 386  386                          TRANS_END_SYNC(ufsvfsp, error, TOP_READ_SYNC,
 387  387                              TOP_READ_SIZE);
 388  388                  }
 389  389          }
 390  390  
 391  391          if (ulp) {
 392  392                  ufs_lockfs_end(ulp);
 393  393          }
 394  394  out:
 395  395  
 396  396          return (error);
 397  397  }
 398  398  
 399  399  extern  int     ufs_HW;         /* high water mark */
 400  400  extern  int     ufs_LW;         /* low water mark */
 401  401  int     ufs_WRITES = 1;         /* XXX - enable/disable */
 402  402  int     ufs_throttles = 0;      /* throttling count */
 403  403  int     ufs_allow_shared_writes = 1;    /* directio shared writes */
 404  404  
 405  405  static int
 406  406  ufs_check_rewrite(struct inode *ip, struct uio *uiop, int ioflag)
 407  407  {
 408  408          int     shared_write;
 409  409  
 410  410          /*
 411  411           * If the FDSYNC flag is set then ignore the global
 412  412           * ufs_allow_shared_writes in this case.
 413  413           */
 414  414          shared_write = (ioflag & FDSYNC) | ufs_allow_shared_writes;
 415  415  
 416  416          /*
 417  417           * Filter to determine if this request is suitable as a
 418  418           * concurrent rewrite. This write must not allocate blocks
 419  419           * by extending the file or filling in holes. No use trying
 420  420           * through FSYNC descriptors as the inode will be synchronously
 421  421           * updated after the write. The uio structure has not yet been
 422  422           * checked for sanity, so assume nothing.
 423  423           */
 424  424          return (((ip->i_mode & IFMT) == IFREG) && !(ioflag & FAPPEND) &&

↓ open down ↓

109 lines elided

↑ open up ↑

 425  425              (uiop->uio_loffset >= (offset_t)0) &&
 426  426              (uiop->uio_loffset < ip->i_size) && (uiop->uio_resid > 0) &&
 427  427              ((ip->i_size - uiop->uio_loffset) >= uiop->uio_resid) &&
 428  428              !(ioflag & FSYNC) && !bmap_has_holes(ip) &&
 429  429              shared_write);
 430  430  }
 431  431  
 432  432  /*ARGSUSED*/
 433  433  static int
 434  434  ufs_write(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cr,
 435      -        caller_context_t *ct)
      435 +    caller_context_t *ct)
 436  436  {
 437  437          struct inode *ip = VTOI(vp);
 438  438          struct ufsvfs *ufsvfsp;
 439  439          struct ulockfs *ulp;
 440  440          int retry = 1;
 441  441          int error, resv, resid = 0;
 442  442          int directio_status;
 443  443          int exclusive;
 444  444          int rewriteflg;
 445  445          long start_resid = uiop->uio_resid;

 446  446  
 447  447          ASSERT(RW_LOCK_HELD(&ip->i_rwlock));
 448  448  
 449  449  retry_mandlock:
 450  450          /*
 451  451           * Mandatory locking needs to be done before ufs_lockfs_begin()
 452  452           * and TRANS_BEGIN_[A]SYNC() calls since mandatory locks can sleep.
 453  453           * Check for forced unmounts normally done in ufs_lockfs_begin().
 454  454           */
 455  455          if ((ufsvfsp = ip->i_ufsvfs) == NULL) {
 456  456                  error = EIO;
 457  457                  goto out;
 458  458          }
 459  459          if (MANDLOCK(vp, ip->i_mode)) {
 460  460  
 461  461                  ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
 462  462  
 463  463                  /*
 464  464                   * ufs_getattr ends up being called by chklock
 465  465                   */
 466  466                  error = chklock(vp, FWRITE, uiop->uio_loffset,
 467  467                      uiop->uio_resid, uiop->uio_fmode, ct);
 468  468                  if (error)
 469  469                          goto out;
 470  470          }
 471  471  
 472  472          /* i_rwlock can change in chklock */
 473  473          exclusive = rw_write_held(&ip->i_rwlock);
 474  474          rewriteflg = ufs_check_rewrite(ip, uiop, ioflag);
 475  475  
 476  476          /*
 477  477           * Check for fast-path special case of directio re-writes.
 478  478           */
 479  479          if ((ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) &&
 480  480              !exclusive && rewriteflg) {
 481  481  
 482  482                  error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_WRITE_MASK);
 483  483                  if (error)
 484  484                          goto out;
 485  485  
 486  486                  rw_enter(&ip->i_contents, RW_READER);
 487  487                  error = ufs_directio_write(ip, uiop, ioflag, 1, cr,
 488  488                      &directio_status);
 489  489                  if (directio_status == DIRECTIO_SUCCESS) {
 490  490                          uint_t i_flag_save;
 491  491  
 492  492                          if (start_resid != uiop->uio_resid)
 493  493                                  error = 0;
 494  494                          /*
 495  495                           * Special treatment of access times for re-writes.
 496  496                           * If IMOD is not already set, then convert it
 497  497                           * to IMODACC for this operation. This defers
 498  498                           * entering a delta into the log until the inode
 499  499                           * is flushed. This mimics what is done for read
 500  500                           * operations and inode access time.
 501  501                           */
 502  502                          mutex_enter(&ip->i_tlock);
 503  503                          i_flag_save = ip->i_flag;
 504  504                          ip->i_flag |= IUPD | ICHG;
 505  505                          ip->i_seq++;
 506  506                          ITIMES_NOLOCK(ip);
 507  507                          if ((i_flag_save & IMOD) == 0) {
 508  508                                  ip->i_flag &= ~IMOD;
 509  509                                  ip->i_flag |= IMODACC;
 510  510                          }
 511  511                          mutex_exit(&ip->i_tlock);
 512  512                          rw_exit(&ip->i_contents);
 513  513                          if (ulp)
 514  514                                  ufs_lockfs_end(ulp);
 515  515                          goto out;
 516  516                  }
 517  517                  rw_exit(&ip->i_contents);
 518  518                  if (ulp)
 519  519                          ufs_lockfs_end(ulp);
 520  520          }
 521  521  
 522  522          if (!exclusive && !rw_tryupgrade(&ip->i_rwlock)) {
 523  523                  rw_exit(&ip->i_rwlock);
 524  524                  rw_enter(&ip->i_rwlock, RW_WRITER);
 525  525                  /*
 526  526                   * Mandatory locking could have been enabled
 527  527                   * after dropping the i_rwlock.
 528  528                   */
 529  529                  if (MANDLOCK(vp, ip->i_mode))
 530  530                          goto retry_mandlock;
 531  531          }
 532  532  
 533  533          error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_WRITE_MASK);
 534  534          if (error)
 535  535                  goto out;
 536  536  
 537  537          /*
 538  538           * Amount of log space needed for this write
 539  539           */
 540  540          if (!rewriteflg || !(ioflag & FDSYNC))
 541  541                  TRANS_WRITE_RESV(ip, uiop, ulp, &resv, &resid);
 542  542  
 543  543          /*
 544  544           * Throttle writes.
 545  545           */
 546  546          if (ufs_WRITES && (ip->i_writes > ufs_HW)) {
 547  547                  mutex_enter(&ip->i_tlock);
 548  548                  while (ip->i_writes > ufs_HW) {
 549  549                          ufs_throttles++;
 550  550                          cv_wait(&ip->i_wrcv, &ip->i_tlock);
 551  551                  }
 552  552                  mutex_exit(&ip->i_tlock);
 553  553          }
 554  554  
 555  555          /*
 556  556           * Enter Transaction
 557  557           *
 558  558           * If the write is a rewrite there is no need to open a transaction
 559  559           * if the FDSYNC flag is set and not the FSYNC.  In this case just
 560  560           * set the IMODACC flag to modify do the update at a later time
 561  561           * thus avoiding the overhead of the logging transaction that is
 562  562           * not required.
 563  563           */
 564  564          if (ioflag & (FSYNC|FDSYNC)) {
 565  565                  if (ulp) {
 566  566                          if (rewriteflg) {
 567  567                                  uint_t i_flag_save;
 568  568  
 569  569                                  rw_enter(&ip->i_contents, RW_READER);
 570  570                                  mutex_enter(&ip->i_tlock);
 571  571                                  i_flag_save = ip->i_flag;
 572  572                                  ip->i_flag |= IUPD | ICHG;
 573  573                                  ip->i_seq++;
 574  574                                  ITIMES_NOLOCK(ip);
 575  575                                  if ((i_flag_save & IMOD) == 0) {
 576  576                                          ip->i_flag &= ~IMOD;
 577  577                                          ip->i_flag |= IMODACC;
 578  578                                  }
 579  579                                  mutex_exit(&ip->i_tlock);
 580  580                                  rw_exit(&ip->i_contents);
 581  581                          } else {
 582  582                                  int terr = 0;
 583  583                                  TRANS_BEGIN_SYNC(ufsvfsp, TOP_WRITE_SYNC, resv,
 584  584                                      terr);
 585  585                                  ASSERT(!terr);
 586  586                          }
 587  587                  }
 588  588          } else {
 589  589                  if (ulp)
 590  590                          TRANS_BEGIN_ASYNC(ufsvfsp, TOP_WRITE, resv);
 591  591          }
 592  592  
 593  593          /*
 594  594           * Write the file
 595  595           */
 596  596          rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
 597  597          rw_enter(&ip->i_contents, RW_WRITER);
 598  598          if ((ioflag & FAPPEND) != 0 && (ip->i_mode & IFMT) == IFREG) {
 599  599                  /*
 600  600                   * In append mode start at end of file.
 601  601                   */
 602  602                  uiop->uio_loffset = ip->i_size;
 603  603          }
 604  604  
 605  605          /*
 606  606           * Mild optimisation, don't call ufs_trans_write() unless we have to
 607  607           * Also, suppress file system full messages if we will retry.
 608  608           */
 609  609          if (retry)
 610  610                  ip->i_flag |= IQUIET;
 611  611          if (resid) {
 612  612                  TRANS_WRITE(ip, uiop, ioflag, error, ulp, cr, resv, resid);
 613  613          } else {
 614  614                  error = wrip(ip, uiop, ioflag, cr);
 615  615          }
 616  616          ip->i_flag &= ~IQUIET;
 617  617  
 618  618          rw_exit(&ip->i_contents);
 619  619          rw_exit(&ufsvfsp->vfs_dqrwlock);
 620  620  
 621  621          /*
 622  622           * Leave Transaction
 623  623           */
 624  624          if (ulp) {
 625  625                  if (ioflag & (FSYNC|FDSYNC)) {
 626  626                          if (!rewriteflg) {
 627  627                                  int terr = 0;
 628  628  
 629  629                                  TRANS_END_SYNC(ufsvfsp, terr, TOP_WRITE_SYNC,
 630  630                                      resv);
 631  631                                  if (error == 0)
 632  632                                          error = terr;
 633  633                          }
 634  634                  } else {
 635  635                          TRANS_END_ASYNC(ufsvfsp, TOP_WRITE, resv);
 636  636                  }
 637  637                  ufs_lockfs_end(ulp);
 638  638          }
 639  639  out:
 640  640          if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
 641  641                  /*
 642  642                   * Any blocks tied up in pending deletes?
 643  643                   */
 644  644                  ufs_delete_drain_wait(ufsvfsp, 1);
 645  645                  retry = 0;
 646  646                  goto retry_mandlock;
 647  647          }
 648  648  
 649  649          if (error == ENOSPC && (start_resid != uiop->uio_resid))
 650  650                  error = 0;
 651  651  
 652  652          return (error);
 653  653  }
 654  654  
 655  655  /*
 656  656   * Don't cache write blocks to files with the sticky bit set.
 657  657   * Used to keep swap files from blowing the page cache on a server.
 658  658   */
 659  659  int stickyhack = 1;
 660  660  
 661  661  /*
 662  662   * Free behind hacks.  The pager is busted.
 663  663   * XXX - need to pass the information down to writedone() in a flag like B_SEQ
 664  664   * or B_FREE_IF_TIGHT_ON_MEMORY.
 665  665   */
 666  666  int     freebehind = 1;
 667  667  int     smallfile = 0;
 668  668  u_offset_t smallfile64 = 32 * 1024;
 669  669  
 670  670  /*
 671  671   * While we should, in most cases, cache the pages for write, we
 672  672   * may also want to cache the pages for read as long as they are
 673  673   * frequently re-usable.
 674  674   *
 675  675   * If cache_read_ahead = 1, the pages for read will go to the tail
 676  676   * of the cache list when they are released, otherwise go to the head.
 677  677   */
 678  678  int     cache_read_ahead = 0;
 679  679  
 680  680  /*
 681  681   * Freebehind exists  so that as we read  large files  sequentially we
 682  682   * don't consume most of memory with pages  from a few files. It takes
 683  683   * longer to re-read from disk multiple small files as it does reading
 684  684   * one large one sequentially.  As system  memory grows customers need
 685  685   * to retain bigger chunks   of files in  memory.   The advent of  the
 686  686   * cachelist opens up of the possibility freeing pages  to the head or
 687  687   * tail of the list.
 688  688   *
 689  689   * Not freeing a page is a bet that the page will be read again before
 690  690   * it's segmap slot is needed for something else. If we loose the bet,
 691  691   * it means some  other thread is  burdened with the  page free we did
 692  692   * not do. If we win we save a free and reclaim.
 693  693   *
 694  694   * Freeing it at the tail  vs the head of cachelist  is a bet that the
 695  695   * page will survive until the next  read.  It's also saying that this
 696  696   * page is more likely to  be re-used than a  page freed some time ago
 697  697   * and never reclaimed.
 698  698   *
 699  699   * Freebehind maintains a  range of  file offset [smallfile1; smallfile2]
 700  700   *
 701  701   *            0 < offset < smallfile1 : pages are not freed.
 702  702   *   smallfile1 < offset < smallfile2 : pages freed to tail of cachelist.
 703  703   *   smallfile2 < offset              : pages freed to head of cachelist.
 704  704   *
 705  705   * The range  is  computed  at most  once  per second  and  depends on
 706  706   * freemem  and  ncpus_online.  Both parameters  are   bounded to be
 707  707   * >= smallfile && >= smallfile64.
 708  708   *
 709  709   * smallfile1 = (free memory / ncpu) / 1000
 710  710   * smallfile2 = (free memory / ncpu) / 10
 711  711   *
 712  712   * A few examples values:
 713  713   *
 714  714   *       Free Mem (in Bytes) [smallfile1; smallfile2]  [smallfile1; smallfile2]
 715  715   *                                 ncpus_online = 4          ncpus_online = 64
 716  716   *       ------------------  -----------------------   -----------------------
 717  717   *             1G                   [256K;  25M]               [32K; 1.5M]
 718  718   *            10G                   [2.5M; 250M]              [156K; 15M]
 719  719   *           100G                    [25M; 2.5G]              [1.5M; 150M]
 720  720   *
 721  721   */
 722  722  
 723  723  #define SMALLFILE1_D 1000
 724  724  #define SMALLFILE2_D 10
 725  725  static u_offset_t smallfile1 = 32 * 1024;
 726  726  static u_offset_t smallfile2 = 32 * 1024;
 727  727  static clock_t smallfile_update = 0; /* lbolt value of when to recompute */
 728  728  uint_t smallfile1_d = SMALLFILE1_D;
 729  729  uint_t smallfile2_d = SMALLFILE2_D;
 730  730  
 731  731  /*
 732  732   * wrip does the real work of write requests for ufs.
 733  733   */
 734  734  int
 735  735  wrip(struct inode *ip, struct uio *uio, int ioflag, struct cred *cr)
 736  736  {
 737  737          rlim64_t limit = uio->uio_llimit;
 738  738          u_offset_t off;
 739  739          u_offset_t old_i_size;
 740  740          struct fs *fs;
 741  741          struct vnode *vp;
 742  742          struct ufsvfs *ufsvfsp;
 743  743          caddr_t base;
 744  744          long start_resid = uio->uio_resid;      /* save starting resid */
 745  745          long premove_resid;                     /* resid before uiomove() */
 746  746          uint_t flags;
 747  747          int newpage;
 748  748          int iupdat_flag, directio_status;
 749  749          int n, on, mapon;
 750  750          int error, pagecreate;
 751  751          int do_dqrwlock;                /* drop/reacquire vfs_dqrwlock */
 752  752          int32_t iblocks;
 753  753          int     new_iblocks;
 754  754  
 755  755          /*
 756  756           * ip->i_size is incremented before the uiomove
 757  757           * is done on a write.  If the move fails (bad user
 758  758           * address) reset ip->i_size.
 759  759           * The better way would be to increment ip->i_size
 760  760           * only if the uiomove succeeds.
 761  761           */
 762  762          int i_size_changed = 0;
 763  763          o_mode_t type;
 764  764          int i_seq_needed = 0;
 765  765  
 766  766          vp = ITOV(ip);
 767  767  
 768  768          /*
 769  769           * check for forced unmount - should not happen as
 770  770           * the request passed the lockfs checks.
 771  771           */
 772  772          if ((ufsvfsp = ip->i_ufsvfs) == NULL)
 773  773                  return (EIO);
 774  774  
 775  775          fs = ip->i_fs;
 776  776  
 777  777          ASSERT(RW_WRITE_HELD(&ip->i_contents));
 778  778  
 779  779          /* check for valid filetype */
 780  780          type = ip->i_mode & IFMT;
 781  781          if ((type != IFREG) && (type != IFDIR) && (type != IFATTRDIR) &&
 782  782              (type != IFLNK) && (type != IFSHAD)) {
 783  783                  return (EIO);
 784  784          }
 785  785  
 786  786          /*
 787  787           * the actual limit of UFS file size
 788  788           * is UFS_MAXOFFSET_T
 789  789           */
 790  790          if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 791  791                  limit = MAXOFFSET_T;
 792  792  
 793  793          if (uio->uio_loffset >= limit) {
 794  794                  proc_t *p = ttoproc(curthread);
 795  795  
 796  796                  mutex_enter(&p->p_lock);
 797  797                  (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls,
 798  798                      p, RCA_UNSAFE_SIGINFO);
 799  799                  mutex_exit(&p->p_lock);
 800  800                  return (EFBIG);
 801  801          }
 802  802  
 803  803          /*
 804  804           * if largefiles are disallowed, the limit is
 805  805           * the pre-largefiles value of 2GB
 806  806           */
 807  807          if (ufsvfsp->vfs_lfflags & UFS_LARGEFILES)
 808  808                  limit = MIN(UFS_MAXOFFSET_T, limit);
 809  809          else
 810  810                  limit = MIN(MAXOFF32_T, limit);
 811  811  
 812  812          if (uio->uio_loffset < (offset_t)0) {
 813  813                  return (EINVAL);
 814  814          }
 815  815          if (uio->uio_resid == 0) {
 816  816                  return (0);
 817  817          }
 818  818  
 819  819          if (uio->uio_loffset >= limit)
 820  820                  return (EFBIG);
 821  821  
 822  822          ip->i_flag |= INOACC;   /* don't update ref time in getpage */
 823  823  
 824  824          if (ioflag & (FSYNC|FDSYNC)) {
 825  825                  ip->i_flag |= ISYNC;
 826  826                  iupdat_flag = 1;
 827  827          }
 828  828          /*
 829  829           * Try to go direct
 830  830           */
 831  831          if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) {
 832  832                  uio->uio_llimit = limit;
 833  833                  error = ufs_directio_write(ip, uio, ioflag, 0, cr,
 834  834                      &directio_status);
 835  835                  /*
 836  836                   * If ufs_directio wrote to the file or set the flags,
 837  837                   * we need to update i_seq, but it may be deferred.
 838  838                   */
 839  839                  if (start_resid != uio->uio_resid ||
 840  840                      (ip->i_flag & (ICHG|IUPD))) {
 841  841                          i_seq_needed = 1;
 842  842                          ip->i_flag |= ISEQ;
 843  843                  }
 844  844                  if (directio_status == DIRECTIO_SUCCESS)
 845  845                          goto out;
 846  846          }
 847  847  
 848  848          /*
 849  849           * Behavior with respect to dropping/reacquiring vfs_dqrwlock:
 850  850           *
 851  851           * o shadow inodes: vfs_dqrwlock is not held at all
 852  852           * o quota updates: vfs_dqrwlock is read or write held
 853  853           * o other updates: vfs_dqrwlock is read held
 854  854           *
 855  855           * The first case is the only one where we do not hold
 856  856           * vfs_dqrwlock at all while entering wrip().
 857  857           * We must make sure not to downgrade/drop vfs_dqrwlock if we
 858  858           * have it as writer, i.e. if we are updating the quota inode.
 859  859           * There is no potential deadlock scenario in this case as
 860  860           * ufs_getpage() takes care of this and avoids reacquiring
 861  861           * vfs_dqrwlock in that case.
 862  862           *
 863  863           * This check is done here since the above conditions do not change
 864  864           * and we possibly loop below, so save a few cycles.
 865  865           */
 866  866          if ((type == IFSHAD) ||
 867  867              (rw_owner(&ufsvfsp->vfs_dqrwlock) == curthread)) {
 868  868                  do_dqrwlock = 0;
 869  869          } else {
 870  870                  do_dqrwlock = 1;
 871  871          }
 872  872  
 873  873          /*
 874  874           * Large Files: We cast MAXBMASK to offset_t
 875  875           * inorder to mask out the higher bits. Since offset_t
 876  876           * is a signed value, the high order bit set in MAXBMASK
 877  877           * value makes it do the right thing by having all bits 1
 878  878           * in the higher word. May be removed for _SOLARIS64_.
 879  879           */
 880  880  
 881  881          fs = ip->i_fs;
 882  882          do {
 883  883                  u_offset_t uoff = uio->uio_loffset;
 884  884                  off = uoff & (offset_t)MAXBMASK;
 885  885                  mapon = (int)(uoff & (offset_t)MAXBOFFSET);
 886  886                  on = (int)blkoff(fs, uoff);
 887  887                  n = (int)MIN(fs->fs_bsize - on, uio->uio_resid);
 888  888                  new_iblocks = 1;
 889  889  
 890  890                  if (type == IFREG && uoff + n >= limit) {
 891  891                          if (uoff >= limit) {
 892  892                                  error = EFBIG;
 893  893                                  goto out;
 894  894                          }
 895  895                          /*
 896  896                           * since uoff + n >= limit,
 897  897                           * therefore n >= limit - uoff, and n is an int
 898  898                           * so it is safe to cast it to an int
 899  899                           */
 900  900                          n = (int)(limit - (rlim64_t)uoff);
 901  901                  }
 902  902                  if (uoff + n > ip->i_size) {
 903  903                          /*
 904  904                           * We are extending the length of the file.
 905  905                           * bmap is used so that we are sure that
 906  906                           * if we need to allocate new blocks, that it
 907  907                           * is done here before we up the file size.
 908  908                           */
 909  909                          error = bmap_write(ip, uoff, (int)(on + n),
 910  910                              mapon == 0, NULL, cr);
 911  911                          /*
 912  912                           * bmap_write never drops i_contents so if
 913  913                           * the flags are set it changed the file.
 914  914                           */
 915  915                          if (ip->i_flag & (ICHG|IUPD)) {
 916  916                                  i_seq_needed = 1;
 917  917                                  ip->i_flag |= ISEQ;
 918  918                          }
 919  919                          if (error)
 920  920                                  break;
 921  921                          /*
 922  922                           * There is a window of vulnerability here.
 923  923                           * The sequence of operations: allocate file
 924  924                           * system blocks, uiomove the data into pages,
 925  925                           * and then update the size of the file in the
 926  926                           * inode, must happen atomically.  However, due
 927  927                           * to current locking constraints, this can not
 928  928                           * be done.
 929  929                           */
 930  930                          ASSERT(ip->i_writer == NULL);
 931  931                          ip->i_writer = curthread;
 932  932                          i_size_changed = 1;
 933  933                          /*
 934  934                           * If we are writing from the beginning of
 935  935                           * the mapping, we can just create the
 936  936                           * pages without having to read them.
 937  937                           */
 938  938                          pagecreate = (mapon == 0);
 939  939                  } else if (n == MAXBSIZE) {
 940  940                          /*
 941  941                           * Going to do a whole mappings worth,
 942  942                           * so we can just create the pages w/o
 943  943                           * having to read them in.  But before
 944  944                           * we do that, we need to make sure any
 945  945                           * needed blocks are allocated first.
 946  946                           */
 947  947                          iblocks = ip->i_blocks;
 948  948                          error = bmap_write(ip, uoff, (int)(on + n),
 949  949                              BI_ALLOC_ONLY, NULL, cr);
 950  950                          /*
 951  951                           * bmap_write never drops i_contents so if
 952  952                           * the flags are set it changed the file.
 953  953                           */
 954  954                          if (ip->i_flag & (ICHG|IUPD)) {
 955  955                                  i_seq_needed = 1;
 956  956                                  ip->i_flag |= ISEQ;
 957  957                          }
 958  958                          if (error)
 959  959                                  break;
 960  960                          pagecreate = 1;
 961  961                          /*
 962  962                           * check if the new created page needed the
 963  963                           * allocation of new disk blocks.
 964  964                           */
 965  965                          if (iblocks == ip->i_blocks)
 966  966                                  new_iblocks = 0; /* no new blocks allocated */
 967  967                  } else {
 968  968                          pagecreate = 0;
 969  969                          /*
 970  970                           * In sync mode flush the indirect blocks which
 971  971                           * may have been allocated and not written on
 972  972                           * disk. In above cases bmap_write will allocate
 973  973                           * in sync mode.
 974  974                           */
 975  975                          if (ioflag & (FSYNC|FDSYNC)) {
 976  976                                  error = ufs_indirblk_sync(ip, uoff);
 977  977                                  if (error)
 978  978                                          break;
 979  979                          }
 980  980                  }
 981  981  
 982  982                  /*
 983  983                   * At this point we can enter ufs_getpage() in one
 984  984                   * of two ways:
 985  985                   * 1) segmap_getmapflt() calls ufs_getpage() when the
 986  986                   *    forcefault parameter is true (pagecreate == 0)
 987  987                   * 2) uiomove() causes a page fault.
 988  988                   *
 989  989                   * We have to drop the contents lock to prevent the VM
 990  990                   * system from trying to reacquire it in ufs_getpage()
 991  991                   * should the uiomove cause a pagefault.
 992  992                   *
 993  993                   * We have to drop the reader vfs_dqrwlock here as well.
 994  994                   */
 995  995                  rw_exit(&ip->i_contents);
 996  996                  if (do_dqrwlock) {
 997  997                          ASSERT(RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock));
 998  998                          ASSERT(!(RW_WRITE_HELD(&ufsvfsp->vfs_dqrwlock)));
 999  999                          rw_exit(&ufsvfsp->vfs_dqrwlock);
1000 1000                  }
1001 1001  
1002 1002                  newpage = 0;
1003 1003                  premove_resid = uio->uio_resid;
1004 1004  
1005 1005                  /*
1006 1006                   * Touch the page and fault it in if it is not in core
1007 1007                   * before segmap_getmapflt or vpm_data_copy can lock it.
1008 1008                   * This is to avoid the deadlock if the buffer is mapped
1009 1009                   * to the same file through mmap which we want to write.
1010 1010                   */
1011 1011                  uio_prefaultpages((long)n, uio);
1012 1012  
1013 1013                  if (vpm_enable) {
1014 1014                          /*
1015 1015                           * Copy data. If new pages are created, part of
1016 1016                           * the page that is not written will be initizliazed
1017 1017                           * with zeros.
1018 1018                           */
1019 1019                          error = vpm_data_copy(vp, (off + mapon), (uint_t)n,
1020 1020                              uio, !pagecreate, &newpage, 0, S_WRITE);
1021 1021                  } else {
1022 1022  
1023 1023                          base = segmap_getmapflt(segkmap, vp, (off + mapon),
1024 1024                              (uint_t)n, !pagecreate, S_WRITE);
1025 1025  
1026 1026                          /*
1027 1027                           * segmap_pagecreate() returns 1 if it calls
1028 1028                           * page_create_va() to allocate any pages.
1029 1029                           */
1030 1030  
1031 1031                          if (pagecreate)
1032 1032                                  newpage = segmap_pagecreate(segkmap, base,
1033 1033                                      (size_t)n, 0);
1034 1034  
1035 1035                          error = uiomove(base + mapon, (long)n, UIO_WRITE, uio);
1036 1036                  }
1037 1037  
1038 1038                  /*
1039 1039                   * If "newpage" is set, then a new page was created and it
1040 1040                   * does not contain valid data, so it needs to be initialized
1041 1041                   * at this point.
1042 1042                   * Otherwise the page contains old data, which was overwritten
1043 1043                   * partially or as a whole in uiomove.
1044 1044                   * If there is only one iovec structure within uio, then
1045 1045                   * on error uiomove will not be able to update uio->uio_loffset
1046 1046                   * and we would zero the whole page here!
1047 1047                   *
1048 1048                   * If uiomove fails because of an error, the old valid data
1049 1049                   * is kept instead of filling the rest of the page with zero's.
1050 1050                   */
1051 1051                  if (!vpm_enable && newpage &&
1052 1052                      uio->uio_loffset < roundup(off + mapon + n, PAGESIZE)) {
1053 1053                          /*
1054 1054                           * We created pages w/o initializing them completely,
1055 1055                           * thus we need to zero the part that wasn't set up.
1056 1056                           * This happens on most EOF write cases and if
1057 1057                           * we had some sort of error during the uiomove.
1058 1058                           */
1059 1059                          int nzero, nmoved;
1060 1060  
1061 1061                          nmoved = (int)(uio->uio_loffset - (off + mapon));
1062 1062                          ASSERT(nmoved >= 0 && nmoved <= n);
1063 1063                          nzero = roundup(on + n, PAGESIZE) - nmoved;
1064 1064                          ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE);
1065 1065                          (void) kzero(base + mapon + nmoved, (uint_t)nzero);
1066 1066                  }
1067 1067  
1068 1068                  /*
1069 1069                   * Unlock the pages allocated by page_create_va()
1070 1070                   * in segmap_pagecreate()
1071 1071                   */
1072 1072                  if (!vpm_enable && newpage)
1073 1073                          segmap_pageunlock(segkmap, base, (size_t)n, S_WRITE);
1074 1074  
1075 1075                  /*
1076 1076                   * If the size of the file changed, then update the
1077 1077                   * size field in the inode now.  This can't be done
1078 1078                   * before the call to segmap_pageunlock or there is
1079 1079                   * a potential deadlock with callers to ufs_putpage().
1080 1080                   * They will be holding i_contents and trying to lock
1081 1081                   * a page, while this thread is holding a page locked
1082 1082                   * and trying to acquire i_contents.
1083 1083                   */
1084 1084                  if (i_size_changed) {
1085 1085                          rw_enter(&ip->i_contents, RW_WRITER);
1086 1086                          old_i_size = ip->i_size;
1087 1087                          UFS_SET_ISIZE(uoff + n, ip);
1088 1088                          TRANS_INODE(ufsvfsp, ip);
1089 1089                          /*
1090 1090                           * file has grown larger than 2GB. Set flag
1091 1091                           * in superblock to indicate this, if it
1092 1092                           * is not already set.
1093 1093                           */
1094 1094                          if ((ip->i_size > MAXOFF32_T) &&
1095 1095                              !(fs->fs_flags & FSLARGEFILES)) {
1096 1096                                  ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
1097 1097                                  mutex_enter(&ufsvfsp->vfs_lock);
1098 1098                                  fs->fs_flags |= FSLARGEFILES;
1099 1099                                  ufs_sbwrite(ufsvfsp);
1100 1100                                  mutex_exit(&ufsvfsp->vfs_lock);
1101 1101                          }
1102 1102                          mutex_enter(&ip->i_tlock);
1103 1103                          ip->i_writer = NULL;
1104 1104                          cv_broadcast(&ip->i_wrcv);
1105 1105                          mutex_exit(&ip->i_tlock);
1106 1106                          rw_exit(&ip->i_contents);
1107 1107                  }
1108 1108  
1109 1109                  if (error) {
1110 1110                          /*
1111 1111                           * If we failed on a write, we may have already
1112 1112                           * allocated file blocks as well as pages.  It's
1113 1113                           * hard to undo the block allocation, but we must
1114 1114                           * be sure to invalidate any pages that may have
1115 1115                           * been allocated.
1116 1116                           *
1117 1117                           * If the page was created without initialization
1118 1118                           * then we must check if it should be possible
1119 1119                           * to destroy the new page and to keep the old data
1120 1120                           * on the disk.
1121 1121                           *
1122 1122                           * It is possible to destroy the page without
1123 1123                           * having to write back its contents only when
1124 1124                           * - the size of the file keeps unchanged
1125 1125                           * - bmap_write() did not allocate new disk blocks
1126 1126                           *   it is possible to create big files using "seek" and
1127 1127                           *   write to the end of the file. A "write" to a
1128 1128                           *   position before the end of the file would not
1129 1129                           *   change the size of the file but it would allocate
1130 1130                           *   new disk blocks.
1131 1131                           * - uiomove intended to overwrite the whole page.
1132 1132                           * - a new page was created (newpage == 1).
1133 1133                           */
1134 1134  
1135 1135                          if (i_size_changed == 0 && new_iblocks == 0 &&
1136 1136                              newpage) {
1137 1137  
1138 1138                                  /* unwind what uiomove eventually last did */
1139 1139                                  uio->uio_resid = premove_resid;
1140 1140  
1141 1141                                  /*
1142 1142                                   * destroy the page, do not write ambiguous
1143 1143                                   * data to the disk.
1144 1144                                   */
1145 1145                                  flags = SM_DESTROY;
1146 1146                          } else {
1147 1147                                  /*
1148 1148                                   * write the page back to the disk, if dirty,
1149 1149                                   * and remove the page from the cache.
1150 1150                                   */
1151 1151                                  flags = SM_INVAL;
1152 1152                          }
1153 1153  
1154 1154                          if (vpm_enable) {
1155 1155                                  /*
1156 1156                                   *  Flush pages.
1157 1157                                   */
1158 1158                                  (void) vpm_sync_pages(vp, off, n, flags);
1159 1159                          } else {
1160 1160                                  (void) segmap_release(segkmap, base, flags);
1161 1161                          }
1162 1162                  } else {
1163 1163                          flags = 0;
1164 1164                          /*
1165 1165                           * Force write back for synchronous write cases.
1166 1166                           */
1167 1167                          if ((ioflag & (FSYNC|FDSYNC)) || type == IFDIR) {
1168 1168                                  /*
1169 1169                                   * If the sticky bit is set but the
1170 1170                                   * execute bit is not set, we do a
1171 1171                                   * synchronous write back and free
1172 1172                                   * the page when done.  We set up swap
1173 1173                                   * files to be handled this way to
1174 1174                                   * prevent servers from keeping around
1175 1175                                   * the client's swap pages too long.
1176 1176                                   * XXX - there ought to be a better way.
1177 1177                                   */
1178 1178                                  if (IS_SWAPVP(vp)) {
1179 1179                                          flags = SM_WRITE | SM_FREE |
1180 1180                                              SM_DONTNEED;
1181 1181                                          iupdat_flag = 0;
1182 1182                                  } else {
1183 1183                                          flags = SM_WRITE;
1184 1184                                  }
1185 1185                          } else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) {
1186 1186                                  /*
1187 1187                                   * Have written a whole block.
1188 1188                                   * Start an asynchronous write and
1189 1189                                   * mark the buffer to indicate that
1190 1190                                   * it won't be needed again soon.
1191 1191                                   */
1192 1192                                  flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
1193 1193                          }
1194 1194                          if (vpm_enable) {
1195 1195                                  /*
1196 1196                                   * Flush pages.
1197 1197                                   */
1198 1198                                  error = vpm_sync_pages(vp, off, n, flags);
1199 1199                          } else {
1200 1200                                  error = segmap_release(segkmap, base, flags);
1201 1201                          }
1202 1202                          /*
1203 1203                           * If the operation failed and is synchronous,
1204 1204                           * then we need to unwind what uiomove() last
1205 1205                           * did so we can potentially return an error to
1206 1206                           * the caller.  If this write operation was
1207 1207                           * done in two pieces and the first succeeded,
1208 1208                           * then we won't return an error for the second
1209 1209                           * piece that failed.  However, we only want to
1210 1210                           * return a resid value that reflects what was
1211 1211                           * really done.
1212 1212                           *
1213 1213                           * Failures for non-synchronous operations can
1214 1214                           * be ignored since the page subsystem will
1215 1215                           * retry the operation until it succeeds or the
1216 1216                           * file system is unmounted.
1217 1217                           */
1218 1218                          if (error) {
1219 1219                                  if ((ioflag & (FSYNC | FDSYNC)) ||
1220 1220                                      type == IFDIR) {
1221 1221                                          uio->uio_resid = premove_resid;
1222 1222                                  } else {
1223 1223                                          error = 0;
1224 1224                                  }
1225 1225                          }
1226 1226                  }
1227 1227  
1228 1228                  /*
1229 1229                   * Re-acquire contents lock.
1230 1230                   * If it was dropped, reacquire reader vfs_dqrwlock as well.
1231 1231                   */
1232 1232                  if (do_dqrwlock)
1233 1233                          rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1234 1234                  rw_enter(&ip->i_contents, RW_WRITER);
1235 1235  
1236 1236                  /*
1237 1237                   * If the uiomove() failed or if a synchronous
1238 1238                   * page push failed, fix up i_size.
1239 1239                   */
1240 1240                  if (error) {
1241 1241                          if (i_size_changed) {
1242 1242                                  /*
1243 1243                                   * The uiomove failed, and we
1244 1244                                   * allocated blocks,so get rid
1245 1245                                   * of them.
1246 1246                                   */
1247 1247                                  (void) ufs_itrunc(ip, old_i_size, 0, cr);
1248 1248                          }
1249 1249                  } else {
1250 1250                          /*
1251 1251                           * XXX - Can this be out of the loop?
1252 1252                           */
1253 1253                          ip->i_flag |= IUPD | ICHG;
1254 1254                          /*
1255 1255                           * Only do one increase of i_seq for multiple
1256 1256                           * pieces.  Because we drop locks, record
1257 1257                           * the fact that we changed the timestamp and
1258 1258                           * are deferring the increase in case another thread
1259 1259                           * pushes our timestamp update.
1260 1260                           */
1261 1261                          i_seq_needed = 1;
1262 1262                          ip->i_flag |= ISEQ;
1263 1263                          if (i_size_changed)
1264 1264                                  ip->i_flag |= IATTCHG;
1265 1265                          if ((ip->i_mode & (IEXEC | (IEXEC >> 3) |
1266 1266                              (IEXEC >> 6))) != 0 &&
1267 1267                              (ip->i_mode & (ISUID | ISGID)) != 0 &&
1268 1268                              secpolicy_vnode_setid_retain(cr,
1269 1269                              (ip->i_mode & ISUID) != 0 && ip->i_uid == 0) != 0) {
1270 1270                                  /*
1271 1271                                   * Clear Set-UID & Set-GID bits on
1272 1272                                   * successful write if not privileged
1273 1273                                   * and at least one of the execute bits
1274 1274                                   * is set.  If we always clear Set-GID,
1275 1275                                   * mandatory file and record locking is
1276 1276                                   * unuseable.
1277 1277                                   */
1278 1278                                  ip->i_mode &= ~(ISUID | ISGID);
1279 1279                          }
1280 1280                  }
1281 1281                  /*
1282 1282                   * In the case the FDSYNC flag is set and this is a
1283 1283                   * "rewrite" we won't log a delta.
1284 1284                   * The FSYNC flag overrides all cases.
1285 1285                   */
1286 1286                  if (!ufs_check_rewrite(ip, uio, ioflag) || !(ioflag & FDSYNC)) {
1287 1287                          TRANS_INODE(ufsvfsp, ip);
1288 1288                  }
1289 1289          } while (error == 0 && uio->uio_resid > 0 && n != 0);
1290 1290  
1291 1291  out:
1292 1292          /*
1293 1293           * Make sure i_seq is increased at least once per write
1294 1294           */
1295 1295          if (i_seq_needed) {
1296 1296                  ip->i_seq++;
1297 1297                  ip->i_flag &= ~ISEQ;    /* no longer deferred */
1298 1298          }
1299 1299  
1300 1300          /*
1301 1301           * Inode is updated according to this table -
1302 1302           *
1303 1303           *   FSYNC        FDSYNC(posix.4)
1304 1304           *   --------------------------
1305 1305           *   always@      IATTCHG|IBDWRITE
1306 1306           *
1307 1307           * @ -  If we are doing synchronous write the only time we should
1308 1308           *      not be sync'ing the ip here is if we have the stickyhack
1309 1309           *      activated, the file is marked with the sticky bit and
1310 1310           *      no exec bit, the file length has not been changed and
1311 1311           *      no new blocks have been allocated during this write.
1312 1312           */
1313 1313  
1314 1314          if ((ip->i_flag & ISYNC) != 0) {
1315 1315                  /*
1316 1316                   * we have eliminated nosync
1317 1317                   */
1318 1318                  if ((ip->i_flag & (IATTCHG|IBDWRITE)) ||
1319 1319                      ((ioflag & FSYNC) && iupdat_flag)) {
1320 1320                          ufs_iupdat(ip, 1);
1321 1321                  }
1322 1322          }
1323 1323  
1324 1324          /*
1325 1325           * If we've already done a partial-write, terminate
1326 1326           * the write but return no error unless the error is ENOSPC
1327 1327           * because the caller can detect this and free resources and
1328 1328           * try again.
1329 1329           */
1330 1330          if ((start_resid != uio->uio_resid) && (error != ENOSPC))
1331 1331                  error = 0;
1332 1332  
1333 1333          ip->i_flag &= ~(INOACC | ISYNC);
1334 1334          ITIMES_NOLOCK(ip);
1335 1335          return (error);
1336 1336  }
1337 1337  
1338 1338  /*
1339 1339   * rdip does the real work of read requests for ufs.
1340 1340   */
1341 1341  int
1342 1342  rdip(struct inode *ip, struct uio *uio, int ioflag, cred_t *cr)
1343 1343  {
1344 1344          u_offset_t off;
1345 1345          caddr_t base;
1346 1346          struct fs *fs;
1347 1347          struct ufsvfs *ufsvfsp;
1348 1348          struct vnode *vp;
1349 1349          long oresid = uio->uio_resid;
1350 1350          u_offset_t n, on, mapon;
1351 1351          int error = 0;
1352 1352          int doupdate = 1;
1353 1353          uint_t flags;
1354 1354          int dofree, directio_status;
1355 1355          krw_t rwtype;
1356 1356          o_mode_t type;
1357 1357          clock_t now;
1358 1358  
1359 1359          vp = ITOV(ip);
1360 1360  
1361 1361          ASSERT(RW_LOCK_HELD(&ip->i_contents));
1362 1362  
1363 1363          ufsvfsp = ip->i_ufsvfs;
1364 1364  
1365 1365          if (ufsvfsp == NULL)
1366 1366                  return (EIO);
1367 1367  
1368 1368          fs = ufsvfsp->vfs_fs;
1369 1369  
1370 1370          /* check for valid filetype */
1371 1371          type = ip->i_mode & IFMT;
1372 1372          if ((type != IFREG) && (type != IFDIR) && (type != IFATTRDIR) &&
1373 1373              (type != IFLNK) && (type != IFSHAD)) {
1374 1374                  return (EIO);
1375 1375          }
1376 1376  
1377 1377          if (uio->uio_loffset > UFS_MAXOFFSET_T) {
1378 1378                  error = 0;
1379 1379                  goto out;
1380 1380          }
1381 1381          if (uio->uio_loffset < (offset_t)0) {
1382 1382                  return (EINVAL);
1383 1383          }
1384 1384          if (uio->uio_resid == 0) {
1385 1385                  return (0);
1386 1386          }
1387 1387  
1388 1388          if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && (fs->fs_ronly == 0) &&
1389 1389              (!ufsvfsp->vfs_noatime)) {
1390 1390                  mutex_enter(&ip->i_tlock);
1391 1391                  ip->i_flag |= IACC;
1392 1392                  mutex_exit(&ip->i_tlock);
1393 1393          }
1394 1394          /*
1395 1395           * Try to go direct
1396 1396           */
1397 1397          if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) {
1398 1398                  error = ufs_directio_read(ip, uio, cr, &directio_status);
1399 1399                  if (directio_status == DIRECTIO_SUCCESS)
1400 1400                          goto out;
1401 1401          }
1402 1402  
1403 1403          rwtype = (rw_write_held(&ip->i_contents)?RW_WRITER:RW_READER);
1404 1404  
1405 1405          do {
1406 1406                  offset_t diff;
1407 1407                  u_offset_t uoff = uio->uio_loffset;
1408 1408                  off = uoff & (offset_t)MAXBMASK;
1409 1409                  mapon = (u_offset_t)(uoff & (offset_t)MAXBOFFSET);
1410 1410                  on = (u_offset_t)blkoff(fs, uoff);
1411 1411                  n = MIN((u_offset_t)fs->fs_bsize - on,
1412 1412                      (u_offset_t)uio->uio_resid);
1413 1413  
1414 1414                  diff = ip->i_size - uoff;
1415 1415  
1416 1416                  if (diff <= (offset_t)0) {
1417 1417                          error = 0;
1418 1418                          goto out;
1419 1419                  }
1420 1420                  if (diff < (offset_t)n)
1421 1421                          n = (int)diff;
1422 1422  
1423 1423                  /*
1424 1424                   * We update smallfile2 and smallfile1 at most every second.
1425 1425                   */
1426 1426                  now = ddi_get_lbolt();
1427 1427                  if (now >= smallfile_update) {
1428 1428                          uint64_t percpufreeb;
1429 1429                          if (smallfile1_d == 0) smallfile1_d = SMALLFILE1_D;
1430 1430                          if (smallfile2_d == 0) smallfile2_d = SMALLFILE2_D;
1431 1431                          percpufreeb = ptob((uint64_t)freemem) / ncpus_online;
1432 1432                          smallfile1 = percpufreeb / smallfile1_d;
1433 1433                          smallfile2 = percpufreeb / smallfile2_d;
1434 1434                          smallfile1 = MAX(smallfile1, smallfile);
1435 1435                          smallfile1 = MAX(smallfile1, smallfile64);
1436 1436                          smallfile2 = MAX(smallfile1, smallfile2);
1437 1437                          smallfile_update = now + hz;
1438 1438                  }
1439 1439  
1440 1440                  dofree = freebehind &&
1441 1441                      ip->i_nextr == (off & PAGEMASK) && off > smallfile1;
1442 1442  
1443 1443                  /*
1444 1444                   * At this point we can enter ufs_getpage() in one of two
1445 1445                   * ways:
1446 1446                   * 1) segmap_getmapflt() calls ufs_getpage() when the
1447 1447                   *    forcefault parameter is true (value of 1 is passed)
1448 1448                   * 2) uiomove() causes a page fault.
1449 1449                   *
1450 1450                   * We cannot hold onto an i_contents reader lock without
1451 1451                   * risking deadlock in ufs_getpage() so drop a reader lock.
1452 1452                   * The ufs_getpage() dolock logic already allows for a
1453 1453                   * thread holding i_contents as writer to work properly
1454 1454                   * so we keep a writer lock.
1455 1455                   */
1456 1456                  if (rwtype == RW_READER)
1457 1457                          rw_exit(&ip->i_contents);
1458 1458  
1459 1459                  if (vpm_enable) {
1460 1460                          /*
1461 1461                           * Copy data.
1462 1462                           */
1463 1463                          error = vpm_data_copy(vp, (off + mapon), (uint_t)n,
1464 1464                              uio, 1, NULL, 0, S_READ);
1465 1465                  } else {
1466 1466                          base = segmap_getmapflt(segkmap, vp, (off + mapon),
1467 1467                              (uint_t)n, 1, S_READ);
1468 1468                          error = uiomove(base + mapon, (long)n, UIO_READ, uio);
1469 1469                  }
1470 1470  
1471 1471                  flags = 0;
1472 1472                  if (!error) {
1473 1473                          /*
1474 1474                           * If  reading sequential  we won't need  this
1475 1475                           * buffer again  soon.  For  offsets in  range
1476 1476                           * [smallfile1,  smallfile2] release the pages
1477 1477                           * at   the  tail  of the   cache list, larger
1478 1478                           * offsets are released at the head.
1479 1479                           */
1480 1480                          if (dofree) {
1481 1481                                  flags = SM_FREE | SM_ASYNC;
1482 1482                                  if ((cache_read_ahead == 0) &&
1483 1483                                      (off > smallfile2))
1484 1484                                          flags |=  SM_DONTNEED;
1485 1485                          }
1486 1486                          /*
1487 1487                           * In POSIX SYNC (FSYNC and FDSYNC) read mode,
1488 1488                           * we want to make sure that the page which has
1489 1489                           * been read, is written on disk if it is dirty.
1490 1490                           * And corresponding indirect blocks should also
1491 1491                           * be flushed out.
1492 1492                           */
1493 1493                          if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) {
1494 1494                                  flags &= ~SM_ASYNC;
1495 1495                                  flags |= SM_WRITE;
1496 1496                          }
1497 1497                          if (vpm_enable) {
1498 1498                                  error = vpm_sync_pages(vp, off, n, flags);
1499 1499                          } else {
1500 1500                                  error = segmap_release(segkmap, base, flags);
1501 1501                          }
1502 1502                  } else {
1503 1503                          if (vpm_enable) {
1504 1504                                  (void) vpm_sync_pages(vp, off, n, flags);
1505 1505                          } else {
1506 1506                                  (void) segmap_release(segkmap, base, flags);
1507 1507                          }
1508 1508                  }
1509 1509  
1510 1510                  if (rwtype == RW_READER)
1511 1511                          rw_enter(&ip->i_contents, rwtype);
1512 1512          } while (error == 0 && uio->uio_resid > 0 && n != 0);
1513 1513  out:
1514 1514          /*
1515 1515           * Inode is updated according to this table if FRSYNC is set.
1516 1516           *
1517 1517           *   FSYNC        FDSYNC(posix.4)
1518 1518           *   --------------------------
1519 1519           *   always       IATTCHG|IBDWRITE
1520 1520           */
1521 1521          /*
1522 1522           * The inode is not updated if we're logging and the inode is a
1523 1523           * directory with FRSYNC, FSYNC and FDSYNC flags set.
1524 1524           */
1525 1525          if (ioflag & FRSYNC) {
1526 1526                  if (TRANS_ISTRANS(ufsvfsp) && ((ip->i_mode & IFMT) == IFDIR)) {
1527 1527                          doupdate = 0;
1528 1528                  }
1529 1529                  if (doupdate) {
1530 1530                          if ((ioflag & FSYNC) ||
1531 1531                              ((ioflag & FDSYNC) &&
1532 1532                              (ip->i_flag & (IATTCHG|IBDWRITE)))) {
1533 1533                                  ufs_iupdat(ip, 1);
1534 1534                          }
1535 1535                  }
1536 1536          }
1537 1537          /*
1538 1538           * If we've already done a partial read, terminate
1539 1539           * the read but return no error.
1540 1540           */
1541 1541          if (oresid != uio->uio_resid)
1542 1542                  error = 0;
1543 1543          ITIMES(ip);
1544 1544  
1545 1545          return (error);
1546 1546  }
1547 1547  
1548 1548  /* ARGSUSED */
1549 1549  static int
1550 1550  ufs_ioctl(
1551 1551          struct vnode    *vp,
1552 1552          int             cmd,
1553 1553          intptr_t        arg,
1554 1554          int             flag,
1555 1555          struct cred     *cr,
1556 1556          int             *rvalp,
1557 1557          caller_context_t *ct)
1558 1558  {
1559 1559          struct lockfs   lockfs, lockfs_out;
1560 1560          struct ufsvfs   *ufsvfsp = VTOI(vp)->i_ufsvfs;
1561 1561          char            *comment, *original_comment;
1562 1562          struct fs       *fs;
1563 1563          struct ulockfs  *ulp;
1564 1564          offset_t        off;
1565 1565          extern int      maxphys;
1566 1566          int             error;
1567 1567          int             issync;
1568 1568          int             trans_size;
1569 1569  
1570 1570  
1571 1571          /*
1572 1572           * forcibly unmounted
1573 1573           */
1574 1574          if (ufsvfsp == NULL || vp->v_vfsp == NULL ||
1575 1575              vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
1576 1576                  return (EIO);
1577 1577          fs = ufsvfsp->vfs_fs;
1578 1578  
1579 1579          if (cmd == Q_QUOTACTL) {
1580 1580                  error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_QUOTA_MASK);
1581 1581                  if (error)
1582 1582                          return (error);
1583 1583  
1584 1584                  if (ulp) {
1585 1585                          TRANS_BEGIN_ASYNC(ufsvfsp, TOP_QUOTA,
1586 1586                              TOP_SETQUOTA_SIZE(fs));
1587 1587                  }
1588 1588  
1589 1589                  error = quotactl(vp, arg, flag, cr);
1590 1590  
1591 1591                  if (ulp) {
1592 1592                          TRANS_END_ASYNC(ufsvfsp, TOP_QUOTA,
1593 1593                              TOP_SETQUOTA_SIZE(fs));
1594 1594                          ufs_lockfs_end(ulp);
1595 1595                  }
1596 1596                  return (error);
1597 1597          }
1598 1598  
1599 1599          switch (cmd) {
1600 1600                  case _FIOLFS:
1601 1601                          /*
1602 1602                           * file system locking
1603 1603                           */
1604 1604                          if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1605 1605                                  return (EPERM);
1606 1606  
1607 1607                          if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
1608 1608                                  if (copyin((caddr_t)arg, &lockfs,
1609 1609                                      sizeof (struct lockfs)))
1610 1610                                          return (EFAULT);
1611 1611                          }
1612 1612  #ifdef _SYSCALL32_IMPL
1613 1613                          else {
1614 1614                                  struct lockfs32 lockfs32;
1615 1615                                  /* Translate ILP32 lockfs to LP64 lockfs */
1616 1616                                  if (copyin((caddr_t)arg, &lockfs32,
1617 1617                                      sizeof (struct lockfs32)))
1618 1618                                          return (EFAULT);
1619 1619                                  lockfs.lf_lock = (ulong_t)lockfs32.lf_lock;
1620 1620                                  lockfs.lf_flags = (ulong_t)lockfs32.lf_flags;
1621 1621                                  lockfs.lf_key = (ulong_t)lockfs32.lf_key;
1622 1622                                  lockfs.lf_comlen = (ulong_t)lockfs32.lf_comlen;
1623 1623                                  lockfs.lf_comment =
1624 1624                                      (caddr_t)(uintptr_t)lockfs32.lf_comment;
1625 1625                          }
1626 1626  #endif /* _SYSCALL32_IMPL */
1627 1627  
1628 1628                          if (lockfs.lf_comlen) {
1629 1629                                  if (lockfs.lf_comlen > LOCKFS_MAXCOMMENTLEN)
1630 1630                                          return (ENAMETOOLONG);
1631 1631                                  comment =
1632 1632                                      kmem_alloc(lockfs.lf_comlen, KM_SLEEP);
1633 1633                                  if (copyin(lockfs.lf_comment, comment,
1634 1634                                      lockfs.lf_comlen)) {
1635 1635                                          kmem_free(comment, lockfs.lf_comlen);
1636 1636                                          return (EFAULT);
1637 1637                                  }
1638 1638                                  original_comment = lockfs.lf_comment;
1639 1639                                  lockfs.lf_comment = comment;
1640 1640                          }
1641 1641                          if ((error = ufs_fiolfs(vp, &lockfs, 0)) == 0) {
1642 1642                                  lockfs.lf_comment = original_comment;
1643 1643  
1644 1644                                  if ((flag & DATAMODEL_MASK) ==
1645 1645                                      DATAMODEL_NATIVE) {
1646 1646                                          (void) copyout(&lockfs, (caddr_t)arg,
1647 1647                                              sizeof (struct lockfs));
1648 1648                                  }
1649 1649  #ifdef _SYSCALL32_IMPL
1650 1650                                  else {
1651 1651                                          struct lockfs32 lockfs32;
1652 1652                                          /* Translate LP64 to ILP32 lockfs */
1653 1653                                          lockfs32.lf_lock =
1654 1654                                              (uint32_t)lockfs.lf_lock;
1655 1655                                          lockfs32.lf_flags =
1656 1656                                              (uint32_t)lockfs.lf_flags;
1657 1657                                          lockfs32.lf_key =
1658 1658                                              (uint32_t)lockfs.lf_key;
1659 1659                                          lockfs32.lf_comlen =
1660 1660                                              (uint32_t)lockfs.lf_comlen;
1661 1661                                          lockfs32.lf_comment =
1662 1662                                              (uint32_t)(uintptr_t)
1663 1663                                              lockfs.lf_comment;
1664 1664                                          (void) copyout(&lockfs32, (caddr_t)arg,
1665 1665                                              sizeof (struct lockfs32));
1666 1666                                  }
1667 1667  #endif /* _SYSCALL32_IMPL */
1668 1668  
1669 1669                          } else {
1670 1670                                  if (lockfs.lf_comlen)
1671 1671                                          kmem_free(comment, lockfs.lf_comlen);
1672 1672                          }
1673 1673                          return (error);
1674 1674  
1675 1675                  case _FIOLFSS:
1676 1676                          /*
1677 1677                           * get file system locking status
1678 1678                           */
1679 1679  
1680 1680                          if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
1681 1681                                  if (copyin((caddr_t)arg, &lockfs,
1682 1682                                      sizeof (struct lockfs)))
1683 1683                                          return (EFAULT);
1684 1684                          }
1685 1685  #ifdef _SYSCALL32_IMPL
1686 1686                          else {
1687 1687                                  struct lockfs32 lockfs32;
1688 1688                                  /* Translate ILP32 lockfs to LP64 lockfs */
1689 1689                                  if (copyin((caddr_t)arg, &lockfs32,
1690 1690                                      sizeof (struct lockfs32)))
1691 1691                                          return (EFAULT);
1692 1692                                  lockfs.lf_lock = (ulong_t)lockfs32.lf_lock;
1693 1693                                  lockfs.lf_flags = (ulong_t)lockfs32.lf_flags;
1694 1694                                  lockfs.lf_key = (ulong_t)lockfs32.lf_key;
1695 1695                                  lockfs.lf_comlen = (ulong_t)lockfs32.lf_comlen;
1696 1696                                  lockfs.lf_comment =
1697 1697                                      (caddr_t)(uintptr_t)lockfs32.lf_comment;
1698 1698                          }
1699 1699  #endif /* _SYSCALL32_IMPL */
1700 1700  
1701 1701                          if (error =  ufs_fiolfss(vp, &lockfs_out))
1702 1702                                  return (error);
1703 1703                          lockfs.lf_lock = lockfs_out.lf_lock;
1704 1704                          lockfs.lf_key = lockfs_out.lf_key;
1705 1705                          lockfs.lf_flags = lockfs_out.lf_flags;
1706 1706                          lockfs.lf_comlen = MIN(lockfs.lf_comlen,
1707 1707                              lockfs_out.lf_comlen);
1708 1708  
1709 1709                          if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
1710 1710                                  if (copyout(&lockfs, (caddr_t)arg,
1711 1711                                      sizeof (struct lockfs)))
1712 1712                                          return (EFAULT);
1713 1713                          }
1714 1714  #ifdef _SYSCALL32_IMPL
1715 1715                          else {
1716 1716                                  /* Translate LP64 to ILP32 lockfs */
1717 1717                                  struct lockfs32 lockfs32;
1718 1718                                  lockfs32.lf_lock = (uint32_t)lockfs.lf_lock;
1719 1719                                  lockfs32.lf_flags = (uint32_t)lockfs.lf_flags;
1720 1720                                  lockfs32.lf_key = (uint32_t)lockfs.lf_key;
1721 1721                                  lockfs32.lf_comlen = (uint32_t)lockfs.lf_comlen;
1722 1722                                  lockfs32.lf_comment =
1723 1723                                      (uint32_t)(uintptr_t)lockfs.lf_comment;
1724 1724                                  if (copyout(&lockfs32, (caddr_t)arg,
1725 1725                                      sizeof (struct lockfs32)))
1726 1726                                          return (EFAULT);
1727 1727                          }
1728 1728  #endif /* _SYSCALL32_IMPL */
1729 1729  
1730 1730                          if (lockfs.lf_comlen &&
1731 1731                              lockfs.lf_comment && lockfs_out.lf_comment)
1732 1732                                  if (copyout(lockfs_out.lf_comment,
1733 1733                                      lockfs.lf_comment, lockfs.lf_comlen))
1734 1734                                          return (EFAULT);
1735 1735                          return (0);
1736 1736  
1737 1737                  case _FIOSATIME:
1738 1738                          /*
1739 1739                           * set access time
1740 1740                           */
1741 1741  
1742 1742                          /*
1743 1743                           * if mounted w/o atime, return quietly.
1744 1744                           * I briefly thought about returning ENOSYS, but
1745 1745                           * figured that most apps would consider this fatal
1746 1746                           * but the idea is to make this as seamless as poss.
1747 1747                           */
1748 1748                          if (ufsvfsp->vfs_noatime)
1749 1749                                  return (0);
1750 1750  
1751 1751                          error = ufs_lockfs_begin(ufsvfsp, &ulp,
1752 1752                              ULOCKFS_SETATTR_MASK);
1753 1753                          if (error)
1754 1754                                  return (error);
1755 1755  
1756 1756                          if (ulp) {
1757 1757                                  trans_size = (int)TOP_SETATTR_SIZE(VTOI(vp));
1758 1758                                  TRANS_BEGIN_CSYNC(ufsvfsp, issync,
1759 1759                                      TOP_SETATTR, trans_size);
1760 1760                          }
1761 1761  
1762 1762                          error = ufs_fiosatime(vp, (struct timeval *)arg,
1763 1763                              flag, cr);
1764 1764  
1765 1765                          if (ulp) {
1766 1766                                  TRANS_END_CSYNC(ufsvfsp, error, issync,
1767 1767                                      TOP_SETATTR, trans_size);
1768 1768                                  ufs_lockfs_end(ulp);
1769 1769                          }
1770 1770                          return (error);
1771 1771  
1772 1772                  case _FIOSDIO:
1773 1773                          /*
1774 1774                           * set delayed-io
1775 1775                           */
1776 1776                          return (ufs_fiosdio(vp, (uint_t *)arg, flag, cr));
1777 1777  
1778 1778                  case _FIOGDIO:
1779 1779                          /*
1780 1780                           * get delayed-io
1781 1781                           */
1782 1782                          return (ufs_fiogdio(vp, (uint_t *)arg, flag, cr));
1783 1783  
1784 1784                  case _FIOIO:
1785 1785                          /*
1786 1786                           * inode open
1787 1787                           */
1788 1788                          error = ufs_lockfs_begin(ufsvfsp, &ulp,
1789 1789                              ULOCKFS_VGET_MASK);
1790 1790                          if (error)
1791 1791                                  return (error);
1792 1792  
1793 1793                          error = ufs_fioio(vp, (struct fioio *)arg, flag, cr);
1794 1794  
1795 1795                          if (ulp) {
1796 1796                                  ufs_lockfs_end(ulp);
1797 1797                          }
1798 1798                          return (error);
1799 1799  
1800 1800                  case _FIOFFS:
1801 1801                          /*
1802 1802                           * file system flush (push w/invalidate)
1803 1803                           */
1804 1804                          if ((caddr_t)arg != NULL)
1805 1805                                  return (EINVAL);
1806 1806                          return (ufs_fioffs(vp, NULL, cr));
1807 1807  
1808 1808                  case _FIOISBUSY:
1809 1809                          /*
1810 1810                           * Contract-private interface for Legato
1811 1811                           * Purge this vnode from the DNLC and decide
1812 1812                           * if this vnode is busy (*arg == 1) or not
1813 1813                           * (*arg == 0)
1814 1814                           */
1815 1815                          if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1816 1816                                  return (EPERM);
1817 1817                          error = ufs_fioisbusy(vp, (int *)arg, cr);
1818 1818                          return (error);
1819 1819  
1820 1820                  case _FIODIRECTIO:
1821 1821                          return (ufs_fiodirectio(vp, (int)arg, cr));
1822 1822  
1823 1823                  case _FIOTUNE:
1824 1824                          /*
1825 1825                           * Tune the file system (aka setting fs attributes)
1826 1826                           */
1827 1827                          error = ufs_lockfs_begin(ufsvfsp, &ulp,
1828 1828                              ULOCKFS_SETATTR_MASK);
1829 1829                          if (error)
1830 1830                                  return (error);
1831 1831  
1832 1832                          error = ufs_fiotune(vp, (struct fiotune *)arg, cr);
1833 1833  
1834 1834                          if (ulp)
1835 1835                                  ufs_lockfs_end(ulp);
1836 1836                          return (error);
1837 1837  
1838 1838                  case _FIOLOGENABLE:
1839 1839                          if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1840 1840                                  return (EPERM);
1841 1841                          return (ufs_fiologenable(vp, (void *)arg, cr, flag));
1842 1842  
1843 1843                  case _FIOLOGDISABLE:
1844 1844                          if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1845 1845                                  return (EPERM);
1846 1846                          return (ufs_fiologdisable(vp, (void *)arg, cr, flag));
1847 1847  
1848 1848                  case _FIOISLOG:
1849 1849                          return (ufs_fioislog(vp, (void *)arg, cr, flag));
1850 1850  
1851 1851                  case _FIOSNAPSHOTCREATE_MULTI:
1852 1852                  {
1853 1853                          struct fiosnapcreate_multi      fc, *fcp;
1854 1854                          size_t  fcm_size;
1855 1855  
1856 1856                          if (copyin((void *)arg, &fc, sizeof (fc)))
1857 1857                                  return (EFAULT);
1858 1858                          if (fc.backfilecount > MAX_BACKFILE_COUNT)
1859 1859                                  return (EINVAL);
1860 1860                          fcm_size = sizeof (struct fiosnapcreate_multi) +
1861 1861                              (fc.backfilecount - 1) * sizeof (int);
1862 1862                          fcp = (struct fiosnapcreate_multi *)
1863 1863                              kmem_alloc(fcm_size, KM_SLEEP);
1864 1864                          if (copyin((void *)arg, fcp, fcm_size)) {
1865 1865                                  kmem_free(fcp, fcm_size);
1866 1866                                  return (EFAULT);
1867 1867                          }
1868 1868                          error = ufs_snap_create(vp, fcp, cr);
1869 1869                          /*
1870 1870                           * Do copyout even if there is an error because
1871 1871                           * the details of error is stored in fcp.
1872 1872                           */
1873 1873                          if (copyout(fcp, (void *)arg, fcm_size))
1874 1874                                  error = EFAULT;
1875 1875                          kmem_free(fcp, fcm_size);
1876 1876                          return (error);
1877 1877                  }
1878 1878  
1879 1879                  case _FIOSNAPSHOTDELETE:
1880 1880                  {
1881 1881                          struct fiosnapdelete    fc;
1882 1882  
1883 1883                          if (copyin((void *)arg, &fc, sizeof (fc)))
1884 1884                                  return (EFAULT);
1885 1885                          error = ufs_snap_delete(vp, &fc, cr);
1886 1886                          if (!error && copyout(&fc, (void *)arg, sizeof (fc)))
1887 1887                                  error = EFAULT;
1888 1888                          return (error);
1889 1889                  }
1890 1890  
1891 1891                  case _FIOGETSUPERBLOCK:
1892 1892                          if (copyout(fs, (void *)arg, SBSIZE))
1893 1893                                  return (EFAULT);
1894 1894                          return (0);
1895 1895  
1896 1896                  case _FIOGETMAXPHYS:
1897 1897                          if (copyout(&maxphys, (void *)arg, sizeof (maxphys)))
1898 1898                                  return (EFAULT);
1899 1899                          return (0);
1900 1900  
1901 1901                  /*
1902 1902                   * The following 3 ioctls are for TSufs support
1903 1903                   * although could potentially be used elsewhere
1904 1904                   */
1905 1905                  case _FIO_SET_LUFS_DEBUG:
1906 1906                          if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1907 1907                                  return (EPERM);
1908 1908                          lufs_debug = (uint32_t)arg;
1909 1909                          return (0);
1910 1910  
1911 1911                  case _FIO_SET_LUFS_ERROR:
1912 1912                          if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1913 1913                                  return (EPERM);
1914 1914                          TRANS_SETERROR(ufsvfsp);
1915 1915                          return (0);
1916 1916  
1917 1917                  case _FIO_GET_TOP_STATS:
1918 1918                  {
1919 1919                          fio_lufs_stats_t *ls;
1920 1920                          ml_unit_t *ul = ufsvfsp->vfs_log;
1921 1921  
1922 1922                          ls = kmem_zalloc(sizeof (*ls), KM_SLEEP);
1923 1923                          ls->ls_debug = ul->un_debug; /* return debug value */
1924 1924                          /* Copy stucture if statistics are being kept */
1925 1925                          if (ul->un_logmap->mtm_tops) {
1926 1926                                  ls->ls_topstats = *(ul->un_logmap->mtm_tops);
1927 1927                          }
1928 1928                          error = 0;
1929 1929                          if (copyout(ls, (void *)arg, sizeof (*ls)))
1930 1930                                  error = EFAULT;
1931 1931                          kmem_free(ls, sizeof (*ls));
1932 1932                          return (error);
1933 1933                  }
1934 1934  
1935 1935                  case _FIO_SEEK_DATA:
1936 1936                  case _FIO_SEEK_HOLE:
1937 1937                          if (ddi_copyin((void *)arg, &off, sizeof (off), flag))
1938 1938                                  return (EFAULT);
1939 1939                          /* offset paramater is in/out */
1940 1940                          error = ufs_fio_holey(vp, cmd, &off);
1941 1941                          if (error)
1942 1942                                  return (error);
1943 1943                          if (ddi_copyout(&off, (void *)arg, sizeof (off), flag))
1944 1944                                  return (EFAULT);
1945 1945                          return (0);
1946 1946  
1947 1947                  case _FIO_COMPRESSED:
1948 1948                  {
1949 1949                          /*
1950 1950                           * This is a project private ufs ioctl() to mark
1951 1951                           * the inode as that belonging to a compressed
1952 1952                           * file. This is used to mark individual
1953 1953                           * compressed files in a miniroot archive.
1954 1954                           * The files compressed in this manner are
1955 1955                           * automatically decompressed by the dcfs filesystem
1956 1956                           * (via an interception in ufs_lookup - see decompvp())
1957 1957                           * which is layered on top of ufs on a system running
1958 1958                           * from the archive. See uts/common/fs/dcfs for details.
1959 1959                           * This ioctl only marks the file as compressed - the
1960 1960                           * actual compression is done by fiocompress (a
1961 1961                           * userland utility) which invokes this ioctl().
1962 1962                           */
1963 1963                          struct inode *ip = VTOI(vp);
1964 1964  
1965 1965                          error = ufs_lockfs_begin(ufsvfsp, &ulp,
1966 1966                              ULOCKFS_SETATTR_MASK);
1967 1967                          if (error)
1968 1968                                  return (error);
1969 1969  
1970 1970                          if (ulp) {
1971 1971                                  TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IUPDAT,
1972 1972                                      TOP_IUPDAT_SIZE(ip));
1973 1973                          }
1974 1974  
1975 1975                          error = ufs_mark_compressed(vp);
1976 1976  
1977 1977                          if (ulp) {
1978 1978                                  TRANS_END_ASYNC(ufsvfsp, TOP_IUPDAT,
1979 1979                                      TOP_IUPDAT_SIZE(ip));
1980 1980                                  ufs_lockfs_end(ulp);
1981 1981                          }
1982 1982  
1983 1983                          return (error);
1984 1984  
1985 1985                  }

↓ open down ↓

1540 lines elided

↑ open up ↑

1986 1986  
1987 1987                  default:
1988 1988                          return (ENOTTY);
1989 1989          }
1990 1990  }
1991 1991  
1992 1992  
1993 1993  /* ARGSUSED */
1994 1994  static int
1995 1995  ufs_getattr(struct vnode *vp, struct vattr *vap, int flags,
1996      -        struct cred *cr, caller_context_t *ct)
     1996 +    struct cred *cr, caller_context_t *ct)
1997 1997  {
1998 1998          struct inode *ip = VTOI(vp);
1999 1999          struct ufsvfs *ufsvfsp;
2000 2000          int err;
2001 2001  
2002 2002          if (vap->va_mask == AT_SIZE) {
2003 2003                  /*
2004 2004                   * for performance, if only the size is requested don't bother
2005 2005                   * with anything else.
2006 2006                   */

2007 2007                  UFS_GET_ISIZE(&vap->va_size, ip);
2008 2008                  return (0);
2009 2009          }
2010 2010  
2011 2011          /*
2012 2012           * inlined lockfs checks
2013 2013           */
2014 2014          ufsvfsp = ip->i_ufsvfs;
2015 2015          if ((ufsvfsp == NULL) || ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs)) {
2016 2016                  err = EIO;
2017 2017                  goto out;
2018 2018          }
2019 2019  
2020 2020          rw_enter(&ip->i_contents, RW_READER);
2021 2021          /*
2022 2022           * Return all the attributes.  This should be refined so
2023 2023           * that it only returns what's asked for.
2024 2024           */
2025 2025  
2026 2026          /*
2027 2027           * Copy from inode table.
2028 2028           */
2029 2029          vap->va_type = vp->v_type;
2030 2030          vap->va_mode = ip->i_mode & MODEMASK;
2031 2031          /*
2032 2032           * If there is an ACL and there is a mask entry, then do the
2033 2033           * extra work that completes the equivalent of an acltomode(3)
2034 2034           * call.  According to POSIX P1003.1e, the acl mask should be
2035 2035           * returned in the group permissions field.
2036 2036           *
2037 2037           * - start with the original permission and mode bits (from above)
2038 2038           * - clear the group owner bits
2039 2039           * - add in the mask bits.
2040 2040           */
2041 2041          if (ip->i_ufs_acl && ip->i_ufs_acl->aclass.acl_ismask) {
2042 2042                  vap->va_mode &= ~((VREAD | VWRITE | VEXEC) >> 3);
2043 2043                  vap->va_mode |=
2044 2044                      (ip->i_ufs_acl->aclass.acl_maskbits & PERMMASK) << 3;
2045 2045          }
2046 2046          vap->va_uid = ip->i_uid;
2047 2047          vap->va_gid = ip->i_gid;
2048 2048          vap->va_fsid = ip->i_dev;
2049 2049          vap->va_nodeid = (ino64_t)ip->i_number;
2050 2050          vap->va_nlink = ip->i_nlink;
2051 2051          vap->va_size = ip->i_size;
2052 2052          if (vp->v_type == VCHR || vp->v_type == VBLK)
2053 2053                  vap->va_rdev = ip->i_rdev;
2054 2054          else
2055 2055                  vap->va_rdev = 0;       /* not a b/c spec. */
2056 2056          mutex_enter(&ip->i_tlock);
2057 2057          ITIMES_NOLOCK(ip);      /* mark correct time in inode */
2058 2058          vap->va_seq = ip->i_seq;
2059 2059          vap->va_atime.tv_sec = (time_t)ip->i_atime.tv_sec;
2060 2060          vap->va_atime.tv_nsec = ip->i_atime.tv_usec*1000;
2061 2061          vap->va_mtime.tv_sec = (time_t)ip->i_mtime.tv_sec;
2062 2062          vap->va_mtime.tv_nsec = ip->i_mtime.tv_usec*1000;
2063 2063          vap->va_ctime.tv_sec = (time_t)ip->i_ctime.tv_sec;
2064 2064          vap->va_ctime.tv_nsec = ip->i_ctime.tv_usec*1000;
2065 2065          mutex_exit(&ip->i_tlock);
2066 2066  
2067 2067          switch (ip->i_mode & IFMT) {
2068 2068  
2069 2069          case IFBLK:
2070 2070                  vap->va_blksize = MAXBSIZE;             /* was BLKDEV_IOSIZE */
2071 2071                  break;
2072 2072  
2073 2073          case IFCHR:
2074 2074                  vap->va_blksize = MAXBSIZE;
2075 2075                  break;
2076 2076  
2077 2077          default:
2078 2078                  vap->va_blksize = ip->i_fs->fs_bsize;
2079 2079                  break;
2080 2080          }
2081 2081          vap->va_nblocks = (fsblkcnt64_t)ip->i_blocks;
2082 2082          rw_exit(&ip->i_contents);
2083 2083          err = 0;
2084 2084  
2085 2085  out:
2086 2086          return (err);
2087 2087  }
2088 2088  
2089 2089  /*
2090 2090   * Special wrapper to provide a callback for secpolicy_vnode_setattr().
2091 2091   * The i_contents lock is already held by the caller and we need to
2092 2092   * declare the inode as 'void *' argument.
2093 2093   */

↓ open down ↓

87 lines elided

↑ open up ↑

2094 2094  static int
2095 2095  ufs_priv_access(void *vip, int mode, struct cred *cr)
2096 2096  {
2097 2097          struct inode *ip = vip;
2098 2098  
2099 2099          return (ufs_iaccess(ip, mode, cr, 0));
2100 2100  }
2101 2101  
2102 2102  /*ARGSUSED4*/
2103 2103  static int
2104      -ufs_setattr(
2105      -        struct vnode *vp,
2106      -        struct vattr *vap,
2107      -        int flags,
2108      -        struct cred *cr,
2109      -        caller_context_t *ct)
     2104 +ufs_setattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cr,
     2105 +    caller_context_t *ct)
2110 2106  {
2111 2107          struct inode *ip = VTOI(vp);
2112 2108          struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
2113 2109          struct fs *fs;
2114 2110          struct ulockfs *ulp;
2115 2111          char *errmsg1;
2116 2112          char *errmsg2;
2117 2113          long blocks;
2118 2114          long int mask = vap->va_mask;
2119 2115          size_t len1, len2;

2120 2116          int issync;
2121 2117          int trans_size;
2122 2118          int dotrans;
2123 2119          int dorwlock;
2124 2120          int error;
2125 2121          int owner_change;
2126 2122          int dodqlock;
2127 2123          timestruc_t now;
2128 2124          vattr_t oldva;
2129 2125          int retry = 1;
2130 2126          int indeadlock;
2131 2127  
2132 2128          /*
2133 2129           * Cannot set these attributes.
2134 2130           */
2135 2131          if ((mask & AT_NOSET) || (mask & AT_XVATTR))
2136 2132                  return (EINVAL);
2137 2133  
2138 2134          /*
2139 2135           * check for forced unmount
2140 2136           */
2141 2137          if (ufsvfsp == NULL)
2142 2138                  return (EIO);
2143 2139  
2144 2140          fs = ufsvfsp->vfs_fs;
2145 2141          if (fs->fs_ronly != 0)
2146 2142                  return (EROFS);
2147 2143  
2148 2144  again:
2149 2145          errmsg1 = NULL;
2150 2146          errmsg2 = NULL;
2151 2147          dotrans = 0;
2152 2148          dorwlock = 0;
2153 2149          dodqlock = 0;
2154 2150  
2155 2151          error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SETATTR_MASK);
2156 2152          if (error)
2157 2153                  goto out;
2158 2154  
2159 2155          /*
2160 2156           * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file.
2161 2157           * This follows the protocol for read()/write().
2162 2158           */
2163 2159          if (vp->v_type != VDIR) {
2164 2160                  /*
2165 2161                   * ufs_tryirwlock uses rw_tryenter and checks for SLOCK to
2166 2162                   * avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
2167 2163                   * possible, retries the operation.
2168 2164                   */
2169 2165                  ufs_tryirwlock(&ip->i_rwlock, RW_WRITER, retry_file);
2170 2166                  if (indeadlock) {
2171 2167                          if (ulp)
2172 2168                                  ufs_lockfs_end(ulp);
2173 2169                          goto again;
2174 2170                  }
2175 2171                  dorwlock = 1;
2176 2172          }
2177 2173  
2178 2174          /*
2179 2175           * Truncate file.  Must have write permission and not be a directory.
2180 2176           */
2181 2177          if (mask & AT_SIZE) {
2182 2178                  rw_enter(&ip->i_contents, RW_WRITER);
2183 2179                  if (vp->v_type == VDIR) {
2184 2180                          error = EISDIR;
2185 2181                          goto update_inode;
2186 2182                  }
2187 2183                  if (error = ufs_iaccess(ip, IWRITE, cr, 0))
2188 2184                          goto update_inode;
2189 2185  
2190 2186                  rw_exit(&ip->i_contents);
2191 2187                  error = TRANS_ITRUNC(ip, vap->va_size, 0, cr);
2192 2188                  if (error) {
2193 2189                          rw_enter(&ip->i_contents, RW_WRITER);
2194 2190                          goto update_inode;
2195 2191                  }
2196 2192  
2197 2193                  if (error == 0 && vap->va_size)
2198 2194                          vnevent_truncate(vp, ct);
2199 2195          }
2200 2196  
2201 2197          if (ulp) {
2202 2198                  trans_size = (int)TOP_SETATTR_SIZE(ip);
2203 2199                  TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_SETATTR, trans_size);
2204 2200                  ++dotrans;
2205 2201          }
2206 2202  
2207 2203          /*
2208 2204           * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory.
2209 2205           * This follows the protocol established by
2210 2206           * ufs_link/create/remove/rename/mkdir/rmdir/symlink.
2211 2207           */
2212 2208          if (vp->v_type == VDIR) {
2213 2209                  ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_SETATTR,
2214 2210                      retry_dir);
2215 2211                  if (indeadlock)
2216 2212                          goto again;
2217 2213                  dorwlock = 1;
2218 2214          }
2219 2215  
2220 2216          /*
2221 2217           * Grab quota lock if we are changing the file's owner.
2222 2218           */
2223 2219          if (mask & AT_UID) {
2224 2220                  rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
2225 2221                  dodqlock = 1;
2226 2222          }
2227 2223          rw_enter(&ip->i_contents, RW_WRITER);
2228 2224  
2229 2225          oldva.va_mode = ip->i_mode;
2230 2226          oldva.va_uid = ip->i_uid;
2231 2227          oldva.va_gid = ip->i_gid;
2232 2228  
2233 2229          vap->va_mask &= ~AT_SIZE;
2234 2230  
2235 2231          error = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
2236 2232              ufs_priv_access, ip);
2237 2233          if (error)
2238 2234                  goto update_inode;
2239 2235  
2240 2236          mask = vap->va_mask;
2241 2237  
2242 2238          /*
2243 2239           * Change file access modes.
2244 2240           */
2245 2241          if (mask & AT_MODE) {
2246 2242                  ip->i_mode = (ip->i_mode & IFMT) | (vap->va_mode & ~IFMT);
2247 2243                  TRANS_INODE(ufsvfsp, ip);
2248 2244                  ip->i_flag |= ICHG;
2249 2245                  if (stickyhack) {
2250 2246                          mutex_enter(&vp->v_lock);
2251 2247                          if ((ip->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX)
2252 2248                                  vp->v_flag |= VSWAPLIKE;
2253 2249                          else
2254 2250                                  vp->v_flag &= ~VSWAPLIKE;
2255 2251                          mutex_exit(&vp->v_lock);
2256 2252                  }
2257 2253          }
2258 2254          if (mask & (AT_UID|AT_GID)) {
2259 2255                  if (mask & AT_UID) {
2260 2256                          /*
2261 2257                           * Don't change ownership of the quota inode.
2262 2258                           */
2263 2259                          if (ufsvfsp->vfs_qinod == ip) {
2264 2260                                  ASSERT(ufsvfsp->vfs_qflags & MQ_ENABLED);
2265 2261                                  error = EINVAL;
2266 2262                                  goto update_inode;
2267 2263                          }
2268 2264  
2269 2265                          /*
2270 2266                           * No real ownership change.
2271 2267                           */
2272 2268                          if (ip->i_uid == vap->va_uid) {
2273 2269                                  blocks = 0;
2274 2270                                  owner_change = 0;
2275 2271                          }
2276 2272                          /*
2277 2273                           * Remove the blocks and the file, from the old user's
2278 2274                           * quota.
2279 2275                           */
2280 2276                          else {
2281 2277                                  blocks = ip->i_blocks;
2282 2278                                  owner_change = 1;
2283 2279  
2284 2280                                  (void) chkdq(ip, -blocks, /* force */ 1, cr,
2285 2281                                      (char **)NULL, (size_t *)NULL);
2286 2282                                  (void) chkiq(ufsvfsp, /* change */ -1, ip,
2287 2283                                      (uid_t)ip->i_uid, /* force */ 1, cr,
2288 2284                                      (char **)NULL, (size_t *)NULL);
2289 2285                                  dqrele(ip->i_dquot);
2290 2286                          }
2291 2287  
2292 2288                          ip->i_uid = vap->va_uid;
2293 2289  
2294 2290                          /*
2295 2291                           * There is a real ownership change.
2296 2292                           */
2297 2293                          if (owner_change) {
2298 2294                                  /*
2299 2295                                   * Add the blocks and the file to the new
2300 2296                                   * user's quota.
2301 2297                                   */
2302 2298                                  ip->i_dquot = getinoquota(ip);
2303 2299                                  (void) chkdq(ip, blocks, /* force */ 1, cr,
2304 2300                                      &errmsg1, &len1);
2305 2301                                  (void) chkiq(ufsvfsp, /* change */ 1,
2306 2302                                      (struct inode *)NULL, (uid_t)ip->i_uid,
2307 2303                                      /* force */ 1, cr, &errmsg2, &len2);
2308 2304                          }
2309 2305                  }
2310 2306                  if (mask & AT_GID) {
2311 2307                          ip->i_gid = vap->va_gid;
2312 2308                  }
2313 2309                  TRANS_INODE(ufsvfsp, ip);
2314 2310                  ip->i_flag |= ICHG;
2315 2311          }
2316 2312          /*
2317 2313           * Change file access or modified times.
2318 2314           */
2319 2315          if (mask & (AT_ATIME|AT_MTIME)) {
2320 2316                  /* Check that the time value is within ufs range */
2321 2317                  if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2322 2318                      ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2323 2319                          error = EOVERFLOW;
2324 2320                          goto update_inode;
2325 2321                  }
2326 2322  
2327 2323                  /*
2328 2324                   * if the "noaccess" mount option is set and only atime
2329 2325                   * update is requested, do nothing. No error is returned.
2330 2326                   */
2331 2327                  if ((ufsvfsp->vfs_noatime) &&
2332 2328                      ((mask & (AT_ATIME|AT_MTIME)) == AT_ATIME))
2333 2329                          goto skip_atime;
2334 2330  
2335 2331                  if (mask & AT_ATIME) {
2336 2332                          ip->i_atime.tv_sec = vap->va_atime.tv_sec;
2337 2333                          ip->i_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2338 2334                          ip->i_flag &= ~IACC;
2339 2335                  }
2340 2336                  if (mask & AT_MTIME) {
2341 2337                          ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
2342 2338                          ip->i_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2343 2339                          gethrestime(&now);
2344 2340                          if (now.tv_sec > TIME32_MAX) {
2345 2341                                  /*
2346 2342                                   * In 2038, ctime sticks forever..
2347 2343                                   */
2348 2344                                  ip->i_ctime.tv_sec = TIME32_MAX;
2349 2345                                  ip->i_ctime.tv_usec = 0;
2350 2346                          } else {
2351 2347                                  ip->i_ctime.tv_sec = now.tv_sec;
2352 2348                                  ip->i_ctime.tv_usec = now.tv_nsec / 1000;
2353 2349                          }
2354 2350                          ip->i_flag &= ~(IUPD|ICHG);
2355 2351                          ip->i_flag |= IMODTIME;
2356 2352                  }
2357 2353                  TRANS_INODE(ufsvfsp, ip);
2358 2354                  ip->i_flag |= IMOD;
2359 2355          }
2360 2356  
2361 2357  skip_atime:
2362 2358          /*
2363 2359           * The presence of a shadow inode may indicate an ACL, but does
2364 2360           * not imply an ACL.  Future FSD types should be handled here too
2365 2361           * and check for the presence of the attribute-specific data
2366 2362           * before referencing it.
2367 2363           */
2368 2364          if (ip->i_shadow) {
2369 2365                  /*
2370 2366                   * XXX if ufs_iupdat is changed to sandbagged write fix
2371 2367                   * ufs_acl_setattr to push ip to keep acls consistent
2372 2368                   *
2373 2369                   * Suppress out of inodes messages if we will retry.
2374 2370                   */
2375 2371                  if (retry)
2376 2372                          ip->i_flag |= IQUIET;
2377 2373                  error = ufs_acl_setattr(ip, vap, cr);
2378 2374                  ip->i_flag &= ~IQUIET;
2379 2375          }
2380 2376  
2381 2377  update_inode:
2382 2378          /*
2383 2379           * Setattr always increases the sequence number
2384 2380           */
2385 2381          ip->i_seq++;
2386 2382  
2387 2383          /*
2388 2384           * if nfsd and not logging; push synchronously
2389 2385           */
2390 2386          if ((curthread->t_flag & T_DONTPEND) && !TRANS_ISTRANS(ufsvfsp)) {
2391 2387                  ufs_iupdat(ip, 1);
2392 2388          } else {
2393 2389                  ITIMES_NOLOCK(ip);
2394 2390          }
2395 2391  
2396 2392          rw_exit(&ip->i_contents);
2397 2393          if (dodqlock) {
2398 2394                  rw_exit(&ufsvfsp->vfs_dqrwlock);
2399 2395          }
2400 2396          if (dorwlock)
2401 2397                  rw_exit(&ip->i_rwlock);
2402 2398  
2403 2399          if (ulp) {
2404 2400                  if (dotrans) {
2405 2401                          int terr = 0;
2406 2402                          TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_SETATTR,
2407 2403                              trans_size);
2408 2404                          if (error == 0)
2409 2405                                  error = terr;
2410 2406                  }
2411 2407                  ufs_lockfs_end(ulp);
2412 2408          }
2413 2409  out:
2414 2410          /*
2415 2411           * If out of inodes or blocks, see if we can free something
2416 2412           * up from the delete queue.
2417 2413           */
2418 2414          if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
2419 2415                  ufs_delete_drain_wait(ufsvfsp, 1);
2420 2416                  retry = 0;
2421 2417                  if (errmsg1 != NULL)
2422 2418                          kmem_free(errmsg1, len1);
2423 2419                  if (errmsg2 != NULL)
2424 2420                          kmem_free(errmsg2, len2);
2425 2421                  goto again;
2426 2422          }
2427 2423          if (errmsg1 != NULL) {
2428 2424                  uprintf(errmsg1);
2429 2425                  kmem_free(errmsg1, len1);
2430 2426          }

↓ open down ↓

311 lines elided

↑ open up ↑

2431 2427          if (errmsg2 != NULL) {
2432 2428                  uprintf(errmsg2);
2433 2429                  kmem_free(errmsg2, len2);
2434 2430          }
2435 2431          return (error);
2436 2432  }
2437 2433  
2438 2434  /*ARGSUSED*/
2439 2435  static int
2440 2436  ufs_access(struct vnode *vp, int mode, int flags, struct cred *cr,
2441      -        caller_context_t *ct)
     2437 +    caller_context_t *ct)
2442 2438  {
2443 2439          struct inode *ip = VTOI(vp);
2444 2440  
2445 2441          if (ip->i_ufsvfs == NULL)
2446 2442                  return (EIO);
2447 2443  
2448 2444          /*
2449 2445           * The ufs_iaccess function wants to be called with
2450 2446           * mode bits expressed as "ufs specific" bits.
2451 2447           * I.e., VWRITE|VREAD|VEXEC do not make sense to

2452 2448           * ufs_iaccess() but IWRITE|IREAD|IEXEC do.
2453 2449           * But since they're the same we just pass the vnode mode
2454 2450           * bit but just verify that assumption at compile time.

↓ open down ↓

3 lines elided

↑ open up ↑

2455 2451           */
2456 2452  #if IWRITE != VWRITE || IREAD != VREAD || IEXEC != VEXEC
2457 2453  #error "ufs_access needs to map Vmodes to Imodes"
2458 2454  #endif
2459 2455          return (ufs_iaccess(ip, mode, cr, 1));
2460 2456  }
2461 2457  
2462 2458  /* ARGSUSED */
2463 2459  static int
2464 2460  ufs_readlink(struct vnode *vp, struct uio *uiop, struct cred *cr,
2465      -        caller_context_t *ct)
     2461 +    caller_context_t *ct)
2466 2462  {
2467 2463          struct inode *ip = VTOI(vp);
2468 2464          struct ufsvfs *ufsvfsp;
2469 2465          struct ulockfs *ulp;
2470 2466          int error;
2471 2467          int fastsymlink;
2472 2468  
2473 2469          if (vp->v_type != VLNK) {
2474 2470                  error = EINVAL;
2475 2471                  goto nolockout;

2476 2472          }
2477 2473  
2478 2474          /*
2479 2475           * If the symbolic link is empty there is nothing to read.
2480 2476           * Fast-track these empty symbolic links
2481 2477           */
2482 2478          if (ip->i_size == 0) {
2483 2479                  error = 0;
2484 2480                  goto nolockout;
2485 2481          }
2486 2482  
2487 2483          ufsvfsp = ip->i_ufsvfs;
2488 2484          error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READLINK_MASK);
2489 2485          if (error)
2490 2486                  goto nolockout;
2491 2487          /*
2492 2488           * The ip->i_rwlock protects the data blocks used for FASTSYMLINK
2493 2489           */
2494 2490  again:
2495 2491          fastsymlink = 0;
2496 2492          if (ip->i_flag & IFASTSYMLNK) {
2497 2493                  rw_enter(&ip->i_rwlock, RW_READER);
2498 2494                  rw_enter(&ip->i_contents, RW_READER);
2499 2495                  if (ip->i_flag & IFASTSYMLNK) {
2500 2496                          if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) &&
2501 2497                              (ip->i_fs->fs_ronly == 0) &&
2502 2498                              (!ufsvfsp->vfs_noatime)) {
2503 2499                                  mutex_enter(&ip->i_tlock);
2504 2500                                  ip->i_flag |= IACC;
2505 2501                                  mutex_exit(&ip->i_tlock);
2506 2502                          }
2507 2503                          error = uiomove((caddr_t)&ip->i_db[1],
2508 2504                              MIN(ip->i_size, uiop->uio_resid),
2509 2505                              UIO_READ, uiop);
2510 2506                          ITIMES(ip);
2511 2507                          ++fastsymlink;
2512 2508                  }
2513 2509                  rw_exit(&ip->i_contents);
2514 2510                  rw_exit(&ip->i_rwlock);
2515 2511          }
2516 2512          if (!fastsymlink) {
2517 2513                  ssize_t size;   /* number of bytes read  */
2518 2514                  caddr_t basep;  /* pointer to input data */
2519 2515                  ino_t ino;
2520 2516                  long  igen;
2521 2517                  struct uio tuio;        /* temp uio struct */
2522 2518                  struct uio *tuiop;
2523 2519                  iovec_t tiov;           /* temp iovec struct */
2524 2520                  char kbuf[FSL_SIZE];    /* buffer to hold fast symlink */
2525 2521                  int tflag = 0;          /* flag to indicate temp vars used */
2526 2522  
2527 2523                  ino = ip->i_number;
2528 2524                  igen = ip->i_gen;
2529 2525                  size = uiop->uio_resid;
2530 2526                  basep = uiop->uio_iov->iov_base;
2531 2527                  tuiop = uiop;
2532 2528  
2533 2529                  rw_enter(&ip->i_rwlock, RW_WRITER);
2534 2530                  rw_enter(&ip->i_contents, RW_WRITER);
2535 2531                  if (ip->i_flag & IFASTSYMLNK) {
2536 2532                          rw_exit(&ip->i_contents);
2537 2533                          rw_exit(&ip->i_rwlock);
2538 2534                          goto again;
2539 2535                  }
2540 2536  
2541 2537                  /* can this be a fast symlink and is it a user buffer? */
2542 2538                  if (ip->i_size <= FSL_SIZE &&
2543 2539                      (uiop->uio_segflg == UIO_USERSPACE ||
2544 2540                      uiop->uio_segflg == UIO_USERISPACE)) {
2545 2541  
2546 2542                          bzero(&tuio, sizeof (struct uio));
2547 2543                          /*
2548 2544                           * setup a kernel buffer to read link into.  this
2549 2545                           * is to fix a race condition where the user buffer
2550 2546                           * got corrupted before copying it into the inode.
2551 2547                           */
2552 2548                          size = ip->i_size;
2553 2549                          tiov.iov_len = size;
2554 2550                          tiov.iov_base = kbuf;
2555 2551                          tuio.uio_iov = &tiov;
2556 2552                          tuio.uio_iovcnt = 1;
2557 2553                          tuio.uio_offset = uiop->uio_offset;
2558 2554                          tuio.uio_segflg = UIO_SYSSPACE;
2559 2555                          tuio.uio_fmode = uiop->uio_fmode;
2560 2556                          tuio.uio_extflg = uiop->uio_extflg;
2561 2557                          tuio.uio_limit = uiop->uio_limit;
2562 2558                          tuio.uio_resid = size;
2563 2559  
2564 2560                          basep = tuio.uio_iov->iov_base;
2565 2561                          tuiop = &tuio;
2566 2562                          tflag = 1;
2567 2563                  }
2568 2564  
2569 2565                  error = rdip(ip, tuiop, 0, cr);
2570 2566                  if (!(error == 0 && ip->i_number == ino && ip->i_gen == igen)) {
2571 2567                          rw_exit(&ip->i_contents);
2572 2568                          rw_exit(&ip->i_rwlock);
2573 2569                          goto out;
2574 2570                  }
2575 2571  
2576 2572                  if (tflag == 0)
2577 2573                          size -= uiop->uio_resid;
2578 2574  
2579 2575                  if ((tflag == 0 && ip->i_size <= FSL_SIZE &&
2580 2576                      ip->i_size == size) || (tflag == 1 &&
2581 2577                      tuio.uio_resid == 0)) {
2582 2578                          error = kcopy(basep, &ip->i_db[1], ip->i_size);
2583 2579                          if (error == 0) {
2584 2580                                  ip->i_flag |= IFASTSYMLNK;
2585 2581                                  /*
2586 2582                                   * free page
2587 2583                                   */
2588 2584                                  (void) VOP_PUTPAGE(ITOV(ip),
2589 2585                                      (offset_t)0, PAGESIZE,
2590 2586                                      (B_DONTNEED | B_FREE | B_FORCE | B_ASYNC),
2591 2587                                      cr, ct);
2592 2588                          } else {
2593 2589                                  int i;
2594 2590                                  /* error, clear garbage left behind */
2595 2591                                  for (i = 1; i < NDADDR; i++)
2596 2592                                          ip->i_db[i] = 0;
2597 2593                                  for (i = 0; i < NIADDR; i++)
2598 2594                                          ip->i_ib[i] = 0;
2599 2595                          }
2600 2596                  }
2601 2597                  if (tflag == 1) {
2602 2598                          /* now, copy it into the user buffer */
2603 2599                          error = uiomove((caddr_t)kbuf,
2604 2600                              MIN(size, uiop->uio_resid),
2605 2601                              UIO_READ, uiop);
2606 2602                  }
2607 2603                  rw_exit(&ip->i_contents);
2608 2604                  rw_exit(&ip->i_rwlock);
2609 2605          }

↓ open down ↓

134 lines elided

↑ open up ↑

2610 2606  out:
2611 2607          if (ulp) {
2612 2608                  ufs_lockfs_end(ulp);
2613 2609          }
2614 2610  nolockout:
2615 2611          return (error);
2616 2612  }
2617 2613  
2618 2614  /* ARGSUSED */
2619 2615  static int
2620      -ufs_fsync(struct vnode *vp, int syncflag, struct cred *cr,
2621      -        caller_context_t *ct)
     2616 +ufs_fsync(struct vnode *vp, int syncflag, struct cred *cr, caller_context_t *ct)
2622 2617  {
2623 2618          struct inode *ip = VTOI(vp);
2624 2619          struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
2625 2620          struct ulockfs *ulp;
2626 2621          int error;
2627 2622  
2628 2623          error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_FSYNC_MASK);
2629 2624          if (error)
2630 2625                  return (error);
2631 2626

2632 2627          if (TRANS_ISTRANS(ufsvfsp)) {
2633 2628                  /*
2634 2629                   * First push out any data pages
2635 2630                   */
2636 2631                  if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) &&
2637 2632                      (vp->v_type != VCHR) && !(IS_SWAPVP(vp))) {
2638 2633                          error = VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
2639 2634                              0, CRED(), ct);
2640 2635                          if (error)
2641 2636                                  goto out;
2642 2637                  }
2643 2638  
2644 2639                  /*
2645 2640                   * Delta any delayed inode times updates
2646 2641                   * and push inode to log.
2647 2642                   * All other inode deltas will have already been delta'd
2648 2643                   * and will be pushed during the commit.
2649 2644                   */
2650 2645                  if (!(syncflag & FDSYNC) &&
2651 2646                      ((ip->i_flag & (IMOD|IMODACC)) == IMODACC)) {
2652 2647                          if (ulp) {
2653 2648                                  TRANS_BEGIN_ASYNC(ufsvfsp, TOP_FSYNC,
2654 2649                                      TOP_SYNCIP_SIZE);
2655 2650                          }
2656 2651                          rw_enter(&ip->i_contents, RW_READER);
2657 2652                          mutex_enter(&ip->i_tlock);
2658 2653                          ip->i_flag &= ~IMODTIME;
2659 2654                          mutex_exit(&ip->i_tlock);
2660 2655                          ufs_iupdat(ip, I_SYNC);
2661 2656                          rw_exit(&ip->i_contents);
2662 2657                          if (ulp) {
2663 2658                                  TRANS_END_ASYNC(ufsvfsp, TOP_FSYNC,
2664 2659                                      TOP_SYNCIP_SIZE);
2665 2660                          }
2666 2661                  }
2667 2662  
2668 2663                  /*
2669 2664                   * Commit the Moby transaction
2670 2665                   *
2671 2666                   * Deltas have already been made so we just need to
2672 2667                   * commit them with a synchronous transaction.
2673 2668                   * TRANS_BEGIN_SYNC() will return an error
2674 2669                   * if there are no deltas to commit, for an
2675 2670                   * empty transaction.
2676 2671                   */
2677 2672                  if (ulp) {
2678 2673                          TRANS_BEGIN_SYNC(ufsvfsp, TOP_FSYNC, TOP_COMMIT_SIZE,
2679 2674                              error);
2680 2675                          if (error) {
2681 2676                                  error = 0; /* commit wasn't needed */
2682 2677                                  goto out;
2683 2678                          }
2684 2679                          TRANS_END_SYNC(ufsvfsp, error, TOP_FSYNC,
2685 2680                              TOP_COMMIT_SIZE);
2686 2681                  }
2687 2682          } else {        /* not logging */
2688 2683                  if (!(IS_SWAPVP(vp)))
2689 2684                          if (syncflag & FNODSYNC) {
2690 2685                                  /* Just update the inode only */
2691 2686                                  TRANS_IUPDAT(ip, 1);
2692 2687                                  error = 0;
2693 2688                          } else if (syncflag & FDSYNC)
2694 2689                                  /* Do data-synchronous writes */
2695 2690                                  error = TRANS_SYNCIP(ip, 0, I_DSYNC, TOP_FSYNC);
2696 2691                          else
2697 2692                                  /* Do synchronous writes */
2698 2693                                  error = TRANS_SYNCIP(ip, 0, I_SYNC, TOP_FSYNC);
2699 2694  
2700 2695                  rw_enter(&ip->i_contents, RW_WRITER);
2701 2696                  if (!error)
2702 2697                          error = ufs_sync_indir(ip);
2703 2698                  rw_exit(&ip->i_contents);
2704 2699          }
2705 2700  out:
2706 2701          if (ulp) {
2707 2702                  ufs_lockfs_end(ulp);
2708 2703          }
2709 2704          return (error);
2710 2705  }
2711 2706  
2712 2707  /*ARGSUSED*/
2713 2708  static void
2714 2709  ufs_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ct)
2715 2710  {

↓ open down ↓

84 lines elided

↑ open up ↑

2716 2711          ufs_iinactive(VTOI(vp));
2717 2712  }
2718 2713  
2719 2714  /*
2720 2715   * Unix file system operations having to do with directory manipulation.
2721 2716   */
2722 2717  int ufs_lookup_idle_count = 2;  /* Number of inodes to idle each time */
2723 2718  /* ARGSUSED */
2724 2719  static int
2725 2720  ufs_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
2726      -        struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cr,
2727      -        caller_context_t *ct, int *direntflags, pathname_t *realpnp)
     2721 +    struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cr,
     2722 +    caller_context_t *ct, int *direntflags, pathname_t *realpnp)
2728 2723  {
2729 2724          struct inode *ip;
2730 2725          struct inode *sip;
2731 2726          struct inode *xip;
2732 2727          struct ufsvfs *ufsvfsp;
2733 2728          struct ulockfs *ulp;
2734 2729          struct vnode *vp;
2735 2730          int error;
2736 2731  
2737 2732          /*

2738 2733           * Check flags for type of lookup (regular file or attribute file)
2739 2734           */
2740 2735  
2741 2736          ip = VTOI(dvp);
2742 2737  
2743 2738          if (flags & LOOKUP_XATTR) {
2744 2739  
2745 2740                  /*
2746 2741                   * If not mounted with XATTR support then return EINVAL
2747 2742                   */
2748 2743  
2749 2744                  if (!(ip->i_ufsvfs->vfs_vfs->vfs_flag & VFS_XATTR))
2750 2745                          return (EINVAL);
2751 2746                  /*
2752 2747                   * We don't allow recursive attributes...
2753 2748                   * Maybe someday we will.
2754 2749                   */
2755 2750                  if ((ip->i_cflags & IXATTR)) {
2756 2751                          return (EINVAL);
2757 2752                  }
2758 2753  
2759 2754                  if ((vp = dnlc_lookup(dvp, XATTR_DIR_NAME)) == NULL) {
2760 2755                          error = ufs_xattr_getattrdir(dvp, &sip, flags, cr);
2761 2756                          if (error) {
2762 2757                                  *vpp = NULL;
2763 2758                                  goto out;
2764 2759                          }
2765 2760  
2766 2761                          vp = ITOV(sip);
2767 2762                          dnlc_update(dvp, XATTR_DIR_NAME, vp);
2768 2763                  }
2769 2764  
2770 2765                  /*
2771 2766                   * Check accessibility of directory.
2772 2767                   */
2773 2768                  if (vp == DNLC_NO_VNODE) {
2774 2769                          VN_RELE(vp);
2775 2770                          error = ENOENT;
2776 2771                          goto out;
2777 2772                  }
2778 2773                  if ((error = ufs_iaccess(VTOI(vp), IEXEC, cr, 1)) != 0) {
2779 2774                          VN_RELE(vp);
2780 2775                          goto out;
2781 2776                  }
2782 2777  
2783 2778                  *vpp = vp;
2784 2779                  return (0);
2785 2780          }
2786 2781  
2787 2782          /*
2788 2783           * Check for a null component, which we should treat as
2789 2784           * looking at dvp from within it's parent, so we don't
2790 2785           * need a call to ufs_iaccess(), as it has already been
2791 2786           * done.
2792 2787           */
2793 2788          if (nm[0] == 0) {
2794 2789                  VN_HOLD(dvp);
2795 2790                  error = 0;
2796 2791                  *vpp = dvp;
2797 2792                  goto out;
2798 2793          }
2799 2794  
2800 2795          /*
2801 2796           * Check for "." ie itself. this is a quick check and
2802 2797           * avoids adding "." into the dnlc (which have been seen
2803 2798           * to occupy >10% of the cache).
2804 2799           */
2805 2800          if ((nm[0] == '.') && (nm[1] == 0)) {
2806 2801                  /*
2807 2802                   * Don't return without checking accessibility
2808 2803                   * of the directory. We only need the lock if
2809 2804                   * we are going to return it.
2810 2805                   */
2811 2806                  if ((error = ufs_iaccess(ip, IEXEC, cr, 1)) == 0) {
2812 2807                          VN_HOLD(dvp);
2813 2808                          *vpp = dvp;
2814 2809                  }
2815 2810                  goto out;
2816 2811          }
2817 2812  
2818 2813          /*
2819 2814           * Fast path: Check the directory name lookup cache.
2820 2815           */
2821 2816          if (vp = dnlc_lookup(dvp, nm)) {
2822 2817                  /*
2823 2818                   * Check accessibility of directory.
2824 2819                   */
2825 2820                  if ((error = ufs_iaccess(ip, IEXEC, cr, 1)) != 0) {
2826 2821                          VN_RELE(vp);
2827 2822                          goto out;
2828 2823                  }
2829 2824                  if (vp == DNLC_NO_VNODE) {
2830 2825                          VN_RELE(vp);
2831 2826                          error = ENOENT;
2832 2827                          goto out;
2833 2828                  }
2834 2829                  xip = VTOI(vp);
2835 2830                  ulp = NULL;
2836 2831                  goto fastpath;
2837 2832          }
2838 2833  
2839 2834          /*
2840 2835           * Keep the idle queue from getting too long by
2841 2836           * idling two inodes before attempting to allocate another.
2842 2837           *    This operation must be performed before entering
2843 2838           *    lockfs or a transaction.
2844 2839           */
2845 2840          if (ufs_idle_q.uq_ne > ufs_idle_q.uq_hiwat)
2846 2841                  if ((curthread->t_flag & T_DONTBLOCK) == 0) {
2847 2842                          ins.in_lidles.value.ul += ufs_lookup_idle_count;
2848 2843                          ufs_idle_some(ufs_lookup_idle_count);
2849 2844                  }
2850 2845  
2851 2846  retry_lookup:
2852 2847          /*
2853 2848           * Check accessibility of directory.
2854 2849           */
2855 2850          if (error = ufs_diraccess(ip, IEXEC, cr))
2856 2851                  goto out;
2857 2852  
2858 2853          ufsvfsp = ip->i_ufsvfs;
2859 2854          error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_LOOKUP_MASK);
2860 2855          if (error)
2861 2856                  goto out;
2862 2857  
2863 2858          error = ufs_dirlook(ip, nm, &xip, cr, 1, 0);
2864 2859  
2865 2860  fastpath:
2866 2861          if (error == 0) {
2867 2862                  ip = xip;
2868 2863                  *vpp = ITOV(ip);
2869 2864  
2870 2865                  /*
2871 2866                   * If vnode is a device return special vnode instead.
2872 2867                   */
2873 2868                  if (IS_DEVVP(*vpp)) {
2874 2869                          struct vnode *newvp;
2875 2870  
2876 2871                          newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type,
2877 2872                              cr);
2878 2873                          VN_RELE(*vpp);
2879 2874                          if (newvp == NULL)
2880 2875                                  error = ENOSYS;
2881 2876                          else
2882 2877                                  *vpp = newvp;
2883 2878                  } else if (ip->i_cflags & ICOMPRESS) {
2884 2879                          struct vnode *newvp;
2885 2880  
2886 2881                          /*
2887 2882                           * Compressed file, substitute dcfs vnode
2888 2883                           */
2889 2884                          newvp = decompvp(*vpp, cr, ct);
2890 2885                          VN_RELE(*vpp);
2891 2886                          if (newvp == NULL)
2892 2887                                  error = ENOSYS;
2893 2888                          else
2894 2889                                  *vpp = newvp;
2895 2890                  }
2896 2891          }
2897 2892          if (ulp) {
2898 2893                  ufs_lockfs_end(ulp);
2899 2894          }
2900 2895

↓ open down ↓

163 lines elided

↑ open up ↑

2901 2896          if (error == EAGAIN)
2902 2897                  goto retry_lookup;
2903 2898  
2904 2899  out:
2905 2900          return (error);
2906 2901  }
2907 2902  
2908 2903  /*ARGSUSED*/
2909 2904  static int
2910 2905  ufs_create(struct vnode *dvp, char *name, struct vattr *vap, enum vcexcl excl,
2911      -        int mode, struct vnode **vpp, struct cred *cr, int flag,
2912      -        caller_context_t *ct, vsecattr_t *vsecp)
     2906 +    int mode, struct vnode **vpp, struct cred *cr, int flag,
     2907 +    caller_context_t *ct, vsecattr_t *vsecp)
2913 2908  {
2914 2909          struct inode *ip;
2915 2910          struct inode *xip;
2916 2911          struct inode *dip;
2917 2912          struct vnode *xvp;
2918 2913          struct ufsvfs *ufsvfsp;
2919 2914          struct ulockfs *ulp;
2920 2915          int error;
2921 2916          int issync;
2922 2917          int truncflag;

2923 2918          int trans_size;
2924 2919          int noentry;
2925 2920          int defer_dip_seq_update = 0;   /* need to defer update of dip->i_seq */
2926 2921          int retry = 1;
2927 2922          int indeadlock;
2928 2923  
2929 2924  again:
2930 2925          ip = VTOI(dvp);
2931 2926          ufsvfsp = ip->i_ufsvfs;
2932 2927          truncflag = 0;
2933 2928  
2934 2929          error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_CREATE_MASK);
2935 2930          if (error)
2936 2931                  goto out;
2937 2932  
2938 2933          if (ulp) {
2939 2934                  trans_size = (int)TOP_CREATE_SIZE(ip);
2940 2935                  TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_CREATE, trans_size);
2941 2936          }
2942 2937  
2943 2938          if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr) != 0)
2944 2939                  vap->va_mode &= ~VSVTX;
2945 2940  
2946 2941          if (*name == '\0') {
2947 2942                  /*
2948 2943                   * Null component name refers to the directory itself.
2949 2944                   */
2950 2945                  VN_HOLD(dvp);
2951 2946                  /*
2952 2947                   * Even though this is an error case, we need to grab the
2953 2948                   * quota lock since the error handling code below is common.
2954 2949                   */
2955 2950                  rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
2956 2951                  rw_enter(&ip->i_contents, RW_WRITER);
2957 2952                  error = EEXIST;
2958 2953          } else {
2959 2954                  xip = NULL;
2960 2955                  noentry = 0;
2961 2956                  /*
2962 2957                   * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
2963 2958                   * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
2964 2959                   * possible, retries the operation.
2965 2960                   */
2966 2961                  ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_CREATE,
2967 2962                      retry_dir);
2968 2963                  if (indeadlock)
2969 2964                          goto again;
2970 2965  
2971 2966                  xvp = dnlc_lookup(dvp, name);
2972 2967                  if (xvp == DNLC_NO_VNODE) {
2973 2968                          noentry = 1;
2974 2969                          VN_RELE(xvp);
2975 2970                          xvp = NULL;
2976 2971                  }
2977 2972                  if (xvp) {
2978 2973                          rw_exit(&ip->i_rwlock);
2979 2974                          if (error = ufs_iaccess(ip, IEXEC, cr, 1)) {
2980 2975                                  VN_RELE(xvp);
2981 2976                          } else {
2982 2977                                  error = EEXIST;
2983 2978                                  xip = VTOI(xvp);
2984 2979                          }
2985 2980                  } else {
2986 2981                          /*
2987 2982                           * Suppress file system full message if we will retry
2988 2983                           */
2989 2984                          error = ufs_direnter_cm(ip, name, DE_CREATE,
2990 2985                              vap, &xip, cr, (noentry | (retry ? IQUIET : 0)));
2991 2986                          if (error == EAGAIN) {
2992 2987                                  if (ulp) {
2993 2988                                          TRANS_END_CSYNC(ufsvfsp, error, issync,
2994 2989                                              TOP_CREATE, trans_size);
2995 2990                                          ufs_lockfs_end(ulp);
2996 2991                                  }
2997 2992                                  goto again;
2998 2993                          }
2999 2994                          rw_exit(&ip->i_rwlock);
3000 2995                  }
3001 2996                  ip = xip;
3002 2997                  if (ip != NULL) {
3003 2998                          rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
3004 2999                          rw_enter(&ip->i_contents, RW_WRITER);
3005 3000                  }
3006 3001          }
3007 3002  
3008 3003          /*
3009 3004           * If the file already exists and this is a non-exclusive create,
3010 3005           * check permissions and allow access for non-directories.
3011 3006           * Read-only create of an existing directory is also allowed.
3012 3007           * We fail an exclusive create of anything which already exists.
3013 3008           */
3014 3009          if (error == EEXIST) {
3015 3010                  dip = VTOI(dvp);
3016 3011                  if (excl == NONEXCL) {
3017 3012                          if ((((ip->i_mode & IFMT) == IFDIR) ||
3018 3013                              ((ip->i_mode & IFMT) == IFATTRDIR)) &&
3019 3014                              (mode & IWRITE))
3020 3015                                  error = EISDIR;
3021 3016                          else if (mode)
3022 3017                                  error = ufs_iaccess(ip, mode, cr, 0);
3023 3018                          else
3024 3019                                  error = 0;
3025 3020                  }
3026 3021                  if (error) {
3027 3022                          rw_exit(&ip->i_contents);
3028 3023                          rw_exit(&ufsvfsp->vfs_dqrwlock);
3029 3024                          VN_RELE(ITOV(ip));
3030 3025                          goto unlock;
3031 3026                  }
3032 3027                  /*
3033 3028                   * If the error EEXIST was set, then i_seq can not
3034 3029                   * have been updated. The sequence number interface
3035 3030                   * is defined such that a non-error VOP_CREATE must
3036 3031                   * increase the dir va_seq it by at least one. If we
3037 3032                   * have cleared the error, increase i_seq. Note that
3038 3033                   * we are increasing the dir i_seq and in rare cases
3039 3034                   * ip may actually be from the dvp, so we already have
3040 3035                   * the locks and it will not be subject to truncation.
3041 3036                   * In case we have to update i_seq of the parent
3042 3037                   * directory dip, we have to defer it till we have
3043 3038                   * released our locks on ip due to lock ordering requirements.
3044 3039                   */
3045 3040                  if (ip != dip)
3046 3041                          defer_dip_seq_update = 1;
3047 3042                  else
3048 3043                          ip->i_seq++;
3049 3044  
3050 3045                  if (((ip->i_mode & IFMT) == IFREG) &&
3051 3046                      (vap->va_mask & AT_SIZE) && vap->va_size == 0) {
3052 3047                          /*
3053 3048                           * Truncate regular files, if requested by caller.
3054 3049                           * Grab i_rwlock to make sure no one else is
3055 3050                           * currently writing to the file (we promised
3056 3051                           * bmap we would do this).
3057 3052                           * Must get the locks in the correct order.
3058 3053                           */
3059 3054                          if (ip->i_size == 0) {
3060 3055                                  ip->i_flag |= ICHG | IUPD;
3061 3056                                  ip->i_seq++;
3062 3057                                  TRANS_INODE(ufsvfsp, ip);
3063 3058                          } else {
3064 3059                                  /*
3065 3060                                   * Large Files: Why this check here?
3066 3061                                   * Though we do it in vn_create() we really
3067 3062                                   * want to guarantee that we do not destroy
3068 3063                                   * Large file data by atomically checking
3069 3064                                   * the size while holding the contents
3070 3065                                   * lock.
3071 3066                                   */
3072 3067                                  if (flag && !(flag & FOFFMAX) &&
3073 3068                                      ((ip->i_mode & IFMT) == IFREG) &&
3074 3069                                      (ip->i_size > (offset_t)MAXOFF32_T)) {
3075 3070                                          rw_exit(&ip->i_contents);
3076 3071                                          rw_exit(&ufsvfsp->vfs_dqrwlock);
3077 3072                                          error = EOVERFLOW;
3078 3073                                          goto unlock;
3079 3074                                  }
3080 3075                                  if (TRANS_ISTRANS(ufsvfsp))
3081 3076                                          truncflag++;
3082 3077                                  else {
3083 3078                                          rw_exit(&ip->i_contents);
3084 3079                                          rw_exit(&ufsvfsp->vfs_dqrwlock);
3085 3080                                          ufs_tryirwlock_trans(&ip->i_rwlock,
3086 3081                                              RW_WRITER, TOP_CREATE,
3087 3082                                              retry_file);
3088 3083                                          if (indeadlock) {
3089 3084                                                  VN_RELE(ITOV(ip));
3090 3085                                                  goto again;
3091 3086                                          }
3092 3087                                          rw_enter(&ufsvfsp->vfs_dqrwlock,
3093 3088                                              RW_READER);
3094 3089                                          rw_enter(&ip->i_contents, RW_WRITER);
3095 3090                                          (void) ufs_itrunc(ip, (u_offset_t)0, 0,
3096 3091                                              cr);
3097 3092                                          rw_exit(&ip->i_rwlock);
3098 3093                                  }
3099 3094  
3100 3095                          }
3101 3096                          if (error == 0) {
3102 3097                                  vnevent_create(ITOV(ip), ct);
3103 3098                          }
3104 3099                  }
3105 3100          }
3106 3101  
3107 3102          if (error) {
3108 3103                  if (ip != NULL) {
3109 3104                          rw_exit(&ufsvfsp->vfs_dqrwlock);
3110 3105                          rw_exit(&ip->i_contents);
3111 3106                  }
3112 3107                  goto unlock;
3113 3108          }
3114 3109  
3115 3110          *vpp = ITOV(ip);
3116 3111          ITIMES(ip);
3117 3112          rw_exit(&ip->i_contents);
3118 3113          rw_exit(&ufsvfsp->vfs_dqrwlock);
3119 3114  
3120 3115          /*
3121 3116           * If vnode is a device return special vnode instead.
3122 3117           */
3123 3118          if (!error && IS_DEVVP(*vpp)) {
3124 3119                  struct vnode *newvp;
3125 3120  
3126 3121                  newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
3127 3122                  VN_RELE(*vpp);
3128 3123                  if (newvp == NULL) {
3129 3124                          error = ENOSYS;
3130 3125                          goto unlock;
3131 3126                  }
3132 3127                  truncflag = 0;
3133 3128                  *vpp = newvp;
3134 3129          }
3135 3130  unlock:
3136 3131  
3137 3132          /*
3138 3133           * Do the deferred update of the parent directory's sequence
3139 3134           * number now.
3140 3135           */
3141 3136          if (defer_dip_seq_update == 1) {
3142 3137                  rw_enter(&dip->i_contents, RW_READER);
3143 3138                  mutex_enter(&dip->i_tlock);
3144 3139                  dip->i_seq++;
3145 3140                  mutex_exit(&dip->i_tlock);
3146 3141                  rw_exit(&dip->i_contents);
3147 3142          }
3148 3143  
3149 3144          if (ulp) {
3150 3145                  int terr = 0;
3151 3146  
3152 3147                  TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_CREATE,
3153 3148                      trans_size);
3154 3149  
3155 3150                  /*
3156 3151                   * If we haven't had a more interesting failure
3157 3152                   * already, then anything that might've happened
3158 3153                   * here should be reported.
3159 3154                   */
3160 3155                  if (error == 0)
3161 3156                          error = terr;
3162 3157          }
3163 3158  
3164 3159          if (!error && truncflag) {
3165 3160                  ufs_tryirwlock(&ip->i_rwlock, RW_WRITER, retry_trunc);
3166 3161                  if (indeadlock) {
3167 3162                          if (ulp)
3168 3163                                  ufs_lockfs_end(ulp);
3169 3164                          VN_RELE(ITOV(ip));
3170 3165                          goto again;
3171 3166                  }
3172 3167                  (void) TRANS_ITRUNC(ip, (u_offset_t)0, 0, cr);
3173 3168                  rw_exit(&ip->i_rwlock);
3174 3169          }
3175 3170  
3176 3171          if (ulp)
3177 3172                  ufs_lockfs_end(ulp);
3178 3173  
3179 3174          /*
3180 3175           * If no inodes available, try to free one up out of the
3181 3176           * pending delete queue.
3182 3177           */
3183 3178          if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
3184 3179                  ufs_delete_drain_wait(ufsvfsp, 1);
3185 3180                  retry = 0;

↓ open down ↓

263 lines elided

↑ open up ↑

3186 3181                  goto again;
3187 3182          }
3188 3183  
3189 3184  out:
3190 3185          return (error);
3191 3186  }
3192 3187  
3193 3188  extern int ufs_idle_max;
3194 3189  /*ARGSUSED*/
3195 3190  static int
3196      -ufs_remove(struct vnode *vp, char *nm, struct cred *cr,
3197      -        caller_context_t *ct, int flags)
     3191 +ufs_remove(struct vnode *vp, char *nm, struct cred *cr, caller_context_t *ct,
     3192 +    int flags)
3198 3193  {
3199 3194          struct inode *ip = VTOI(vp);
3200 3195          struct ufsvfs *ufsvfsp  = ip->i_ufsvfs;
3201 3196          struct ulockfs *ulp;
3202 3197          vnode_t *rmvp = NULL;   /* Vnode corresponding to name being removed */
3203 3198          int indeadlock;
3204 3199          int error;
3205 3200          int issync;
3206 3201          int trans_size;
3207 3202

3208 3203          /*
3209 3204           * don't let the delete queue get too long
3210 3205           */
3211 3206          if (ufsvfsp == NULL) {
3212 3207                  error = EIO;
3213 3208                  goto out;
3214 3209          }
3215 3210          if (ufsvfsp->vfs_delete.uq_ne > ufs_idle_max)
3216 3211                  ufs_delete_drain(vp->v_vfsp, 1, 1);
3217 3212  
3218 3213          error = ufs_eventlookup(vp, nm, cr, &rmvp);
3219 3214          if (rmvp != NULL) {
3220 3215                  /* Only send the event if there were no errors */
3221 3216                  if (error == 0)
3222 3217                          vnevent_remove(rmvp, vp, nm, ct);
3223 3218                  VN_RELE(rmvp);
3224 3219          }
3225 3220  
3226 3221  retry_remove:
3227 3222          error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_REMOVE_MASK);
3228 3223          if (error)
3229 3224                  goto out;
3230 3225  
3231 3226          if (ulp)
3232 3227                  TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE,
3233 3228                      trans_size = (int)TOP_REMOVE_SIZE(VTOI(vp)));
3234 3229  
3235 3230          /*
3236 3231           * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
3237 3232           * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
3238 3233           * possible, retries the operation.
3239 3234           */
3240 3235          ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_REMOVE, retry);
3241 3236          if (indeadlock)
3242 3237                  goto retry_remove;
3243 3238          error = ufs_dirremove(ip, nm, (struct inode *)0, (struct vnode *)0,
3244 3239              DR_REMOVE, cr);
3245 3240          rw_exit(&ip->i_rwlock);
3246 3241  
3247 3242          if (ulp) {
3248 3243                  TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_REMOVE, trans_size);
3249 3244                  ufs_lockfs_end(ulp);
3250 3245          }
3251 3246  
3252 3247  out:

↓ open down ↓

45 lines elided

↑ open up ↑

3253 3248          return (error);
3254 3249  }
3255 3250  
3256 3251  /*
3257 3252   * Link a file or a directory.  Only privileged processes are allowed to
3258 3253   * make links to directories.
3259 3254   */
3260 3255  /*ARGSUSED*/
3261 3256  static int
3262 3257  ufs_link(struct vnode *tdvp, struct vnode *svp, char *tnm, struct cred *cr,
3263      -        caller_context_t *ct, int flags)
     3258 +    caller_context_t *ct, int flags)
3264 3259  {
3265 3260          struct inode *sip;
3266 3261          struct inode *tdp = VTOI(tdvp);
3267 3262          struct ufsvfs *ufsvfsp = tdp->i_ufsvfs;
3268 3263          struct ulockfs *ulp;
3269 3264          struct vnode *realvp;
3270 3265          int error;
3271 3266          int issync;
3272 3267          int trans_size;
3273 3268          int isdev;

3274 3269          int indeadlock;
3275 3270  
3276 3271  retry_link:
3277 3272          error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_LINK_MASK);
3278 3273          if (error)
3279 3274                  goto out;
3280 3275  
3281 3276          if (ulp)
3282 3277                  TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_LINK,
3283 3278                      trans_size = (int)TOP_LINK_SIZE(VTOI(tdvp)));
3284 3279  
3285 3280          if (VOP_REALVP(svp, &realvp, ct) == 0)
3286 3281                  svp = realvp;
3287 3282  
3288 3283          /*
3289 3284           * Make sure link for extended attributes is valid
3290 3285           * We only support hard linking of attr in ATTRDIR to ATTRDIR
3291 3286           *
3292 3287           * Make certain we don't attempt to look at a device node as
3293 3288           * a ufs inode.
3294 3289           */
3295 3290  
3296 3291          isdev = IS_DEVVP(svp);
3297 3292          if (((isdev == 0) && ((VTOI(svp)->i_cflags & IXATTR) == 0) &&
3298 3293              ((tdp->i_mode & IFMT) == IFATTRDIR)) ||
3299 3294              ((isdev == 0) && (VTOI(svp)->i_cflags & IXATTR) &&
3300 3295              ((tdp->i_mode & IFMT) == IFDIR))) {
3301 3296                  error = EINVAL;
3302 3297                  goto unlock;
3303 3298          }
3304 3299  
3305 3300          sip = VTOI(svp);
3306 3301          if ((svp->v_type == VDIR &&
3307 3302              secpolicy_fs_linkdir(cr, ufsvfsp->vfs_vfs) != 0) ||
3308 3303              (sip->i_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0)) {
3309 3304                  error = EPERM;
3310 3305                  goto unlock;
3311 3306          }
3312 3307  
3313 3308          /*
3314 3309           * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
3315 3310           * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
3316 3311           * possible, retries the operation.
3317 3312           */
3318 3313          ufs_tryirwlock_trans(&tdp->i_rwlock, RW_WRITER, TOP_LINK, retry);
3319 3314          if (indeadlock)
3320 3315                  goto retry_link;
3321 3316          error = ufs_direnter_lr(tdp, tnm, DE_LINK, (struct inode *)0,
3322 3317              sip, cr);
3323 3318          rw_exit(&tdp->i_rwlock);
3324 3319  
3325 3320  unlock:
3326 3321          if (ulp) {
3327 3322                  TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_LINK, trans_size);
3328 3323                  ufs_lockfs_end(ulp);
3329 3324          }
3330 3325  
3331 3326          if (!error) {
3332 3327                  vnevent_link(svp, ct);
3333 3328          }
3334 3329  out:
3335 3330          return (error);
3336 3331  }
3337 3332  
3338 3333  uint64_t ufs_rename_retry_cnt;
3339 3334  uint64_t ufs_rename_upgrade_retry_cnt;
3340 3335  uint64_t ufs_rename_dircheck_retry_cnt;
3341 3336  clock_t  ufs_rename_backoff_delay = 1;
3342 3337  
3343 3338  /*
3344 3339   * Rename a file or directory.
3345 3340   * We are given the vnode and entry string of the source and the
3346 3341   * vnode and entry string of the place we want to move the source
3347 3342   * to (the target). The essential operation is:

↓ open down ↓

74 lines elided

↑ open up ↑

3348 3343   *      unlink(target);
3349 3344   *      link(source, target);
3350 3345   *      unlink(source);
3351 3346   * but "atomically".  Can't do full commit without saving state in
3352 3347   * the inode on disk, which isn't feasible at this time.  Best we
3353 3348   * can do is always guarantee that the TARGET exists.
3354 3349   */
3355 3350  
3356 3351  /*ARGSUSED*/
3357 3352  static int
3358      -ufs_rename(
3359      -        struct vnode *sdvp,             /* old (source) parent vnode */
3360      -        char *snm,                      /* old (source) entry name */
3361      -        struct vnode *tdvp,             /* new (target) parent vnode */
3362      -        char *tnm,                      /* new (target) entry name */
3363      -        struct cred *cr,
3364      -        caller_context_t *ct,
3365      -        int flags)
     3353 +ufs_rename(struct vnode *sdvp, char *snm, struct vnode *tdvp, char *tnm,
     3354 +    struct cred *cr, caller_context_t *ct, int flags)
3366 3355  {
3367 3356          struct inode *sip = NULL;       /* source inode */
3368 3357          struct inode *ip = NULL;        /* check inode */
3369 3358          struct inode *sdp;              /* old (source) parent inode */
3370 3359          struct inode *tdp;              /* new (target) parent inode */
3371 3360          struct vnode *svp = NULL;       /* source vnode */
3372 3361          struct vnode *tvp = NULL;       /* target vnode, if it exists */
3373 3362          struct vnode *realvp;
3374 3363          struct ufsvfs *ufsvfsp;
3375 3364          struct ulockfs *ulp = NULL;

3376 3365          struct ufs_slot slot;
3377 3366          timestruc_t now;
3378 3367          int error;
3379 3368          int issync;
3380 3369          int trans_size;
3381 3370          krwlock_t *first_lock;
3382 3371          krwlock_t *second_lock;
3383 3372          krwlock_t *reverse_lock;
3384 3373          int serr, terr;
3385 3374  
3386 3375          sdp = VTOI(sdvp);
3387 3376          slot.fbp = NULL;
3388 3377          ufsvfsp = sdp->i_ufsvfs;
3389 3378  
3390 3379          if (VOP_REALVP(tdvp, &realvp, ct) == 0)
3391 3380                  tdvp = realvp;
3392 3381  
3393 3382          /* Must do this before taking locks in case of DNLC miss */
3394 3383          terr = ufs_eventlookup(tdvp, tnm, cr, &tvp);
3395 3384          serr = ufs_eventlookup(sdvp, snm, cr, &svp);
3396 3385  
3397 3386          if ((serr == 0) && ((terr == 0) || (terr == ENOENT))) {
3398 3387                  if (tvp != NULL)
3399 3388                          vnevent_pre_rename_dest(tvp, tdvp, tnm, ct);
3400 3389  
3401 3390                  /*
3402 3391                   * Notify the target directory of the rename event
3403 3392                   * if source and target directories are not the same.
3404 3393                   */
3405 3394                  if (sdvp != tdvp)
3406 3395                          vnevent_pre_rename_dest_dir(tdvp, svp, tnm, ct);
3407 3396  
3408 3397                  if (svp != NULL)
3409 3398                          vnevent_pre_rename_src(svp, sdvp, snm, ct);
3410 3399          }
3411 3400  
3412 3401          if (svp != NULL)
3413 3402                  VN_RELE(svp);
3414 3403  
3415 3404  retry_rename:
3416 3405          error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_RENAME_MASK);
3417 3406          if (error)
3418 3407                  goto unlock;
3419 3408  
3420 3409          if (ulp)
3421 3410                  TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_RENAME,
3422 3411                      trans_size = (int)TOP_RENAME_SIZE(sdp));
3423 3412  
3424 3413          if (VOP_REALVP(tdvp, &realvp, ct) == 0)
3425 3414                  tdvp = realvp;
3426 3415  
3427 3416          tdp = VTOI(tdvp);
3428 3417  
3429 3418          /*
3430 3419           * We only allow renaming of attributes from ATTRDIR to ATTRDIR.
3431 3420           */
3432 3421          if ((tdp->i_mode & IFMT) != (sdp->i_mode & IFMT)) {
3433 3422                  error = EINVAL;
3434 3423                  goto unlock;
3435 3424          }
3436 3425  
3437 3426          /*
3438 3427           * Check accessibility of directory.
3439 3428           */
3440 3429          if (error = ufs_diraccess(sdp, IEXEC, cr))
3441 3430                  goto unlock;
3442 3431  
3443 3432          /*
3444 3433           * Look up inode of file we're supposed to rename.
3445 3434           */
3446 3435          gethrestime(&now);
3447 3436          if (error = ufs_dirlook(sdp, snm, &sip, cr, 0, 0)) {
3448 3437                  if (error == EAGAIN) {
3449 3438                          if (ulp) {
3450 3439                                  TRANS_END_CSYNC(ufsvfsp, error, issync,
3451 3440                                      TOP_RENAME, trans_size);
3452 3441                                  ufs_lockfs_end(ulp);
3453 3442                          }
3454 3443                          goto retry_rename;
3455 3444                  }
3456 3445  
3457 3446                  goto unlock;
3458 3447          }
3459 3448  
3460 3449          /*
3461 3450           * Lock both the source and target directories (they may be
3462 3451           * the same) to provide the atomicity semantics that was
3463 3452           * previously provided by the per file system vfs_rename_lock
3464 3453           *
3465 3454           * with vfs_rename_lock removed to allow simultaneous renames
3466 3455           * within a file system, ufs_dircheckpath can deadlock while
3467 3456           * traversing back to ensure that source is not a parent directory
3468 3457           * of target parent directory. This is because we get into
3469 3458           * ufs_dircheckpath with the sdp and tdp locks held as RW_WRITER.
3470 3459           * If the tdp and sdp of the simultaneous renames happen to be
3471 3460           * in the path of each other, it can lead to a deadlock. This
3472 3461           * can be avoided by getting the locks as RW_READER here and then
3473 3462           * upgrading to RW_WRITER after completing the ufs_dircheckpath.
3474 3463           *
3475 3464           * We hold the target directory's i_rwlock after calling
3476 3465           * ufs_lockfs_begin but in many other operations (like ufs_readdir)
3477 3466           * VOP_RWLOCK is explicitly called by the filesystem independent code
3478 3467           * before calling the file system operation. In these cases the order
3479 3468           * is reversed (i.e i_rwlock is taken first and then ufs_lockfs_begin
3480 3469           * is called). This is fine as long as ufs_lockfs_begin acts as a VOP
3481 3470           * counter but with ufs_quiesce setting the SLOCK bit this becomes a
3482 3471           * synchronizing object which might lead to a deadlock. So we use
3483 3472           * rw_tryenter instead of rw_enter. If we fail to get this lock and
3484 3473           * find that SLOCK bit is set, we call ufs_lockfs_end and restart the
3485 3474           * operation.
3486 3475           */
3487 3476  retry:
3488 3477          first_lock = &tdp->i_rwlock;
3489 3478          second_lock = &sdp->i_rwlock;
3490 3479  retry_firstlock:
3491 3480          if (!rw_tryenter(first_lock, RW_READER)) {
3492 3481                  /*
3493 3482                   * We didn't get the lock. Check if the SLOCK is set in the
3494 3483                   * ufsvfs. If yes, we might be in a deadlock. Safer to give up
3495 3484                   * and wait for SLOCK to be cleared.
3496 3485                   */
3497 3486  
3498 3487                  if (ulp && ULOCKFS_IS_SLOCK(ulp)) {
3499 3488                          TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RENAME,
3500 3489                              trans_size);
3501 3490                          ufs_lockfs_end(ulp);
3502 3491                          goto retry_rename;
3503 3492  
3504 3493                  } else {
3505 3494                          /*
3506 3495                           * SLOCK isn't set so this is a genuine synchronization
3507 3496                           * case. Let's try again after giving them a breather.
3508 3497                           */
3509 3498                          delay(RETRY_LOCK_DELAY);
3510 3499                          goto  retry_firstlock;
3511 3500                  }
3512 3501          }
3513 3502          /*
3514 3503           * Need to check if the tdp and sdp are same !!!
3515 3504           */
3516 3505          if ((tdp != sdp) && (!rw_tryenter(second_lock, RW_READER))) {
3517 3506                  /*
3518 3507                   * We didn't get the lock. Check if the SLOCK is set in the
3519 3508                   * ufsvfs. If yes, we might be in a deadlock. Safer to give up
3520 3509                   * and wait for SLOCK to be cleared.
3521 3510                   */
3522 3511  
3523 3512                  rw_exit(first_lock);
3524 3513                  if (ulp && ULOCKFS_IS_SLOCK(ulp)) {
3525 3514                          TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RENAME,
3526 3515                              trans_size);
3527 3516                          ufs_lockfs_end(ulp);
3528 3517                          goto retry_rename;
3529 3518  
3530 3519                  } else {
3531 3520                          /*
3532 3521                           * So we couldn't get the second level peer lock *and*
3533 3522                           * the SLOCK bit isn't set. Too bad we can be
3534 3523                           * contentding with someone wanting these locks otherway
3535 3524                           * round. Reverse the locks in case there is a heavy
3536 3525                           * contention for the second level lock.
3537 3526                           */
3538 3527                          reverse_lock = first_lock;
3539 3528                          first_lock = second_lock;
3540 3529                          second_lock = reverse_lock;
3541 3530                          ufs_rename_retry_cnt++;
3542 3531                          goto  retry_firstlock;
3543 3532                  }
3544 3533          }
3545 3534  
3546 3535          if (sip == tdp) {
3547 3536                  error = EINVAL;
3548 3537                  goto errout;
3549 3538          }
3550 3539          /*
3551 3540           * Make sure we can delete the source entry.  This requires
3552 3541           * write permission on the containing directory.
3553 3542           * Check for sticky directories.
3554 3543           */
3555 3544          rw_enter(&sdp->i_contents, RW_READER);
3556 3545          rw_enter(&sip->i_contents, RW_READER);
3557 3546          if ((error = ufs_iaccess(sdp, IWRITE, cr, 0)) != 0 ||
3558 3547              (error = ufs_sticky_remove_access(sdp, sip, cr)) != 0) {
3559 3548                  rw_exit(&sip->i_contents);
3560 3549                  rw_exit(&sdp->i_contents);
3561 3550                  goto errout;
3562 3551          }
3563 3552  
3564 3553          /*
3565 3554           * If this is a rename of a directory and the parent is
3566 3555           * different (".." must be changed), then the source
3567 3556           * directory must not be in the directory hierarchy
3568 3557           * above the target, as this would orphan everything
3569 3558           * below the source directory.  Also the user must have
3570 3559           * write permission in the source so as to be able to
3571 3560           * change "..".
3572 3561           */
3573 3562          if ((((sip->i_mode & IFMT) == IFDIR) ||
3574 3563              ((sip->i_mode & IFMT) == IFATTRDIR)) && sdp != tdp) {
3575 3564                  ino_t   inum;
3576 3565  
3577 3566                  if (error = ufs_iaccess(sip, IWRITE, cr, 0)) {
3578 3567                          rw_exit(&sip->i_contents);
3579 3568                          rw_exit(&sdp->i_contents);
3580 3569                          goto errout;
3581 3570                  }
3582 3571                  inum = sip->i_number;
3583 3572                  rw_exit(&sip->i_contents);
3584 3573                  rw_exit(&sdp->i_contents);
3585 3574                  if ((error = ufs_dircheckpath(inum, tdp, sdp, cr))) {
3586 3575                          /*
3587 3576                           * If we got EAGAIN ufs_dircheckpath detected a
3588 3577                           * potential deadlock and backed out. We need
3589 3578                           * to retry the operation since sdp and tdp have
3590 3579                           * to be released to avoid the deadlock.
3591 3580                           */
3592 3581                          if (error == EAGAIN) {
3593 3582                                  rw_exit(&tdp->i_rwlock);
3594 3583                                  if (tdp != sdp)
3595 3584                                          rw_exit(&sdp->i_rwlock);
3596 3585                                  delay(ufs_rename_backoff_delay);
3597 3586                                  ufs_rename_dircheck_retry_cnt++;
3598 3587                                  goto retry;
3599 3588                          }
3600 3589                          goto errout;
3601 3590                  }
3602 3591          } else {
3603 3592                  rw_exit(&sip->i_contents);
3604 3593                  rw_exit(&sdp->i_contents);
3605 3594          }
3606 3595  
3607 3596  
3608 3597          /*
3609 3598           * Check for renaming '.' or '..' or alias of '.'
3610 3599           */
3611 3600          if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0 || sdp == sip) {
3612 3601                  error = EINVAL;
3613 3602                  goto errout;
3614 3603          }
3615 3604  
3616 3605          /*
3617 3606           * Simultaneous renames can deadlock in ufs_dircheckpath since it
3618 3607           * tries to traverse back the file tree with both tdp and sdp held
3619 3608           * as RW_WRITER. To avoid that we have to hold the tdp and sdp locks
3620 3609           * as RW_READERS  till ufs_dircheckpath is done.
3621 3610           * Now that ufs_dircheckpath is done with, we can upgrade the locks
3622 3611           * to RW_WRITER.
3623 3612           */
3624 3613          if (!rw_tryupgrade(&tdp->i_rwlock)) {
3625 3614                  /*
3626 3615                   * The upgrade failed. We got to give away the lock
3627 3616                   * as to avoid deadlocking with someone else who is
3628 3617                   * waiting for writer lock. With the lock gone, we
3629 3618                   * cannot be sure the checks done above will hold
3630 3619                   * good when we eventually get them back as writer.
3631 3620                   * So if we can't upgrade we drop the locks and retry
3632 3621                   * everything again.
3633 3622                   */
3634 3623                  rw_exit(&tdp->i_rwlock);
3635 3624                  if (tdp != sdp)
3636 3625                          rw_exit(&sdp->i_rwlock);
3637 3626                  delay(ufs_rename_backoff_delay);
3638 3627                  ufs_rename_upgrade_retry_cnt++;
3639 3628                  goto retry;
3640 3629          }
3641 3630          if (tdp != sdp) {
3642 3631                  if (!rw_tryupgrade(&sdp->i_rwlock)) {
3643 3632                          /*
3644 3633                           * The upgrade failed. We got to give away the lock
3645 3634                           * as to avoid deadlocking with someone else who is
3646 3635                           * waiting for writer lock. With the lock gone, we
3647 3636                           * cannot be sure the checks done above will hold
3648 3637                           * good when we eventually get them back as writer.
3649 3638                           * So if we can't upgrade we drop the locks and retry
3650 3639                           * everything again.
3651 3640                           */
3652 3641                          rw_exit(&tdp->i_rwlock);
3653 3642                          rw_exit(&sdp->i_rwlock);
3654 3643                          delay(ufs_rename_backoff_delay);
3655 3644                          ufs_rename_upgrade_retry_cnt++;
3656 3645                          goto retry;
3657 3646                  }
3658 3647          }
3659 3648  
3660 3649          /*
3661 3650           * Now that all the locks are held check to make sure another thread
3662 3651           * didn't slip in and take out the sip.
3663 3652           */
3664 3653          slot.status = NONE;
3665 3654          if ((sip->i_ctime.tv_usec * 1000) > now.tv_nsec ||
3666 3655              sip->i_ctime.tv_sec > now.tv_sec) {
3667 3656                  rw_enter(&sdp->i_ufsvfs->vfs_dqrwlock, RW_READER);
3668 3657                  rw_enter(&sdp->i_contents, RW_WRITER);
3669 3658                  error = ufs_dircheckforname(sdp, snm, strlen(snm), &slot,
3670 3659                      &ip, cr, 0);
3671 3660                  rw_exit(&sdp->i_contents);
3672 3661                  rw_exit(&sdp->i_ufsvfs->vfs_dqrwlock);
3673 3662                  if (error) {
3674 3663                          goto errout;
3675 3664                  }
3676 3665                  if (ip == NULL) {
3677 3666                          error = ENOENT;
3678 3667                          goto errout;
3679 3668                  } else {
3680 3669                          /*
3681 3670                           * If the inode was found need to drop the v_count
3682 3671                           * so as not to keep the filesystem from being
3683 3672                           * unmounted at a later time.
3684 3673                           */
3685 3674                          VN_RELE(ITOV(ip));
3686 3675                  }
3687 3676  
3688 3677                  /*
3689 3678                   * Release the slot.fbp that has the page mapped and
3690 3679                   * locked SE_SHARED, and could be used in in
3691 3680                   * ufs_direnter_lr() which needs to get the SE_EXCL lock
3692 3681                   * on said page.
3693 3682                   */
3694 3683                  if (slot.fbp) {
3695 3684                          fbrelse(slot.fbp, S_OTHER);
3696 3685                          slot.fbp = NULL;
3697 3686                  }
3698 3687          }
3699 3688  
3700 3689          /*
3701 3690           * Link source to the target.
3702 3691           */
3703 3692          if (error = ufs_direnter_lr(tdp, tnm, DE_RENAME, sdp, sip, cr)) {
3704 3693                  /*
3705 3694                   * ESAME isn't really an error; it indicates that the
3706 3695                   * operation should not be done because the source and target
3707 3696                   * are the same file, but that no error should be reported.
3708 3697                   */
3709 3698                  if (error == ESAME)
3710 3699                          error = 0;
3711 3700                  goto errout;
3712 3701          }
3713 3702  
3714 3703          if (error == 0 && tvp != NULL)
3715 3704                  vnevent_rename_dest(tvp, tdvp, tnm, ct);
3716 3705  
3717 3706          /*
3718 3707           * Unlink the source.
3719 3708           * Remove the source entry.  ufs_dirremove() checks that the entry
3720 3709           * still reflects sip, and returns an error if it doesn't.
3721 3710           * If the entry has changed just forget about it.  Release
3722 3711           * the source inode.
3723 3712           */
3724 3713          if ((error = ufs_dirremove(sdp, snm, sip, (struct vnode *)0,
3725 3714              DR_RENAME, cr)) == ENOENT)
3726 3715                  error = 0;
3727 3716  
3728 3717          if (error == 0) {
3729 3718                  vnevent_rename_src(ITOV(sip), sdvp, snm, ct);
3730 3719                  /*
3731 3720                   * Notify the target directory of the rename event
3732 3721                   * if source and target directories are not the same.
3733 3722                   */
3734 3723                  if (sdvp != tdvp)
3735 3724                          vnevent_rename_dest_dir(tdvp, ct);
3736 3725          }
3737 3726  
3738 3727  errout:
3739 3728          if (slot.fbp)
3740 3729                  fbrelse(slot.fbp, S_OTHER);
3741 3730  
3742 3731          rw_exit(&tdp->i_rwlock);
3743 3732          if (sdp != tdp) {
3744 3733                  rw_exit(&sdp->i_rwlock);
3745 3734          }
3746 3735  
3747 3736  unlock:
3748 3737          if (tvp != NULL)
3749 3738                  VN_RELE(tvp);
3750 3739          if (sip != NULL)
3751 3740                  VN_RELE(ITOV(sip));
3752 3741  
3753 3742          if (ulp) {

↓ open down ↓

378 lines elided

↑ open up ↑

3754 3743                  TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RENAME, trans_size);
3755 3744                  ufs_lockfs_end(ulp);
3756 3745          }
3757 3746  
3758 3747          return (error);
3759 3748  }
3760 3749  
3761 3750  /*ARGSUSED*/
3762 3751  static int
3763 3752  ufs_mkdir(struct vnode *dvp, char *dirname, struct vattr *vap,
3764      -        struct vnode **vpp, struct cred *cr, caller_context_t *ct, int flags,
3765      -        vsecattr_t *vsecp)
     3753 +    struct vnode **vpp, struct cred *cr, caller_context_t *ct, int flags,
     3754 +    vsecattr_t *vsecp)
3766 3755  {
3767 3756          struct inode *ip;
3768 3757          struct inode *xip;
3769 3758          struct ufsvfs *ufsvfsp;
3770 3759          struct ulockfs *ulp;
3771 3760          int error;
3772 3761          int issync;
3773 3762          int trans_size;
3774 3763          int indeadlock;
3775 3764          int retry = 1;

3776 3765  
3777 3766          ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
3778 3767  
3779 3768          /*
3780 3769           * Can't make directory in attr hidden dir
3781 3770           */
3782 3771          if ((VTOI(dvp)->i_mode & IFMT) == IFATTRDIR)
3783 3772                  return (EINVAL);
3784 3773  
3785 3774  again:
3786 3775          ip = VTOI(dvp);
3787 3776          ufsvfsp = ip->i_ufsvfs;
3788 3777          error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MKDIR_MASK);
3789 3778          if (error)
3790 3779                  goto out;
3791 3780          if (ulp)
3792 3781                  TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_MKDIR,
3793 3782                      trans_size = (int)TOP_MKDIR_SIZE(ip));
3794 3783  
3795 3784          /*
3796 3785           * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
3797 3786           * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
3798 3787           * possible, retries the operation.
3799 3788           */
3800 3789          ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_MKDIR, retry);
3801 3790          if (indeadlock)
3802 3791                  goto again;
3803 3792  
3804 3793          error = ufs_direnter_cm(ip, dirname, DE_MKDIR, vap, &xip, cr,
3805 3794              (retry ? IQUIET : 0));
3806 3795          if (error == EAGAIN) {
3807 3796                  if (ulp) {
3808 3797                          TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_MKDIR,
3809 3798                              trans_size);
3810 3799                          ufs_lockfs_end(ulp);
3811 3800                  }
3812 3801                  goto again;
3813 3802          }
3814 3803  
3815 3804          rw_exit(&ip->i_rwlock);
3816 3805          if (error == 0) {
3817 3806                  ip = xip;
3818 3807                  *vpp = ITOV(ip);
3819 3808          } else if (error == EEXIST)
3820 3809                  VN_RELE(ITOV(xip));
3821 3810  
3822 3811          if (ulp) {
3823 3812                  int terr = 0;
3824 3813                  TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_MKDIR, trans_size);
3825 3814                  ufs_lockfs_end(ulp);
3826 3815                  if (error == 0)
3827 3816                          error = terr;
3828 3817          }
3829 3818  out:
3830 3819          if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
3831 3820                  ufs_delete_drain_wait(ufsvfsp, 1);

↓ open down ↓

56 lines elided

↑ open up ↑

3832 3821                  retry = 0;
3833 3822                  goto again;
3834 3823          }
3835 3824  
3836 3825          return (error);
3837 3826  }
3838 3827  
3839 3828  /*ARGSUSED*/
3840 3829  static int
3841 3830  ufs_rmdir(struct vnode *vp, char *nm, struct vnode *cdir, struct cred *cr,
3842      -        caller_context_t *ct, int flags)
     3831 +    caller_context_t *ct, int flags)
3843 3832  {
3844 3833          struct inode *ip = VTOI(vp);
3845 3834          struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
3846 3835          struct ulockfs *ulp;
3847 3836          vnode_t *rmvp = NULL;   /* Vnode of removed directory */
3848 3837          int error;
3849 3838          int issync;
3850 3839          int trans_size;
3851 3840          int indeadlock;
3852 3841

3853 3842          /*
3854 3843           * don't let the delete queue get too long
3855 3844           */
3856 3845          if (ufsvfsp == NULL) {
3857 3846                  error = EIO;
3858 3847                  goto out;
3859 3848          }
3860 3849          if (ufsvfsp->vfs_delete.uq_ne > ufs_idle_max)
3861 3850                  ufs_delete_drain(vp->v_vfsp, 1, 1);
3862 3851  
3863 3852          error = ufs_eventlookup(vp, nm, cr, &rmvp);
3864 3853          if (rmvp != NULL) {
3865 3854                  /* Only send the event if there were no errors */
3866 3855                  if (error == 0)
3867 3856                          vnevent_rmdir(rmvp, vp, nm, ct);
3868 3857                  VN_RELE(rmvp);
3869 3858          }
3870 3859  
3871 3860  retry_rmdir:
3872 3861          error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_RMDIR_MASK);
3873 3862          if (error)
3874 3863                  goto out;
3875 3864  
3876 3865          if (ulp)
3877 3866                  TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_RMDIR,
3878 3867                      trans_size = TOP_RMDIR_SIZE);
3879 3868  
3880 3869          /*
3881 3870           * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
3882 3871           * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
3883 3872           * possible, retries the operation.
3884 3873           */
3885 3874          ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_RMDIR, retry);
3886 3875          if (indeadlock)
3887 3876                  goto retry_rmdir;
3888 3877          error = ufs_dirremove(ip, nm, (struct inode *)0, cdir, DR_RMDIR, cr);
3889 3878  
3890 3879          rw_exit(&ip->i_rwlock);
3891 3880  
3892 3881          if (ulp) {
3893 3882                  TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RMDIR,

↓ open down ↓

41 lines elided

↑ open up ↑

3894 3883                      trans_size);
3895 3884                  ufs_lockfs_end(ulp);
3896 3885          }
3897 3886  
3898 3887  out:
3899 3888          return (error);
3900 3889  }
3901 3890  
3902 3891  /* ARGSUSED */
3903 3892  static int
3904      -ufs_readdir(
3905      -        struct vnode *vp,
3906      -        struct uio *uiop,
3907      -        struct cred *cr,
3908      -        int *eofp,
3909      -        caller_context_t *ct,
3910      -        int flags)
     3893 +ufs_readdir(struct vnode *vp, struct uio *uiop, struct cred *cr, int *eofp,
     3894 +    caller_context_t *ct, int flags)
3911 3895  {
3912 3896          struct iovec *iovp;
3913 3897          struct inode *ip;
3914 3898          struct direct *idp;
3915 3899          struct dirent64 *odp;
3916 3900          struct fbuf *fbp;
3917 3901          struct ufsvfs *ufsvfsp;
3918 3902          struct ulockfs *ulp;
3919 3903          caddr_t outbuf;
3920 3904          size_t bufsize;

3921 3905          uint_t offset;
3922 3906          uint_t bytes_wanted, total_bytes_wanted;
3923 3907          int incount = 0;
3924 3908          int outcount = 0;
3925 3909          int error;
3926 3910  
3927 3911          ip = VTOI(vp);
3928 3912          ASSERT(RW_READ_HELD(&ip->i_rwlock));
3929 3913  
3930 3914          if (uiop->uio_loffset >= MAXOFF32_T) {
3931 3915                  if (eofp)
3932 3916                          *eofp = 1;
3933 3917                  return (0);
3934 3918          }
3935 3919  
3936 3920          /*
3937 3921           * Check if we have been called with a valid iov_len
3938 3922           * and bail out if not, otherwise we may potentially loop
3939 3923           * forever further down.
3940 3924           */
3941 3925          if (uiop->uio_iov->iov_len <= 0) {
3942 3926                  error = EINVAL;
3943 3927                  goto out;
3944 3928          }
3945 3929  
3946 3930          /*
3947 3931           * Large Files: When we come here we are guaranteed that
3948 3932           * uio_offset can be used safely. The high word is zero.
3949 3933           */
3950 3934  
3951 3935          ufsvfsp = ip->i_ufsvfs;
3952 3936          error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READDIR_MASK);
3953 3937          if (error)
3954 3938                  goto out;
3955 3939  
3956 3940          iovp = uiop->uio_iov;
3957 3941          total_bytes_wanted = iovp->iov_len;
3958 3942  
3959 3943          /* Large Files: directory files should not be "large" */
3960 3944  
3961 3945          ASSERT(ip->i_size <= MAXOFF32_T);
3962 3946  
3963 3947          /* Force offset to be valid (to guard against bogus lseek() values) */
3964 3948          offset = (uint_t)uiop->uio_offset & ~(DIRBLKSIZ - 1);
3965 3949  
3966 3950          /* Quit if at end of file or link count of zero (posix) */
3967 3951          if (offset >= (uint_t)ip->i_size || ip->i_nlink <= 0) {
3968 3952                  if (eofp)
3969 3953                          *eofp = 1;
3970 3954                  error = 0;
3971 3955                  goto unlock;
3972 3956          }
3973 3957  
3974 3958          /*
3975 3959           * Get space to change directory entries into fs independent format.
3976 3960           * Do fast alloc for the most commonly used-request size (filesystem
3977 3961           * block size).
3978 3962           */
3979 3963          if (uiop->uio_segflg != UIO_SYSSPACE || uiop->uio_iovcnt != 1) {
3980 3964                  bufsize = total_bytes_wanted;
3981 3965                  outbuf = kmem_alloc(bufsize, KM_SLEEP);
3982 3966                  odp = (struct dirent64 *)outbuf;
3983 3967          } else {
3984 3968                  bufsize = total_bytes_wanted;
3985 3969                  odp = (struct dirent64 *)iovp->iov_base;
3986 3970          }
3987 3971  
3988 3972  nextblk:
3989 3973          bytes_wanted = total_bytes_wanted;
3990 3974  
3991 3975          /* Truncate request to file size */
3992 3976          if (offset + bytes_wanted > (int)ip->i_size)
3993 3977                  bytes_wanted = (int)(ip->i_size - offset);
3994 3978  
3995 3979          /* Comply with MAXBSIZE boundary restrictions of fbread() */
3996 3980          if ((offset & MAXBOFFSET) + bytes_wanted > MAXBSIZE)
3997 3981                  bytes_wanted = MAXBSIZE - (offset & MAXBOFFSET);
3998 3982  
3999 3983          /*
4000 3984           * Read in the next chunk.
4001 3985           * We are still holding the i_rwlock.
4002 3986           */
4003 3987          error = fbread(vp, (offset_t)offset, bytes_wanted, S_OTHER, &fbp);
4004 3988  
4005 3989          if (error)
4006 3990                  goto update_inode;
4007 3991          if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && (ip->i_fs->fs_ronly == 0) &&
4008 3992              (!ufsvfsp->vfs_noatime)) {
4009 3993                  ip->i_flag |= IACC;
4010 3994          }
4011 3995          incount = 0;
4012 3996          idp = (struct direct *)fbp->fb_addr;
4013 3997          if (idp->d_ino == 0 && idp->d_reclen == 0 && idp->d_namlen == 0) {
4014 3998                  cmn_err(CE_WARN, "ufs_readdir: bad dir, inumber = %llu, "
4015 3999                      "fs = %s\n",
4016 4000                      (u_longlong_t)ip->i_number, ufsvfsp->vfs_fs->fs_fsmnt);
4017 4001                  fbrelse(fbp, S_OTHER);
4018 4002                  error = ENXIO;
4019 4003                  goto update_inode;
4020 4004          }
4021 4005          /* Transform to file-system independent format */
4022 4006          while (incount < bytes_wanted) {
4023 4007                  /*
4024 4008                   * If the current directory entry is mangled, then skip
4025 4009                   * to the next block.  It would be nice to set the FSBAD
4026 4010                   * flag in the super-block so that a fsck is forced on
4027 4011                   * next reboot, but locking is a problem.
4028 4012                   */
4029 4013                  if (idp->d_reclen & 0x3) {
4030 4014                          offset = (offset + DIRBLKSIZ) & ~(DIRBLKSIZ-1);
4031 4015                          break;
4032 4016                  }
4033 4017  
4034 4018                  /* Skip to requested offset and skip empty entries */
4035 4019                  if (idp->d_ino != 0 && offset >= (uint_t)uiop->uio_offset) {
4036 4020                          ushort_t this_reclen =
4037 4021                              DIRENT64_RECLEN(idp->d_namlen);
4038 4022                          /* Buffer too small for any entries */
4039 4023                          if (!outcount && this_reclen > bufsize) {
4040 4024                                  fbrelse(fbp, S_OTHER);
4041 4025                                  error = EINVAL;
4042 4026                                  goto update_inode;
4043 4027                          }
4044 4028                          /* If would overrun the buffer, quit */
4045 4029                          if (outcount + this_reclen > bufsize) {
4046 4030                                  break;
4047 4031                          }
4048 4032                          /* Take this entry */
4049 4033                          odp->d_ino = (ino64_t)idp->d_ino;
4050 4034                          odp->d_reclen = (ushort_t)this_reclen;
4051 4035                          odp->d_off = (offset_t)(offset + idp->d_reclen);
4052 4036  
4053 4037                          /* use strncpy(9f) to zero out uninitialized bytes */
4054 4038  
4055 4039                          ASSERT(strlen(idp->d_name) + 1 <=
4056 4040                              DIRENT64_NAMELEN(this_reclen));
4057 4041                          (void) strncpy(odp->d_name, idp->d_name,
4058 4042                              DIRENT64_NAMELEN(this_reclen));
4059 4043                          outcount += odp->d_reclen;
4060 4044                          odp = (struct dirent64 *)
4061 4045                              ((intptr_t)odp + odp->d_reclen);
4062 4046                          ASSERT(outcount <= bufsize);
4063 4047                  }
4064 4048                  if (idp->d_reclen) {
4065 4049                          incount += idp->d_reclen;
4066 4050                          offset += idp->d_reclen;
4067 4051                          idp = (struct direct *)((intptr_t)idp + idp->d_reclen);
4068 4052                  } else {
4069 4053                          offset = (offset + DIRBLKSIZ) & ~(DIRBLKSIZ-1);
4070 4054                          break;
4071 4055                  }
4072 4056          }
4073 4057          /* Release the chunk */
4074 4058          fbrelse(fbp, S_OTHER);
4075 4059  
4076 4060          /* Read whole block, but got no entries, read another if not eof */
4077 4061  
4078 4062          /*
4079 4063           * Large Files: casting i_size to int here is not a problem
4080 4064           * because directory sizes are always less than MAXOFF32_T.
4081 4065           * See assertion above.
4082 4066           */
4083 4067  
4084 4068          if (offset < (int)ip->i_size && !outcount)
4085 4069                  goto nextblk;
4086 4070  
4087 4071          /* Copy out the entry data */
4088 4072          if (uiop->uio_segflg == UIO_SYSSPACE && uiop->uio_iovcnt == 1) {
4089 4073                  iovp->iov_base += outcount;
4090 4074                  iovp->iov_len -= outcount;
4091 4075                  uiop->uio_resid -= outcount;
4092 4076                  uiop->uio_offset = offset;
4093 4077          } else if ((error = uiomove(outbuf, (long)outcount, UIO_READ,
4094 4078              uiop)) == 0)
4095 4079                  uiop->uio_offset = offset;
4096 4080  update_inode:
4097 4081          ITIMES(ip);
4098 4082          if (uiop->uio_segflg != UIO_SYSSPACE || uiop->uio_iovcnt != 1)
4099 4083                  kmem_free(outbuf, bufsize);
4100 4084  
4101 4085          if (eofp && error == 0)
4102 4086                  *eofp = (uiop->uio_offset >= (int)ip->i_size);

↓ open down ↓

182 lines elided

↑ open up ↑

4103 4087  unlock:
4104 4088          if (ulp) {
4105 4089                  ufs_lockfs_end(ulp);
4106 4090          }
4107 4091  out:
4108 4092          return (error);
4109 4093  }
4110 4094  
4111 4095  /*ARGSUSED*/
4112 4096  static int
4113      -ufs_symlink(
4114      -        struct vnode *dvp,              /* ptr to parent dir vnode */
4115      -        char *linkname,                 /* name of symbolic link */
4116      -        struct vattr *vap,              /* attributes */
4117      -        char *target,                   /* target path */
4118      -        struct cred *cr,                /* user credentials */
4119      -        caller_context_t *ct,
4120      -        int flags)
     4097 +ufs_symlink(struct vnode *dvp, char *linkname, struct vattr *vap, char *target,
     4098 +    struct cred *cr, caller_context_t *ct, int flags)
4121 4099  {
4122 4100          struct inode *ip, *dip = VTOI(dvp);
4123 4101          struct ufsvfs *ufsvfsp = dip->i_ufsvfs;
4124 4102          struct ulockfs *ulp;
4125 4103          int error;
4126 4104          int issync;
4127 4105          int trans_size;
4128 4106          int residual;
4129 4107          int ioflag;
4130 4108          int retry = 1;

4131 4109  
4132 4110          /*
4133 4111           * No symlinks in attrdirs at this time
4134 4112           */
4135 4113          if ((VTOI(dvp)->i_mode & IFMT) == IFATTRDIR)
4136 4114                  return (EINVAL);
4137 4115  
4138 4116  again:
4139 4117          ip = (struct inode *)NULL;
4140 4118          vap->va_type = VLNK;
4141 4119          vap->va_rdev = 0;
4142 4120  
4143 4121          error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SYMLINK_MASK);
4144 4122          if (error)
4145 4123                  goto out;
4146 4124  
4147 4125          if (ulp)
4148 4126                  TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_SYMLINK,
4149 4127                      trans_size = (int)TOP_SYMLINK_SIZE(dip));
4150 4128  
4151 4129          /*
4152 4130           * We must create the inode before the directory entry, to avoid
4153 4131           * racing with readlink().  ufs_dirmakeinode requires that we
4154 4132           * hold the quota lock as reader, and directory locks as writer.
4155 4133           */
4156 4134  
4157 4135          rw_enter(&dip->i_rwlock, RW_WRITER);
4158 4136          rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
4159 4137          rw_enter(&dip->i_contents, RW_WRITER);
4160 4138  
4161 4139          /*
4162 4140           * Suppress any out of inodes messages if we will retry on
4163 4141           * ENOSP
4164 4142           */
4165 4143          if (retry)
4166 4144                  dip->i_flag |= IQUIET;
4167 4145  
4168 4146          error = ufs_dirmakeinode(dip, &ip, vap, DE_SYMLINK, cr);
4169 4147  
4170 4148          dip->i_flag &= ~IQUIET;
4171 4149  
4172 4150          rw_exit(&dip->i_contents);
4173 4151          rw_exit(&ufsvfsp->vfs_dqrwlock);
4174 4152          rw_exit(&dip->i_rwlock);
4175 4153  
4176 4154          if (error)
4177 4155                  goto unlock;
4178 4156  
4179 4157          /*
4180 4158           * OK.  The inode has been created.  Write out the data of the
4181 4159           * symbolic link.  Since symbolic links are metadata, and should
4182 4160           * remain consistent across a system crash, we need to force the
4183 4161           * data out synchronously.
4184 4162           *
4185 4163           * (This is a change from the semantics in earlier releases, which
4186 4164           * only created symbolic links synchronously if the semi-documented
4187 4165           * 'syncdir' option was set, or if we were being invoked by the NFS
4188 4166           * server, which requires symbolic links to be created synchronously.)
4189 4167           *
4190 4168           * We need to pass in a pointer for the residual length; otherwise
4191 4169           * ufs_rdwri() will always return EIO if it can't write the data,
4192 4170           * even if the error was really ENOSPC or EDQUOT.
4193 4171           */
4194 4172  
4195 4173          ioflag = FWRITE | FDSYNC;
4196 4174          residual = 0;
4197 4175  
4198 4176          rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
4199 4177          rw_enter(&ip->i_contents, RW_WRITER);
4200 4178  
4201 4179          /*
4202 4180           * Suppress file system full messages if we will retry
4203 4181           */
4204 4182          if (retry)
4205 4183                  ip->i_flag |= IQUIET;
4206 4184  
4207 4185          error = ufs_rdwri(UIO_WRITE, ioflag, ip, target, strlen(target),
4208 4186              (offset_t)0, UIO_SYSSPACE, &residual, cr);
4209 4187  
4210 4188          ip->i_flag &= ~IQUIET;
4211 4189  
4212 4190          if (error) {
4213 4191                  rw_exit(&ip->i_contents);
4214 4192                  rw_exit(&ufsvfsp->vfs_dqrwlock);
4215 4193                  goto remove;
4216 4194          }
4217 4195  
4218 4196          /*
4219 4197           * If the link's data is small enough, we can cache it in the inode.
4220 4198           * This is a "fast symbolic link".  We don't use the first direct
4221 4199           * block because that's actually used to point at the symbolic link's
4222 4200           * contents on disk; but we know that none of the other direct or
4223 4201           * indirect blocks can be used because symbolic links are restricted
4224 4202           * to be smaller than a file system block.
4225 4203           */
4226 4204  
4227 4205          ASSERT(MAXPATHLEN <= VBSIZE(ITOV(ip)));
4228 4206  
4229 4207          if (ip->i_size > 0 && ip->i_size <= FSL_SIZE) {
4230 4208                  if (kcopy(target, &ip->i_db[1], ip->i_size) == 0) {
4231 4209                          ip->i_flag |= IFASTSYMLNK;
4232 4210                  } else {
4233 4211                          int i;
4234 4212                          /* error, clear garbage left behind */
4235 4213                          for (i = 1; i < NDADDR; i++)
4236 4214                                  ip->i_db[i] = 0;
4237 4215                          for (i = 0; i < NIADDR; i++)
4238 4216                                  ip->i_ib[i] = 0;
4239 4217                  }
4240 4218          }
4241 4219  
4242 4220          rw_exit(&ip->i_contents);
4243 4221          rw_exit(&ufsvfsp->vfs_dqrwlock);
4244 4222  
4245 4223          /*
4246 4224           * OK.  We've successfully created the symbolic link.  All that
4247 4225           * remains is to insert it into the appropriate directory.
4248 4226           */
4249 4227  
4250 4228          rw_enter(&dip->i_rwlock, RW_WRITER);
4251 4229          error = ufs_direnter_lr(dip, linkname, DE_SYMLINK, NULL, ip, cr);
4252 4230          rw_exit(&dip->i_rwlock);
4253 4231  
4254 4232          /*
4255 4233           * Fall through into remove-on-error code.  We're either done, or we
4256 4234           * need to remove the inode (if we couldn't insert it).
4257 4235           */
4258 4236  
4259 4237  remove:
4260 4238          if (error && (ip != NULL)) {
4261 4239                  rw_enter(&ip->i_contents, RW_WRITER);
4262 4240                  ip->i_nlink--;
4263 4241                  ip->i_flag |= ICHG;
4264 4242                  ip->i_seq++;
4265 4243                  ufs_setreclaim(ip);
4266 4244                  rw_exit(&ip->i_contents);
4267 4245          }
4268 4246  
4269 4247  unlock:
4270 4248          if (ip != NULL)
4271 4249                  VN_RELE(ITOV(ip));
4272 4250  
4273 4251          if (ulp) {
4274 4252                  int terr = 0;
4275 4253  
4276 4254                  TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_SYMLINK,
4277 4255                      trans_size);
4278 4256                  ufs_lockfs_end(ulp);
4279 4257                  if (error == 0)
4280 4258                          error = terr;
4281 4259          }
4282 4260  
4283 4261          /*
4284 4262           * We may have failed due to lack of an inode or of a block to
4285 4263           * store the target in.  Try flushing the delete queue to free
4286 4264           * logically-available things up and try again.
4287 4265           */
4288 4266          if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
4289 4267                  ufs_delete_drain_wait(ufsvfsp, 1);
4290 4268                  retry = 0;
4291 4269                  goto again;
4292 4270          }

↓ open down ↓

162 lines elided

↑ open up ↑

4293 4271  
4294 4272  out:
4295 4273          return (error);
4296 4274  }
4297 4275  
4298 4276  /*
4299 4277   * Ufs specific routine used to do ufs io.
4300 4278   */
4301 4279  int
4302 4280  ufs_rdwri(enum uio_rw rw, int ioflag, struct inode *ip, caddr_t base,
4303      -        ssize_t len, offset_t offset, enum uio_seg seg, int *aresid,
4304      -        struct cred *cr)
     4281 +    ssize_t len, offset_t offset, enum uio_seg seg, int *aresid,
     4282 +    struct cred *cr)
4305 4283  {
4306 4284          struct uio auio;
4307 4285          struct iovec aiov;
4308 4286          int error;
4309 4287  
4310 4288          ASSERT(RW_LOCK_HELD(&ip->i_contents));
4311 4289  
4312 4290          bzero((caddr_t)&auio, sizeof (uio_t));
4313 4291          bzero((caddr_t)&aiov, sizeof (iovec_t));
4314 4292

4315 4293          aiov.iov_base = base;
4316 4294          aiov.iov_len = len;
4317 4295          auio.uio_iov = &aiov;
4318 4296          auio.uio_iovcnt = 1;
4319 4297          auio.uio_loffset = offset;
4320 4298          auio.uio_segflg = (short)seg;
4321 4299          auio.uio_resid = len;
4322 4300  
4323 4301          if (rw == UIO_WRITE) {
4324 4302                  auio.uio_fmode = FWRITE;
4325 4303                  auio.uio_extflg = UIO_COPY_DEFAULT;
4326 4304                  auio.uio_llimit = curproc->p_fsz_ctl;
4327 4305                  error = wrip(ip, &auio, ioflag, cr);
4328 4306          } else {
4329 4307                  auio.uio_fmode = FREAD;
4330 4308                  auio.uio_extflg = UIO_COPY_CACHED;
4331 4309                  auio.uio_llimit = MAXOFFSET_T;
4332 4310                  error = rdip(ip, &auio, ioflag, cr);
4333 4311          }
4334 4312  
4335 4313          if (aresid) {
4336 4314                  *aresid = auio.uio_resid;
4337 4315          } else if (auio.uio_resid) {
4338 4316                  error = EIO;
4339 4317          }
4340 4318          return (error);
4341 4319  }
4342 4320  
4343 4321  /*ARGSUSED*/
4344 4322  static int
4345 4323  ufs_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
4346 4324  {
4347 4325          struct ufid *ufid;
4348 4326          struct inode *ip = VTOI(vp);
4349 4327  
4350 4328          if (ip->i_ufsvfs == NULL)
4351 4329                  return (EIO);
4352 4330  
4353 4331          if (fidp->fid_len < (sizeof (struct ufid) - sizeof (ushort_t))) {
4354 4332                  fidp->fid_len = sizeof (struct ufid) - sizeof (ushort_t);
4355 4333                  return (ENOSPC);
4356 4334          }
4357 4335  
4358 4336          ufid = (struct ufid *)fidp;
4359 4337          bzero((char *)ufid, sizeof (struct ufid));
4360 4338          ufid->ufid_len = sizeof (struct ufid) - sizeof (ushort_t);
4361 4339          ufid->ufid_ino = ip->i_number;
4362 4340          ufid->ufid_gen = ip->i_gen;
4363 4341  
4364 4342          return (0);
4365 4343  }
4366 4344  
4367 4345  /* ARGSUSED2 */
4368 4346  static int
4369 4347  ufs_rwlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
4370 4348  {
4371 4349          struct inode    *ip = VTOI(vp);
4372 4350          struct ufsvfs   *ufsvfsp;
4373 4351          int             forcedirectio;
4374 4352  
4375 4353          /*
4376 4354           * Read case is easy.
4377 4355           */
4378 4356          if (!write_lock) {
4379 4357                  rw_enter(&ip->i_rwlock, RW_READER);
4380 4358                  return (V_WRITELOCK_FALSE);
4381 4359          }
4382 4360  
4383 4361          /*
4384 4362           * Caller has requested a writer lock, but that inhibits any
4385 4363           * concurrency in the VOPs that follow. Acquire the lock shared
4386 4364           * and defer exclusive access until it is known to be needed in
4387 4365           * other VOP handlers. Some cases can be determined here.
4388 4366           */
4389 4367  
4390 4368          /*
4391 4369           * If directio is not set, there is no chance of concurrency,
4392 4370           * so just acquire the lock exclusive. Beware of a forced
4393 4371           * unmount before looking at the mount option.
4394 4372           */
4395 4373          ufsvfsp = ip->i_ufsvfs;
4396 4374          forcedirectio = ufsvfsp ? ufsvfsp->vfs_forcedirectio : 0;
4397 4375          if (!(ip->i_flag & IDIRECTIO || forcedirectio) ||
4398 4376              !ufs_allow_shared_writes) {
4399 4377                  rw_enter(&ip->i_rwlock, RW_WRITER);
4400 4378                  return (V_WRITELOCK_TRUE);
4401 4379          }
4402 4380  
4403 4381          /*
4404 4382           * Mandatory locking forces acquiring i_rwlock exclusive.
4405 4383           */
4406 4384          if (MANDLOCK(vp, ip->i_mode)) {
4407 4385                  rw_enter(&ip->i_rwlock, RW_WRITER);
4408 4386                  return (V_WRITELOCK_TRUE);
4409 4387          }
4410 4388  
4411 4389          /*
4412 4390           * Acquire the lock shared in case a concurrent write follows.
4413 4391           * Mandatory locking could have become enabled before the lock
4414 4392           * was acquired. Re-check and upgrade if needed.
4415 4393           */
4416 4394          rw_enter(&ip->i_rwlock, RW_READER);
4417 4395          if (MANDLOCK(vp, ip->i_mode)) {
4418 4396                  rw_exit(&ip->i_rwlock);
4419 4397                  rw_enter(&ip->i_rwlock, RW_WRITER);
4420 4398                  return (V_WRITELOCK_TRUE);
4421 4399          }
4422 4400          return (V_WRITELOCK_FALSE);
4423 4401  }
4424 4402  
4425 4403  /*ARGSUSED*/

↓ open down ↓

111 lines elided

↑ open up ↑

4426 4404  static void
4427 4405  ufs_rwunlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
4428 4406  {
4429 4407          struct inode    *ip = VTOI(vp);
4430 4408  
4431 4409          rw_exit(&ip->i_rwlock);
4432 4410  }
4433 4411  
4434 4412  /* ARGSUSED */
4435 4413  static int
4436      -ufs_seek(struct vnode *vp, offset_t ooff, offset_t *noffp,
4437      -        caller_context_t *ct)
     4414 +ufs_seek(struct vnode *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
4438 4415  {
4439 4416          return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
4440 4417  }
4441 4418  
4442 4419  /* ARGSUSED */
4443 4420  static int
4444 4421  ufs_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
4445      -        offset_t offset, struct flk_callback *flk_cbp, struct cred *cr,
4446      -        caller_context_t *ct)
     4422 +    offset_t offset, struct flk_callback *flk_cbp, struct cred *cr,
     4423 +    caller_context_t *ct)
4447 4424  {
4448 4425          struct inode *ip = VTOI(vp);
4449 4426  
4450 4427          if (ip->i_ufsvfs == NULL)
4451 4428                  return (EIO);
4452 4429  
4453 4430          /*
4454 4431           * If file is being mapped, disallow frlock.
4455 4432           * XXX I am not holding tlock while checking i_mapcnt because the
4456 4433           * current locking strategy drops all locks before calling fs_frlock.

4457 4434           * So, mapcnt could change before we enter fs_frlock making is

↓ open down ↓

1 lines elided

↑ open up ↑

4458 4435           * meaningless to have held tlock in the first place.
4459 4436           */
4460 4437          if (ip->i_mapcnt > 0 && MANDLOCK(vp, ip->i_mode))
4461 4438                  return (EAGAIN);
4462 4439          return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4463 4440  }
4464 4441  
4465 4442  /* ARGSUSED */
4466 4443  static int
4467 4444  ufs_space(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
4468      -        offset_t offset, cred_t *cr, caller_context_t *ct)
     4445 +    offset_t offset, cred_t *cr, caller_context_t *ct)
4469 4446  {
4470 4447          struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs;
4471 4448          struct ulockfs *ulp;
4472 4449          int error;
4473 4450  
4474 4451          if ((error = convoff(vp, bfp, 0, offset)) == 0) {
4475 4452                  if (cmd == F_FREESP) {
4476 4453                          error = ufs_lockfs_begin(ufsvfsp, &ulp,
4477 4454                              ULOCKFS_SPACE_MASK);
4478 4455                          if (error)

4479 4456                                  return (error);
4480 4457                          error = ufs_freesp(vp, bfp, flag, cr);
4481 4458  
4482 4459                          if (error == 0 && bfp->l_start == 0)
4483 4460                                  vnevent_truncate(vp, ct);
4484 4461                  } else if (cmd == F_ALLOCSP) {
4485 4462                          error = ufs_lockfs_begin(ufsvfsp, &ulp,
4486 4463                              ULOCKFS_FALLOCATE_MASK);
4487 4464                          if (error)
4488 4465                                  return (error);
4489 4466                          error = ufs_allocsp(vp, bfp, cr);
4490 4467                  } else
4491 4468                          return (EINVAL); /* Command not handled here */
4492 4469  
4493 4470                  if (ulp)
4494 4471                          ufs_lockfs_end(ulp);
4495 4472  
4496 4473          }
4497 4474          return (error);
4498 4475  }
4499 4476  
4500 4477  /*
4501 4478   * Used to determine if read ahead should be done. Also used to
4502 4479   * to determine when write back occurs.
4503 4480   */
4504 4481  #define CLUSTSZ(ip)             ((ip)->i_ufsvfs->vfs_ioclustsz)
4505 4482  
4506 4483  /*
4507 4484   * A faster version of ufs_getpage.
4508 4485   *
4509 4486   * We optimize by inlining the pvn_getpages iterator, eliminating
4510 4487   * calls to bmap_read if file doesn't have UFS holes, and avoiding
4511 4488   * the overhead of page_exists().
4512 4489   *
4513 4490   * When files has UFS_HOLES and ufs_getpage is called with S_READ,
4514 4491   * we set *protp to PROT_READ to avoid calling bmap_read. This approach
4515 4492   * victimizes performance when a file with UFS holes is faulted
4516 4493   * first in the S_READ mode, and then in the S_WRITE mode. We will get
4517 4494   * two MMU faults in this case.
4518 4495   *
4519 4496   * XXX - the inode fields which control the sequential mode are not

↓ open down ↓

41 lines elided

↑ open up ↑

4520 4497   *       protected by any mutex. The read ahead will act wild if
4521 4498   *       multiple processes will access the file concurrently and
4522 4499   *       some of them in sequential mode. One particulary bad case
4523 4500   *       is if another thread will change the value of i_nextrio between
4524 4501   *       the time this thread tests the i_nextrio value and then reads it
4525 4502   *       again to use it as the offset for the read ahead.
4526 4503   */
4527 4504  /*ARGSUSED*/
4528 4505  static int
4529 4506  ufs_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp,
4530      -        page_t *plarr[], size_t plsz, struct seg *seg, caddr_t addr,
4531      -        enum seg_rw rw, struct cred *cr, caller_context_t *ct)
     4507 +    page_t *plarr[], size_t plsz, struct seg *seg, caddr_t addr,
     4508 +    enum seg_rw rw, struct cred *cr, caller_context_t *ct)
4532 4509  {
4533 4510          u_offset_t      uoff = (u_offset_t)off; /* type conversion */
4534 4511          u_offset_t      pgoff;
4535 4512          u_offset_t      eoff;
4536 4513          struct inode    *ip = VTOI(vp);
4537 4514          struct ufsvfs   *ufsvfsp = ip->i_ufsvfs;
4538 4515          struct fs       *fs;
4539 4516          struct ulockfs  *ulp;
4540 4517          page_t          **pl;
4541 4518          caddr_t         pgaddr;

4542 4519          krw_t           rwtype;
4543 4520          int             err;
4544 4521          int             has_holes;
4545 4522          int             beyond_eof;
4546 4523          int             seqmode;
4547 4524          int             pgsize = PAGESIZE;
4548 4525          int             dolock;
4549 4526          int             do_qlock;
4550 4527          int             trans_size;
4551 4528  
4552 4529          ASSERT((uoff & PAGEOFFSET) == 0);
4553 4530  
4554 4531          if (protp)
4555 4532                  *protp = PROT_ALL;
4556 4533  
4557 4534          /*
4558 4535           * Obey the lockfs protocol
4559 4536           */
4560 4537          err = ufs_lockfs_begin_getpage(ufsvfsp, &ulp, seg,
4561 4538              rw == S_READ || rw == S_EXEC, protp);
4562 4539          if (err)
4563 4540                  goto out;
4564 4541  
4565 4542          fs = ufsvfsp->vfs_fs;
4566 4543  
4567 4544          if (ulp && (rw == S_CREATE || rw == S_WRITE) &&
4568 4545              !(vp->v_flag & VISSWAP)) {
4569 4546                  /*
4570 4547                   * Try to start a transaction, will return if blocking is
4571 4548                   * expected to occur and the address space is not the
4572 4549                   * kernel address space.
4573 4550                   */
4574 4551                  trans_size = TOP_GETPAGE_SIZE(ip);
4575 4552                  if (seg->s_as != &kas) {
4576 4553                          TRANS_TRY_BEGIN_ASYNC(ufsvfsp, TOP_GETPAGE,
4577 4554                              trans_size, err)
4578 4555                          if (err == EWOULDBLOCK) {
4579 4556                                  /*
4580 4557                                   * Use EDEADLK here because the VM code
4581 4558                                   * can normally never see this error.
4582 4559                                   */
4583 4560                                  err = EDEADLK;
4584 4561                                  ufs_lockfs_end(ulp);
4585 4562                                  goto out;
4586 4563                          }
4587 4564                  } else {
4588 4565                          TRANS_BEGIN_ASYNC(ufsvfsp, TOP_GETPAGE, trans_size);
4589 4566                  }
4590 4567          }
4591 4568  
4592 4569          if (vp->v_flag & VNOMAP) {
4593 4570                  err = ENOSYS;
4594 4571                  goto unlock;
4595 4572          }
4596 4573  
4597 4574          seqmode = ip->i_nextr == uoff && rw != S_CREATE;
4598 4575  
4599 4576          rwtype = RW_READER;             /* start as a reader */
4600 4577          dolock = (rw_owner(&ip->i_contents) != curthread);
4601 4578          /*
4602 4579           * If this thread owns the lock, i.e., this thread grabbed it
4603 4580           * as writer somewhere above, then we don't need to grab the
4604 4581           * lock as reader in this routine.
4605 4582           */
4606 4583          do_qlock = (rw_owner(&ufsvfsp->vfs_dqrwlock) != curthread);
4607 4584  
4608 4585  retrylock:
4609 4586          if (dolock) {
4610 4587                  /*
4611 4588                   * Grab the quota lock if we need to call
4612 4589                   * bmap_write() below (with i_contents as writer).
4613 4590                   */
4614 4591                  if (do_qlock && rwtype == RW_WRITER)
4615 4592                          rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
4616 4593                  rw_enter(&ip->i_contents, rwtype);
4617 4594          }
4618 4595  
4619 4596          /*
4620 4597           * We may be getting called as a side effect of a bmap using
4621 4598           * fbread() when the blocks might be being allocated and the
4622 4599           * size has not yet been up'ed.  In this case we want to be
4623 4600           * able to return zero pages if we get back UFS_HOLE from
4624 4601           * calling bmap for a non write case here.  We also might have
4625 4602           * to read some frags from the disk into a page if we are
4626 4603           * extending the number of frags for a given lbn in bmap().
4627 4604           * Large Files: The read of i_size here is atomic because
4628 4605           * i_contents is held here. If dolock is zero, the lock
4629 4606           * is held in bmap routines.
4630 4607           */
4631 4608          beyond_eof = uoff + len >
4632 4609              P2ROUNDUP_TYPED(ip->i_size, PAGESIZE, u_offset_t);
4633 4610          if (beyond_eof && seg != segkmap) {
4634 4611                  if (dolock) {
4635 4612                          rw_exit(&ip->i_contents);
4636 4613                          if (do_qlock && rwtype == RW_WRITER)
4637 4614                                  rw_exit(&ufsvfsp->vfs_dqrwlock);
4638 4615                  }
4639 4616                  err = EFAULT;
4640 4617                  goto unlock;
4641 4618          }
4642 4619  
4643 4620          /*
4644 4621           * Must hold i_contents lock throughout the call to pvn_getpages
4645 4622           * since locked pages are returned from each call to ufs_getapage.
4646 4623           * Must *not* return locked pages and then try for contents lock
4647 4624           * due to lock ordering requirements (inode > page)
4648 4625           */
4649 4626  
4650 4627          has_holes = bmap_has_holes(ip);
4651 4628  
4652 4629          if ((rw == S_WRITE || rw == S_CREATE) && has_holes && !beyond_eof) {
4653 4630                  int     blk_size;
4654 4631                  u_offset_t offset;
4655 4632  
4656 4633                  /*
4657 4634                   * We must acquire the RW_WRITER lock in order to
4658 4635                   * call bmap_write().
4659 4636                   */
4660 4637                  if (dolock && rwtype == RW_READER) {
4661 4638                          rwtype = RW_WRITER;
4662 4639  
4663 4640                          /*
4664 4641                           * Grab the quota lock before
4665 4642                           * upgrading i_contents, but if we can't grab it
4666 4643                           * don't wait here due to lock order:
4667 4644                           * vfs_dqrwlock > i_contents.
4668 4645                           */
4669 4646                          if (do_qlock &&
4670 4647                              rw_tryenter(&ufsvfsp->vfs_dqrwlock, RW_READER)
4671 4648                              == 0) {
4672 4649                                  rw_exit(&ip->i_contents);
4673 4650                                  goto retrylock;
4674 4651                          }
4675 4652                          if (!rw_tryupgrade(&ip->i_contents)) {
4676 4653                                  rw_exit(&ip->i_contents);
4677 4654                                  if (do_qlock)
4678 4655                                          rw_exit(&ufsvfsp->vfs_dqrwlock);
4679 4656                                  goto retrylock;
4680 4657                          }
4681 4658                  }
4682 4659  
4683 4660                  /*
4684 4661                   * May be allocating disk blocks for holes here as
4685 4662                   * a result of mmap faults. write(2) does the bmap_write
4686 4663                   * in rdip/wrip, not here. We are not dealing with frags
4687 4664                   * in this case.
4688 4665                   */
4689 4666                  /*
4690 4667                   * Large Files: We cast fs_bmask field to offset_t
4691 4668                   * just as we do for MAXBMASK because uoff is a 64-bit
4692 4669                   * data type. fs_bmask will still be a 32-bit type
4693 4670                   * as we cannot change any ondisk data structures.
4694 4671                   */
4695 4672  
4696 4673                  offset = uoff & (offset_t)fs->fs_bmask;
4697 4674                  while (offset < uoff + len) {
4698 4675                          blk_size = (int)blksize(fs, ip, lblkno(fs, offset));
4699 4676                          err = bmap_write(ip, offset, blk_size,
4700 4677                              BI_NORMAL, NULL, cr);
4701 4678                          if (ip->i_flag & (ICHG|IUPD))
4702 4679                                  ip->i_seq++;
4703 4680                          if (err)
4704 4681                                  goto update_inode;
4705 4682                          offset += blk_size; /* XXX - make this contig */
4706 4683                  }
4707 4684          }
4708 4685  
4709 4686          /*
4710 4687           * Can be a reader from now on.
4711 4688           */
4712 4689          if (dolock && rwtype == RW_WRITER) {
4713 4690                  rw_downgrade(&ip->i_contents);
4714 4691                  /*
4715 4692                   * We can release vfs_dqrwlock early so do it, but make
4716 4693                   * sure we don't try to release it again at the bottom.
4717 4694                   */
4718 4695                  if (do_qlock) {
4719 4696                          rw_exit(&ufsvfsp->vfs_dqrwlock);
4720 4697                          do_qlock = 0;
4721 4698                  }
4722 4699          }
4723 4700  
4724 4701          /*
4725 4702           * We remove PROT_WRITE in cases when the file has UFS holes
4726 4703           * because we don't  want to call bmap_read() to check each
4727 4704           * page if it is backed with a disk block.
4728 4705           */
4729 4706          if (protp && has_holes && rw != S_WRITE && rw != S_CREATE)
4730 4707                  *protp &= ~PROT_WRITE;
4731 4708  
4732 4709          err = 0;
4733 4710  
4734 4711          /*
4735 4712           * The loop looks up pages in the range [off, off + len).
4736 4713           * For each page, we first check if we should initiate an asynchronous
4737 4714           * read ahead before we call page_lookup (we may sleep in page_lookup
4738 4715           * for a previously initiated disk read).
4739 4716           */
4740 4717          eoff = (uoff + len);
4741 4718          for (pgoff = uoff, pgaddr = addr, pl = plarr;
4742 4719              pgoff < eoff; /* empty */) {
4743 4720                  page_t  *pp;
4744 4721                  u_offset_t      nextrio;
4745 4722                  se_t    se;
4746 4723                  int retval;
4747 4724  
4748 4725                  se = ((rw == S_CREATE || rw == S_OTHER) ? SE_EXCL : SE_SHARED);
4749 4726  
4750 4727                  /* Handle async getpage (faultahead) */
4751 4728                  if (plarr == NULL) {
4752 4729                          ip->i_nextrio = pgoff;
4753 4730                          (void) ufs_getpage_ra(vp, pgoff, seg, pgaddr);
4754 4731                          pgoff += pgsize;
4755 4732                          pgaddr += pgsize;
4756 4733                          continue;
4757 4734                  }
4758 4735                  /*
4759 4736                   * Check if we should initiate read ahead of next cluster.
4760 4737                   * We call page_exists only when we need to confirm that
4761 4738                   * we have the current page before we initiate the read ahead.
4762 4739                   */
4763 4740                  nextrio = ip->i_nextrio;
4764 4741                  if (seqmode &&
4765 4742                      pgoff + CLUSTSZ(ip) >= nextrio && pgoff <= nextrio &&
4766 4743                      nextrio < ip->i_size && page_exists(vp, pgoff)) {
4767 4744                          retval = ufs_getpage_ra(vp, pgoff, seg, pgaddr);
4768 4745                          /*
4769 4746                           * We always read ahead the next cluster of data
4770 4747                           * starting from i_nextrio. If the page (vp,nextrio)
4771 4748                           * is actually in core at this point, the routine
4772 4749                           * ufs_getpage_ra() will stop pre-fetching data
4773 4750                           * until we read that page in a synchronized manner
4774 4751                           * through ufs_getpage_miss(). So, we should increase
4775 4752                           * i_nextrio if the page (vp, nextrio) exists.
4776 4753                           */
4777 4754                          if ((retval == 0) && page_exists(vp, nextrio)) {
4778 4755                                  ip->i_nextrio = nextrio + pgsize;
4779 4756                          }
4780 4757                  }
4781 4758  
4782 4759                  if ((pp = page_lookup(vp, pgoff, se)) != NULL) {
4783 4760                          /*
4784 4761                           * We found the page in the page cache.
4785 4762                           */
4786 4763                          *pl++ = pp;
4787 4764                          pgoff += pgsize;
4788 4765                          pgaddr += pgsize;
4789 4766                          len -= pgsize;
4790 4767                          plsz -= pgsize;
4791 4768                  } else  {
4792 4769                          /*
4793 4770                           * We have to create the page, or read it from disk.
4794 4771                           */
4795 4772                          if (err = ufs_getpage_miss(vp, pgoff, len, seg, pgaddr,
4796 4773                              pl, plsz, rw, seqmode))
4797 4774                                  goto error;
4798 4775  
4799 4776                          while (*pl != NULL) {
4800 4777                                  pl++;
4801 4778                                  pgoff += pgsize;
4802 4779                                  pgaddr += pgsize;
4803 4780                                  len -= pgsize;
4804 4781                                  plsz -= pgsize;
4805 4782                          }
4806 4783                  }
4807 4784          }
4808 4785  
4809 4786          /*
4810 4787           * Return pages up to plsz if they are in the page cache.
4811 4788           * We cannot return pages if there is a chance that they are
4812 4789           * backed with a UFS hole and rw is S_WRITE or S_CREATE.
4813 4790           */
4814 4791          if (plarr && !(has_holes && (rw == S_WRITE || rw == S_CREATE))) {
4815 4792  
4816 4793                  ASSERT((protp == NULL) ||
4817 4794                      !(has_holes && (*protp & PROT_WRITE)));
4818 4795  
4819 4796                  eoff = pgoff + plsz;
4820 4797                  while (pgoff < eoff) {
4821 4798                          page_t          *pp;
4822 4799  
4823 4800                          if ((pp = page_lookup_nowait(vp, pgoff,
4824 4801                              SE_SHARED)) == NULL)
4825 4802                                  break;
4826 4803  
4827 4804                          *pl++ = pp;
4828 4805                          pgoff += pgsize;
4829 4806                          plsz -= pgsize;
4830 4807                  }
4831 4808          }
4832 4809  
4833 4810          if (plarr)
4834 4811                  *pl = NULL;                     /* Terminate page list */
4835 4812          ip->i_nextr = pgoff;
4836 4813  
4837 4814  error:
4838 4815          if (err && plarr) {
4839 4816                  /*
4840 4817                   * Release any pages we have locked.
4841 4818                   */
4842 4819                  while (pl > &plarr[0])
4843 4820                          page_unlock(*--pl);
4844 4821  
4845 4822                  plarr[0] = NULL;
4846 4823          }
4847 4824  
4848 4825  update_inode:
4849 4826          /*
4850 4827           * If the inode is not already marked for IACC (in rdip() for read)
4851 4828           * and the inode is not marked for no access time update (in wrip()
4852 4829           * for write) then update the inode access time and mod time now.
4853 4830           */
4854 4831          if ((ip->i_flag & (IACC | INOACC)) == 0) {
4855 4832                  if ((rw != S_OTHER) && (ip->i_mode & IFMT) != IFDIR) {
4856 4833                          if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) &&
4857 4834                              (fs->fs_ronly == 0) &&
4858 4835                              (!ufsvfsp->vfs_noatime)) {
4859 4836                                  mutex_enter(&ip->i_tlock);
4860 4837                                  ip->i_flag |= IACC;
4861 4838                                  ITIMES_NOLOCK(ip);
4862 4839                                  mutex_exit(&ip->i_tlock);
4863 4840                          }
4864 4841                  }
4865 4842          }
4866 4843  
4867 4844          if (dolock) {
4868 4845                  rw_exit(&ip->i_contents);
4869 4846                  if (do_qlock && rwtype == RW_WRITER)
4870 4847                          rw_exit(&ufsvfsp->vfs_dqrwlock);
4871 4848          }
4872 4849  
4873 4850  unlock:
4874 4851          if (ulp) {
4875 4852                  if ((rw == S_CREATE || rw == S_WRITE) &&
4876 4853                      !(vp->v_flag & VISSWAP)) {
4877 4854                          TRANS_END_ASYNC(ufsvfsp, TOP_GETPAGE, trans_size);
4878 4855                  }
4879 4856                  ufs_lockfs_end(ulp);
4880 4857          }
4881 4858  out:
4882 4859          return (err);
4883 4860  }

↓ open down ↓

342 lines elided

↑ open up ↑

4884 4861  
4885 4862  /*
4886 4863   * ufs_getpage_miss is called when ufs_getpage missed the page in the page
4887 4864   * cache. The page is either read from the disk, or it's created.
4888 4865   * A page is created (without disk read) if rw == S_CREATE, or if
4889 4866   * the page is not backed with a real disk block (UFS hole).
4890 4867   */
4891 4868  /* ARGSUSED */
4892 4869  static int
4893 4870  ufs_getpage_miss(struct vnode *vp, u_offset_t off, size_t len, struct seg *seg,
4894      -        caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw, int seq)
     4871 +    caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw, int seq)
4895 4872  {
4896 4873          struct inode    *ip = VTOI(vp);
4897 4874          page_t          *pp;
4898 4875          daddr_t         bn;
4899 4876          size_t          io_len;
4900 4877          int             crpage = 0;
4901 4878          int             err;
4902 4879          int             contig;
4903 4880          int             bsize = ip->i_fs->fs_bsize;
4904 4881

4905 4882          /*
4906 4883           * Figure out whether the page can be created, or must be
4907 4884           * must be read from the disk.
4908 4885           */
4909 4886          if (rw == S_CREATE)
4910 4887                  crpage = 1;
4911 4888          else {
4912 4889                  contig = 0;
4913 4890                  if (err = bmap_read(ip, off, &bn, &contig))
4914 4891                          return (err);
4915 4892  
4916 4893                  crpage = (bn == UFS_HOLE);
4917 4894  
4918 4895                  /*
4919 4896                   * If its also a fallocated block that hasn't been written to
4920 4897                   * yet, we will treat it just like a UFS_HOLE and create
4921 4898                   * a zero page for it
4922 4899                   */
4923 4900                  if (ISFALLOCBLK(ip, bn))
4924 4901                          crpage = 1;
4925 4902          }
4926 4903  
4927 4904          if (crpage) {
4928 4905                  if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT, seg,
4929 4906                      addr)) == NULL) {
4930 4907                          return (ufs_fault(vp,
4931 4908                              "ufs_getpage_miss: page_create == NULL"));
4932 4909                  }
4933 4910  
4934 4911                  if (rw != S_CREATE)
4935 4912                          pagezero(pp, 0, PAGESIZE);
4936 4913  
4937 4914                  io_len = PAGESIZE;
4938 4915          } else {
4939 4916                  u_offset_t      io_off;
4940 4917                  uint_t  xlen;
4941 4918                  struct buf      *bp;
4942 4919                  ufsvfs_t        *ufsvfsp = ip->i_ufsvfs;
4943 4920  
4944 4921                  /*
4945 4922                   * If access is not in sequential order, we read from disk
4946 4923                   * in bsize units.
4947 4924                   *
4948 4925                   * We limit the size of the transfer to bsize if we are reading
4949 4926                   * from the beginning of the file. Note in this situation we
4950 4927                   * will hedge our bets and initiate an async read ahead of
4951 4928                   * the second block.
4952 4929                   */
4953 4930                  if (!seq || off == 0)
4954 4931                          contig = MIN(contig, bsize);
4955 4932  
4956 4933                  pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4957 4934                      &io_len, off, contig, 0);
4958 4935  
4959 4936                  /*
4960 4937                   * Some other thread has entered the page.
4961 4938                   * ufs_getpage will retry page_lookup.
4962 4939                   */
4963 4940                  if (pp == NULL) {
4964 4941                          pl[0] = NULL;
4965 4942                          return (0);
4966 4943                  }
4967 4944  
4968 4945                  /*
4969 4946                   * Zero part of the page which we are not
4970 4947                   * going to read from the disk.
4971 4948                   */
4972 4949                  xlen = io_len & PAGEOFFSET;
4973 4950                  if (xlen != 0)
4974 4951                          pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
4975 4952  
4976 4953                  bp = pageio_setup(pp, io_len, ip->i_devvp, B_READ);
4977 4954                  bp->b_edev = ip->i_dev;
4978 4955                  bp->b_dev = cmpdev(ip->i_dev);
4979 4956                  bp->b_blkno = bn;
4980 4957                  bp->b_un.b_addr = (caddr_t)0;
4981 4958                  bp->b_file = ip->i_vnode;
4982 4959                  bp->b_offset = off;
4983 4960  
4984 4961                  if (ufsvfsp->vfs_log) {
4985 4962                          lufs_read_strategy(ufsvfsp->vfs_log, bp);
4986 4963                  } else if (ufsvfsp->vfs_snapshot) {
4987 4964                          fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
4988 4965                  } else {
4989 4966                          ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
4990 4967                          ub.ub_getpages.value.ul++;
4991 4968                          (void) bdev_strategy(bp);
4992 4969                          lwp_stat_update(LWP_STAT_INBLK, 1);
4993 4970                  }
4994 4971  
4995 4972                  ip->i_nextrio = off + ((io_len + PAGESIZE - 1) & PAGEMASK);
4996 4973  
4997 4974                  /*
4998 4975                   * If the file access is sequential, initiate read ahead
4999 4976                   * of the next cluster.
5000 4977                   */
5001 4978                  if (seq && ip->i_nextrio < ip->i_size)
5002 4979                          (void) ufs_getpage_ra(vp, off, seg, addr);
5003 4980                  err = biowait(bp);
5004 4981                  pageio_done(bp);
5005 4982  
5006 4983                  if (err) {
5007 4984                          pvn_read_done(pp, B_ERROR);
5008 4985                          return (err);
5009 4986                  }
5010 4987          }
5011 4988  
5012 4989          pvn_plist_init(pp, pl, plsz, off, io_len, rw);
5013 4990          return (0);
5014 4991  }
5015 4992  
5016 4993  /*
5017 4994   * Read ahead a cluster from the disk. Returns the length in bytes.
5018 4995   */
5019 4996  static int
5020 4997  ufs_getpage_ra(struct vnode *vp, u_offset_t off, struct seg *seg, caddr_t addr)
5021 4998  {
5022 4999          struct inode    *ip = VTOI(vp);
5023 5000          page_t          *pp;
5024 5001          u_offset_t      io_off = ip->i_nextrio;
5025 5002          ufsvfs_t        *ufsvfsp;
5026 5003          caddr_t         addr2 = addr + (io_off - off);
5027 5004          struct buf      *bp;
5028 5005          daddr_t         bn;
5029 5006          size_t          io_len;
5030 5007          int             err;
5031 5008          int             contig;
5032 5009          int             xlen;
5033 5010          int             bsize = ip->i_fs->fs_bsize;
5034 5011  
5035 5012          /*
5036 5013           * If the directio advisory is in effect on this file,
5037 5014           * then do not do buffered read ahead. Read ahead makes
5038 5015           * it more difficult on threads using directio as they
5039 5016           * will be forced to flush the pages from this vnode.
5040 5017           */
5041 5018          if ((ufsvfsp = ip->i_ufsvfs) == NULL)
5042 5019                  return (0);
5043 5020          if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio)
5044 5021                  return (0);
5045 5022  
5046 5023          /*
5047 5024           * Is this test needed?
5048 5025           */
5049 5026          if (addr2 >= seg->s_base + seg->s_size)
5050 5027                  return (0);
5051 5028  
5052 5029          contig = 0;
5053 5030          err = bmap_read(ip, io_off, &bn, &contig);
5054 5031          /*
5055 5032           * If its a UFS_HOLE or a fallocated block, do not perform
5056 5033           * any read ahead's since there probably is nothing to read ahead
5057 5034           */
5058 5035          if (err || bn == UFS_HOLE || ISFALLOCBLK(ip, bn))
5059 5036                  return (0);
5060 5037  
5061 5038          /*
5062 5039           * Limit the transfer size to bsize if this is the 2nd block.
5063 5040           */
5064 5041          if (io_off == (u_offset_t)bsize)
5065 5042                  contig = MIN(contig, bsize);
5066 5043  
5067 5044          if ((pp = pvn_read_kluster(vp, io_off, seg, addr2, &io_off,
5068 5045              &io_len, io_off, contig, 1)) == NULL)
5069 5046                  return (0);
5070 5047  
5071 5048          /*
5072 5049           * Zero part of page which we are not going to read from disk
5073 5050           */
5074 5051          if ((xlen = (io_len & PAGEOFFSET)) > 0)
5075 5052                  pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
5076 5053  
5077 5054          ip->i_nextrio = (io_off + io_len + PAGESIZE - 1) & PAGEMASK;
5078 5055  
5079 5056          bp = pageio_setup(pp, io_len, ip->i_devvp, B_READ | B_ASYNC);
5080 5057          bp->b_edev = ip->i_dev;
5081 5058          bp->b_dev = cmpdev(ip->i_dev);
5082 5059          bp->b_blkno = bn;
5083 5060          bp->b_un.b_addr = (caddr_t)0;
5084 5061          bp->b_file = ip->i_vnode;
5085 5062          bp->b_offset = off;
5086 5063  
5087 5064          if (ufsvfsp->vfs_log) {
5088 5065                  lufs_read_strategy(ufsvfsp->vfs_log, bp);
5089 5066          } else if (ufsvfsp->vfs_snapshot) {
5090 5067                  fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
5091 5068          } else {
5092 5069                  ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
5093 5070                  ub.ub_getras.value.ul++;
5094 5071                  (void) bdev_strategy(bp);
5095 5072                  lwp_stat_update(LWP_STAT_INBLK, 1);
5096 5073          }
5097 5074  
5098 5075          return (io_len);
5099 5076  }
5100 5077  
5101 5078  int     ufs_delay = 1;

↓ open down ↓

197 lines elided

↑ open up ↑

5102 5079  /*
5103 5080   * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE, B_ASYNC}
5104 5081   *
5105 5082   * LMXXX - the inode really ought to contain a pointer to one of these
5106 5083   * async args.  Stuff gunk in there and just hand the whole mess off.
5107 5084   * This would replace i_delaylen, i_delayoff.
5108 5085   */
5109 5086  /*ARGSUSED*/
5110 5087  static int
5111 5088  ufs_putpage(struct vnode *vp, offset_t off, size_t len, int flags,
5112      -        struct cred *cr, caller_context_t *ct)
     5089 +    struct cred *cr, caller_context_t *ct)
5113 5090  {
5114 5091          struct inode *ip = VTOI(vp);
5115 5092          int err = 0;
5116 5093  
5117 5094          if (vp->v_count == 0) {
5118 5095                  return (ufs_fault(vp, "ufs_putpage: bad v_count == 0"));
5119 5096          }
5120 5097  
5121 5098          /*
5122 5099           * XXX - Why should this check be made here?

5123 5100           */
5124 5101          if (vp->v_flag & VNOMAP) {
5125 5102                  err = ENOSYS;
5126 5103                  goto errout;
5127 5104          }
5128 5105  
5129 5106          if (ip->i_ufsvfs == NULL) {
5130 5107                  err = EIO;
5131 5108                  goto errout;
5132 5109          }
5133 5110  
5134 5111          if (flags & B_ASYNC) {
5135 5112                  if (ufs_delay && len &&
5136 5113                      (flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0) {
5137 5114                          mutex_enter(&ip->i_tlock);
5138 5115                          /*
5139 5116                           * If nobody stalled, start a new cluster.
5140 5117                           */
5141 5118                          if (ip->i_delaylen == 0) {
5142 5119                                  ip->i_delayoff = off;
5143 5120                                  ip->i_delaylen = len;
5144 5121                                  mutex_exit(&ip->i_tlock);
5145 5122                                  goto errout;
5146 5123                          }
5147 5124                          /*
5148 5125                           * If we have a full cluster or they are not contig,
5149 5126                           * then push last cluster and start over.
5150 5127                           */
5151 5128                          if (ip->i_delaylen >= CLUSTSZ(ip) ||
5152 5129                              ip->i_delayoff + ip->i_delaylen != off) {
5153 5130                                  u_offset_t doff;
5154 5131                                  size_t dlen;
5155 5132  
5156 5133                                  doff = ip->i_delayoff;
5157 5134                                  dlen = ip->i_delaylen;
5158 5135                                  ip->i_delayoff = off;
5159 5136                                  ip->i_delaylen = len;
5160 5137                                  mutex_exit(&ip->i_tlock);
5161 5138                                  err = ufs_putpages(vp, doff, dlen,
5162 5139                                      flags, cr);
5163 5140                                  /* LMXXX - flags are new val, not old */
5164 5141                                  goto errout;
5165 5142                          }
5166 5143                          /*
5167 5144                           * There is something there, it's not full, and
5168 5145                           * it is contig.
5169 5146                           */
5170 5147                          ip->i_delaylen += len;
5171 5148                          mutex_exit(&ip->i_tlock);
5172 5149                          goto errout;
5173 5150                  }
5174 5151                  /*
5175 5152                   * Must have weird flags or we are not clustering.
5176 5153                   */
5177 5154          }
5178 5155  
5179 5156          err = ufs_putpages(vp, off, len, flags, cr);
5180 5157  
5181 5158  errout:
5182 5159          return (err);
5183 5160  }

↓ open down ↓

61 lines elided

↑ open up ↑

5184 5161  
5185 5162  /*
5186 5163   * If len == 0, do from off to EOF.
5187 5164   *
5188 5165   * The normal cases should be len == 0 & off == 0 (entire vp list),
5189 5166   * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
5190 5167   * (from pageout).
5191 5168   */
5192 5169  /*ARGSUSED*/
5193 5170  static int
5194      -ufs_putpages(
5195      -        struct vnode *vp,
5196      -        offset_t off,
5197      -        size_t len,
5198      -        int flags,
5199      -        struct cred *cr)
     5171 +ufs_putpages(struct vnode *vp, offset_t off, size_t len, int flags,
     5172 +    struct cred *cr)
5200 5173  {
5201 5174          u_offset_t io_off;
5202 5175          u_offset_t eoff;
5203 5176          struct inode *ip = VTOI(vp);
5204 5177          page_t *pp;
5205 5178          size_t io_len;
5206 5179          int err = 0;
5207 5180          int dolock;
5208 5181  
5209 5182          if (vp->v_count == 0)

5210 5183                  return (ufs_fault(vp, "ufs_putpages: v_count == 0"));
5211 5184          /*
5212 5185           * Acquire the readers/write inode lock before locking
5213 5186           * any pages in this inode.
5214 5187           * The inode lock is held during i/o.
5215 5188           */
5216 5189          if (len == 0) {
5217 5190                  mutex_enter(&ip->i_tlock);
5218 5191                  ip->i_delayoff = ip->i_delaylen = 0;
5219 5192                  mutex_exit(&ip->i_tlock);
5220 5193          }
5221 5194          dolock = (rw_owner(&ip->i_contents) != curthread);
5222 5195          if (dolock) {
5223 5196                  /*
5224 5197                   * Must synchronize this thread and any possible thread
5225 5198                   * operating in the window of vulnerability in wrip().
5226 5199                   * It is dangerous to allow both a thread doing a putpage
5227 5200                   * and a thread writing, so serialize them.  The exception
5228 5201                   * is when the thread in wrip() does something which causes
5229 5202                   * a putpage operation.  Then, the thread must be allowed
5230 5203                   * to continue.  It may encounter a bmap_read problem in
5231 5204                   * ufs_putapage, but that is handled in ufs_putapage.
5232 5205                   * Allow async writers to proceed, we don't want to block
5233 5206                   * the pageout daemon.
5234 5207                   */
5235 5208                  if (ip->i_writer == curthread)
5236 5209                          rw_enter(&ip->i_contents, RW_READER);
5237 5210                  else {
5238 5211                          for (;;) {
5239 5212                                  rw_enter(&ip->i_contents, RW_READER);
5240 5213                                  mutex_enter(&ip->i_tlock);
5241 5214                                  /*
5242 5215                                   * If there is no thread in the critical
5243 5216                                   * section of wrip(), then proceed.
5244 5217                                   * Otherwise, wait until there isn't one.
5245 5218                                   */
5246 5219                                  if (ip->i_writer == NULL) {
5247 5220                                          mutex_exit(&ip->i_tlock);
5248 5221                                          break;
5249 5222                                  }
5250 5223                                  rw_exit(&ip->i_contents);
5251 5224                                  /*
5252 5225                                   * Bounce async writers when we have a writer
5253 5226                                   * working on this file so we don't deadlock
5254 5227                                   * the pageout daemon.
5255 5228                                   */
5256 5229                                  if (flags & B_ASYNC) {
5257 5230                                          mutex_exit(&ip->i_tlock);
5258 5231                                          return (0);
5259 5232                                  }
5260 5233                                  cv_wait(&ip->i_wrcv, &ip->i_tlock);
5261 5234                                  mutex_exit(&ip->i_tlock);
5262 5235                          }
5263 5236                  }
5264 5237          }
5265 5238  
5266 5239          if (!vn_has_cached_data(vp)) {
5267 5240                  if (dolock)
5268 5241                          rw_exit(&ip->i_contents);
5269 5242                  return (0);
5270 5243          }
5271 5244  
5272 5245          if (len == 0) {
5273 5246                  /*
5274 5247                   * Search the entire vp list for pages >= off.
5275 5248                   */
5276 5249                  err = pvn_vplist_dirty(vp, (u_offset_t)off, ufs_putapage,
5277 5250                      flags, cr);
5278 5251          } else {
5279 5252                  /*
5280 5253                   * Loop over all offsets in the range looking for
5281 5254                   * pages to deal with.
5282 5255                   */
5283 5256                  if ((eoff = blkroundup(ip->i_fs, ip->i_size)) != 0)
5284 5257                          eoff = MIN(off + len, eoff);
5285 5258                  else
5286 5259                          eoff = off + len;
5287 5260  
5288 5261                  for (io_off = off; io_off < eoff; io_off += io_len) {
5289 5262                          /*
5290 5263                           * If we are not invalidating, synchronously
5291 5264                           * freeing or writing pages, use the routine
5292 5265                           * page_lookup_nowait() to prevent reclaiming
5293 5266                           * them from the free list.
5294 5267                           */
5295 5268                          if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
5296 5269                                  pp = page_lookup(vp, io_off,
5297 5270                                      (flags & (B_INVAL | B_FREE)) ?
5298 5271                                      SE_EXCL : SE_SHARED);
5299 5272                          } else {
5300 5273                                  pp = page_lookup_nowait(vp, io_off,
5301 5274                                      (flags & B_FREE) ? SE_EXCL : SE_SHARED);
5302 5275                          }
5303 5276  
5304 5277                          if (pp == NULL || pvn_getdirty(pp, flags) == 0)
5305 5278                                  io_len = PAGESIZE;
5306 5279                          else {
5307 5280                                  u_offset_t *io_offp = &io_off;
5308 5281  
5309 5282                                  err = ufs_putapage(vp, pp, io_offp, &io_len,
5310 5283                                      flags, cr);
5311 5284                                  if (err != 0)
5312 5285                                          break;
5313 5286                                  /*
5314 5287                                   * "io_off" and "io_len" are returned as
5315 5288                                   * the range of pages we actually wrote.
5316 5289                                   * This allows us to skip ahead more quickly
5317 5290                                   * since several pages may've been dealt
5318 5291                                   * with by this iteration of the loop.
5319 5292                                   */
5320 5293                          }
5321 5294                  }
5322 5295          }
5323 5296          if (err == 0 && off == 0 && (len == 0 || len >= ip->i_size)) {
5324 5297                  /*
5325 5298                   * We have just sync'ed back all the pages on
5326 5299                   * the inode, turn off the IMODTIME flag.
5327 5300                   */
5328 5301                  mutex_enter(&ip->i_tlock);
5329 5302                  ip->i_flag &= ~IMODTIME;
5330 5303                  mutex_exit(&ip->i_tlock);
5331 5304          }
5332 5305          if (dolock)
5333 5306                  rw_exit(&ip->i_contents);
5334 5307          return (err);
5335 5308  }
5336 5309  
5337 5310  static void
5338 5311  ufs_iodone(buf_t *bp)
5339 5312  {
5340 5313          struct inode *ip;
5341 5314  
5342 5315          ASSERT((bp->b_pages->p_vnode != NULL) && !(bp->b_flags & B_READ));
5343 5316  
5344 5317          bp->b_iodone = NULL;
5345 5318  
5346 5319          ip = VTOI(bp->b_pages->p_vnode);
5347 5320  
5348 5321          mutex_enter(&ip->i_tlock);
5349 5322          if (ip->i_writes >= ufs_LW) {
5350 5323                  if ((ip->i_writes -= bp->b_bcount) <= ufs_LW)
5351 5324                          if (ufs_WRITES)
5352 5325                                  cv_broadcast(&ip->i_wrcv); /* wake all up */
5353 5326          } else {
5354 5327                  ip->i_writes -= bp->b_bcount;
5355 5328          }
5356 5329  
5357 5330          mutex_exit(&ip->i_tlock);
5358 5331          iodone(bp);

↓ open down ↓

149 lines elided

↑ open up ↑

5359 5332  }
5360 5333  
5361 5334  /*
5362 5335   * Write out a single page, possibly klustering adjacent
5363 5336   * dirty pages.  The inode lock must be held.
5364 5337   *
5365 5338   * LMXXX - bsize < pagesize not done.
5366 5339   */
5367 5340  /*ARGSUSED*/
5368 5341  int
5369      -ufs_putapage(
5370      -        struct vnode *vp,
5371      -        page_t *pp,
5372      -        u_offset_t *offp,
5373      -        size_t *lenp,           /* return values */
5374      -        int flags,
5375      -        struct cred *cr)
     5342 +ufs_putapage(struct vnode *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
     5343 +    int flags, struct cred *cr)
5376 5344  {
5377 5345          u_offset_t io_off;
5378 5346          u_offset_t off;
5379 5347          struct inode *ip = VTOI(vp);
5380 5348          struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
5381 5349          struct fs *fs;
5382 5350          struct buf *bp;
5383 5351          size_t io_len;
5384 5352          daddr_t bn;
5385 5353          int err;

5386 5354          int contig;
5387 5355          int dotrans;
5388 5356  
5389 5357          ASSERT(RW_LOCK_HELD(&ip->i_contents));
5390 5358  
5391 5359          if (ufsvfsp == NULL) {
5392 5360                  err = EIO;
5393 5361                  goto out_trace;
5394 5362          }
5395 5363  
5396 5364          fs = ip->i_fs;
5397 5365          ASSERT(fs->fs_ronly == 0);
5398 5366  
5399 5367          /*
5400 5368           * If the modified time on the inode has not already been
5401 5369           * set elsewhere (e.g. for write/setattr) we set the time now.
5402 5370           * This gives us approximate modified times for mmap'ed files
5403 5371           * which are modified via stores in the user address space.
5404 5372           */
5405 5373          if ((ip->i_flag & IMODTIME) == 0) {
5406 5374                  mutex_enter(&ip->i_tlock);
5407 5375                  ip->i_flag |= IUPD;
5408 5376                  ip->i_seq++;
5409 5377                  ITIMES_NOLOCK(ip);
5410 5378                  mutex_exit(&ip->i_tlock);
5411 5379          }
5412 5380  
5413 5381          /*
5414 5382           * Align the request to a block boundry (for old file systems),
5415 5383           * and go ask bmap() how contiguous things are for this file.
5416 5384           */
5417 5385          off = pp->p_offset & (offset_t)fs->fs_bmask;    /* block align it */
5418 5386          contig = 0;
5419 5387          err = bmap_read(ip, off, &bn, &contig);
5420 5388          if (err)
5421 5389                  goto out;
5422 5390          if (bn == UFS_HOLE) {                   /* putpage never allocates */
5423 5391                  /*
5424 5392                   * logging device is in error mode; simply return EIO
5425 5393                   */
5426 5394                  if (TRANS_ISERROR(ufsvfsp)) {
5427 5395                          err = EIO;
5428 5396                          goto out;
5429 5397                  }
5430 5398                  /*
5431 5399                   * Oops, the thread in the window in wrip() did some
5432 5400                   * sort of operation which caused a putpage in the bad
5433 5401                   * range.  In this case, just return an error which will
5434 5402                   * cause the software modified bit on the page to set
5435 5403                   * and the page will get written out again later.
5436 5404                   */
5437 5405                  if (ip->i_writer == curthread) {
5438 5406                          err = EIO;
5439 5407                          goto out;
5440 5408                  }
5441 5409                  /*
5442 5410                   * If the pager is trying to push a page in the bad range
5443 5411                   * just tell it to try again later when things are better.
5444 5412                   */
5445 5413                  if (flags & B_ASYNC) {
5446 5414                          err = EAGAIN;
5447 5415                          goto out;
5448 5416                  }
5449 5417                  err = ufs_fault(ITOV(ip), "ufs_putapage: bn == UFS_HOLE");
5450 5418                  goto out;
5451 5419          }
5452 5420  
5453 5421          /*
5454 5422           * If it is an fallocate'd block, reverse the negativity since
5455 5423           * we are now writing to it
5456 5424           */
5457 5425          if (ISFALLOCBLK(ip, bn)) {
5458 5426                  err = bmap_set_bn(vp, off, dbtofsb(fs, -bn));
5459 5427                  if (err)
5460 5428                          goto out;
5461 5429  
5462 5430                  bn = -bn;
5463 5431          }
5464 5432  
5465 5433          /*
5466 5434           * Take the length (of contiguous bytes) passed back from bmap()
5467 5435           * and _try_ and get a set of pages covering that extent.
5468 5436           */
5469 5437          pp = pvn_write_kluster(vp, pp, &io_off, &io_len, off, contig, flags);
5470 5438  
5471 5439          /*
5472 5440           * May have run out of memory and not clustered backwards.
5473 5441           * off          p_offset
5474 5442           * [  pp - 1  ][   pp   ]
5475 5443           * [    block           ]
5476 5444           * We told bmap off, so we have to adjust the bn accordingly.
5477 5445           */
5478 5446          if (io_off > off) {
5479 5447                  bn += btod(io_off - off);
5480 5448                  contig -= (io_off - off);
5481 5449          }
5482 5450  
5483 5451          /*
5484 5452           * bmap was carefull to tell us the right size so use that.
5485 5453           * There might be unallocated frags at the end.
5486 5454           * LMXXX - bzero the end of the page?  We must be writing after EOF.
5487 5455           */
5488 5456          if (io_len > contig) {
5489 5457                  ASSERT(io_len - contig < fs->fs_bsize);
5490 5458                  io_len -= (io_len - contig);
5491 5459          }
5492 5460  
5493 5461          /*
5494 5462           * Handle the case where we are writing the last page after EOF.
5495 5463           *
5496 5464           * XXX - just a patch for i-mt3.
5497 5465           */
5498 5466          if (io_len == 0) {
5499 5467                  ASSERT(pp->p_offset >=
5500 5468                      (u_offset_t)(roundup(ip->i_size, PAGESIZE)));
5501 5469                  io_len = PAGESIZE;
5502 5470          }
5503 5471  
5504 5472          bp = pageio_setup(pp, io_len, ip->i_devvp, B_WRITE | flags);
5505 5473  
5506 5474          ULOCKFS_SET_MOD(ITOUL(ip));
5507 5475  
5508 5476          bp->b_edev = ip->i_dev;
5509 5477          bp->b_dev = cmpdev(ip->i_dev);
5510 5478          bp->b_blkno = bn;
5511 5479          bp->b_un.b_addr = (caddr_t)0;
5512 5480          bp->b_file = ip->i_vnode;
5513 5481  
5514 5482          /*
5515 5483           * File contents of shadow or quota inodes are metadata, and updates
5516 5484           * to these need to be put into a logging transaction. All direct
5517 5485           * callers in UFS do that, but fsflush can come here _before_ the
5518 5486           * normal codepath. An example would be updating ACL information, for
5519 5487           * which the normal codepath would be:
5520 5488           *      ufs_si_store()
5521 5489           *      ufs_rdwri()
5522 5490           *      wrip()
5523 5491           *      segmap_release()
5524 5492           *      VOP_PUTPAGE()
5525 5493           * Here, fsflush can pick up the dirty page before segmap_release()
5526 5494           * forces it out. If that happens, there's no transaction.
5527 5495           * We therefore need to test whether a transaction exists, and if not
5528 5496           * create one - for fsflush.
5529 5497           */
5530 5498          dotrans =
5531 5499              (((ip->i_mode & IFMT) == IFSHAD || ufsvfsp->vfs_qinod == ip) &&
5532 5500              ((curthread->t_flag & T_DONTBLOCK) == 0) &&
5533 5501              (TRANS_ISTRANS(ufsvfsp)));
5534 5502  
5535 5503          if (dotrans) {
5536 5504                  curthread->t_flag |= T_DONTBLOCK;
5537 5505                  TRANS_BEGIN_ASYNC(ufsvfsp, TOP_PUTPAGE, TOP_PUTPAGE_SIZE(ip));
5538 5506          }
5539 5507          if (TRANS_ISTRANS(ufsvfsp)) {
5540 5508                  if ((ip->i_mode & IFMT) == IFSHAD) {
5541 5509                          TRANS_BUF(ufsvfsp, 0, io_len, bp, DT_SHAD);
5542 5510                  } else if (ufsvfsp->vfs_qinod == ip) {
5543 5511                          TRANS_DELTA(ufsvfsp, ldbtob(bn), bp->b_bcount, DT_QR,
5544 5512                              0, 0);
5545 5513                  }
5546 5514          }
5547 5515          if (dotrans) {
5548 5516                  TRANS_END_ASYNC(ufsvfsp, TOP_PUTPAGE, TOP_PUTPAGE_SIZE(ip));
5549 5517                  curthread->t_flag &= ~T_DONTBLOCK;
5550 5518          }
5551 5519  
5552 5520          /* write throttle */
5553 5521  
5554 5522          ASSERT(bp->b_iodone == NULL);
5555 5523          bp->b_iodone = (int (*)())ufs_iodone;
5556 5524          mutex_enter(&ip->i_tlock);
5557 5525          ip->i_writes += bp->b_bcount;
5558 5526          mutex_exit(&ip->i_tlock);
5559 5527  
5560 5528          if (bp->b_flags & B_ASYNC) {
5561 5529                  if (ufsvfsp->vfs_log) {
5562 5530                          lufs_write_strategy(ufsvfsp->vfs_log, bp);
5563 5531                  } else if (ufsvfsp->vfs_snapshot) {
5564 5532                          fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
5565 5533                  } else {
5566 5534                          ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
5567 5535                          ub.ub_putasyncs.value.ul++;
5568 5536                          (void) bdev_strategy(bp);
5569 5537                          lwp_stat_update(LWP_STAT_OUBLK, 1);
5570 5538                  }
5571 5539          } else {
5572 5540                  if (ufsvfsp->vfs_log) {
5573 5541                          lufs_write_strategy(ufsvfsp->vfs_log, bp);
5574 5542                  } else if (ufsvfsp->vfs_snapshot) {
5575 5543                          fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
5576 5544                  } else {
5577 5545                          ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
5578 5546                          ub.ub_putsyncs.value.ul++;
5579 5547                          (void) bdev_strategy(bp);
5580 5548                          lwp_stat_update(LWP_STAT_OUBLK, 1);
5581 5549                  }
5582 5550                  err = biowait(bp);
5583 5551                  pageio_done(bp);
5584 5552                  pvn_write_done(pp, ((err) ? B_ERROR : 0) | B_WRITE | flags);
5585 5553          }
5586 5554  
5587 5555          pp = NULL;
5588 5556  
5589 5557  out:
5590 5558          if (err != 0 && pp != NULL)
5591 5559                  pvn_write_done(pp, B_ERROR | B_WRITE | flags);
5592 5560  
5593 5561          if (offp)
5594 5562                  *offp = io_off;
5595 5563          if (lenp)

↓ open down ↓

210 lines elided

↑ open up ↑

5596 5564                  *lenp = io_len;
5597 5565  out_trace:
5598 5566          return (err);
5599 5567  }
5600 5568  
5601 5569  uint64_t ufs_map_alock_retry_cnt;
5602 5570  uint64_t ufs_map_lockfs_retry_cnt;
5603 5571  
5604 5572  /* ARGSUSED */
5605 5573  static int
5606      -ufs_map(struct vnode *vp,
5607      -        offset_t off,
5608      -        struct as *as,
5609      -        caddr_t *addrp,
5610      -        size_t len,
5611      -        uchar_t prot,
5612      -        uchar_t maxprot,
5613      -        uint_t flags,
5614      -        struct cred *cr,
5615      -        caller_context_t *ct)
     5574 +ufs_map(struct vnode *vp, offset_t off, struct as *as, caddr_t *addrp,
     5575 +    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, struct cred *cr,
     5576 +    caller_context_t *ct)
5616 5577  {
5617 5578          struct segvn_crargs vn_a;
5618 5579          struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs;
5619 5580          struct ulockfs *ulp;
5620 5581          int error, sig;
5621 5582          k_sigset_t smask;
5622 5583          caddr_t hint = *addrp;
5623 5584  
5624 5585          if (vp->v_flag & VNOMAP) {
5625 5586                  error = ENOSYS;

5626 5587                  goto out;
5627 5588          }
5628 5589  
5629 5590          if (off < (offset_t)0 || (offset_t)(off + len) < (offset_t)0) {
5630 5591                  error = ENXIO;
5631 5592                  goto out;
5632 5593          }
5633 5594  
5634 5595          if (vp->v_type != VREG) {
5635 5596                  error = ENODEV;
5636 5597                  goto out;
5637 5598          }
5638 5599  
5639 5600  retry_map:
5640 5601          *addrp = hint;
5641 5602          /*
5642 5603           * If file is being locked, disallow mapping.
5643 5604           */
5644 5605          if (vn_has_mandatory_locks(vp, VTOI(vp)->i_mode)) {
5645 5606                  error = EAGAIN;
5646 5607                  goto out;
5647 5608          }
5648 5609  
5649 5610          as_rangelock(as);
5650 5611          /*
5651 5612           * Note that if we are retrying (because ufs_lockfs_trybegin failed in
5652 5613           * the previous attempt), some other thread could have grabbed
5653 5614           * the same VA range if MAP_FIXED is set. In that case, choose_addr
5654 5615           * would unmap the valid VA range, that is ok.
5655 5616           */
5656 5617          error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
5657 5618          if (error != 0) {
5658 5619                  as_rangeunlock(as);
5659 5620                  goto out;
5660 5621          }
5661 5622  
5662 5623          /*
5663 5624           * a_lock has to be acquired before entering the lockfs protocol
5664 5625           * because that is the order in which pagefault works. Also we cannot
5665 5626           * block on a_lock here because this waiting writer will prevent
5666 5627           * further readers like ufs_read from progressing and could cause
5667 5628           * deadlock between ufs_read/ufs_map/pagefault when a quiesce is
5668 5629           * pending.
5669 5630           */
5670 5631          while (!AS_LOCK_TRYENTER(as, RW_WRITER)) {
5671 5632                  ufs_map_alock_retry_cnt++;
5672 5633                  delay(RETRY_LOCK_DELAY);
5673 5634          }
5674 5635  
5675 5636          /*
5676 5637           * We can't hold as->a_lock and wait for lockfs to succeed because
5677 5638           * the proc tools might hang on a_lock, so call ufs_lockfs_trybegin()
5678 5639           * instead.
5679 5640           */
5680 5641          if (error = ufs_lockfs_trybegin(ufsvfsp, &ulp, ULOCKFS_MAP_MASK)) {
5681 5642                  /*
5682 5643                   * ufs_lockfs_trybegin() did not succeed. It is safer to give up
5683 5644                   * as->a_lock and wait for ulp->ul_fs_lock status to change.
5684 5645                   */
5685 5646                  ufs_map_lockfs_retry_cnt++;
5686 5647                  AS_LOCK_EXIT(as);
5687 5648                  as_rangeunlock(as);
5688 5649                  if (error == EIO)
5689 5650                          goto out;
5690 5651  
5691 5652                  mutex_enter(&ulp->ul_lock);
5692 5653                  while (ulp->ul_fs_lock & ULOCKFS_MAP_MASK) {
5693 5654                          if (ULOCKFS_IS_SLOCK(ulp) || ufsvfsp->vfs_nointr) {
5694 5655                                  cv_wait(&ulp->ul_cv, &ulp->ul_lock);
5695 5656                          } else {
5696 5657                                  sigintr(&smask, 1);
5697 5658                                  sig = cv_wait_sig(&ulp->ul_cv, &ulp->ul_lock);
5698 5659                                  sigunintr(&smask);
5699 5660                                  if (((ulp->ul_fs_lock & ULOCKFS_MAP_MASK) &&
5700 5661                                      !sig) || ufsvfsp->vfs_dontblock) {
5701 5662                                          mutex_exit(&ulp->ul_lock);
5702 5663                                          return (EINTR);
5703 5664                                  }
5704 5665                          }
5705 5666                  }
5706 5667                  mutex_exit(&ulp->ul_lock);
5707 5668                  goto retry_map;
5708 5669          }
5709 5670  
5710 5671          vn_a.vp = vp;
5711 5672          vn_a.offset = (u_offset_t)off;
5712 5673          vn_a.type = flags & MAP_TYPE;
5713 5674          vn_a.prot = prot;
5714 5675          vn_a.maxprot = maxprot;
5715 5676          vn_a.cred = cr;
5716 5677          vn_a.amp = NULL;
5717 5678          vn_a.flags = flags & ~MAP_TYPE;
5718 5679          vn_a.szc = 0;
5719 5680          vn_a.lgrp_mem_policy_flags = 0;
5720 5681

↓ open down ↓

95 lines elided

↑ open up ↑

5721 5682          error = as_map_locked(as, *addrp, len, segvn_create, &vn_a);
5722 5683          if (ulp)
5723 5684                  ufs_lockfs_end(ulp);
5724 5685          as_rangeunlock(as);
5725 5686  out:
5726 5687          return (error);
5727 5688  }
5728 5689  
5729 5690  /* ARGSUSED */
5730 5691  static int
5731      -ufs_addmap(struct vnode *vp,
5732      -        offset_t off,
5733      -        struct as *as,
5734      -        caddr_t addr,
5735      -        size_t  len,
5736      -        uchar_t  prot,
5737      -        uchar_t  maxprot,
5738      -        uint_t    flags,
5739      -        struct cred *cr,
5740      -        caller_context_t *ct)
     5692 +ufs_addmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
     5693 +    size_t len, uchar_t  prot, uchar_t  maxprot, uint_t    flags,
     5694 +    struct cred *cr, caller_context_t *ct)
5741 5695  {
5742 5696          struct inode *ip = VTOI(vp);
5743 5697  
5744 5698          if (vp->v_flag & VNOMAP) {
5745 5699                  return (ENOSYS);
5746 5700          }
5747 5701  
5748 5702          mutex_enter(&ip->i_tlock);
5749 5703          ip->i_mapcnt += btopr(len);
5750 5704          mutex_exit(&ip->i_tlock);
5751 5705          return (0);
5752 5706  }
5753 5707  
5754 5708  /*ARGSUSED*/
5755 5709  static int
5756 5710  ufs_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
5757      -        size_t len, uint_t prot,  uint_t maxprot,  uint_t flags,
5758      -        struct cred *cr, caller_context_t *ct)
     5711 +    size_t len, uint_t prot,  uint_t maxprot,  uint_t flags, struct cred *cr,
     5712 +    caller_context_t *ct)
5759 5713  {
5760 5714          struct inode *ip = VTOI(vp);
5761 5715  
5762 5716          if (vp->v_flag & VNOMAP) {
5763 5717                  return (ENOSYS);
5764 5718          }
5765 5719  
5766 5720          mutex_enter(&ip->i_tlock);
5767 5721          ip->i_mapcnt -= btopr(len);     /* Count released mappings */
5768 5722          ASSERT(ip->i_mapcnt >= 0);

5769 5723          mutex_exit(&ip->i_tlock);

↓ open down ↓

1 lines elided

↑ open up ↑

5770 5724          return (0);
5771 5725  }
5772 5726  /*
5773 5727   * Return the answer requested to poll() for non-device files
5774 5728   */
5775 5729  struct pollhead ufs_pollhd;
5776 5730  
5777 5731  /* ARGSUSED */
5778 5732  int
5779 5733  ufs_poll(vnode_t *vp, short ev, int any, short *revp, struct pollhead **phpp,
5780      -        caller_context_t *ct)
     5734 +    caller_context_t *ct)
5781 5735  {
5782 5736          struct ufsvfs   *ufsvfsp;
5783 5737  
     5738 +        /*
     5739 +         * Regular files reject edge-triggered pollers.
     5740 +         * See the comment in fs_poll() for a more detailed explanation.
     5741 +         */
     5742 +        if (ev & POLLET) {
     5743 +                return (EPERM);
     5744 +        }
     5745 +
5784 5746          *revp = 0;
5785 5747          ufsvfsp = VTOI(vp)->i_ufsvfs;
5786 5748  
5787 5749          if (!ufsvfsp) {
5788 5750                  *revp = POLLHUP;
5789 5751                  goto out;
5790 5752          }
5791 5753  
5792 5754          if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs) ||
5793 5755              ULOCKFS_IS_ELOCK(&ufsvfsp->vfs_ulockfs)) {

5794 5756                  *revp |= POLLERR;
5795 5757  
5796 5758          } else {
5797 5759                  if ((ev & POLLOUT) && !ufsvfsp->vfs_fs->fs_ronly &&
5798 5760                      !ULOCKFS_IS_WLOCK(&ufsvfsp->vfs_ulockfs))
5799 5761                          *revp |= POLLOUT;
5800 5762  
5801 5763                  if ((ev & POLLWRBAND) && !ufsvfsp->vfs_fs->fs_ronly &&
5802 5764                      !ULOCKFS_IS_WLOCK(&ufsvfsp->vfs_ulockfs))
5803 5765                          *revp |= POLLWRBAND;
5804 5766  
5805 5767                  if (ev & POLLIN)
5806 5768                          *revp |= POLLIN;
5807 5769

↓ open down ↓

14 lines elided

↑ open up ↑

5808 5770                  if (ev & POLLRDNORM)
5809 5771                          *revp |= POLLRDNORM;
5810 5772  
5811 5773                  if (ev & POLLRDBAND)
5812 5774                          *revp |= POLLRDBAND;
5813 5775          }
5814 5776  
5815 5777          if ((ev & POLLPRI) && (*revp & (POLLERR|POLLHUP)))
5816 5778                  *revp |= POLLPRI;
5817 5779  out:
5818      -        *phpp = !any && !*revp ? &ufs_pollhd : (struct pollhead *)NULL;
     5780 +        if (*revp == 0 && ! any) {
     5781 +                *phpp = &ufs_pollhd;
     5782 +        }
5819 5783  
5820 5784          return (0);
5821 5785  }
5822 5786  
5823 5787  /* ARGSUSED */
5824 5788  static int
5825 5789  ufs_l_pathconf(struct vnode *vp, int cmd, ulong_t *valp, struct cred *cr,
5826      -        caller_context_t *ct)
     5790 +    caller_context_t *ct)
5827 5791  {
5828 5792          struct ufsvfs   *ufsvfsp = VTOI(vp)->i_ufsvfs;
5829 5793          struct ulockfs  *ulp = NULL;
5830 5794          struct inode    *sip = NULL;
5831 5795          int             error;
5832 5796          struct inode    *ip = VTOI(vp);
5833 5797          int             issync;
5834 5798  
5835 5799          error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_PATHCONF_MASK);
5836 5800          if (error)

5837 5801                  return (error);
5838 5802  
5839 5803          switch (cmd) {
5840 5804                  /*
5841 5805                   * Have to handle _PC_NAME_MAX here, because the normal way
5842 5806                   * [fs_pathconf() -> VOP_STATVFS() -> ufs_statvfs()]
5843 5807                   * results in a lock ordering reversal between
5844 5808                   * ufs_lockfs_{begin,end}() and
5845 5809                   * ufs_thread_{suspend,continue}().
5846 5810                   *
5847 5811                   * Keep in sync with ufs_statvfs().
5848 5812                   */
5849 5813          case _PC_NAME_MAX:
5850 5814                  *valp = MAXNAMLEN;
5851 5815                  break;
5852 5816  
5853 5817          case _PC_FILESIZEBITS:
5854 5818                  if (ufsvfsp->vfs_lfflags & UFS_LARGEFILES)
5855 5819                          *valp = UFS_FILESIZE_BITS;
5856 5820                  else
5857 5821                          *valp = 32;
5858 5822                  break;
5859 5823  
5860 5824          case _PC_XATTR_EXISTS:
5861 5825                  if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
5862 5826  
5863 5827                          error =
5864 5828                              ufs_xattr_getattrdir(vp, &sip, LOOKUP_XATTR, cr);
5865 5829                          if (error ==  0 && sip != NULL) {
5866 5830                                  /* Start transaction */
5867 5831                                  if (ulp) {
5868 5832                                          TRANS_BEGIN_CSYNC(ufsvfsp, issync,
5869 5833                                              TOP_RMDIR, TOP_RMDIR_SIZE);
5870 5834                                  }
5871 5835                                  /*
5872 5836                                   * Is directory empty
5873 5837                                   */
5874 5838                                  rw_enter(&sip->i_rwlock, RW_WRITER);
5875 5839                                  rw_enter(&sip->i_contents, RW_WRITER);
5876 5840                                  if (ufs_xattrdirempty(sip,
5877 5841                                      sip->i_number, CRED())) {
5878 5842                                          rw_enter(&ip->i_contents, RW_WRITER);
5879 5843                                          ufs_unhook_shadow(ip, sip);
5880 5844                                          rw_exit(&ip->i_contents);
5881 5845  
5882 5846                                          *valp = 0;
5883 5847  
5884 5848                                  } else
5885 5849                                          *valp = 1;
5886 5850                                  rw_exit(&sip->i_contents);
5887 5851                                  rw_exit(&sip->i_rwlock);
5888 5852                                  if (ulp) {
5889 5853                                          TRANS_END_CSYNC(ufsvfsp, error, issync,
5890 5854                                              TOP_RMDIR, TOP_RMDIR_SIZE);
5891 5855                                  }
5892 5856                                  VN_RELE(ITOV(sip));
5893 5857                          } else if (error == ENOENT) {
5894 5858                                  *valp = 0;
5895 5859                                  error = 0;
5896 5860                          }
5897 5861                  } else {
5898 5862                          error = fs_pathconf(vp, cmd, valp, cr, ct);
5899 5863                  }
5900 5864                  break;
5901 5865  
5902 5866          case _PC_ACL_ENABLED:
5903 5867                  *valp = _ACL_ACLENT_ENABLED;
5904 5868                  break;
5905 5869  
5906 5870          case _PC_MIN_HOLE_SIZE:
5907 5871                  *valp = (ulong_t)ip->i_fs->fs_bsize;
5908 5872                  break;
5909 5873  
5910 5874          case _PC_SATTR_ENABLED:
5911 5875          case _PC_SATTR_EXISTS:
5912 5876                  *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
5913 5877                      (vp->v_type == VREG || vp->v_type == VDIR);
5914 5878                  break;
5915 5879  
5916 5880          case _PC_TIMESTAMP_RESOLUTION:
5917 5881                  /*
5918 5882                   * UFS keeps only microsecond timestamp resolution.
5919 5883                   * This is historical and will probably never change.
5920 5884                   */
5921 5885                  *valp = 1000L;
5922 5886                  break;
5923 5887  
5924 5888          default:
5925 5889                  error = fs_pathconf(vp, cmd, valp, cr, ct);
5926 5890                  break;
5927 5891          }
5928 5892  
5929 5893          if (ulp != NULL) {

↓ open down ↓

93 lines elided

↑ open up ↑

5930 5894                  ufs_lockfs_end(ulp);
5931 5895          }
5932 5896          return (error);
5933 5897  }
5934 5898  
5935 5899  int ufs_pageio_writes, ufs_pageio_reads;
5936 5900  
5937 5901  /*ARGSUSED*/
5938 5902  static int
5939 5903  ufs_pageio(struct vnode *vp, page_t *pp, u_offset_t io_off, size_t io_len,
5940      -        int flags, struct cred *cr, caller_context_t *ct)
     5904 +    int flags, struct cred *cr, caller_context_t *ct)
5941 5905  {
5942 5906          struct inode *ip = VTOI(vp);
5943 5907          struct ufsvfs *ufsvfsp;
5944 5908          page_t *npp = NULL, *opp = NULL, *cpp = pp;
5945 5909          struct buf *bp;
5946 5910          daddr_t bn;
5947 5911          size_t done_len = 0, cur_len = 0;
5948 5912          int err = 0;
5949 5913          int contig = 0;
5950 5914          int dolock;

5951 5915          int vmpss = 0;
5952 5916          struct ulockfs *ulp;
5953 5917  
5954 5918          if ((flags & B_READ) && pp != NULL && pp->p_vnode == vp &&
5955 5919              vp->v_mpssdata != NULL) {
5956 5920                  vmpss = 1;
5957 5921          }
5958 5922  
5959 5923          dolock = (rw_owner(&ip->i_contents) != curthread);
5960 5924          /*
5961 5925           * We need a better check.  Ideally, we would use another
5962 5926           * vnodeops so that hlocked and forcibly unmounted file
5963 5927           * systems would return EIO where appropriate and w/o the
5964 5928           * need for these checks.
5965 5929           */
5966 5930          if ((ufsvfsp = ip->i_ufsvfs) == NULL)
5967 5931                  return (EIO);
5968 5932  
5969 5933          /*
5970 5934           * For vmpss (pp can be NULL) case respect the quiesce protocol.
5971 5935           * ul_lock must be taken before locking pages so we can't use it here
5972 5936           * if pp is non NULL because segvn already locked pages
5973 5937           * SE_EXCL. Instead we rely on the fact that a forced umount or
5974 5938           * applying a filesystem lock via ufs_fiolfs() will block in the
5975 5939           * implicit call to ufs_flush() until we unlock the pages after the
5976 5940           * return to segvn. Other ufs_quiesce() callers keep ufs_quiesce_pend
5977 5941           * above 0 until they are done. We have to be careful not to increment
5978 5942           * ul_vnops_cnt here after forceful unmount hlocks the file system.
5979 5943           *
5980 5944           * If pp is NULL use ul_lock to make sure we don't increment
5981 5945           * ul_vnops_cnt after forceful unmount hlocks the file system.
5982 5946           */
5983 5947          if (vmpss || pp == NULL) {
5984 5948                  ulp = &ufsvfsp->vfs_ulockfs;
5985 5949                  if (pp == NULL)
5986 5950                          mutex_enter(&ulp->ul_lock);
5987 5951                  if (ulp->ul_fs_lock & ULOCKFS_GETREAD_MASK) {
5988 5952                          if (pp == NULL) {
5989 5953                                  mutex_exit(&ulp->ul_lock);
5990 5954                          }
5991 5955                          return (vmpss ? EIO : EINVAL);
5992 5956                  }
5993 5957                  atomic_inc_ulong(&ulp->ul_vnops_cnt);
5994 5958                  if (pp == NULL)
5995 5959                          mutex_exit(&ulp->ul_lock);
5996 5960                  if (ufs_quiesce_pend) {
5997 5961                          if (!atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
5998 5962                                  cv_broadcast(&ulp->ul_cv);
5999 5963                          return (vmpss ? EIO : EINVAL);
6000 5964                  }
6001 5965          }
6002 5966  
6003 5967          if (dolock) {
6004 5968                  /*
6005 5969                   * segvn may call VOP_PAGEIO() instead of VOP_GETPAGE() to
6006 5970                   * handle a fault against a segment that maps vnode pages with
6007 5971                   * large mappings.  Segvn creates pages and holds them locked
6008 5972                   * SE_EXCL during VOP_PAGEIO() call. In this case we have to
6009 5973                   * use rw_tryenter() to avoid a potential deadlock since in
6010 5974                   * lock order i_contents needs to be taken first.
6011 5975                   * Segvn will retry via VOP_GETPAGE() if VOP_PAGEIO() fails.
6012 5976                   */
6013 5977                  if (!vmpss) {
6014 5978                          rw_enter(&ip->i_contents, RW_READER);
6015 5979                  } else if (!rw_tryenter(&ip->i_contents, RW_READER)) {
6016 5980                          if (!atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
6017 5981                                  cv_broadcast(&ulp->ul_cv);
6018 5982                          return (EDEADLK);
6019 5983                  }
6020 5984          }
6021 5985  
6022 5986          /*
6023 5987           * Return an error to segvn because the pagefault request is beyond
6024 5988           * PAGESIZE rounded EOF.
6025 5989           */
6026 5990          if (vmpss && btopr(io_off + io_len) > btopr(ip->i_size)) {
6027 5991                  if (dolock)
6028 5992                          rw_exit(&ip->i_contents);
6029 5993                  if (!atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
6030 5994                          cv_broadcast(&ulp->ul_cv);
6031 5995                  return (EFAULT);
6032 5996          }
6033 5997  
6034 5998          if (pp == NULL) {
6035 5999                  if (bmap_has_holes(ip)) {
6036 6000                          err = ENOSYS;
6037 6001                  } else {
6038 6002                          err = EINVAL;
6039 6003                  }
6040 6004                  if (dolock)
6041 6005                          rw_exit(&ip->i_contents);
6042 6006                  if (!atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
6043 6007                          cv_broadcast(&ulp->ul_cv);
6044 6008                  return (err);
6045 6009          }
6046 6010  
6047 6011          /*
6048 6012           * Break the io request into chunks, one for each contiguous
6049 6013           * stretch of disk blocks in the target file.
6050 6014           */
6051 6015          while (done_len < io_len) {
6052 6016                  ASSERT(cpp);
6053 6017                  contig = 0;
6054 6018                  if (err = bmap_read(ip, (u_offset_t)(io_off + done_len),
6055 6019                      &bn, &contig))
6056 6020                          break;
6057 6021  
6058 6022                  if (bn == UFS_HOLE) {   /* No holey swapfiles */
6059 6023                          if (vmpss) {
6060 6024                                  err = EFAULT;
6061 6025                                  break;
6062 6026                          }
6063 6027                          err = ufs_fault(ITOV(ip), "ufs_pageio: bn == UFS_HOLE");
6064 6028                          break;
6065 6029                  }
6066 6030  
6067 6031                  cur_len = MIN(io_len - done_len, contig);
6068 6032                  /*
6069 6033                   * Zero out a page beyond EOF, when the last block of
6070 6034                   * a file is a UFS fragment so that ufs_pageio() can be used
6071 6035                   * instead of ufs_getpage() to handle faults against
6072 6036                   * segvn segments that use large pages.
6073 6037                   */
6074 6038                  page_list_break(&cpp, &npp, btopr(cur_len));
6075 6039                  if ((flags & B_READ) && (cur_len & PAGEOFFSET)) {
6076 6040                          size_t xlen = cur_len & PAGEOFFSET;
6077 6041                          pagezero(cpp->p_prev, xlen, PAGESIZE - xlen);
6078 6042                  }
6079 6043  
6080 6044                  bp = pageio_setup(cpp, cur_len, ip->i_devvp, flags);
6081 6045                  ASSERT(bp != NULL);
6082 6046  
6083 6047                  bp->b_edev = ip->i_dev;
6084 6048                  bp->b_dev = cmpdev(ip->i_dev);
6085 6049                  bp->b_blkno = bn;
6086 6050                  bp->b_un.b_addr = (caddr_t)0;
6087 6051                  bp->b_file = ip->i_vnode;
6088 6052  
6089 6053                  ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
6090 6054                  ub.ub_pageios.value.ul++;
6091 6055                  if (ufsvfsp->vfs_snapshot)
6092 6056                          fssnap_strategy(&(ufsvfsp->vfs_snapshot), bp);
6093 6057                  else
6094 6058                          (void) bdev_strategy(bp);
6095 6059  
6096 6060                  if (flags & B_READ)
6097 6061                          ufs_pageio_reads++;
6098 6062                  else
6099 6063                          ufs_pageio_writes++;
6100 6064                  if (flags & B_READ)
6101 6065                          lwp_stat_update(LWP_STAT_INBLK, 1);
6102 6066                  else
6103 6067                          lwp_stat_update(LWP_STAT_OUBLK, 1);
6104 6068                  /*
6105 6069                   * If the request is not B_ASYNC, wait for i/o to complete
6106 6070                   * and re-assemble the page list to return to the caller.
6107 6071                   * If it is B_ASYNC we leave the page list in pieces and
6108 6072                   * cleanup() will dispose of them.
6109 6073                   */
6110 6074                  if ((flags & B_ASYNC) == 0) {
6111 6075                          err = biowait(bp);
6112 6076                          pageio_done(bp);
6113 6077                          if (err)
6114 6078                                  break;
6115 6079                          page_list_concat(&opp, &cpp);
6116 6080                  }
6117 6081                  cpp = npp;
6118 6082                  npp = NULL;
6119 6083                  if (flags & B_READ)
6120 6084                          cur_len = P2ROUNDUP_TYPED(cur_len, PAGESIZE, size_t);
6121 6085                  done_len += cur_len;
6122 6086          }
6123 6087          ASSERT(err || (cpp == NULL && npp == NULL && done_len == io_len));
6124 6088          if (err) {
6125 6089                  if (flags & B_ASYNC) {
6126 6090                          /* Cleanup unprocessed parts of list */
6127 6091                          page_list_concat(&cpp, &npp);
6128 6092                          if (flags & B_READ)
6129 6093                                  pvn_read_done(cpp, B_ERROR);
6130 6094                          else
6131 6095                                  pvn_write_done(cpp, B_ERROR);
6132 6096                  } else {
6133 6097                          /* Re-assemble list and let caller clean up */
6134 6098                          page_list_concat(&opp, &cpp);
6135 6099                          page_list_concat(&opp, &npp);
6136 6100                  }
6137 6101          }
6138 6102  
6139 6103          if (vmpss && !(ip->i_flag & IACC) && !ULOCKFS_IS_NOIACC(ulp) &&
6140 6104              ufsvfsp->vfs_fs->fs_ronly == 0 && !ufsvfsp->vfs_noatime) {
6141 6105                  mutex_enter(&ip->i_tlock);
6142 6106                  ip->i_flag |= IACC;
6143 6107                  ITIMES_NOLOCK(ip);
6144 6108                  mutex_exit(&ip->i_tlock);
6145 6109          }
6146 6110  
6147 6111          if (dolock)
6148 6112                  rw_exit(&ip->i_contents);
6149 6113          if (vmpss && !atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
6150 6114                  cv_broadcast(&ulp->ul_cv);
6151 6115          return (err);
6152 6116  }
6153 6117  
6154 6118  /*
6155 6119   * Called when the kernel is in a frozen state to dump data
6156 6120   * directly to the device. It uses a private dump data structure,
6157 6121   * set up by dump_ctl, to locate the correct disk block to which to dump.
6158 6122   */
6159 6123  /*ARGSUSED*/
6160 6124  static int
6161 6125  ufs_dump(vnode_t *vp, caddr_t addr, offset_t ldbn, offset_t dblks,
6162 6126      caller_context_t *ct)
6163 6127  {
6164 6128          u_offset_t      file_size;
6165 6129          struct inode    *ip = VTOI(vp);
6166 6130          struct fs       *fs = ip->i_fs;
6167 6131          daddr_t         dbn, lfsbn;
6168 6132          int             disk_blks = fs->fs_bsize >> DEV_BSHIFT;
6169 6133          int             error = 0;
6170 6134          int             ndbs, nfsbs;
6171 6135  
6172 6136          /*
6173 6137           * forced unmount case
6174 6138           */
6175 6139          if (ip->i_ufsvfs == NULL)
6176 6140                  return (EIO);
6177 6141          /*
6178 6142           * Validate the inode that it has not been modified since
6179 6143           * the dump structure is allocated.
6180 6144           */
6181 6145          mutex_enter(&ip->i_tlock);
6182 6146          if ((dump_info == NULL) ||
6183 6147              (dump_info->ip != ip) ||
6184 6148              (dump_info->time.tv_sec != ip->i_mtime.tv_sec) ||
6185 6149              (dump_info->time.tv_usec != ip->i_mtime.tv_usec)) {
6186 6150                  mutex_exit(&ip->i_tlock);
6187 6151                  return (-1);
6188 6152          }
6189 6153          mutex_exit(&ip->i_tlock);
6190 6154  
6191 6155          /*
6192 6156           * See that the file has room for this write
6193 6157           */
6194 6158          UFS_GET_ISIZE(&file_size, ip);
6195 6159  
6196 6160          if (ldbtob(ldbn + dblks) > file_size)
6197 6161                  return (ENOSPC);
6198 6162  
6199 6163          /*
6200 6164           * Find the physical disk block numbers from the dump
6201 6165           * private data structure directly and write out the data
6202 6166           * in contiguous block lumps
6203 6167           */
6204 6168          while (dblks > 0 && !error) {
6205 6169                  lfsbn = (daddr_t)lblkno(fs, ldbtob(ldbn));
6206 6170                  dbn = fsbtodb(fs, dump_info->dblk[lfsbn]) + ldbn % disk_blks;
6207 6171                  nfsbs = 1;
6208 6172                  ndbs = disk_blks - ldbn % disk_blks;
6209 6173                  while (ndbs < dblks && fsbtodb(fs, dump_info->dblk[lfsbn +
6210 6174                      nfsbs]) == dbn + ndbs) {
6211 6175                          nfsbs++;
6212 6176                          ndbs += disk_blks;
6213 6177                  }
6214 6178                  if (ndbs > dblks)
6215 6179                          ndbs = dblks;
6216 6180                  error = bdev_dump(ip->i_dev, addr, dbn, ndbs);
6217 6181                  addr += ldbtob((offset_t)ndbs);
6218 6182                  dblks -= ndbs;
6219 6183                  ldbn += ndbs;
6220 6184          }
6221 6185          return (error);
6222 6186  
6223 6187  }
6224 6188  
6225 6189  /*
6226 6190   * Prepare the file system before and after the dump operation.
6227 6191   *
6228 6192   * action = DUMP_ALLOC:
6229 6193   * Preparation before dump, allocate dump private data structure
6230 6194   * to hold all the direct and indirect block info for dump.
6231 6195   *
6232 6196   * action = DUMP_FREE:
6233 6197   * Clean up after dump, deallocate the dump private data structure.
6234 6198   *
6235 6199   * action = DUMP_SCAN:
6236 6200   * Scan dump_info for *blkp DEV_BSIZE blocks of contig fs space;
6237 6201   * if found, the starting file-relative DEV_BSIZE lbn is written
6238 6202   * to *bklp; that lbn is intended for use with VOP_DUMP()
6239 6203   */
6240 6204  /*ARGSUSED*/
6241 6205  static int
6242 6206  ufs_dumpctl(vnode_t *vp, int action, offset_t *blkp, caller_context_t *ct)
6243 6207  {
6244 6208          struct inode    *ip = VTOI(vp);
6245 6209          ufsvfs_t        *ufsvfsp = ip->i_ufsvfs;
6246 6210          struct fs       *fs;
6247 6211          daddr32_t       *dblk, *storeblk;
6248 6212          daddr32_t       *nextblk, *endblk;
6249 6213          struct buf      *bp;
6250 6214          int             i, entry, entries;
6251 6215          int             n, ncontig;
6252 6216  
6253 6217          /*
6254 6218           * check for forced unmount
6255 6219           */
6256 6220          if (ufsvfsp == NULL)
6257 6221                  return (EIO);
6258 6222  
6259 6223          if (action == DUMP_ALLOC) {
6260 6224                  /*
6261 6225                   * alloc and record dump_info
6262 6226                   */
6263 6227                  if (dump_info != NULL)
6264 6228                          return (EINVAL);
6265 6229  
6266 6230                  ASSERT(vp->v_type == VREG);
6267 6231                  fs = ufsvfsp->vfs_fs;
6268 6232  
6269 6233                  rw_enter(&ip->i_contents, RW_READER);
6270 6234  
6271 6235                  if (bmap_has_holes(ip)) {
6272 6236                          rw_exit(&ip->i_contents);
6273 6237                          return (EFAULT);
6274 6238                  }
6275 6239  
6276 6240                  /*
6277 6241                   * calculate and allocate space needed according to i_size
6278 6242                   */
6279 6243                  entries = (int)lblkno(fs, blkroundup(fs, ip->i_size));
6280 6244                  dump_info = kmem_alloc(sizeof (struct dump) +
6281 6245                      (entries - 1) * sizeof (daddr32_t), KM_NOSLEEP);
6282 6246                  if (dump_info == NULL) {
6283 6247                          rw_exit(&ip->i_contents);
6284 6248                          return (ENOMEM);
6285 6249                  }
6286 6250  
6287 6251                  /* Start saving the info */
6288 6252                  dump_info->fsbs = entries;
6289 6253                  dump_info->ip = ip;
6290 6254                  storeblk = &dump_info->dblk[0];
6291 6255  
6292 6256                  /* Direct Blocks */
6293 6257                  for (entry = 0; entry < NDADDR && entry < entries; entry++)
6294 6258                          *storeblk++ = ip->i_db[entry];
6295 6259  
6296 6260                  /* Indirect Blocks */
6297 6261                  for (i = 0; i < NIADDR; i++) {
6298 6262                          int error = 0;
6299 6263  
6300 6264                          bp = UFS_BREAD(ufsvfsp,
6301 6265                              ip->i_dev, fsbtodb(fs, ip->i_ib[i]), fs->fs_bsize);
6302 6266                          if (bp->b_flags & B_ERROR)
6303 6267                                  error = EIO;
6304 6268                          else {
6305 6269                                  dblk = bp->b_un.b_daddr;
6306 6270                                  if ((storeblk = save_dblks(ip, ufsvfsp,
6307 6271                                      storeblk, dblk, i, entries)) == NULL)
6308 6272                                          error = EIO;
6309 6273                          }
6310 6274  
6311 6275                          brelse(bp);
6312 6276  
6313 6277                          if (error != 0) {
6314 6278                                  kmem_free(dump_info, sizeof (struct dump) +
6315 6279                                      (entries - 1) * sizeof (daddr32_t));
6316 6280                                  rw_exit(&ip->i_contents);
6317 6281                                  dump_info = NULL;
6318 6282                                  return (error);
6319 6283                          }
6320 6284                  }
6321 6285                  /* and time stamp the information */
6322 6286                  mutex_enter(&ip->i_tlock);
6323 6287                  dump_info->time = ip->i_mtime;
6324 6288                  mutex_exit(&ip->i_tlock);
6325 6289  
6326 6290                  rw_exit(&ip->i_contents);
6327 6291          } else if (action == DUMP_FREE) {
6328 6292                  /*
6329 6293                   * free dump_info
6330 6294                   */
6331 6295                  if (dump_info == NULL)
6332 6296                          return (EINVAL);
6333 6297                  entries = dump_info->fsbs - 1;
6334 6298                  kmem_free(dump_info, sizeof (struct dump) +
6335 6299                      entries * sizeof (daddr32_t));
6336 6300                  dump_info = NULL;
6337 6301          } else if (action == DUMP_SCAN) {
6338 6302                  /*
6339 6303                   * scan dump_info
6340 6304                   */
6341 6305                  if (dump_info == NULL)
6342 6306                          return (EINVAL);
6343 6307  
6344 6308                  dblk = dump_info->dblk;
6345 6309                  nextblk = dblk + 1;
6346 6310                  endblk = dblk + dump_info->fsbs - 1;
6347 6311                  fs = ufsvfsp->vfs_fs;
6348 6312                  ncontig = *blkp >> (fs->fs_bshift - DEV_BSHIFT);
6349 6313  
6350 6314                  /*
6351 6315                   * scan dblk[] entries; contig fs space is found when:
6352 6316                   * ((current blkno + frags per block) == next blkno)
6353 6317                   */
6354 6318                  n = 0;
6355 6319                  while (n < ncontig && dblk < endblk) {
6356 6320                          if ((*dblk + fs->fs_frag) == *nextblk)
6357 6321                                  n++;
6358 6322                          else
6359 6323                                  n = 0;
6360 6324                          dblk++;
6361 6325                          nextblk++;
6362 6326                  }
6363 6327  
6364 6328                  /*
6365 6329                   * index is where size bytes of contig space begins;
6366 6330                   * conversion from index to the file's DEV_BSIZE lbn
6367 6331                   * is equivalent to:  (index * fs_bsize) / DEV_BSIZE
6368 6332                   */
6369 6333                  if (n == ncontig) {
6370 6334                          i = (dblk - dump_info->dblk) - ncontig;
6371 6335                          *blkp = i << (fs->fs_bshift - DEV_BSHIFT);
6372 6336                  } else
6373 6337                          return (EFAULT);
6374 6338          }
6375 6339          return (0);
6376 6340  }
6377 6341  
6378 6342  /*
6379 6343   * Recursive helper function for ufs_dumpctl().  It follows the indirect file
6380 6344   * system  blocks until it reaches the the disk block addresses, which are
6381 6345   * then stored into the given buffer, storeblk.
6382 6346   */
6383 6347  static daddr32_t *
6384 6348  save_dblks(struct inode *ip, struct ufsvfs *ufsvfsp,  daddr32_t *storeblk,
6385 6349      daddr32_t *dblk, int level, int entries)
6386 6350  {
6387 6351          struct fs       *fs = ufsvfsp->vfs_fs;
6388 6352          struct buf      *bp;
6389 6353          int             i;
6390 6354  
6391 6355          if (level == 0) {
6392 6356                  for (i = 0; i < NINDIR(fs); i++) {
6393 6357                          if (storeblk - dump_info->dblk >= entries)
6394 6358                                  break;
6395 6359                          *storeblk++ = dblk[i];
6396 6360                  }
6397 6361                  return (storeblk);
6398 6362          }
6399 6363          for (i = 0; i < NINDIR(fs); i++) {
6400 6364                  if (storeblk - dump_info->dblk >= entries)
6401 6365                          break;
6402 6366                  bp = UFS_BREAD(ufsvfsp,
6403 6367                      ip->i_dev, fsbtodb(fs, dblk[i]), fs->fs_bsize);
6404 6368                  if (bp->b_flags & B_ERROR) {
6405 6369                          brelse(bp);
6406 6370                          return (NULL);
6407 6371                  }
6408 6372                  storeblk = save_dblks(ip, ufsvfsp, storeblk, bp->b_un.b_daddr,
6409 6373                      level - 1, entries);
6410 6374                  brelse(bp);

↓ open down ↓

460 lines elided

↑ open up ↑

6411 6375  
6412 6376                  if (storeblk == NULL)
6413 6377                          return (NULL);
6414 6378          }
6415 6379          return (storeblk);
6416 6380  }
6417 6381  
6418 6382  /* ARGSUSED */
6419 6383  static int
6420 6384  ufs_getsecattr(struct vnode *vp, vsecattr_t *vsap, int flag,
6421      -        struct cred *cr, caller_context_t *ct)
     6385 +    struct cred *cr, caller_context_t *ct)
6422 6386  {
6423 6387          struct inode    *ip = VTOI(vp);
6424 6388          struct ulockfs  *ulp;
6425 6389          struct ufsvfs   *ufsvfsp = ip->i_ufsvfs;
6426 6390          ulong_t         vsa_mask = vsap->vsa_mask;
6427 6391          int             err = EINVAL;
6428 6392  
6429 6393          vsa_mask &= (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT);
6430 6394  
6431 6395          /*

6432 6396           * Only grab locks if needed - they're not needed to check vsa_mask
6433 6397           * or if the mask contains no acl flags.
6434 6398           */
6435 6399          if (vsa_mask != 0) {
6436 6400                  if (err = ufs_lockfs_begin(ufsvfsp, &ulp,
6437 6401                      ULOCKFS_GETATTR_MASK))
6438 6402                          return (err);
6439 6403  
6440 6404                  rw_enter(&ip->i_contents, RW_READER);
6441 6405                  err = ufs_acl_get(ip, vsap, flag, cr);
6442 6406                  rw_exit(&ip->i_contents);

↓ open down ↓

11 lines elided

↑ open up ↑

6443 6407  
6444 6408                  if (ulp)
6445 6409                          ufs_lockfs_end(ulp);
6446 6410          }
6447 6411          return (err);
6448 6412  }
6449 6413  
6450 6414  /* ARGSUSED */
6451 6415  static int
6452 6416  ufs_setsecattr(struct vnode *vp, vsecattr_t *vsap, int flag, struct cred *cr,
6453      -        caller_context_t *ct)
     6417 +    caller_context_t *ct)
6454 6418  {
6455 6419          struct inode    *ip = VTOI(vp);
6456 6420          struct ulockfs  *ulp = NULL;
6457 6421          struct ufsvfs   *ufsvfsp = VTOI(vp)->i_ufsvfs;
6458 6422          ulong_t         vsa_mask = vsap->vsa_mask;
6459 6423          int             err;
6460 6424          int             haverwlock = 1;
6461 6425          int             trans_size;
6462 6426          int             donetrans = 0;
6463 6427          int             retry = 1;

6464 6428  
6465 6429          ASSERT(RW_LOCK_HELD(&ip->i_rwlock));
6466 6430  
6467 6431          /* Abort now if the request is either empty or invalid. */
6468 6432          vsa_mask &= (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT);
6469 6433          if ((vsa_mask == 0) ||
6470 6434              ((vsap->vsa_aclentp == NULL) &&
6471 6435              (vsap->vsa_dfaclentp == NULL))) {
6472 6436                  err = EINVAL;
6473 6437                  goto out;
6474 6438          }
6475 6439  
6476 6440          /*
6477 6441           * Following convention, if this is a directory then we acquire the
6478 6442           * inode's i_rwlock after starting a UFS logging transaction;
6479 6443           * otherwise, we acquire it beforehand. Since we were called (and
6480 6444           * must therefore return) with the lock held, we will have to drop it,
6481 6445           * and later reacquire it, if operating on a directory.
6482 6446           */
6483 6447          if (vp->v_type == VDIR) {
6484 6448                  rw_exit(&ip->i_rwlock);
6485 6449                  haverwlock = 0;
6486 6450          } else {
6487 6451                  /* Upgrade the lock if required. */
6488 6452                  if (!rw_write_held(&ip->i_rwlock)) {
6489 6453                          rw_exit(&ip->i_rwlock);
6490 6454                          rw_enter(&ip->i_rwlock, RW_WRITER);
6491 6455                  }
6492 6456          }
6493 6457  
6494 6458  again:
6495 6459          ASSERT(!(vp->v_type == VDIR && haverwlock));
6496 6460          if (err = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SETATTR_MASK)) {
6497 6461                  ulp = NULL;
6498 6462                  retry = 0;
6499 6463                  goto out;
6500 6464          }
6501 6465  
6502 6466          /*
6503 6467           * Check that the file system supports this operation. Note that
6504 6468           * ufs_lockfs_begin() will have checked that the file system had
6505 6469           * not been forcibly unmounted.
6506 6470           */
6507 6471          if (ufsvfsp->vfs_fs->fs_ronly) {
6508 6472                  err = EROFS;
6509 6473                  goto out;
6510 6474          }
6511 6475          if (ufsvfsp->vfs_nosetsec) {
6512 6476                  err = ENOSYS;
6513 6477                  goto out;
6514 6478          }
6515 6479  
6516 6480          if (ulp) {
6517 6481                  TRANS_BEGIN_ASYNC(ufsvfsp, TOP_SETSECATTR,
6518 6482                      trans_size = TOP_SETSECATTR_SIZE(VTOI(vp)));
6519 6483                  donetrans = 1;
6520 6484          }
6521 6485  
6522 6486          if (vp->v_type == VDIR) {
6523 6487                  rw_enter(&ip->i_rwlock, RW_WRITER);
6524 6488                  haverwlock = 1;
6525 6489          }
6526 6490  
6527 6491          ASSERT(haverwlock);
6528 6492  
6529 6493          /* Do the actual work. */
6530 6494          rw_enter(&ip->i_contents, RW_WRITER);
6531 6495          /*
6532 6496           * Suppress out of inodes messages if we will retry.
6533 6497           */
6534 6498          if (retry)
6535 6499                  ip->i_flag |= IQUIET;
6536 6500          err = ufs_acl_set(ip, vsap, flag, cr);
6537 6501          ip->i_flag &= ~IQUIET;
6538 6502          rw_exit(&ip->i_contents);
6539 6503  
6540 6504  out:
6541 6505          if (ulp) {
6542 6506                  if (donetrans) {
6543 6507                          /*
6544 6508                           * top_end_async() can eventually call
6545 6509                           * top_end_sync(), which can block. We must
6546 6510                           * therefore observe the lock-ordering protocol
6547 6511                           * here as well.
6548 6512                           */
6549 6513                          if (vp->v_type == VDIR) {
6550 6514                                  rw_exit(&ip->i_rwlock);
6551 6515                                  haverwlock = 0;
6552 6516                          }
6553 6517                          TRANS_END_ASYNC(ufsvfsp, TOP_SETSECATTR, trans_size);
6554 6518                  }
6555 6519                  ufs_lockfs_end(ulp);
6556 6520          }
6557 6521          /*
6558 6522           * If no inodes available, try scaring a logically-
6559 6523           * free one out of the delete queue to someplace
6560 6524           * that we can find it.
6561 6525           */
6562 6526          if ((err == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
6563 6527                  ufs_delete_drain_wait(ufsvfsp, 1);
6564 6528                  retry = 0;
6565 6529                  if (vp->v_type == VDIR && haverwlock) {
6566 6530                          rw_exit(&ip->i_rwlock);
6567 6531                          haverwlock = 0;
6568 6532                  }
6569 6533                  goto again;
6570 6534          }
6571 6535          /*
6572 6536           * If we need to reacquire the lock then it is safe to do so
6573 6537           * as a reader. This is because ufs_rwunlock(), which will be
6574 6538           * called by our caller after we return, does not differentiate
6575 6539           * between shared and exclusive locks.
6576 6540           */
6577 6541          if (!haverwlock) {
6578 6542                  ASSERT(vp->v_type == VDIR);
6579 6543                  rw_enter(&ip->i_rwlock, RW_READER);
6580 6544          }
6581 6545  
6582 6546          return (err);
6583 6547  }
6584 6548  
6585 6549  /*
6586 6550   * Locate the vnode to be used for an event notification. As this will
6587 6551   * be called prior to the name space change perform basic verification
6588 6552   * that the change will be allowed.
6589 6553   */
6590 6554  
6591 6555  static int
6592 6556  ufs_eventlookup(struct vnode *dvp, char *nm, struct cred *cr,
6593 6557      struct vnode **vpp)
6594 6558  {
6595 6559          int     namlen;
6596 6560          int     error;
6597 6561          struct vnode    *vp;
6598 6562          struct inode    *ip;
6599 6563          struct inode    *xip;
6600 6564          struct ufsvfs   *ufsvfsp;
6601 6565          struct ulockfs  *ulp;
6602 6566  
6603 6567          ip = VTOI(dvp);
6604 6568          *vpp = NULL;
6605 6569  
6606 6570          if ((namlen = strlen(nm)) == 0)
6607 6571                  return (EINVAL);
6608 6572  
6609 6573          if (nm[0] == '.') {
6610 6574                  if (namlen == 1)
6611 6575                          return (EINVAL);
6612 6576                  else if ((namlen == 2) && nm[1] == '.') {
6613 6577                          return (EEXIST);
6614 6578                  }
6615 6579          }
6616 6580  
6617 6581          /*
6618 6582           * Check accessibility and write access of parent directory as we
6619 6583           * only want to post the event if we're able to make a change.
6620 6584           */
6621 6585          if (error = ufs_diraccess(ip, IEXEC|IWRITE, cr))
6622 6586                  return (error);
6623 6587  
6624 6588          if (vp = dnlc_lookup(dvp, nm)) {
6625 6589                  if (vp == DNLC_NO_VNODE) {
6626 6590                          VN_RELE(vp);
6627 6591                          return (ENOENT);
6628 6592                  }
6629 6593  
6630 6594                  *vpp = vp;
6631 6595                  return (0);
6632 6596          }
6633 6597  
6634 6598          /*
6635 6599           * Keep the idle queue from getting too long by idling two
6636 6600           * inodes before attempting to allocate another.
6637 6601           * This operation must be performed before entering lockfs
6638 6602           * or a transaction.
6639 6603           */
6640 6604          if (ufs_idle_q.uq_ne > ufs_idle_q.uq_hiwat)
6641 6605                  if ((curthread->t_flag & T_DONTBLOCK) == 0) {
6642 6606                          ins.in_lidles.value.ul += ufs_lookup_idle_count;
6643 6607                          ufs_idle_some(ufs_lookup_idle_count);
6644 6608                  }
6645 6609  
6646 6610          ufsvfsp = ip->i_ufsvfs;
6647 6611  
6648 6612  retry_lookup:
6649 6613          if (error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_LOOKUP_MASK))
6650 6614                  return (error);
6651 6615  
6652 6616          if ((error = ufs_dirlook(ip, nm, &xip, cr, 1, 1)) == 0) {
6653 6617                  vp = ITOV(xip);
6654 6618                  *vpp = vp;
6655 6619          }
6656 6620  
6657 6621          if (ulp) {
6658 6622                  ufs_lockfs_end(ulp);
6659 6623          }
6660 6624  
6661 6625          if (error == EAGAIN)
6662 6626                  goto retry_lookup;
6663 6627  
6664 6628          return (error);
6665 6629  }

↓ open down ↓

202 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX