big-one New usr/src/uts/common/fs/ufs/ufs

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1984, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  25  * Copyright 2017 Joyent, Inc.
  26  * Copyright (c) 2016 by Delphix. All rights reserved.
  27  */
  28 
  29 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  30 /*        All Rights Reserved   */
  31 
  32 /*
  33  * Portions of this source code were derived from Berkeley 4.3 BSD
  34  * under license from the Regents of the University of California.
  35  */
  36 
  37 #include <sys/types.h>
  38 #include <sys/t_lock.h>
  39 #include <sys/ksynch.h>
  40 #include <sys/param.h>
  41 #include <sys/time.h>
  42 #include <sys/systm.h>
  43 #include <sys/sysmacros.h>
  44 #include <sys/resource.h>
  45 #include <sys/signal.h>
  46 #include <sys/cred.h>
  47 #include <sys/user.h>
  48 #include <sys/buf.h>
  49 #include <sys/vfs.h>
  50 #include <sys/vfs_opreg.h>
  51 #include <sys/vnode.h>
  52 #include <sys/proc.h>
  53 #include <sys/disp.h>
  54 #include <sys/file.h>
  55 #include <sys/fcntl.h>
  56 #include <sys/flock.h>
  57 #include <sys/atomic.h>
  58 #include <sys/kmem.h>
  59 #include <sys/uio.h>
  60 #include <sys/dnlc.h>
  61 #include <sys/conf.h>
  62 #include <sys/mman.h>
  63 #include <sys/pathname.h>
  64 #include <sys/debug.h>
  65 #include <sys/vmsystm.h>
  66 #include <sys/cmn_err.h>
  67 #include <sys/filio.h>
  68 #include <sys/policy.h>
  69 
  70 #include <sys/fs/ufs_fs.h>
  71 #include <sys/fs/ufs_lockfs.h>
  72 #include <sys/fs/ufs_filio.h>
  73 #include <sys/fs/ufs_inode.h>
  74 #include <sys/fs/ufs_fsdir.h>
  75 #include <sys/fs/ufs_quota.h>
  76 #include <sys/fs/ufs_log.h>
  77 #include <sys/fs/ufs_snap.h>
  78 #include <sys/fs/ufs_trans.h>
  79 #include <sys/fs/ufs_panic.h>
  80 #include <sys/fs/ufs_bio.h>
  81 #include <sys/dirent.h>           /* must be AFTER <sys/fs/fsdir.h>! */
  82 #include <sys/errno.h>
  83 #include <sys/fssnap_if.h>
  84 #include <sys/unistd.h>
  85 #include <sys/sunddi.h>
  86 
  87 #include <sys/filio.h>            /* _FIOIO */
  88 
  89 #include <vm/hat.h>
  90 #include <vm/page.h>
  91 #include <vm/pvn.h>
  92 #include <vm/as.h>
  93 #include <vm/seg.h>
  94 #include <vm/seg_map.h>
  95 #include <vm/seg_vn.h>
  96 #include <vm/seg_kmem.h>
  97 #include <vm/rm.h>
  98 #include <sys/swap.h>
  99 
 100 #include <fs/fs_subr.h>
 101 
 102 #include <sys/fs/decomp.h>
 103 
 104 static struct instats ins;
 105 
 106 static  int ufs_getpage_ra(struct vnode *, u_offset_t, struct seg *, caddr_t);
 107 static  int ufs_getpage_miss(struct vnode *, u_offset_t, size_t, struct seg *,
 108                 caddr_t, struct page **, size_t, enum seg_rw, int);
 109 static  int ufs_open(struct vnode **, int, struct cred *, caller_context_t *);
 110 static  int ufs_close(struct vnode *, int, int, offset_t, struct cred *,
 111                 caller_context_t *);
 112 static  int ufs_read(struct vnode *, struct uio *, int, struct cred *,
 113                 struct caller_context *);
 114 static  int ufs_write(struct vnode *, struct uio *, int, struct cred *,
 115                 struct caller_context *);
 116 static  int ufs_ioctl(struct vnode *, int, intptr_t, int, struct cred *,
 117                 int *, caller_context_t *);
 118 static  int ufs_getattr(struct vnode *, struct vattr *, int, struct cred *,
 119                 caller_context_t *);
 120 static  int ufs_setattr(struct vnode *, struct vattr *, int, struct cred *,
 121                 caller_context_t *);
 122 static  int ufs_access(struct vnode *, int, int, struct cred *,
 123                 caller_context_t *);
 124 static  int ufs_lookup(struct vnode *, char *, struct vnode **,
 125                 struct pathname *, int, struct vnode *, struct cred *,
 126                 caller_context_t *, int *, pathname_t *);
 127 static  int ufs_create(struct vnode *, char *, struct vattr *, enum vcexcl,
 128                 int, struct vnode **, struct cred *, int,
 129                 caller_context_t *, vsecattr_t  *);
 130 static  int ufs_remove(struct vnode *, char *, struct cred *,
 131                 caller_context_t *, int);
 132 static  int ufs_link(struct vnode *, struct vnode *, char *, struct cred *,
 133                 caller_context_t *, int);
 134 static  int ufs_rename(struct vnode *, char *, struct vnode *, char *,
 135                 struct cred *, caller_context_t *, int);
 136 static  int ufs_mkdir(struct vnode *, char *, struct vattr *, struct vnode **,
 137                 struct cred *, caller_context_t *, int, vsecattr_t *);
 138 static  int ufs_rmdir(struct vnode *, char *, struct vnode *, struct cred *,
 139                 caller_context_t *, int);
 140 static  int ufs_readdir(struct vnode *, struct uio *, struct cred *, int *,
 141                 caller_context_t *, int);
 142 static  int ufs_symlink(struct vnode *, char *, struct vattr *, char *,
 143                 struct cred *, caller_context_t *, int);
 144 static  int ufs_readlink(struct vnode *, struct uio *, struct cred *,
 145                 caller_context_t *);
 146 static  int ufs_fsync(struct vnode *, int, struct cred *, caller_context_t *);
 147 static  void ufs_inactive(struct vnode *, struct cred *, caller_context_t *);
 148 static  int ufs_fid(struct vnode *, struct fid *, caller_context_t *);
 149 static  int ufs_rwlock(struct vnode *, int, caller_context_t *);
 150 static  void ufs_rwunlock(struct vnode *, int, caller_context_t *);
 151 static  int ufs_seek(struct vnode *, offset_t, offset_t *, caller_context_t *);
 152 static  int ufs_frlock(struct vnode *, int, struct flock64 *, int, offset_t,
 153                 struct flk_callback *, struct cred *,
 154                 caller_context_t *);
 155 static  int ufs_space(struct vnode *, int, struct flock64 *, int, offset_t,
 156                 cred_t *, caller_context_t *);
 157 static  int ufs_getpage(struct vnode *, offset_t, size_t, uint_t *,
 158                 struct page **, size_t, struct seg *, caddr_t,
 159                 enum seg_rw, struct cred *, caller_context_t *);
 160 static  int ufs_putpage(struct vnode *, offset_t, size_t, int, struct cred *,
 161                 caller_context_t *);
 162 static  int ufs_putpages(struct vnode *, offset_t, size_t, int, struct cred *);
 163 static  int ufs_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t,
 164                 uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
 165 static  int ufs_addmap(struct vnode *, offset_t, struct as *, caddr_t,  size_t,
 166                 uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
 167 static  int ufs_delmap(struct vnode *, offset_t, struct as *, caddr_t,  size_t,
 168                 uint_t, uint_t, uint_t, struct cred *, caller_context_t *);
 169 static  int ufs_poll(vnode_t *, short, int, short *, struct pollhead **,
 170                 caller_context_t *);
 171 static  int ufs_dump(vnode_t *, caddr_t, offset_t, offset_t,
 172     caller_context_t *);
 173 static  int ufs_l_pathconf(struct vnode *, int, ulong_t *, struct cred *,
 174                 caller_context_t *);
 175 static  int ufs_pageio(struct vnode *, struct page *, u_offset_t, size_t, int,
 176                 struct cred *, caller_context_t *);
 177 static  int ufs_dumpctl(vnode_t *, int, offset_t *, caller_context_t *);
 178 static  daddr32_t *save_dblks(struct inode *, struct ufsvfs *, daddr32_t *,
 179                 daddr32_t *, int, int);
 180 static  int ufs_getsecattr(struct vnode *, vsecattr_t *, int, struct cred *,
 181                 caller_context_t *);
 182 static  int ufs_setsecattr(struct vnode *, vsecattr_t *, int, struct cred *,
 183                 caller_context_t *);
 184 static  int ufs_priv_access(void *, int, struct cred *);
 185 static  int ufs_eventlookup(struct vnode *, char *, struct cred *,
 186     struct vnode **);
 187 extern int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
 188 
 189 /*
 190  * For lockfs: ulockfs begin/end is now inlined in the ufs_xxx functions.
 191  *
 192  * XXX - ULOCKFS in fs_pathconf and ufs_ioctl is not inlined yet.
 193  */
 194 struct vnodeops *ufs_vnodeops;
 195 
 196 /* NOTE: "not blkd" below  means that the operation isn't blocked by lockfs */
 197 const fs_operation_def_t ufs_vnodeops_template[] = {
 198         VOPNAME_OPEN,           { .vop_open = ufs_open },       /* not blkd */
 199         VOPNAME_CLOSE,          { .vop_close = ufs_close },     /* not blkd */
 200         VOPNAME_READ,           { .vop_read = ufs_read },
 201         VOPNAME_WRITE,          { .vop_write = ufs_write },
 202         VOPNAME_IOCTL,          { .vop_ioctl = ufs_ioctl },
 203         VOPNAME_GETATTR,        { .vop_getattr = ufs_getattr },
 204         VOPNAME_SETATTR,        { .vop_setattr = ufs_setattr },
 205         VOPNAME_ACCESS,         { .vop_access = ufs_access },
 206         VOPNAME_LOOKUP,         { .vop_lookup = ufs_lookup },
 207         VOPNAME_CREATE,         { .vop_create = ufs_create },
 208         VOPNAME_REMOVE,         { .vop_remove = ufs_remove },
 209         VOPNAME_LINK,           { .vop_link = ufs_link },
 210         VOPNAME_RENAME,         { .vop_rename = ufs_rename },
 211         VOPNAME_MKDIR,          { .vop_mkdir = ufs_mkdir },
 212         VOPNAME_RMDIR,          { .vop_rmdir = ufs_rmdir },
 213         VOPNAME_READDIR,        { .vop_readdir = ufs_readdir },
 214         VOPNAME_SYMLINK,        { .vop_symlink = ufs_symlink },
 215         VOPNAME_READLINK,       { .vop_readlink = ufs_readlink },
 216         VOPNAME_FSYNC,          { .vop_fsync = ufs_fsync },
 217         VOPNAME_INACTIVE,       { .vop_inactive = ufs_inactive }, /* not blkd */
 218         VOPNAME_FID,            { .vop_fid = ufs_fid },
 219         VOPNAME_RWLOCK,         { .vop_rwlock = ufs_rwlock },   /* not blkd */
 220         VOPNAME_RWUNLOCK,       { .vop_rwunlock = ufs_rwunlock }, /* not blkd */
 221         VOPNAME_SEEK,           { .vop_seek = ufs_seek },
 222         VOPNAME_FRLOCK,         { .vop_frlock = ufs_frlock },
 223         VOPNAME_SPACE,          { .vop_space = ufs_space },
 224         VOPNAME_GETPAGE,        { .vop_getpage = ufs_getpage },
 225         VOPNAME_PUTPAGE,        { .vop_putpage = ufs_putpage },
 226         VOPNAME_MAP,            { .vop_map = ufs_map },
 227         VOPNAME_ADDMAP,         { .vop_addmap = ufs_addmap },   /* not blkd */
 228         VOPNAME_DELMAP,         { .vop_delmap = ufs_delmap },   /* not blkd */
 229         VOPNAME_POLL,           { .vop_poll = ufs_poll },       /* not blkd */
 230         VOPNAME_DUMP,           { .vop_dump = ufs_dump },
 231         VOPNAME_PATHCONF,       { .vop_pathconf = ufs_l_pathconf },
 232         VOPNAME_PAGEIO,         { .vop_pageio = ufs_pageio },
 233         VOPNAME_DUMPCTL,        { .vop_dumpctl = ufs_dumpctl },
 234         VOPNAME_GETSECATTR,     { .vop_getsecattr = ufs_getsecattr },
 235         VOPNAME_SETSECATTR,     { .vop_setsecattr = ufs_setsecattr },
 236         VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
 237         NULL,                   NULL
 238 };
 239 
 240 #define MAX_BACKFILE_COUNT      9999
 241 
 242 /*
 243  * Created by ufs_dumpctl() to store a file's disk block info into memory.
 244  * Used by ufs_dump() to dump data to disk directly.
 245  */
 246 struct dump {
 247         struct inode    *ip;            /* the file we contain */
 248         daddr_t         fsbs;           /* number of blocks stored */
 249         struct timeval32 time;          /* time stamp for the struct */
 250         daddr32_t       dblk[1];        /* place holder for block info */
 251 };
 252 
 253 static struct dump *dump_info = NULL;
 254 
 255 /*
 256  * Previously there was no special action required for ordinary files.
 257  * (Devices are handled through the device file system.)
 258  * Now we support Large Files and Large File API requires open to
 259  * fail if file is large.
 260  * We could take care to prevent data corruption
 261  * by doing an atomic check of size and truncate if file is opened with
 262  * FTRUNC flag set but traditionally this is being done by the vfs/vnode
 263  * layers. So taking care of truncation here is a change in the existing
 264  * semantics of VOP_OPEN and therefore we chose not to implement any thing
 265  * here. The check for the size of the file > 2GB is being done at the
 266  * vfs layer in routine vn_open().
 267  */
 268 
 269 /* ARGSUSED */
 270 static int
 271 ufs_open(struct vnode **vpp, int flag, struct cred *cr, caller_context_t *ct)
 272 {
 273         return (0);
 274 }
 275 
 276 /*ARGSUSED*/
 277 static int
 278 ufs_close(struct vnode *vp, int flag, int count, offset_t offset,
 279     struct cred *cr, caller_context_t *ct)
 280 {
 281         cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
 282         cleanshares(vp, ttoproc(curthread)->p_pid);
 283 
 284         /*
 285          * Push partially filled cluster at last close.
 286          * ``last close'' is approximated because the dnlc
 287          * may have a hold on the vnode.
 288          * Checking for VBAD here will also act as a forced umount check.
 289          */
 290         if (vp->v_count <= 2 && vp->v_type != VBAD) {
 291                 struct inode *ip = VTOI(vp);
 292                 if (ip->i_delaylen) {
 293                         ins.in_poc.value.ul++;
 294                         (void) ufs_putpages(vp, ip->i_delayoff, ip->i_delaylen,
 295                             B_ASYNC | B_FREE, cr);
 296                         ip->i_delaylen = 0;
 297                 }
 298         }
 299 
 300         return (0);
 301 }
 302 
 303 /*ARGSUSED*/
 304 static int
 305 ufs_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr,
 306     struct caller_context *ct)
 307 {
 308         struct inode *ip = VTOI(vp);
 309         struct ufsvfs *ufsvfsp;
 310         struct ulockfs *ulp = NULL;
 311         int error = 0;
 312         int intrans = 0;
 313 
 314         ASSERT(RW_READ_HELD(&ip->i_rwlock));
 315 
 316         /*
 317          * Mandatory locking needs to be done before ufs_lockfs_begin()
 318          * and TRANS_BEGIN_SYNC() calls since mandatory locks can sleep.
 319          */
 320         if (MANDLOCK(vp, ip->i_mode)) {
 321                 /*
 322                  * ufs_getattr ends up being called by chklock
 323                  */
 324                 error = chklock(vp, FREAD, uiop->uio_loffset,
 325                     uiop->uio_resid, uiop->uio_fmode, ct);
 326                 if (error)
 327                         goto out;
 328         }
 329 
 330         ufsvfsp = ip->i_ufsvfs;
 331         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READ_MASK);
 332         if (error)
 333                 goto out;
 334 
 335         /*
 336          * In the case that a directory is opened for reading as a file
 337          * (eg "cat .") with the  O_RSYNC, O_SYNC and O_DSYNC flags set.
 338          * The locking order had to be changed to avoid a deadlock with
 339          * an update taking place on that directory at the same time.
 340          */
 341         if ((ip->i_mode & IFMT) == IFDIR) {
 342 
 343                 rw_enter(&ip->i_contents, RW_READER);
 344                 error = rdip(ip, uiop, ioflag, cr);
 345                 rw_exit(&ip->i_contents);
 346 
 347                 if (error) {
 348                         if (ulp)
 349                                 ufs_lockfs_end(ulp);
 350                         goto out;
 351                 }
 352 
 353                 if (ulp && (ioflag & FRSYNC) && (ioflag & (FSYNC | FDSYNC)) &&
 354                     TRANS_ISTRANS(ufsvfsp)) {
 355                         rw_exit(&ip->i_rwlock);
 356                         TRANS_BEGIN_SYNC(ufsvfsp, TOP_READ_SYNC, TOP_READ_SIZE,
 357                             error);
 358                         ASSERT(!error);
 359                         TRANS_END_SYNC(ufsvfsp, error, TOP_READ_SYNC,
 360                             TOP_READ_SIZE);
 361                         rw_enter(&ip->i_rwlock, RW_READER);
 362                 }
 363         } else {
 364                 /*
 365                  * Only transact reads to files opened for sync-read and
 366                  * sync-write on a file system that is not write locked.
 367                  *
 368                  * The ``not write locked'' check prevents problems with
 369                  * enabling/disabling logging on a busy file system.  E.g.,
 370                  * logging exists at the beginning of the read but does not
 371                  * at the end.
 372                  *
 373                  */
 374                 if (ulp && (ioflag & FRSYNC) && (ioflag & (FSYNC | FDSYNC)) &&
 375                     TRANS_ISTRANS(ufsvfsp)) {
 376                         TRANS_BEGIN_SYNC(ufsvfsp, TOP_READ_SYNC, TOP_READ_SIZE,
 377                             error);
 378                         ASSERT(!error);
 379                         intrans = 1;
 380                 }
 381 
 382                 rw_enter(&ip->i_contents, RW_READER);
 383                 error = rdip(ip, uiop, ioflag, cr);
 384                 rw_exit(&ip->i_contents);
 385 
 386                 if (intrans) {
 387                         TRANS_END_SYNC(ufsvfsp, error, TOP_READ_SYNC,
 388                             TOP_READ_SIZE);
 389                 }
 390         }
 391 
 392         if (ulp) {
 393                 ufs_lockfs_end(ulp);
 394         }
 395 out:
 396 
 397         return (error);
 398 }
 399 
 400 extern  volatile int    ufs_HW; /* high water mark */
 401 extern  volatile int    ufs_LW; /* low water mark */
 402 volatile int    ufs_WRITES = 1; /* XXX - enable/disable */
 403 int     ufs_throttles = 0;      /* throttling count */
 404 int     ufs_allow_shared_writes = 1;    /* directio shared writes */
 405 
 406 static int
 407 ufs_check_rewrite(struct inode *ip, struct uio *uiop, int ioflag)
 408 {
 409         int     shared_write;
 410 
 411         /*
 412          * If the FDSYNC flag is set then ignore the global
 413          * ufs_allow_shared_writes in this case.
 414          */
 415         shared_write = (ioflag & FDSYNC) | ufs_allow_shared_writes;
 416 
 417         /*
 418          * Filter to determine if this request is suitable as a
 419          * concurrent rewrite. This write must not allocate blocks
 420          * by extending the file or filling in holes. No use trying
 421          * through FSYNC descriptors as the inode will be synchronously
 422          * updated after the write. The uio structure has not yet been
 423          * checked for sanity, so assume nothing.
 424          */
 425         return (((ip->i_mode & IFMT) == IFREG) && !(ioflag & FAPPEND) &&
 426             (uiop->uio_loffset >= (offset_t)0) &&
 427             (uiop->uio_loffset < ip->i_size) && (uiop->uio_resid > 0) &&
 428             ((ip->i_size - uiop->uio_loffset) >= uiop->uio_resid) &&
 429             !(ioflag & FSYNC) && !bmap_has_holes(ip) &&
 430             shared_write);
 431 }
 432 
 433 /*ARGSUSED*/
 434 static int
 435 ufs_write(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cr,
 436     caller_context_t *ct)
 437 {
 438         struct inode *ip = VTOI(vp);
 439         struct ufsvfs *ufsvfsp;
 440         struct ulockfs *ulp;
 441         int retry = 1;
 442         int error, resv, resid = 0;
 443         int directio_status;
 444         int exclusive;
 445         int rewriteflg;
 446         long start_resid = uiop->uio_resid;
 447 
 448         ASSERT(RW_LOCK_HELD(&ip->i_rwlock));
 449 
 450 retry_mandlock:
 451         /*
 452          * Mandatory locking needs to be done before ufs_lockfs_begin()
 453          * and TRANS_BEGIN_[A]SYNC() calls since mandatory locks can sleep.
 454          * Check for forced unmounts normally done in ufs_lockfs_begin().
 455          */
 456         if ((ufsvfsp = ip->i_ufsvfs) == NULL) {
 457                 error = EIO;
 458                 goto out;
 459         }
 460         if (MANDLOCK(vp, ip->i_mode)) {
 461 
 462                 ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
 463 
 464                 /*
 465                  * ufs_getattr ends up being called by chklock
 466                  */
 467                 error = chklock(vp, FWRITE, uiop->uio_loffset,
 468                     uiop->uio_resid, uiop->uio_fmode, ct);
 469                 if (error)
 470                         goto out;
 471         }
 472 
 473         /* i_rwlock can change in chklock */
 474         exclusive = rw_write_held(&ip->i_rwlock);
 475         rewriteflg = ufs_check_rewrite(ip, uiop, ioflag);
 476 
 477         /*
 478          * Check for fast-path special case of directio re-writes.
 479          */
 480         if ((ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) &&
 481             !exclusive && rewriteflg) {
 482 
 483                 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_WRITE_MASK);
 484                 if (error)
 485                         goto out;
 486 
 487                 rw_enter(&ip->i_contents, RW_READER);
 488                 error = ufs_directio_write(ip, uiop, ioflag, 1, cr,
 489                     &directio_status);
 490                 if (directio_status == DIRECTIO_SUCCESS) {
 491                         uint_t i_flag_save;
 492 
 493                         if (start_resid != uiop->uio_resid)
 494                                 error = 0;
 495                         /*
 496                          * Special treatment of access times for re-writes.
 497                          * If IMOD is not already set, then convert it
 498                          * to IMODACC for this operation. This defers
 499                          * entering a delta into the log until the inode
 500                          * is flushed. This mimics what is done for read
 501                          * operations and inode access time.
 502                          */
 503                         mutex_enter(&ip->i_tlock);
 504                         i_flag_save = ip->i_flag;
 505                         ip->i_flag |= IUPD | ICHG;
 506                         ip->i_seq++;
 507                         ITIMES_NOLOCK(ip);
 508                         if ((i_flag_save & IMOD) == 0) {
 509                                 ip->i_flag &= ~IMOD;
 510                                 ip->i_flag |= IMODACC;
 511                         }
 512                         mutex_exit(&ip->i_tlock);
 513                         rw_exit(&ip->i_contents);
 514                         if (ulp)
 515                                 ufs_lockfs_end(ulp);
 516                         goto out;
 517                 }
 518                 rw_exit(&ip->i_contents);
 519                 if (ulp)
 520                         ufs_lockfs_end(ulp);
 521         }
 522 
 523         if (!exclusive && !rw_tryupgrade(&ip->i_rwlock)) {
 524                 rw_exit(&ip->i_rwlock);
 525                 rw_enter(&ip->i_rwlock, RW_WRITER);
 526                 /*
 527                  * Mandatory locking could have been enabled
 528                  * after dropping the i_rwlock.
 529                  */
 530                 if (MANDLOCK(vp, ip->i_mode))
 531                         goto retry_mandlock;
 532         }
 533 
 534         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_WRITE_MASK);
 535         if (error)
 536                 goto out;
 537 
 538         /*
 539          * Amount of log space needed for this write
 540          */
 541         if (!rewriteflg || !(ioflag & FDSYNC))
 542                 TRANS_WRITE_RESV(ip, uiop, ulp, &resv, &resid);
 543 
 544         /*
 545          * Throttle writes.
 546          */
 547         if (ufs_WRITES && (ip->i_writes > ufs_HW)) {
 548                 mutex_enter(&ip->i_tlock);
 549                 while (ip->i_writes > ufs_HW) {
 550                         ufs_throttles++;
 551                         cv_wait(&ip->i_wrcv, &ip->i_tlock);
 552                 }
 553                 mutex_exit(&ip->i_tlock);
 554         }
 555 
 556         /*
 557          * Enter Transaction
 558          *
 559          * If the write is a rewrite there is no need to open a transaction
 560          * if the FDSYNC flag is set and not the FSYNC.  In this case just
 561          * set the IMODACC flag to modify do the update at a later time
 562          * thus avoiding the overhead of the logging transaction that is
 563          * not required.
 564          */
 565         if (ioflag & (FSYNC|FDSYNC)) {
 566                 if (ulp) {
 567                         if (rewriteflg) {
 568                                 uint_t i_flag_save;
 569 
 570                                 rw_enter(&ip->i_contents, RW_READER);
 571                                 mutex_enter(&ip->i_tlock);
 572                                 i_flag_save = ip->i_flag;
 573                                 ip->i_flag |= IUPD | ICHG;
 574                                 ip->i_seq++;
 575                                 ITIMES_NOLOCK(ip);
 576                                 if ((i_flag_save & IMOD) == 0) {
 577                                         ip->i_flag &= ~IMOD;
 578                                         ip->i_flag |= IMODACC;
 579                                 }
 580                                 mutex_exit(&ip->i_tlock);
 581                                 rw_exit(&ip->i_contents);
 582                         } else {
 583                                 int terr = 0;
 584                                 TRANS_BEGIN_SYNC(ufsvfsp, TOP_WRITE_SYNC, resv,
 585                                     terr);
 586                                 ASSERT(!terr);
 587                         }
 588                 }
 589         } else {
 590                 if (ulp)
 591                         TRANS_BEGIN_ASYNC(ufsvfsp, TOP_WRITE, resv);
 592         }
 593 
 594         /*
 595          * Write the file
 596          */
 597         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
 598         rw_enter(&ip->i_contents, RW_WRITER);
 599         if ((ioflag & FAPPEND) != 0 && (ip->i_mode & IFMT) == IFREG) {
 600                 /*
 601                  * In append mode start at end of file.
 602                  */
 603                 uiop->uio_loffset = ip->i_size;
 604         }
 605 
 606         /*
 607          * Mild optimisation, don't call ufs_trans_write() unless we have to
 608          * Also, suppress file system full messages if we will retry.
 609          */
 610         if (retry)
 611                 ip->i_flag |= IQUIET;
 612         if (resid) {
 613                 TRANS_WRITE(ip, uiop, ioflag, error, ulp, cr, resv, resid);
 614         } else {
 615                 error = wrip(ip, uiop, ioflag, cr);
 616         }
 617         ip->i_flag &= ~IQUIET;
 618 
 619         rw_exit(&ip->i_contents);
 620         rw_exit(&ufsvfsp->vfs_dqrwlock);
 621 
 622         /*
 623          * Leave Transaction
 624          */
 625         if (ulp) {
 626                 if (ioflag & (FSYNC|FDSYNC)) {
 627                         if (!rewriteflg) {
 628                                 int terr = 0;
 629 
 630                                 TRANS_END_SYNC(ufsvfsp, terr, TOP_WRITE_SYNC,
 631                                     resv);
 632                                 if (error == 0)
 633                                         error = terr;
 634                         }
 635                 } else {
 636                         TRANS_END_ASYNC(ufsvfsp, TOP_WRITE, resv);
 637                 }
 638                 ufs_lockfs_end(ulp);
 639         }
 640 out:
 641         if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
 642                 /*
 643                  * Any blocks tied up in pending deletes?
 644                  */
 645                 ufs_delete_drain_wait(ufsvfsp, 1);
 646                 retry = 0;
 647                 goto retry_mandlock;
 648         }
 649 
 650         if (error == ENOSPC && (start_resid != uiop->uio_resid))
 651                 error = 0;
 652 
 653         return (error);
 654 }
 655 
 656 /*
 657  * Don't cache write blocks to files with the sticky bit set.
 658  * Used to keep swap files from blowing the page cache on a server.
 659  */
 660 int stickyhack = 1;
 661 
 662 /*
 663  * Free behind hacks.  The pager is busted.
 664  * XXX - need to pass the information down to writedone() in a flag like B_SEQ
 665  * or B_FREE_IF_TIGHT_ON_MEMORY.
 666  */
 667 volatile int    freebehind = 1;
 668 volatile int    smallfile = 0;
 669 u_offset_t smallfile64 = 32 * 1024;
 670 
 671 /*
 672  * While we should, in most cases, cache the pages for write, we
 673  * may also want to cache the pages for read as long as they are
 674  * frequently re-usable.
 675  *
 676  * If cache_read_ahead = 1, the pages for read will go to the tail
 677  * of the cache list when they are released, otherwise go to the head.
 678  */
 679 int     cache_read_ahead = 0;
 680 
 681 /*
 682  * Freebehind exists  so that as we read  large files  sequentially we
 683  * don't consume most of memory with pages  from a few files. It takes
 684  * longer to re-read from disk multiple small files as it does reading
 685  * one large one sequentially.  As system  memory grows customers need
 686  * to retain bigger chunks   of files in  memory.   The advent of  the
 687  * cachelist opens up of the possibility freeing pages  to the head or
 688  * tail of the list.
 689  *
 690  * Not freeing a page is a bet that the page will be read again before
 691  * it's segmap slot is needed for something else. If we loose the bet,
 692  * it means some  other thread is  burdened with the  page free we did
 693  * not do. If we win we save a free and reclaim.
 694  *
 695  * Freeing it at the tail  vs the head of cachelist  is a bet that the
 696  * page will survive until the next  read.  It's also saying that this
 697  * page is more likely to  be re-used than a  page freed some time ago
 698  * and never reclaimed.
 699  *
 700  * Freebehind maintains a  range of  file offset [smallfile1; smallfile2]
 701  *
 702  *            0 < offset < smallfile1 : pages are not freed.
 703  *   smallfile1 < offset < smallfile2 : pages freed to tail of cachelist.
 704  *   smallfile2 < offset              : pages freed to head of cachelist.
 705  *
 706  * The range  is  computed  at most  once  per second  and  depends on
 707  * freemem  and  ncpus_online.  Both parameters  are   bounded to be
 708  * >= smallfile && >= smallfile64.
 709  *
 710  * smallfile1 = (free memory / ncpu) / 1000
 711  * smallfile2 = (free memory / ncpu) / 10
 712  *
 713  * A few examples values:
 714  *
 715  *       Free Mem (in Bytes) [smallfile1; smallfile2]  [smallfile1; smallfile2]
 716  *                                 ncpus_online = 4          ncpus_online = 64
 717  *       ------------------  -----------------------   -----------------------
 718  *             1G                   [256K;  25M]               [32K; 1.5M]
 719  *            10G                   [2.5M; 250M]              [156K; 15M]
 720  *           100G                    [25M; 2.5G]              [1.5M; 150M]
 721  *
 722  */
 723 
 724 #define SMALLFILE1_D 1000
 725 #define SMALLFILE2_D 10
 726 static u_offset_t smallfile1 = 32 * 1024;
 727 static u_offset_t smallfile2 = 32 * 1024;
 728 static clock_t smallfile_update = 0; /* lbolt value of when to recompute */
 729 uint_t smallfile1_d = SMALLFILE1_D;
 730 uint_t smallfile2_d = SMALLFILE2_D;
 731 
 732 /*
 733  * wrip does the real work of write requests for ufs.
 734  */
 735 int
 736 wrip(struct inode *ip, struct uio *uio, int ioflag, struct cred *cr)
 737 {
 738         rlim64_t limit = uio->uio_llimit;
 739         u_offset_t off;
 740         u_offset_t old_i_size;
 741         struct fs *fs;
 742         struct vnode *vp;
 743         struct ufsvfs *ufsvfsp;
 744         caddr_t base;
 745         long start_resid = uio->uio_resid;   /* save starting resid */
 746         long premove_resid;                     /* resid before uiomove() */
 747         uint_t flags;
 748         int newpage;
 749         int iupdat_flag, directio_status;
 750         int n, on, mapon;
 751         int error, pagecreate;
 752         int do_dqrwlock;                /* drop/reacquire vfs_dqrwlock */
 753         int32_t iblocks;
 754         int     new_iblocks;
 755 
 756         /*
 757          * ip->i_size is incremented before the uiomove
 758          * is done on a write.  If the move fails (bad user
 759          * address) reset ip->i_size.
 760          * The better way would be to increment ip->i_size
 761          * only if the uiomove succeeds.
 762          */
 763         int i_size_changed = 0;
 764         o_mode_t type;
 765         int i_seq_needed = 0;
 766 
 767         vp = ITOV(ip);
 768 
 769         /*
 770          * check for forced unmount - should not happen as
 771          * the request passed the lockfs checks.
 772          */
 773         if ((ufsvfsp = ip->i_ufsvfs) == NULL)
 774                 return (EIO);
 775 
 776         fs = ip->i_fs;
 777 
 778         ASSERT(RW_WRITE_HELD(&ip->i_contents));
 779 
 780         /* check for valid filetype */
 781         type = ip->i_mode & IFMT;
 782         if ((type != IFREG) && (type != IFDIR) && (type != IFATTRDIR) &&
 783             (type != IFLNK) && (type != IFSHAD)) {
 784                 return (EIO);
 785         }
 786 
 787         /*
 788          * the actual limit of UFS file size
 789          * is UFS_MAXOFFSET_T
 790          */
 791         if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 792                 limit = MAXOFFSET_T;
 793 
 794         if (uio->uio_loffset >= limit) {
 795                 proc_t *p = ttoproc(curthread);
 796 
 797                 mutex_enter(&p->p_lock);
 798                 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls,
 799                     p, RCA_UNSAFE_SIGINFO);
 800                 mutex_exit(&p->p_lock);
 801                 return (EFBIG);
 802         }
 803 
 804         /*
 805          * if largefiles are disallowed, the limit is
 806          * the pre-largefiles value of 2GB
 807          */
 808         if (ufsvfsp->vfs_lfflags & UFS_LARGEFILES)
 809                 limit = MIN(UFS_MAXOFFSET_T, limit);
 810         else
 811                 limit = MIN(MAXOFF32_T, limit);
 812 
 813         if (uio->uio_loffset < (offset_t)0) {
 814                 return (EINVAL);
 815         }
 816         if (uio->uio_resid == 0) {
 817                 return (0);
 818         }
 819 
 820         if (uio->uio_loffset >= limit)
 821                 return (EFBIG);
 822 
 823         ip->i_flag |= INOACC;        /* don't update ref time in getpage */
 824 
 825         if (ioflag & (FSYNC|FDSYNC)) {
 826                 ip->i_flag |= ISYNC;
 827                 iupdat_flag = 1;
 828         }
 829         /*
 830          * Try to go direct
 831          */
 832         if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) {
 833                 uio->uio_llimit = limit;
 834                 error = ufs_directio_write(ip, uio, ioflag, 0, cr,
 835                     &directio_status);
 836                 /*
 837                  * If ufs_directio wrote to the file or set the flags,
 838                  * we need to update i_seq, but it may be deferred.
 839                  */
 840                 if (start_resid != uio->uio_resid ||
 841                     (ip->i_flag & (ICHG|IUPD))) {
 842                         i_seq_needed = 1;
 843                         ip->i_flag |= ISEQ;
 844                 }
 845                 if (directio_status == DIRECTIO_SUCCESS)
 846                         goto out;
 847         }
 848 
 849         /*
 850          * Behavior with respect to dropping/reacquiring vfs_dqrwlock:
 851          *
 852          * o shadow inodes: vfs_dqrwlock is not held at all
 853          * o quota updates: vfs_dqrwlock is read or write held
 854          * o other updates: vfs_dqrwlock is read held
 855          *
 856          * The first case is the only one where we do not hold
 857          * vfs_dqrwlock at all while entering wrip().
 858          * We must make sure not to downgrade/drop vfs_dqrwlock if we
 859          * have it as writer, i.e. if we are updating the quota inode.
 860          * There is no potential deadlock scenario in this case as
 861          * ufs_getpage() takes care of this and avoids reacquiring
 862          * vfs_dqrwlock in that case.
 863          *
 864          * This check is done here since the above conditions do not change
 865          * and we possibly loop below, so save a few cycles.
 866          */
 867         if ((type == IFSHAD) ||
 868             (rw_owner(&ufsvfsp->vfs_dqrwlock) == curthread)) {
 869                 do_dqrwlock = 0;
 870         } else {
 871                 do_dqrwlock = 1;
 872         }
 873 
 874         /*
 875          * Large Files: We cast MAXBMASK to offset_t
 876          * inorder to mask out the higher bits. Since offset_t
 877          * is a signed value, the high order bit set in MAXBMASK
 878          * value makes it do the right thing by having all bits 1
 879          * in the higher word. May be removed for _SOLARIS64_.
 880          */
 881 
 882         fs = ip->i_fs;
 883         do {
 884                 u_offset_t uoff = uio->uio_loffset;
 885                 off = uoff & (offset_t)MAXBMASK;
 886                 mapon = (int)(uoff & (offset_t)MAXBOFFSET);
 887                 on = (int)blkoff(fs, uoff);
 888                 n = (int)MIN(fs->fs_bsize - on, uio->uio_resid);
 889                 new_iblocks = 1;
 890 
 891                 if (type == IFREG && uoff + n >= limit) {
 892                         if (uoff >= limit) {
 893                                 error = EFBIG;
 894                                 goto out;
 895                         }
 896                         /*
 897                          * since uoff + n >= limit,
 898                          * therefore n >= limit - uoff, and n is an int
 899                          * so it is safe to cast it to an int
 900                          */
 901                         n = (int)(limit - (rlim64_t)uoff);
 902                 }
 903                 if (uoff + n > ip->i_size) {
 904                         /*
 905                          * We are extending the length of the file.
 906                          * bmap is used so that we are sure that
 907                          * if we need to allocate new blocks, that it
 908                          * is done here before we up the file size.
 909                          */
 910                         error = bmap_write(ip, uoff, (int)(on + n),
 911                             mapon == 0, NULL, cr);
 912                         /*
 913                          * bmap_write never drops i_contents so if
 914                          * the flags are set it changed the file.
 915                          */
 916                         if (ip->i_flag & (ICHG|IUPD)) {
 917                                 i_seq_needed = 1;
 918                                 ip->i_flag |= ISEQ;
 919                         }
 920                         if (error)
 921                                 break;
 922                         /*
 923                          * There is a window of vulnerability here.
 924                          * The sequence of operations: allocate file
 925                          * system blocks, uiomove the data into pages,
 926                          * and then update the size of the file in the
 927                          * inode, must happen atomically.  However, due
 928                          * to current locking constraints, this can not
 929                          * be done.
 930                          */
 931                         ASSERT(ip->i_writer == NULL);
 932                         ip->i_writer = curthread;
 933                         i_size_changed = 1;
 934                         /*
 935                          * If we are writing from the beginning of
 936                          * the mapping, we can just create the
 937                          * pages without having to read them.
 938                          */
 939                         pagecreate = (mapon == 0);
 940                 } else if (n == MAXBSIZE) {
 941                         /*
 942                          * Going to do a whole mappings worth,
 943                          * so we can just create the pages w/o
 944                          * having to read them in.  But before
 945                          * we do that, we need to make sure any
 946                          * needed blocks are allocated first.
 947                          */
 948                         iblocks = ip->i_blocks;
 949                         error = bmap_write(ip, uoff, (int)(on + n),
 950                             BI_ALLOC_ONLY, NULL, cr);
 951                         /*
 952                          * bmap_write never drops i_contents so if
 953                          * the flags are set it changed the file.
 954                          */
 955                         if (ip->i_flag & (ICHG|IUPD)) {
 956                                 i_seq_needed = 1;
 957                                 ip->i_flag |= ISEQ;
 958                         }
 959                         if (error)
 960                                 break;
 961                         pagecreate = 1;
 962                         /*
 963                          * check if the new created page needed the
 964                          * allocation of new disk blocks.
 965                          */
 966                         if (iblocks == ip->i_blocks)
 967                                 new_iblocks = 0; /* no new blocks allocated */
 968                 } else {
 969                         pagecreate = 0;
 970                         /*
 971                          * In sync mode flush the indirect blocks which
 972                          * may have been allocated and not written on
 973                          * disk. In above cases bmap_write will allocate
 974                          * in sync mode.
 975                          */
 976                         if (ioflag & (FSYNC|FDSYNC)) {
 977                                 error = ufs_indirblk_sync(ip, uoff);
 978                                 if (error)
 979                                         break;
 980                         }
 981                 }
 982 
 983                 /*
 984                  * At this point we can enter ufs_getpage() in one
 985                  * of two ways:
 986                  * 1) segmap_getmapflt() calls ufs_getpage() when the
 987                  *    forcefault parameter is true (pagecreate == 0)
 988                  * 2) uiomove() causes a page fault.
 989                  *
 990                  * We have to drop the contents lock to prevent the VM
 991                  * system from trying to reacquire it in ufs_getpage()
 992                  * should the uiomove cause a pagefault.
 993                  *
 994                  * We have to drop the reader vfs_dqrwlock here as well.
 995                  */
 996                 rw_exit(&ip->i_contents);
 997                 if (do_dqrwlock) {
 998                         ASSERT(RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock));
 999                         ASSERT(!(RW_WRITE_HELD(&ufsvfsp->vfs_dqrwlock)));
1000                         rw_exit(&ufsvfsp->vfs_dqrwlock);
1001                 }
1002 
1003                 newpage = 0;
1004                 premove_resid = uio->uio_resid;
1005 
1006                 /*
1007                  * Touch the page and fault it in if it is not in core
1008                  * before segmap_getmapflt or vpm_data_copy can lock it.
1009                  * This is to avoid the deadlock if the buffer is mapped
1010                  * to the same file through mmap which we want to write.
1011                  */
1012                 uio_prefaultpages((long)n, uio);
1013 
1014                 if (vpm_enable) {
1015                         /*
1016                          * Copy data. If new pages are created, part of
1017                          * the page that is not written will be initizliazed
1018                          * with zeros.
1019                          */
1020                         error = vpm_data_copy(vp, (off + mapon), (uint_t)n,
1021                             uio, !pagecreate, &newpage, 0, S_WRITE);
1022                 } else {
1023 
1024                         base = segmap_getmapflt(segkmap, vp, (off + mapon),
1025                             (uint_t)n, !pagecreate, S_WRITE);
1026 
1027                         /*
1028                          * segmap_pagecreate() returns 1 if it calls
1029                          * page_create_va() to allocate any pages.
1030                          */
1031 
1032                         if (pagecreate)
1033                                 newpage = segmap_pagecreate(segkmap, base,
1034                                     (size_t)n, 0);
1035 
1036                         error = uiomove(base + mapon, (long)n, UIO_WRITE, uio);
1037                 }
1038 
1039                 /*
1040                  * If "newpage" is set, then a new page was created and it
1041                  * does not contain valid data, so it needs to be initialized
1042                  * at this point.
1043                  * Otherwise the page contains old data, which was overwritten
1044                  * partially or as a whole in uiomove.
1045                  * If there is only one iovec structure within uio, then
1046                  * on error uiomove will not be able to update uio->uio_loffset
1047                  * and we would zero the whole page here!
1048                  *
1049                  * If uiomove fails because of an error, the old valid data
1050                  * is kept instead of filling the rest of the page with zero's.
1051                  */
1052                 if (!vpm_enable && newpage &&
1053                     uio->uio_loffset < roundup(off + mapon + n, PAGESIZE)) {
1054                         /*
1055                          * We created pages w/o initializing them completely,
1056                          * thus we need to zero the part that wasn't set up.
1057                          * This happens on most EOF write cases and if
1058                          * we had some sort of error during the uiomove.
1059                          */
1060                         int nzero, nmoved;
1061 
1062                         nmoved = (int)(uio->uio_loffset - (off + mapon));
1063                         ASSERT(nmoved >= 0 && nmoved <= n);
1064                         nzero = roundup(on + n, PAGESIZE) - nmoved;
1065                         ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE);
1066                         (void) kzero(base + mapon + nmoved, (uint_t)nzero);
1067                 }
1068 
1069                 /*
1070                  * Unlock the pages allocated by page_create_va()
1071                  * in segmap_pagecreate()
1072                  */
1073                 if (!vpm_enable && newpage)
1074                         segmap_pageunlock(segkmap, base, (size_t)n, S_WRITE);
1075 
1076                 /*
1077                  * If the size of the file changed, then update the
1078                  * size field in the inode now.  This can't be done
1079                  * before the call to segmap_pageunlock or there is
1080                  * a potential deadlock with callers to ufs_putpage().
1081                  * They will be holding i_contents and trying to lock
1082                  * a page, while this thread is holding a page locked
1083                  * and trying to acquire i_contents.
1084                  */
1085                 if (i_size_changed) {
1086                         rw_enter(&ip->i_contents, RW_WRITER);
1087                         old_i_size = ip->i_size;
1088                         UFS_SET_ISIZE(uoff + n, ip);
1089                         TRANS_INODE(ufsvfsp, ip);
1090                         /*
1091                          * file has grown larger than 2GB. Set flag
1092                          * in superblock to indicate this, if it
1093                          * is not already set.
1094                          */
1095                         if ((ip->i_size > MAXOFF32_T) &&
1096                             !(fs->fs_flags & FSLARGEFILES)) {
1097                                 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
1098                                 mutex_enter(&ufsvfsp->vfs_lock);
1099                                 fs->fs_flags |= FSLARGEFILES;
1100                                 ufs_sbwrite(ufsvfsp);
1101                                 mutex_exit(&ufsvfsp->vfs_lock);
1102                         }
1103                         mutex_enter(&ip->i_tlock);
1104                         ip->i_writer = NULL;
1105                         cv_broadcast(&ip->i_wrcv);
1106                         mutex_exit(&ip->i_tlock);
1107                         rw_exit(&ip->i_contents);
1108                 }
1109 
1110                 if (error) {
1111                         /*
1112                          * If we failed on a write, we may have already
1113                          * allocated file blocks as well as pages.  It's
1114                          * hard to undo the block allocation, but we must
1115                          * be sure to invalidate any pages that may have
1116                          * been allocated.
1117                          *
1118                          * If the page was created without initialization
1119                          * then we must check if it should be possible
1120                          * to destroy the new page and to keep the old data
1121                          * on the disk.
1122                          *
1123                          * It is possible to destroy the page without
1124                          * having to write back its contents only when
1125                          * - the size of the file keeps unchanged
1126                          * - bmap_write() did not allocate new disk blocks
1127                          *   it is possible to create big files using "seek" and
1128                          *   write to the end of the file. A "write" to a
1129                          *   position before the end of the file would not
1130                          *   change the size of the file but it would allocate
1131                          *   new disk blocks.
1132                          * - uiomove intended to overwrite the whole page.
1133                          * - a new page was created (newpage == 1).
1134                          */
1135 
1136                         if (i_size_changed == 0 && new_iblocks == 0 &&
1137                             newpage) {
1138 
1139                                 /* unwind what uiomove eventually last did */
1140                                 uio->uio_resid = premove_resid;
1141 
1142                                 /*
1143                                  * destroy the page, do not write ambiguous
1144                                  * data to the disk.
1145                                  */
1146                                 flags = SM_DESTROY;
1147                         } else {
1148                                 /*
1149                                  * write the page back to the disk, if dirty,
1150                                  * and remove the page from the cache.
1151                                  */
1152                                 flags = SM_INVAL;
1153                         }
1154 
1155                         if (vpm_enable) {
1156                                 /*
1157                                  *  Flush pages.
1158                                  */
1159                                 (void) vpm_sync_pages(vp, off, n, flags);
1160                         } else {
1161                                 (void) segmap_release(segkmap, base, flags);
1162                         }
1163                 } else {
1164                         flags = 0;
1165                         /*
1166                          * Force write back for synchronous write cases.
1167                          */
1168                         if ((ioflag & (FSYNC|FDSYNC)) || type == IFDIR) {
1169                                 /*
1170                                  * If the sticky bit is set but the
1171                                  * execute bit is not set, we do a
1172                                  * synchronous write back and free
1173                                  * the page when done.  We set up swap
1174                                  * files to be handled this way to
1175                                  * prevent servers from keeping around
1176                                  * the client's swap pages too long.
1177                                  * XXX - there ought to be a better way.
1178                                  */
1179                                 if (IS_SWAPVP(vp)) {
1180                                         flags = SM_WRITE | SM_FREE |
1181                                             SM_DONTNEED;
1182                                         iupdat_flag = 0;
1183                                 } else {
1184                                         flags = SM_WRITE;
1185                                 }
1186                         } else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) {
1187                                 /*
1188                                  * Have written a whole block.
1189                                  * Start an asynchronous write and
1190                                  * mark the buffer to indicate that
1191                                  * it won't be needed again soon.
1192                                  */
1193                                 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
1194                         }
1195                         if (vpm_enable) {
1196                                 /*
1197                                  * Flush pages.
1198                                  */
1199                                 error = vpm_sync_pages(vp, off, n, flags);
1200                         } else {
1201                                 error = segmap_release(segkmap, base, flags);
1202                         }
1203                         /*
1204                          * If the operation failed and is synchronous,
1205                          * then we need to unwind what uiomove() last
1206                          * did so we can potentially return an error to
1207                          * the caller.  If this write operation was
1208                          * done in two pieces and the first succeeded,
1209                          * then we won't return an error for the second
1210                          * piece that failed.  However, we only want to
1211                          * return a resid value that reflects what was
1212                          * really done.
1213                          *
1214                          * Failures for non-synchronous operations can
1215                          * be ignored since the page subsystem will
1216                          * retry the operation until it succeeds or the
1217                          * file system is unmounted.
1218                          */
1219                         if (error) {
1220                                 if ((ioflag & (FSYNC | FDSYNC)) ||
1221                                     type == IFDIR) {
1222                                         uio->uio_resid = premove_resid;
1223                                 } else {
1224                                         error = 0;
1225                                 }
1226                         }
1227                 }
1228 
1229                 /*
1230                  * Re-acquire contents lock.
1231                  * If it was dropped, reacquire reader vfs_dqrwlock as well.
1232                  */
1233                 if (do_dqrwlock)
1234                         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1235                 rw_enter(&ip->i_contents, RW_WRITER);
1236 
1237                 /*
1238                  * If the uiomove() failed or if a synchronous
1239                  * page push failed, fix up i_size.
1240                  */
1241                 if (error) {
1242                         if (i_size_changed) {
1243                                 /*
1244                                  * The uiomove failed, and we
1245                                  * allocated blocks,so get rid
1246                                  * of them.
1247                                  */
1248                                 (void) ufs_itrunc(ip, old_i_size, 0, cr);
1249                         }
1250                 } else {
1251                         /*
1252                          * XXX - Can this be out of the loop?
1253                          */
1254                         ip->i_flag |= IUPD | ICHG;
1255                         /*
1256                          * Only do one increase of i_seq for multiple
1257                          * pieces.  Because we drop locks, record
1258                          * the fact that we changed the timestamp and
1259                          * are deferring the increase in case another thread
1260                          * pushes our timestamp update.
1261                          */
1262                         i_seq_needed = 1;
1263                         ip->i_flag |= ISEQ;
1264                         if (i_size_changed)
1265                                 ip->i_flag |= IATTCHG;
1266                         if ((ip->i_mode & (IEXEC | (IEXEC >> 3) |
1267                             (IEXEC >> 6))) != 0 &&
1268                             (ip->i_mode & (ISUID | ISGID)) != 0 &&
1269                             secpolicy_vnode_setid_retain(cr,
1270                             (ip->i_mode & ISUID) != 0 && ip->i_uid == 0) != 0) {
1271                                 /*
1272                                  * Clear Set-UID & Set-GID bits on
1273                                  * successful write if not privileged
1274                                  * and at least one of the execute bits
1275                                  * is set.  If we always clear Set-GID,
1276                                  * mandatory file and record locking is
1277                                  * unuseable.
1278                                  */
1279                                 ip->i_mode &= ~(ISUID | ISGID);
1280                         }
1281                 }
1282                 /*
1283                  * In the case the FDSYNC flag is set and this is a
1284                  * "rewrite" we won't log a delta.
1285                  * The FSYNC flag overrides all cases.
1286                  */
1287                 if (!ufs_check_rewrite(ip, uio, ioflag) || !(ioflag & FDSYNC)) {
1288                         TRANS_INODE(ufsvfsp, ip);
1289                 }
1290         } while (error == 0 && uio->uio_resid > 0 && n != 0);
1291 
1292 out:
1293         /*
1294          * Make sure i_seq is increased at least once per write
1295          */
1296         if (i_seq_needed) {
1297                 ip->i_seq++;
1298                 ip->i_flag &= ~ISEQ;     /* no longer deferred */
1299         }
1300 
1301         /*
1302          * Inode is updated according to this table -
1303          *
1304          *   FSYNC        FDSYNC(posix.4)
1305          *   --------------------------
1306          *   always@      IATTCHG|IBDWRITE
1307          *
1308          * @ -  If we are doing synchronous write the only time we should
1309          *      not be sync'ing the ip here is if we have the stickyhack
1310          *      activated, the file is marked with the sticky bit and
1311          *      no exec bit, the file length has not been changed and
1312          *      no new blocks have been allocated during this write.
1313          */
1314 
1315         if ((ip->i_flag & ISYNC) != 0) {
1316                 /*
1317                  * we have eliminated nosync
1318                  */
1319                 if ((ip->i_flag & (IATTCHG|IBDWRITE)) ||
1320                     ((ioflag & FSYNC) && iupdat_flag)) {
1321                         ufs_iupdat(ip, 1);
1322                 }
1323         }
1324 
1325         /*
1326          * If we've already done a partial-write, terminate
1327          * the write but return no error unless the error is ENOSPC
1328          * because the caller can detect this and free resources and
1329          * try again.
1330          */
1331         if ((start_resid != uio->uio_resid) && (error != ENOSPC))
1332                 error = 0;
1333 
1334         ip->i_flag &= ~(INOACC | ISYNC);
1335         ITIMES_NOLOCK(ip);
1336         return (error);
1337 }
1338 
1339 /*
1340  * rdip does the real work of read requests for ufs.
1341  */
1342 int
1343 rdip(struct inode *ip, struct uio *uio, int ioflag, cred_t *cr)
1344 {
1345         u_offset_t off;
1346         caddr_t base;
1347         struct fs *fs;
1348         struct ufsvfs *ufsvfsp;
1349         struct vnode *vp;
1350         long oresid = uio->uio_resid;
1351         u_offset_t n, on, mapon;
1352         int error = 0;
1353         int doupdate = 1;
1354         uint_t flags;
1355         int dofree, directio_status;
1356         krw_t rwtype;
1357         o_mode_t type;
1358         clock_t now;
1359 
1360         vp = ITOV(ip);
1361 
1362         ASSERT(RW_LOCK_HELD(&ip->i_contents));
1363 
1364         ufsvfsp = ip->i_ufsvfs;
1365 
1366         if (ufsvfsp == NULL)
1367                 return (EIO);
1368 
1369         fs = ufsvfsp->vfs_fs;
1370 
1371         /* check for valid filetype */
1372         type = ip->i_mode & IFMT;
1373         if ((type != IFREG) && (type != IFDIR) && (type != IFATTRDIR) &&
1374             (type != IFLNK) && (type != IFSHAD)) {
1375                 return (EIO);
1376         }
1377 
1378         if (uio->uio_loffset > UFS_MAXOFFSET_T) {
1379                 error = 0;
1380                 goto out;
1381         }
1382         if (uio->uio_loffset < (offset_t)0) {
1383                 return (EINVAL);
1384         }
1385         if (uio->uio_resid == 0) {
1386                 return (0);
1387         }
1388 
1389         if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && (fs->fs_ronly == 0) &&
1390             (!ufsvfsp->vfs_noatime)) {
1391                 mutex_enter(&ip->i_tlock);
1392                 ip->i_flag |= IACC;
1393                 mutex_exit(&ip->i_tlock);
1394         }
1395         /*
1396          * Try to go direct
1397          */
1398         if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) {
1399                 error = ufs_directio_read(ip, uio, cr, &directio_status);
1400                 if (directio_status == DIRECTIO_SUCCESS)
1401                         goto out;
1402         }
1403 
1404         rwtype = (rw_write_held(&ip->i_contents)?RW_WRITER:RW_READER);
1405 
1406         do {
1407                 offset_t diff;
1408                 u_offset_t uoff = uio->uio_loffset;
1409                 off = uoff & (offset_t)MAXBMASK;
1410                 mapon = (u_offset_t)(uoff & (offset_t)MAXBOFFSET);
1411                 on = (u_offset_t)blkoff(fs, uoff);
1412                 n = MIN((u_offset_t)fs->fs_bsize - on,
1413                     (u_offset_t)uio->uio_resid);
1414 
1415                 diff = ip->i_size - uoff;
1416 
1417                 if (diff <= (offset_t)0) {
1418                         error = 0;
1419                         goto out;
1420                 }
1421                 if (diff < (offset_t)n)
1422                         n = (int)diff;
1423 
1424                 /*
1425                  * We update smallfile2 and smallfile1 at most every second.
1426                  */
1427                 now = ddi_get_lbolt();
1428                 if (now >= smallfile_update) {
1429                         uint64_t percpufreeb;
1430                         if (smallfile1_d == 0) smallfile1_d = SMALLFILE1_D;
1431                         if (smallfile2_d == 0) smallfile2_d = SMALLFILE2_D;
1432                         percpufreeb = ptob((uint64_t)freemem) / ncpus_online;
1433                         smallfile1 = percpufreeb / smallfile1_d;
1434                         smallfile2 = percpufreeb / smallfile2_d;
1435                         smallfile1 = MAX(smallfile1, smallfile);
1436                         smallfile1 = MAX(smallfile1, smallfile64);
1437                         smallfile2 = MAX(smallfile1, smallfile2);
1438                         smallfile_update = now + hz;
1439                 }
1440 
1441                 dofree = freebehind &&
1442                     ip->i_nextr == (off & PAGEMASK) && off > smallfile1;
1443 
1444                 /*
1445                  * At this point we can enter ufs_getpage() in one of two
1446                  * ways:
1447                  * 1) segmap_getmapflt() calls ufs_getpage() when the
1448                  *    forcefault parameter is true (value of 1 is passed)
1449                  * 2) uiomove() causes a page fault.
1450                  *
1451                  * We cannot hold onto an i_contents reader lock without
1452                  * risking deadlock in ufs_getpage() so drop a reader lock.
1453                  * The ufs_getpage() dolock logic already allows for a
1454                  * thread holding i_contents as writer to work properly
1455                  * so we keep a writer lock.
1456                  */
1457                 if (rwtype == RW_READER)
1458                         rw_exit(&ip->i_contents);
1459 
1460                 if (vpm_enable) {
1461                         /*
1462                          * Copy data.
1463                          */
1464                         error = vpm_data_copy(vp, (off + mapon), (uint_t)n,
1465                             uio, 1, NULL, 0, S_READ);
1466                 } else {
1467                         base = segmap_getmapflt(segkmap, vp, (off + mapon),
1468                             (uint_t)n, 1, S_READ);
1469                         error = uiomove(base + mapon, (long)n, UIO_READ, uio);
1470                 }
1471 
1472                 flags = 0;
1473                 if (!error) {
1474                         /*
1475                          * If  reading sequential  we won't need  this
1476                          * buffer again  soon.  For  offsets in  range
1477                          * [smallfile1,  smallfile2] release the pages
1478                          * at   the  tail  of the   cache list, larger
1479                          * offsets are released at the head.
1480                          */
1481                         if (dofree) {
1482                                 flags = SM_FREE | SM_ASYNC;
1483                                 if ((cache_read_ahead == 0) &&
1484                                     (off > smallfile2))
1485                                         flags |=  SM_DONTNEED;
1486                         }
1487                         /*
1488                          * In POSIX SYNC (FSYNC and FDSYNC) read mode,
1489                          * we want to make sure that the page which has
1490                          * been read, is written on disk if it is dirty.
1491                          * And corresponding indirect blocks should also
1492                          * be flushed out.
1493                          */
1494                         if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) {
1495                                 flags &= ~SM_ASYNC;
1496                                 flags |= SM_WRITE;
1497                         }
1498                         if (vpm_enable) {
1499                                 error = vpm_sync_pages(vp, off, n, flags);
1500                         } else {
1501                                 error = segmap_release(segkmap, base, flags);
1502                         }
1503                 } else {
1504                         if (vpm_enable) {
1505                                 (void) vpm_sync_pages(vp, off, n, flags);
1506                         } else {
1507                                 (void) segmap_release(segkmap, base, flags);
1508                         }
1509                 }
1510 
1511                 if (rwtype == RW_READER)
1512                         rw_enter(&ip->i_contents, rwtype);
1513         } while (error == 0 && uio->uio_resid > 0 && n != 0);
1514 out:
1515         /*
1516          * Inode is updated according to this table if FRSYNC is set.
1517          *
1518          *   FSYNC        FDSYNC(posix.4)
1519          *   --------------------------
1520          *   always       IATTCHG|IBDWRITE
1521          */
1522         /*
1523          * The inode is not updated if we're logging and the inode is a
1524          * directory with FRSYNC, FSYNC and FDSYNC flags set.
1525          */
1526         if (ioflag & FRSYNC) {
1527                 if (TRANS_ISTRANS(ufsvfsp) && ((ip->i_mode & IFMT) == IFDIR)) {
1528                         doupdate = 0;
1529                 }
1530                 if (doupdate) {
1531                         if ((ioflag & FSYNC) ||
1532                             ((ioflag & FDSYNC) &&
1533                             (ip->i_flag & (IATTCHG|IBDWRITE)))) {
1534                                 ufs_iupdat(ip, 1);
1535                         }
1536                 }
1537         }
1538         /*
1539          * If we've already done a partial read, terminate
1540          * the read but return no error.
1541          */
1542         if (oresid != uio->uio_resid)
1543                 error = 0;
1544         ITIMES(ip);
1545 
1546         return (error);
1547 }
1548 
1549 /* ARGSUSED */
1550 static int
1551 ufs_ioctl(
1552         struct vnode    *vp,
1553         int             cmd,
1554         intptr_t        arg,
1555         int             flag,
1556         struct cred     *cr,
1557         int             *rvalp,
1558         caller_context_t *ct)
1559 {
1560         struct lockfs   lockfs, lockfs_out;
1561         struct ufsvfs   *ufsvfsp = VTOI(vp)->i_ufsvfs;
1562         char            *comment, *original_comment;
1563         struct fs       *fs;
1564         struct ulockfs  *ulp;
1565         offset_t        off;
1566         int             error;
1567         int             issync;
1568         int             trans_size;
1569 
1570 
1571         /*
1572          * forcibly unmounted
1573          */
1574         if (ufsvfsp == NULL || vp->v_vfsp == NULL ||
1575             vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
1576                 return (EIO);
1577         fs = ufsvfsp->vfs_fs;
1578 
1579         if (cmd == Q_QUOTACTL) {
1580                 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_QUOTA_MASK);
1581                 if (error)
1582                         return (error);
1583 
1584                 if (ulp) {
1585                         TRANS_BEGIN_ASYNC(ufsvfsp, TOP_QUOTA,
1586                             TOP_SETQUOTA_SIZE(fs));
1587                 }
1588 
1589                 error = quotactl(vp, arg, flag, cr);
1590 
1591                 if (ulp) {
1592                         TRANS_END_ASYNC(ufsvfsp, TOP_QUOTA,
1593                             TOP_SETQUOTA_SIZE(fs));
1594                         ufs_lockfs_end(ulp);
1595                 }
1596                 return (error);
1597         }
1598 
1599         switch (cmd) {
1600                 case _FIOLFS:
1601                         /*
1602                          * file system locking
1603                          */
1604                         if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1605                                 return (EPERM);
1606 
1607                         if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
1608                                 if (copyin((caddr_t)arg, &lockfs,
1609                                     sizeof (struct lockfs)))
1610                                         return (EFAULT);
1611                         }
1612 #ifdef _SYSCALL32_IMPL
1613                         else {
1614                                 struct lockfs32 lockfs32;
1615                                 /* Translate ILP32 lockfs to LP64 lockfs */
1616                                 if (copyin((caddr_t)arg, &lockfs32,
1617                                     sizeof (struct lockfs32)))
1618                                         return (EFAULT);
1619                                 lockfs.lf_lock = (ulong_t)lockfs32.lf_lock;
1620                                 lockfs.lf_flags = (ulong_t)lockfs32.lf_flags;
1621                                 lockfs.lf_key = (ulong_t)lockfs32.lf_key;
1622                                 lockfs.lf_comlen = (ulong_t)lockfs32.lf_comlen;
1623                                 lockfs.lf_comment =
1624                                     (caddr_t)(uintptr_t)lockfs32.lf_comment;
1625                         }
1626 #endif /* _SYSCALL32_IMPL */
1627 
1628                         if (lockfs.lf_comlen) {
1629                                 if (lockfs.lf_comlen > LOCKFS_MAXCOMMENTLEN)
1630                                         return (ENAMETOOLONG);
1631                                 comment =
1632                                     kmem_alloc(lockfs.lf_comlen, KM_SLEEP);
1633                                 if (copyin(lockfs.lf_comment, comment,
1634                                     lockfs.lf_comlen)) {
1635                                         kmem_free(comment, lockfs.lf_comlen);
1636                                         return (EFAULT);
1637                                 }
1638                                 original_comment = lockfs.lf_comment;
1639                                 lockfs.lf_comment = comment;
1640                         }
1641                         if ((error = ufs_fiolfs(vp, &lockfs, 0)) == 0) {
1642                                 lockfs.lf_comment = original_comment;
1643 
1644                                 if ((flag & DATAMODEL_MASK) ==
1645                                     DATAMODEL_NATIVE) {
1646                                         (void) copyout(&lockfs, (caddr_t)arg,
1647                                             sizeof (struct lockfs));
1648                                 }
1649 #ifdef _SYSCALL32_IMPL
1650                                 else {
1651                                         struct lockfs32 lockfs32;
1652                                         /* Translate LP64 to ILP32 lockfs */
1653                                         lockfs32.lf_lock =
1654                                             (uint32_t)lockfs.lf_lock;
1655                                         lockfs32.lf_flags =
1656                                             (uint32_t)lockfs.lf_flags;
1657                                         lockfs32.lf_key =
1658                                             (uint32_t)lockfs.lf_key;
1659                                         lockfs32.lf_comlen =
1660                                             (uint32_t)lockfs.lf_comlen;
1661                                         lockfs32.lf_comment =
1662                                             (uint32_t)(uintptr_t)
1663                                             lockfs.lf_comment;
1664                                         (void) copyout(&lockfs32, (caddr_t)arg,
1665                                             sizeof (struct lockfs32));
1666                                 }
1667 #endif /* _SYSCALL32_IMPL */
1668 
1669                         } else {
1670                                 if (lockfs.lf_comlen)
1671                                         kmem_free(comment, lockfs.lf_comlen);
1672                         }
1673                         return (error);
1674 
1675                 case _FIOLFSS:
1676                         /*
1677                          * get file system locking status
1678                          */
1679 
1680                         if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
1681                                 if (copyin((caddr_t)arg, &lockfs,
1682                                     sizeof (struct lockfs)))
1683                                         return (EFAULT);
1684                         }
1685 #ifdef _SYSCALL32_IMPL
1686                         else {
1687                                 struct lockfs32 lockfs32;
1688                                 /* Translate ILP32 lockfs to LP64 lockfs */
1689                                 if (copyin((caddr_t)arg, &lockfs32,
1690                                     sizeof (struct lockfs32)))
1691                                         return (EFAULT);
1692                                 lockfs.lf_lock = (ulong_t)lockfs32.lf_lock;
1693                                 lockfs.lf_flags = (ulong_t)lockfs32.lf_flags;
1694                                 lockfs.lf_key = (ulong_t)lockfs32.lf_key;
1695                                 lockfs.lf_comlen = (ulong_t)lockfs32.lf_comlen;
1696                                 lockfs.lf_comment =
1697                                     (caddr_t)(uintptr_t)lockfs32.lf_comment;
1698                         }
1699 #endif /* _SYSCALL32_IMPL */
1700 
1701                         if (error =  ufs_fiolfss(vp, &lockfs_out))
1702                                 return (error);
1703                         lockfs.lf_lock = lockfs_out.lf_lock;
1704                         lockfs.lf_key = lockfs_out.lf_key;
1705                         lockfs.lf_flags = lockfs_out.lf_flags;
1706                         lockfs.lf_comlen = MIN(lockfs.lf_comlen,
1707                             lockfs_out.lf_comlen);
1708 
1709                         if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
1710                                 if (copyout(&lockfs, (caddr_t)arg,
1711                                     sizeof (struct lockfs)))
1712                                         return (EFAULT);
1713                         }
1714 #ifdef _SYSCALL32_IMPL
1715                         else {
1716                                 /* Translate LP64 to ILP32 lockfs */
1717                                 struct lockfs32 lockfs32;
1718                                 lockfs32.lf_lock = (uint32_t)lockfs.lf_lock;
1719                                 lockfs32.lf_flags = (uint32_t)lockfs.lf_flags;
1720                                 lockfs32.lf_key = (uint32_t)lockfs.lf_key;
1721                                 lockfs32.lf_comlen = (uint32_t)lockfs.lf_comlen;
1722                                 lockfs32.lf_comment =
1723                                     (uint32_t)(uintptr_t)lockfs.lf_comment;
1724                                 if (copyout(&lockfs32, (caddr_t)arg,
1725                                     sizeof (struct lockfs32)))
1726                                         return (EFAULT);
1727                         }
1728 #endif /* _SYSCALL32_IMPL */
1729 
1730                         if (lockfs.lf_comlen &&
1731                             lockfs.lf_comment && lockfs_out.lf_comment)
1732                                 if (copyout(lockfs_out.lf_comment,
1733                                     lockfs.lf_comment, lockfs.lf_comlen))
1734                                         return (EFAULT);
1735                         return (0);
1736 
1737                 case _FIOSATIME:
1738                         /*
1739                          * set access time
1740                          */
1741 
1742                         /*
1743                          * if mounted w/o atime, return quietly.
1744                          * I briefly thought about returning ENOSYS, but
1745                          * figured that most apps would consider this fatal
1746                          * but the idea is to make this as seamless as poss.
1747                          */
1748                         if (ufsvfsp->vfs_noatime)
1749                                 return (0);
1750 
1751                         error = ufs_lockfs_begin(ufsvfsp, &ulp,
1752                             ULOCKFS_SETATTR_MASK);
1753                         if (error)
1754                                 return (error);
1755 
1756                         if (ulp) {
1757                                 trans_size = (int)TOP_SETATTR_SIZE(VTOI(vp));
1758                                 TRANS_BEGIN_CSYNC(ufsvfsp, issync,
1759                                     TOP_SETATTR, trans_size);
1760                         }
1761 
1762                         error = ufs_fiosatime(vp, (struct timeval *)arg,
1763                             flag, cr);
1764 
1765                         if (ulp) {
1766                                 TRANS_END_CSYNC(ufsvfsp, error, issync,
1767                                     TOP_SETATTR, trans_size);
1768                                 ufs_lockfs_end(ulp);
1769                         }
1770                         return (error);
1771 
1772                 case _FIOSDIO:
1773                         /*
1774                          * set delayed-io
1775                          */
1776                         return (ufs_fiosdio(vp, (uint_t *)arg, flag, cr));
1777 
1778                 case _FIOGDIO:
1779                         /*
1780                          * get delayed-io
1781                          */
1782                         return (ufs_fiogdio(vp, (uint_t *)arg, flag, cr));
1783 
1784                 case _FIOIO:
1785                         /*
1786                          * inode open
1787                          */
1788                         error = ufs_lockfs_begin(ufsvfsp, &ulp,
1789                             ULOCKFS_VGET_MASK);
1790                         if (error)
1791                                 return (error);
1792 
1793                         error = ufs_fioio(vp, (struct fioio *)arg, flag, cr);
1794 
1795                         if (ulp) {
1796                                 ufs_lockfs_end(ulp);
1797                         }
1798                         return (error);
1799 
1800                 case _FIOFFS:
1801                         /*
1802                          * file system flush (push w/invalidate)
1803                          */
1804                         if ((caddr_t)arg != NULL)
1805                                 return (EINVAL);
1806                         return (ufs_fioffs(vp, NULL, cr));
1807 
1808                 case _FIOISBUSY:
1809                         /*
1810                          * Contract-private interface for Legato
1811                          * Purge this vnode from the DNLC and decide
1812                          * if this vnode is busy (*arg == 1) or not
1813                          * (*arg == 0)
1814                          */
1815                         if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1816                                 return (EPERM);
1817                         error = ufs_fioisbusy(vp, (int *)arg, cr);
1818                         return (error);
1819 
1820                 case _FIODIRECTIO:
1821                         return (ufs_fiodirectio(vp, (int)arg, cr));
1822 
1823                 case _FIOTUNE:
1824                         /*
1825                          * Tune the file system (aka setting fs attributes)
1826                          */
1827                         error = ufs_lockfs_begin(ufsvfsp, &ulp,
1828                             ULOCKFS_SETATTR_MASK);
1829                         if (error)
1830                                 return (error);
1831 
1832                         error = ufs_fiotune(vp, (struct fiotune *)arg, cr);
1833 
1834                         if (ulp)
1835                                 ufs_lockfs_end(ulp);
1836                         return (error);
1837 
1838                 case _FIOLOGENABLE:
1839                         if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1840                                 return (EPERM);
1841                         return (ufs_fiologenable(vp, (void *)arg, cr, flag));
1842 
1843                 case _FIOLOGDISABLE:
1844                         if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1845                                 return (EPERM);
1846                         return (ufs_fiologdisable(vp, (void *)arg, cr, flag));
1847 
1848                 case _FIOISLOG:
1849                         return (ufs_fioislog(vp, (void *)arg, cr, flag));
1850 
1851                 case _FIOSNAPSHOTCREATE_MULTI:
1852                 {
1853                         struct fiosnapcreate_multi      fc, *fcp;
1854                         size_t  fcm_size;
1855 
1856                         if (copyin((void *)arg, &fc, sizeof (fc)))
1857                                 return (EFAULT);
1858                         if (fc.backfilecount > MAX_BACKFILE_COUNT)
1859                                 return (EINVAL);
1860                         fcm_size = sizeof (struct fiosnapcreate_multi) +
1861                             (fc.backfilecount - 1) * sizeof (int);
1862                         fcp = (struct fiosnapcreate_multi *)
1863                             kmem_alloc(fcm_size, KM_SLEEP);
1864                         if (copyin((void *)arg, fcp, fcm_size)) {
1865                                 kmem_free(fcp, fcm_size);
1866                                 return (EFAULT);
1867                         }
1868                         error = ufs_snap_create(vp, fcp, cr);
1869                         /*
1870                          * Do copyout even if there is an error because
1871                          * the details of error is stored in fcp.
1872                          */
1873                         if (copyout(fcp, (void *)arg, fcm_size))
1874                                 error = EFAULT;
1875                         kmem_free(fcp, fcm_size);
1876                         return (error);
1877                 }
1878 
1879                 case _FIOSNAPSHOTDELETE:
1880                 {
1881                         struct fiosnapdelete    fc;
1882 
1883                         if (copyin((void *)arg, &fc, sizeof (fc)))
1884                                 return (EFAULT);
1885                         error = ufs_snap_delete(vp, &fc, cr);
1886                         if (!error && copyout(&fc, (void *)arg, sizeof (fc)))
1887                                 error = EFAULT;
1888                         return (error);
1889                 }
1890 
1891                 case _FIOGETSUPERBLOCK:
1892                         if (copyout(fs, (void *)arg, SBSIZE))
1893                                 return (EFAULT);
1894                         return (0);
1895 
1896                 case _FIOGETMAXPHYS:
1897                         if (copyout((void *)&maxphys, (void *)arg,
1898                             sizeof (maxphys)))
1899                                 return (EFAULT);
1900                         return (0);
1901 
1902                 /*
1903                  * The following 3 ioctls are for TSufs support
1904                  * although could potentially be used elsewhere
1905                  */
1906                 case _FIO_SET_LUFS_DEBUG:
1907                         if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1908                                 return (EPERM);
1909                         lufs_debug = (uint32_t)arg;
1910                         return (0);
1911 
1912                 case _FIO_SET_LUFS_ERROR:
1913                         if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1914                                 return (EPERM);
1915                         TRANS_SETERROR(ufsvfsp);
1916                         return (0);
1917 
1918                 case _FIO_GET_TOP_STATS:
1919                 {
1920                         fio_lufs_stats_t *ls;
1921                         ml_unit_t *ul = ufsvfsp->vfs_log;
1922 
1923                         ls = kmem_zalloc(sizeof (*ls), KM_SLEEP);
1924                         ls->ls_debug = ul->un_debug; /* return debug value */
1925                         /* Copy stucture if statistics are being kept */
1926                         if (ul->un_logmap->mtm_tops) {
1927                                 ls->ls_topstats = *(ul->un_logmap->mtm_tops);
1928                         }
1929                         error = 0;
1930                         if (copyout(ls, (void *)arg, sizeof (*ls)))
1931                                 error = EFAULT;
1932                         kmem_free(ls, sizeof (*ls));
1933                         return (error);
1934                 }
1935 
1936                 case _FIO_SEEK_DATA:
1937                 case _FIO_SEEK_HOLE:
1938                         if (ddi_copyin((void *)arg, &off, sizeof (off), flag))
1939                                 return (EFAULT);
1940                         /* offset paramater is in/out */
1941                         error = ufs_fio_holey(vp, cmd, &off);
1942                         if (error)
1943                                 return (error);
1944                         if (ddi_copyout(&off, (void *)arg, sizeof (off), flag))
1945                                 return (EFAULT);
1946                         return (0);
1947 
1948                 case _FIO_COMPRESSED:
1949                 {
1950                         /*
1951                          * This is a project private ufs ioctl() to mark
1952                          * the inode as that belonging to a compressed
1953                          * file. This is used to mark individual
1954                          * compressed files in a miniroot archive.
1955                          * The files compressed in this manner are
1956                          * automatically decompressed by the dcfs filesystem
1957                          * (via an interception in ufs_lookup - see decompvp())
1958                          * which is layered on top of ufs on a system running
1959                          * from the archive. See uts/common/fs/dcfs for details.
1960                          * This ioctl only marks the file as compressed - the
1961                          * actual compression is done by fiocompress (a
1962                          * userland utility) which invokes this ioctl().
1963                          */
1964                         struct inode *ip = VTOI(vp);
1965 
1966                         error = ufs_lockfs_begin(ufsvfsp, &ulp,
1967                             ULOCKFS_SETATTR_MASK);
1968                         if (error)
1969                                 return (error);
1970 
1971                         if (ulp) {
1972                                 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IUPDAT,
1973                                     TOP_IUPDAT_SIZE(ip));
1974                         }
1975 
1976                         error = ufs_mark_compressed(vp);
1977 
1978                         if (ulp) {
1979                                 TRANS_END_ASYNC(ufsvfsp, TOP_IUPDAT,
1980                                     TOP_IUPDAT_SIZE(ip));
1981                                 ufs_lockfs_end(ulp);
1982                         }
1983 
1984                         return (error);
1985 
1986                 }
1987 
1988                 default:
1989                         return (ENOTTY);
1990         }
1991 }
1992 
1993 
1994 /* ARGSUSED */
1995 static int
1996 ufs_getattr(struct vnode *vp, struct vattr *vap, int flags,
1997     struct cred *cr, caller_context_t *ct)
1998 {
1999         struct inode *ip = VTOI(vp);
2000         struct ufsvfs *ufsvfsp;
2001         int err;
2002 
2003         if (vap->va_mask == AT_SIZE) {
2004                 /*
2005                  * for performance, if only the size is requested don't bother
2006                  * with anything else.
2007                  */
2008                 UFS_GET_ISIZE(&vap->va_size, ip);
2009                 return (0);
2010         }
2011 
2012         /*
2013          * inlined lockfs checks
2014          */
2015         ufsvfsp = ip->i_ufsvfs;
2016         if ((ufsvfsp == NULL) || ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs)) {
2017                 err = EIO;
2018                 goto out;
2019         }
2020 
2021         rw_enter(&ip->i_contents, RW_READER);
2022         /*
2023          * Return all the attributes.  This should be refined so
2024          * that it only returns what's asked for.
2025          */
2026 
2027         /*
2028          * Copy from inode table.
2029          */
2030         vap->va_type = vp->v_type;
2031         vap->va_mode = ip->i_mode & MODEMASK;
2032         /*
2033          * If there is an ACL and there is a mask entry, then do the
2034          * extra work that completes the equivalent of an acltomode(3)
2035          * call.  According to POSIX P1003.1e, the acl mask should be
2036          * returned in the group permissions field.
2037          *
2038          * - start with the original permission and mode bits (from above)
2039          * - clear the group owner bits
2040          * - add in the mask bits.
2041          */
2042         if (ip->i_ufs_acl && ip->i_ufs_acl->aclass.acl_ismask) {
2043                 vap->va_mode &= ~((VREAD | VWRITE | VEXEC) >> 3);
2044                 vap->va_mode |=
2045                     (ip->i_ufs_acl->aclass.acl_maskbits & PERMMASK) << 3;
2046         }
2047         vap->va_uid = ip->i_uid;
2048         vap->va_gid = ip->i_gid;
2049         vap->va_fsid = ip->i_dev;
2050         vap->va_nodeid = (ino64_t)ip->i_number;
2051         vap->va_nlink = ip->i_nlink;
2052         vap->va_size = ip->i_size;
2053         if (vp->v_type == VCHR || vp->v_type == VBLK)
2054                 vap->va_rdev = ip->i_rdev;
2055         else
2056                 vap->va_rdev = 0;    /* not a b/c spec. */
2057         mutex_enter(&ip->i_tlock);
2058         ITIMES_NOLOCK(ip);      /* mark correct time in inode */
2059         vap->va_seq = ip->i_seq;
2060         vap->va_atime.tv_sec = (time_t)ip->i_atime.tv_sec;
2061         vap->va_atime.tv_nsec = ip->i_atime.tv_usec*1000;
2062         vap->va_mtime.tv_sec = (time_t)ip->i_mtime.tv_sec;
2063         vap->va_mtime.tv_nsec = ip->i_mtime.tv_usec*1000;
2064         vap->va_ctime.tv_sec = (time_t)ip->i_ctime.tv_sec;
2065         vap->va_ctime.tv_nsec = ip->i_ctime.tv_usec*1000;
2066         mutex_exit(&ip->i_tlock);
2067 
2068         switch (ip->i_mode & IFMT) {
2069 
2070         case IFBLK:
2071                 vap->va_blksize = MAXBSIZE;          /* was BLKDEV_IOSIZE */
2072                 break;
2073 
2074         case IFCHR:
2075                 vap->va_blksize = MAXBSIZE;
2076                 break;
2077 
2078         default:
2079                 vap->va_blksize = ip->i_fs->fs_bsize;
2080                 break;
2081         }
2082         vap->va_nblocks = (fsblkcnt64_t)ip->i_blocks;
2083         rw_exit(&ip->i_contents);
2084         err = 0;
2085 
2086 out:
2087         return (err);
2088 }
2089 
2090 /*
2091  * Special wrapper to provide a callback for secpolicy_vnode_setattr().
2092  * The i_contents lock is already held by the caller and we need to
2093  * declare the inode as 'void *' argument.
2094  */
2095 static int
2096 ufs_priv_access(void *vip, int mode, struct cred *cr)
2097 {
2098         struct inode *ip = vip;
2099 
2100         return (ufs_iaccess(ip, mode, cr, 0));
2101 }
2102 
2103 /*ARGSUSED4*/
2104 static int
2105 ufs_setattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cr,
2106     caller_context_t *ct)
2107 {
2108         struct inode *ip = VTOI(vp);
2109         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
2110         struct fs *fs;
2111         struct ulockfs *ulp;
2112         char *errmsg1;
2113         char *errmsg2;
2114         long blocks;
2115         long int mask = vap->va_mask;
2116         size_t len1, len2;
2117         int issync;
2118         int trans_size;
2119         int dotrans;
2120         int dorwlock;
2121         int error;
2122         int owner_change;
2123         int dodqlock;
2124         timestruc_t now;
2125         vattr_t oldva;
2126         int retry = 1;
2127         int indeadlock;
2128 
2129         /*
2130          * Cannot set these attributes.
2131          */
2132         if ((mask & AT_NOSET) || (mask & AT_XVATTR))
2133                 return (EINVAL);
2134 
2135         /*
2136          * check for forced unmount
2137          */
2138         if (ufsvfsp == NULL)
2139                 return (EIO);
2140 
2141         fs = ufsvfsp->vfs_fs;
2142         if (fs->fs_ronly != 0)
2143                 return (EROFS);
2144 
2145 again:
2146         errmsg1 = NULL;
2147         errmsg2 = NULL;
2148         dotrans = 0;
2149         dorwlock = 0;
2150         dodqlock = 0;
2151 
2152         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SETATTR_MASK);
2153         if (error)
2154                 goto out;
2155 
2156         /*
2157          * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file.
2158          * This follows the protocol for read()/write().
2159          */
2160         if (vp->v_type != VDIR) {
2161                 /*
2162                  * ufs_tryirwlock uses rw_tryenter and checks for SLOCK to
2163                  * avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
2164                  * possible, retries the operation.
2165                  */
2166                 ufs_tryirwlock(&ip->i_rwlock, RW_WRITER, retry_file);
2167                 if (indeadlock) {
2168                         if (ulp)
2169                                 ufs_lockfs_end(ulp);
2170                         goto again;
2171                 }
2172                 dorwlock = 1;
2173         }
2174 
2175         /*
2176          * Truncate file.  Must have write permission and not be a directory.
2177          */
2178         if (mask & AT_SIZE) {
2179                 rw_enter(&ip->i_contents, RW_WRITER);
2180                 if (vp->v_type == VDIR) {
2181                         error = EISDIR;
2182                         goto update_inode;
2183                 }
2184                 if (error = ufs_iaccess(ip, IWRITE, cr, 0))
2185                         goto update_inode;
2186 
2187                 rw_exit(&ip->i_contents);
2188                 error = TRANS_ITRUNC(ip, vap->va_size, 0, cr);
2189                 if (error) {
2190                         rw_enter(&ip->i_contents, RW_WRITER);
2191                         goto update_inode;
2192                 }
2193 
2194                 if (error == 0 && vap->va_size)
2195                         vnevent_truncate(vp, ct);
2196         }
2197 
2198         if (ulp) {
2199                 trans_size = (int)TOP_SETATTR_SIZE(ip);
2200                 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_SETATTR, trans_size);
2201                 ++dotrans;
2202         }
2203 
2204         /*
2205          * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory.
2206          * This follows the protocol established by
2207          * ufs_link/create/remove/rename/mkdir/rmdir/symlink.
2208          */
2209         if (vp->v_type == VDIR) {
2210                 ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_SETATTR,
2211                     retry_dir);
2212                 if (indeadlock)
2213                         goto again;
2214                 dorwlock = 1;
2215         }
2216 
2217         /*
2218          * Grab quota lock if we are changing the file's owner.
2219          */
2220         if (mask & AT_UID) {
2221                 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
2222                 dodqlock = 1;
2223         }
2224         rw_enter(&ip->i_contents, RW_WRITER);
2225 
2226         oldva.va_mode = ip->i_mode;
2227         oldva.va_uid = ip->i_uid;
2228         oldva.va_gid = ip->i_gid;
2229 
2230         vap->va_mask &= ~AT_SIZE;
2231 
2232         error = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
2233             ufs_priv_access, ip);
2234         if (error)
2235                 goto update_inode;
2236 
2237         mask = vap->va_mask;
2238 
2239         /*
2240          * Change file access modes.
2241          */
2242         if (mask & AT_MODE) {
2243                 ip->i_mode = (ip->i_mode & IFMT) | (vap->va_mode & ~IFMT);
2244                 TRANS_INODE(ufsvfsp, ip);
2245                 ip->i_flag |= ICHG;
2246                 if (stickyhack) {
2247                         mutex_enter(&vp->v_lock);
2248                         if ((ip->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX)
2249                                 vp->v_flag |= VSWAPLIKE;
2250                         else
2251                                 vp->v_flag &= ~VSWAPLIKE;
2252                         mutex_exit(&vp->v_lock);
2253                 }
2254         }
2255         if (mask & (AT_UID|AT_GID)) {
2256                 if (mask & AT_UID) {
2257                         /*
2258                          * Don't change ownership of the quota inode.
2259                          */
2260                         if (ufsvfsp->vfs_qinod == ip) {
2261                                 ASSERT(ufsvfsp->vfs_qflags & MQ_ENABLED);
2262                                 error = EINVAL;
2263                                 goto update_inode;
2264                         }
2265 
2266                         /*
2267                          * No real ownership change.
2268                          */
2269                         if (ip->i_uid == vap->va_uid) {
2270                                 blocks = 0;
2271                                 owner_change = 0;
2272                         }
2273                         /*
2274                          * Remove the blocks and the file, from the old user's
2275                          * quota.
2276                          */
2277                         else {
2278                                 blocks = ip->i_blocks;
2279                                 owner_change = 1;
2280 
2281                                 (void) chkdq(ip, -blocks, /* force */ 1, cr,
2282                                     (char **)NULL, (size_t *)NULL);
2283                                 (void) chkiq(ufsvfsp, /* change */ -1, ip,
2284                                     (uid_t)ip->i_uid, /* force */ 1, cr,
2285                                     (char **)NULL, (size_t *)NULL);
2286                                 dqrele(ip->i_dquot);
2287                         }
2288 
2289                         ip->i_uid = vap->va_uid;
2290 
2291                         /*
2292                          * There is a real ownership change.
2293                          */
2294                         if (owner_change) {
2295                                 /*
2296                                  * Add the blocks and the file to the new
2297                                  * user's quota.
2298                                  */
2299                                 ip->i_dquot = getinoquota(ip);
2300                                 (void) chkdq(ip, blocks, /* force */ 1, cr,
2301                                     &errmsg1, &len1);
2302                                 (void) chkiq(ufsvfsp, /* change */ 1,
2303                                     (struct inode *)NULL, (uid_t)ip->i_uid,
2304                                     /* force */ 1, cr, &errmsg2, &len2);
2305                         }
2306                 }
2307                 if (mask & AT_GID) {
2308                         ip->i_gid = vap->va_gid;
2309                 }
2310                 TRANS_INODE(ufsvfsp, ip);
2311                 ip->i_flag |= ICHG;
2312         }
2313         /*
2314          * Change file access or modified times.
2315          */
2316         if (mask & (AT_ATIME|AT_MTIME)) {
2317                 /* Check that the time value is within ufs range */
2318                 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2319                     ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2320                         error = EOVERFLOW;
2321                         goto update_inode;
2322                 }
2323 
2324                 /*
2325                  * if the "noaccess" mount option is set and only atime
2326                  * update is requested, do nothing. No error is returned.
2327                  */
2328                 if ((ufsvfsp->vfs_noatime) &&
2329                     ((mask & (AT_ATIME|AT_MTIME)) == AT_ATIME))
2330                         goto skip_atime;
2331 
2332                 if (mask & AT_ATIME) {
2333                         ip->i_atime.tv_sec = vap->va_atime.tv_sec;
2334                         ip->i_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2335                         ip->i_flag &= ~IACC;
2336                 }
2337                 if (mask & AT_MTIME) {
2338                         ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
2339                         ip->i_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2340                         gethrestime(&now);
2341                         if (now.tv_sec > TIME32_MAX) {
2342                                 /*
2343                                  * In 2038, ctime sticks forever..
2344                                  */
2345                                 ip->i_ctime.tv_sec = TIME32_MAX;
2346                                 ip->i_ctime.tv_usec = 0;
2347                         } else {
2348                                 ip->i_ctime.tv_sec = now.tv_sec;
2349                                 ip->i_ctime.tv_usec = now.tv_nsec / 1000;
2350                         }
2351                         ip->i_flag &= ~(IUPD|ICHG);
2352                         ip->i_flag |= IMODTIME;
2353                 }
2354                 TRANS_INODE(ufsvfsp, ip);
2355                 ip->i_flag |= IMOD;
2356         }
2357 
2358 skip_atime:
2359         /*
2360          * The presence of a shadow inode may indicate an ACL, but does
2361          * not imply an ACL.  Future FSD types should be handled here too
2362          * and check for the presence of the attribute-specific data
2363          * before referencing it.
2364          */
2365         if (ip->i_shadow) {
2366                 /*
2367                  * XXX if ufs_iupdat is changed to sandbagged write fix
2368                  * ufs_acl_setattr to push ip to keep acls consistent
2369                  *
2370                  * Suppress out of inodes messages if we will retry.
2371                  */
2372                 if (retry)
2373                         ip->i_flag |= IQUIET;
2374                 error = ufs_acl_setattr(ip, vap, cr);
2375                 ip->i_flag &= ~IQUIET;
2376         }
2377 
2378 update_inode:
2379         /*
2380          * Setattr always increases the sequence number
2381          */
2382         ip->i_seq++;
2383 
2384         /*
2385          * if nfsd and not logging; push synchronously
2386          */
2387         if ((curthread->t_flag & T_DONTPEND) && !TRANS_ISTRANS(ufsvfsp)) {
2388                 ufs_iupdat(ip, 1);
2389         } else {
2390                 ITIMES_NOLOCK(ip);
2391         }
2392 
2393         rw_exit(&ip->i_contents);
2394         if (dodqlock) {
2395                 rw_exit(&ufsvfsp->vfs_dqrwlock);
2396         }
2397         if (dorwlock)
2398                 rw_exit(&ip->i_rwlock);
2399 
2400         if (ulp) {
2401                 if (dotrans) {
2402                         int terr = 0;
2403                         TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_SETATTR,
2404                             trans_size);
2405                         if (error == 0)
2406                                 error = terr;
2407                 }
2408                 ufs_lockfs_end(ulp);
2409         }
2410 out:
2411         /*
2412          * If out of inodes or blocks, see if we can free something
2413          * up from the delete queue.
2414          */
2415         if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
2416                 ufs_delete_drain_wait(ufsvfsp, 1);
2417                 retry = 0;
2418                 if (errmsg1 != NULL)
2419                         kmem_free(errmsg1, len1);
2420                 if (errmsg2 != NULL)
2421                         kmem_free(errmsg2, len2);
2422                 goto again;
2423         }
2424         if (errmsg1 != NULL) {
2425                 uprintf(errmsg1);
2426                 kmem_free(errmsg1, len1);
2427         }
2428         if (errmsg2 != NULL) {
2429                 uprintf(errmsg2);
2430                 kmem_free(errmsg2, len2);
2431         }
2432         return (error);
2433 }
2434 
2435 /*ARGSUSED*/
2436 static int
2437 ufs_access(struct vnode *vp, int mode, int flags, struct cred *cr,
2438     caller_context_t *ct)
2439 {
2440         struct inode *ip = VTOI(vp);
2441 
2442         if (ip->i_ufsvfs == NULL)
2443                 return (EIO);
2444 
2445         /*
2446          * The ufs_iaccess function wants to be called with
2447          * mode bits expressed as "ufs specific" bits.
2448          * I.e., VWRITE|VREAD|VEXEC do not make sense to
2449          * ufs_iaccess() but IWRITE|IREAD|IEXEC do.
2450          * But since they're the same we just pass the vnode mode
2451          * bit but just verify that assumption at compile time.
2452          */
2453 #if IWRITE != VWRITE || IREAD != VREAD || IEXEC != VEXEC
2454 #error "ufs_access needs to map Vmodes to Imodes"
2455 #endif
2456         return (ufs_iaccess(ip, mode, cr, 1));
2457 }
2458 
2459 /* ARGSUSED */
2460 static int
2461 ufs_readlink(struct vnode *vp, struct uio *uiop, struct cred *cr,
2462     caller_context_t *ct)
2463 {
2464         struct inode *ip = VTOI(vp);
2465         struct ufsvfs *ufsvfsp;
2466         struct ulockfs *ulp;
2467         int error;
2468         int fastsymlink;
2469 
2470         if (vp->v_type != VLNK) {
2471                 error = EINVAL;
2472                 goto nolockout;
2473         }
2474 
2475         /*
2476          * If the symbolic link is empty there is nothing to read.
2477          * Fast-track these empty symbolic links
2478          */
2479         if (ip->i_size == 0) {
2480                 error = 0;
2481                 goto nolockout;
2482         }
2483 
2484         ufsvfsp = ip->i_ufsvfs;
2485         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READLINK_MASK);
2486         if (error)
2487                 goto nolockout;
2488         /*
2489          * The ip->i_rwlock protects the data blocks used for FASTSYMLINK
2490          */
2491 again:
2492         fastsymlink = 0;
2493         if (ip->i_flag & IFASTSYMLNK) {
2494                 rw_enter(&ip->i_rwlock, RW_READER);
2495                 rw_enter(&ip->i_contents, RW_READER);
2496                 if (ip->i_flag & IFASTSYMLNK) {
2497                         if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) &&
2498                             (ip->i_fs->fs_ronly == 0) &&
2499                             (!ufsvfsp->vfs_noatime)) {
2500                                 mutex_enter(&ip->i_tlock);
2501                                 ip->i_flag |= IACC;
2502                                 mutex_exit(&ip->i_tlock);
2503                         }
2504                         error = uiomove((caddr_t)&ip->i_db[1],
2505                             MIN(ip->i_size, uiop->uio_resid),
2506                             UIO_READ, uiop);
2507                         ITIMES(ip);
2508                         ++fastsymlink;
2509                 }
2510                 rw_exit(&ip->i_contents);
2511                 rw_exit(&ip->i_rwlock);
2512         }
2513         if (!fastsymlink) {
2514                 ssize_t size;   /* number of bytes read  */
2515                 caddr_t basep;  /* pointer to input data */
2516                 ino_t ino;
2517                 long  igen;
2518                 struct uio tuio;        /* temp uio struct */
2519                 struct uio *tuiop;
2520                 iovec_t tiov;           /* temp iovec struct */
2521                 char kbuf[FSL_SIZE];    /* buffer to hold fast symlink */
2522                 int tflag = 0;          /* flag to indicate temp vars used */
2523 
2524                 ino = ip->i_number;
2525                 igen = ip->i_gen;
2526                 size = uiop->uio_resid;
2527                 basep = uiop->uio_iov->iov_base;
2528                 tuiop = uiop;
2529 
2530                 rw_enter(&ip->i_rwlock, RW_WRITER);
2531                 rw_enter(&ip->i_contents, RW_WRITER);
2532                 if (ip->i_flag & IFASTSYMLNK) {
2533                         rw_exit(&ip->i_contents);
2534                         rw_exit(&ip->i_rwlock);
2535                         goto again;
2536                 }
2537 
2538                 /* can this be a fast symlink and is it a user buffer? */
2539                 if (ip->i_size <= FSL_SIZE &&
2540                     (uiop->uio_segflg == UIO_USERSPACE ||
2541                     uiop->uio_segflg == UIO_USERISPACE)) {
2542 
2543                         bzero(&tuio, sizeof (struct uio));
2544                         /*
2545                          * setup a kernel buffer to read link into.  this
2546                          * is to fix a race condition where the user buffer
2547                          * got corrupted before copying it into the inode.
2548                          */
2549                         size = ip->i_size;
2550                         tiov.iov_len = size;
2551                         tiov.iov_base = kbuf;
2552                         tuio.uio_iov = &tiov;
2553                         tuio.uio_iovcnt = 1;
2554                         tuio.uio_offset = uiop->uio_offset;
2555                         tuio.uio_segflg = UIO_SYSSPACE;
2556                         tuio.uio_fmode = uiop->uio_fmode;
2557                         tuio.uio_extflg = uiop->uio_extflg;
2558                         tuio.uio_limit = uiop->uio_limit;
2559                         tuio.uio_resid = size;
2560 
2561                         basep = tuio.uio_iov->iov_base;
2562                         tuiop = &tuio;
2563                         tflag = 1;
2564                 }
2565 
2566                 error = rdip(ip, tuiop, 0, cr);
2567                 if (!(error == 0 && ip->i_number == ino && ip->i_gen == igen)) {
2568                         rw_exit(&ip->i_contents);
2569                         rw_exit(&ip->i_rwlock);
2570                         goto out;
2571                 }
2572 
2573                 if (tflag == 0)
2574                         size -= uiop->uio_resid;
2575 
2576                 if ((tflag == 0 && ip->i_size <= FSL_SIZE &&
2577                     ip->i_size == size) || (tflag == 1 &&
2578                     tuio.uio_resid == 0)) {
2579                         error = kcopy(basep, &ip->i_db[1], ip->i_size);
2580                         if (error == 0) {
2581                                 ip->i_flag |= IFASTSYMLNK;
2582                                 /*
2583                                  * free page
2584                                  */
2585                                 (void) VOP_PUTPAGE(ITOV(ip),
2586                                     (offset_t)0, PAGESIZE,
2587                                     (B_DONTNEED | B_FREE | B_FORCE | B_ASYNC),
2588                                     cr, ct);
2589                         } else {
2590                                 int i;
2591                                 /* error, clear garbage left behind */
2592                                 for (i = 1; i < NDADDR; i++)
2593                                         ip->i_db[i] = 0;
2594                                 for (i = 0; i < NIADDR; i++)
2595                                         ip->i_ib[i] = 0;
2596                         }
2597                 }
2598                 if (tflag == 1) {
2599                         /* now, copy it into the user buffer */
2600                         error = uiomove((caddr_t)kbuf,
2601                             MIN(size, uiop->uio_resid),
2602                             UIO_READ, uiop);
2603                 }
2604                 rw_exit(&ip->i_contents);
2605                 rw_exit(&ip->i_rwlock);
2606         }
2607 out:
2608         if (ulp) {
2609                 ufs_lockfs_end(ulp);
2610         }
2611 nolockout:
2612         return (error);
2613 }
2614 
2615 /* ARGSUSED */
2616 static int
2617 ufs_fsync(struct vnode *vp, int syncflag, struct cred *cr, caller_context_t *ct)
2618 {
2619         struct inode *ip = VTOI(vp);
2620         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
2621         struct ulockfs *ulp;
2622         int error;
2623 
2624         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_FSYNC_MASK);
2625         if (error)
2626                 return (error);
2627 
2628         if (TRANS_ISTRANS(ufsvfsp)) {
2629                 /*
2630                  * First push out any data pages
2631                  */
2632                 if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) &&
2633                     (vp->v_type != VCHR) && !(IS_SWAPVP(vp))) {
2634                         error = VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
2635                             0, CRED(), ct);
2636                         if (error)
2637                                 goto out;
2638                 }
2639 
2640                 /*
2641                  * Delta any delayed inode times updates
2642                  * and push inode to log.
2643                  * All other inode deltas will have already been delta'd
2644                  * and will be pushed during the commit.
2645                  */
2646                 if (!(syncflag & FDSYNC) &&
2647                     ((ip->i_flag & (IMOD|IMODACC)) == IMODACC)) {
2648                         if (ulp) {
2649                                 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_FSYNC,
2650                                     TOP_SYNCIP_SIZE);
2651                         }
2652                         rw_enter(&ip->i_contents, RW_READER);
2653                         mutex_enter(&ip->i_tlock);
2654                         ip->i_flag &= ~IMODTIME;
2655                         mutex_exit(&ip->i_tlock);
2656                         ufs_iupdat(ip, I_SYNC);
2657                         rw_exit(&ip->i_contents);
2658                         if (ulp) {
2659                                 TRANS_END_ASYNC(ufsvfsp, TOP_FSYNC,
2660                                     TOP_SYNCIP_SIZE);
2661                         }
2662                 }
2663 
2664                 /*
2665                  * Commit the Moby transaction
2666                  *
2667                  * Deltas have already been made so we just need to
2668                  * commit them with a synchronous transaction.
2669                  * TRANS_BEGIN_SYNC() will return an error
2670                  * if there are no deltas to commit, for an
2671                  * empty transaction.
2672                  */
2673                 if (ulp) {
2674                         TRANS_BEGIN_SYNC(ufsvfsp, TOP_FSYNC, TOP_COMMIT_SIZE,
2675                             error);
2676                         if (error) {
2677                                 error = 0; /* commit wasn't needed */
2678                                 goto out;
2679                         }
2680                         TRANS_END_SYNC(ufsvfsp, error, TOP_FSYNC,
2681                             TOP_COMMIT_SIZE);
2682                 }
2683         } else {        /* not logging */
2684                 if (!(IS_SWAPVP(vp)))
2685                         if (syncflag & FNODSYNC) {
2686                                 /* Just update the inode only */
2687                                 TRANS_IUPDAT(ip, 1);
2688                                 error = 0;
2689                         } else if (syncflag & FDSYNC)
2690                                 /* Do data-synchronous writes */
2691                                 error = TRANS_SYNCIP(ip, 0, I_DSYNC, TOP_FSYNC);
2692                         else
2693                                 /* Do synchronous writes */
2694                                 error = TRANS_SYNCIP(ip, 0, I_SYNC, TOP_FSYNC);
2695 
2696                 rw_enter(&ip->i_contents, RW_WRITER);
2697                 if (!error)
2698                         error = ufs_sync_indir(ip);
2699                 rw_exit(&ip->i_contents);
2700         }
2701 out:
2702         if (ulp) {
2703                 ufs_lockfs_end(ulp);
2704         }
2705         return (error);
2706 }
2707 
2708 /*ARGSUSED*/
2709 static void
2710 ufs_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ct)
2711 {
2712         ufs_iinactive(VTOI(vp));
2713 }
2714 
2715 /*
2716  * Unix file system operations having to do with directory manipulation.
2717  */
2718 int ufs_lookup_idle_count = 2;  /* Number of inodes to idle each time */
2719 /* ARGSUSED */
2720 static int
2721 ufs_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
2722     struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cr,
2723     caller_context_t *ct, int *direntflags, pathname_t *realpnp)
2724 {
2725         struct inode *ip;
2726         struct inode *sip;
2727         struct inode *xip;
2728         struct ufsvfs *ufsvfsp;
2729         struct ulockfs *ulp;
2730         struct vnode *vp;
2731         int error;
2732 
2733         /*
2734          * Check flags for type of lookup (regular file or attribute file)
2735          */
2736 
2737         ip = VTOI(dvp);
2738 
2739         if (flags & LOOKUP_XATTR) {
2740 
2741                 /*
2742                  * If not mounted with XATTR support then return EINVAL
2743                  */
2744 
2745                 if (!(ip->i_ufsvfs->vfs_vfs->vfs_flag & VFS_XATTR))
2746                         return (EINVAL);
2747                 /*
2748                  * We don't allow recursive attributes...
2749                  * Maybe someday we will.
2750                  */
2751                 if ((ip->i_cflags & IXATTR)) {
2752                         return (EINVAL);
2753                 }
2754 
2755                 if ((vp = dnlc_lookup(dvp, XATTR_DIR_NAME)) == NULL) {
2756                         error = ufs_xattr_getattrdir(dvp, &sip, flags, cr);
2757                         if (error) {
2758                                 *vpp = NULL;
2759                                 goto out;
2760                         }
2761 
2762                         vp = ITOV(sip);
2763                         dnlc_update(dvp, XATTR_DIR_NAME, vp);
2764                 }
2765 
2766                 /*
2767                  * Check accessibility of directory.
2768                  */
2769                 if (vp == DNLC_NO_VNODE) {
2770                         VN_RELE(vp);
2771                         error = ENOENT;
2772                         goto out;
2773                 }
2774                 if ((error = ufs_iaccess(VTOI(vp), IEXEC, cr, 1)) != 0) {
2775                         VN_RELE(vp);
2776                         goto out;
2777                 }
2778 
2779                 *vpp = vp;
2780                 return (0);
2781         }
2782 
2783         /*
2784          * Check for a null component, which we should treat as
2785          * looking at dvp from within it's parent, so we don't
2786          * need a call to ufs_iaccess(), as it has already been
2787          * done.
2788          */
2789         if (nm[0] == 0) {
2790                 VN_HOLD(dvp);
2791                 error = 0;
2792                 *vpp = dvp;
2793                 goto out;
2794         }
2795 
2796         /*
2797          * Check for "." ie itself. this is a quick check and
2798          * avoids adding "." into the dnlc (which have been seen
2799          * to occupy >10% of the cache).
2800          */
2801         if ((nm[0] == '.') && (nm[1] == 0)) {
2802                 /*
2803                  * Don't return without checking accessibility
2804                  * of the directory. We only need the lock if
2805                  * we are going to return it.
2806                  */
2807                 if ((error = ufs_iaccess(ip, IEXEC, cr, 1)) == 0) {
2808                         VN_HOLD(dvp);
2809                         *vpp = dvp;
2810                 }
2811                 goto out;
2812         }
2813 
2814         /*
2815          * Fast path: Check the directory name lookup cache.
2816          */
2817         if (vp = dnlc_lookup(dvp, nm)) {
2818                 /*
2819                  * Check accessibility of directory.
2820                  */
2821                 if ((error = ufs_iaccess(ip, IEXEC, cr, 1)) != 0) {
2822                         VN_RELE(vp);
2823                         goto out;
2824                 }
2825                 if (vp == DNLC_NO_VNODE) {
2826                         VN_RELE(vp);
2827                         error = ENOENT;
2828                         goto out;
2829                 }
2830                 xip = VTOI(vp);
2831                 ulp = NULL;
2832                 goto fastpath;
2833         }
2834 
2835         /*
2836          * Keep the idle queue from getting too long by
2837          * idling two inodes before attempting to allocate another.
2838          *    This operation must be performed before entering
2839          *    lockfs or a transaction.
2840          */
2841         if (ufs_idle_q.uq_ne > ufs_idle_q.uq_hiwat)
2842                 if ((curthread->t_flag & T_DONTBLOCK) == 0) {
2843                         ins.in_lidles.value.ul += ufs_lookup_idle_count;
2844                         ufs_idle_some(ufs_lookup_idle_count);
2845                 }
2846 
2847 retry_lookup:
2848         /*
2849          * Check accessibility of directory.
2850          */
2851         if (error = ufs_diraccess(ip, IEXEC, cr))
2852                 goto out;
2853 
2854         ufsvfsp = ip->i_ufsvfs;
2855         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_LOOKUP_MASK);
2856         if (error)
2857                 goto out;
2858 
2859         error = ufs_dirlook(ip, nm, &xip, cr, 1, 0);
2860 
2861 fastpath:
2862         if (error == 0) {
2863                 ip = xip;
2864                 *vpp = ITOV(ip);
2865 
2866                 /*
2867                  * If vnode is a device return special vnode instead.
2868                  */
2869                 if (IS_DEVVP(*vpp)) {
2870                         struct vnode *newvp;
2871 
2872                         newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type,
2873                             cr);
2874                         VN_RELE(*vpp);
2875                         if (newvp == NULL)
2876                                 error = ENOSYS;
2877                         else
2878                                 *vpp = newvp;
2879                 } else if (ip->i_cflags & ICOMPRESS) {
2880                         struct vnode *newvp;
2881 
2882                         /*
2883                          * Compressed file, substitute dcfs vnode
2884                          */
2885                         newvp = decompvp(*vpp, cr, ct);
2886                         VN_RELE(*vpp);
2887                         if (newvp == NULL)
2888                                 error = ENOSYS;
2889                         else
2890                                 *vpp = newvp;
2891                 }
2892         }
2893         if (ulp) {
2894                 ufs_lockfs_end(ulp);
2895         }
2896 
2897         if (error == EAGAIN)
2898                 goto retry_lookup;
2899 
2900 out:
2901         return (error);
2902 }
2903 
2904 /*ARGSUSED*/
2905 static int
2906 ufs_create(struct vnode *dvp, char *name, struct vattr *vap, enum vcexcl excl,
2907     int mode, struct vnode **vpp, struct cred *cr, int flag,
2908     caller_context_t *ct, vsecattr_t *vsecp)
2909 {
2910         struct inode *ip;
2911         struct inode *xip;
2912         struct inode *dip;
2913         struct vnode *xvp;
2914         struct ufsvfs *ufsvfsp;
2915         struct ulockfs *ulp;
2916         int error;
2917         int issync;
2918         int truncflag;
2919         int trans_size;
2920         int noentry;
2921         int defer_dip_seq_update = 0;   /* need to defer update of dip->i_seq */
2922         int retry = 1;
2923         int indeadlock;
2924 
2925 again:
2926         ip = VTOI(dvp);
2927         ufsvfsp = ip->i_ufsvfs;
2928         truncflag = 0;
2929 
2930         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_CREATE_MASK);
2931         if (error)
2932                 goto out;
2933 
2934         if (ulp) {
2935                 trans_size = (int)TOP_CREATE_SIZE(ip);
2936                 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_CREATE, trans_size);
2937         }
2938 
2939         if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr) != 0)
2940                 vap->va_mode &= ~VSVTX;
2941 
2942         if (*name == '\0') {
2943                 /*
2944                  * Null component name refers to the directory itself.
2945                  */
2946                 VN_HOLD(dvp);
2947                 /*
2948                  * Even though this is an error case, we need to grab the
2949                  * quota lock since the error handling code below is common.
2950                  */
2951                 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
2952                 rw_enter(&ip->i_contents, RW_WRITER);
2953                 error = EEXIST;
2954         } else {
2955                 xip = NULL;
2956                 noentry = 0;
2957                 /*
2958                  * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
2959                  * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
2960                  * possible, retries the operation.
2961                  */
2962                 ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_CREATE,
2963                     retry_dir);
2964                 if (indeadlock)
2965                         goto again;
2966 
2967                 xvp = dnlc_lookup(dvp, name);
2968                 if (xvp == DNLC_NO_VNODE) {
2969                         noentry = 1;
2970                         VN_RELE(xvp);
2971                         xvp = NULL;
2972                 }
2973                 if (xvp) {
2974                         rw_exit(&ip->i_rwlock);
2975                         if (error = ufs_iaccess(ip, IEXEC, cr, 1)) {
2976                                 VN_RELE(xvp);
2977                         } else {
2978                                 error = EEXIST;
2979                                 xip = VTOI(xvp);
2980                         }
2981                 } else {
2982                         /*
2983                          * Suppress file system full message if we will retry
2984                          */
2985                         error = ufs_direnter_cm(ip, name, DE_CREATE,
2986                             vap, &xip, cr, (noentry | (retry ? IQUIET : 0)));
2987                         if (error == EAGAIN) {
2988                                 if (ulp) {
2989                                         TRANS_END_CSYNC(ufsvfsp, error, issync,
2990                                             TOP_CREATE, trans_size);
2991                                         ufs_lockfs_end(ulp);
2992                                 }
2993                                 goto again;
2994                         }
2995                         rw_exit(&ip->i_rwlock);
2996                 }
2997                 ip = xip;
2998                 if (ip != NULL) {
2999                         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
3000                         rw_enter(&ip->i_contents, RW_WRITER);
3001                 }
3002         }
3003 
3004         /*
3005          * If the file already exists and this is a non-exclusive create,
3006          * check permissions and allow access for non-directories.
3007          * Read-only create of an existing directory is also allowed.
3008          * We fail an exclusive create of anything which already exists.
3009          */
3010         if (error == EEXIST) {
3011                 dip = VTOI(dvp);
3012                 if (excl == NONEXCL) {
3013                         if ((((ip->i_mode & IFMT) == IFDIR) ||
3014                             ((ip->i_mode & IFMT) == IFATTRDIR)) &&
3015                             (mode & IWRITE))
3016                                 error = EISDIR;
3017                         else if (mode)
3018                                 error = ufs_iaccess(ip, mode, cr, 0);
3019                         else
3020                                 error = 0;
3021                 }
3022                 if (error) {
3023                         rw_exit(&ip->i_contents);
3024                         rw_exit(&ufsvfsp->vfs_dqrwlock);
3025                         VN_RELE(ITOV(ip));
3026                         goto unlock;
3027                 }
3028                 /*
3029                  * If the error EEXIST was set, then i_seq can not
3030                  * have been updated. The sequence number interface
3031                  * is defined such that a non-error VOP_CREATE must
3032                  * increase the dir va_seq it by at least one. If we
3033                  * have cleared the error, increase i_seq. Note that
3034                  * we are increasing the dir i_seq and in rare cases
3035                  * ip may actually be from the dvp, so we already have
3036                  * the locks and it will not be subject to truncation.
3037                  * In case we have to update i_seq of the parent
3038                  * directory dip, we have to defer it till we have
3039                  * released our locks on ip due to lock ordering requirements.
3040                  */
3041                 if (ip != dip)
3042                         defer_dip_seq_update = 1;
3043                 else
3044                         ip->i_seq++;
3045 
3046                 if (((ip->i_mode & IFMT) == IFREG) &&
3047                     (vap->va_mask & AT_SIZE) && vap->va_size == 0) {
3048                         /*
3049                          * Truncate regular files, if requested by caller.
3050                          * Grab i_rwlock to make sure no one else is
3051                          * currently writing to the file (we promised
3052                          * bmap we would do this).
3053                          * Must get the locks in the correct order.
3054                          */
3055                         if (ip->i_size == 0) {
3056                                 ip->i_flag |= ICHG | IUPD;
3057                                 ip->i_seq++;
3058                                 TRANS_INODE(ufsvfsp, ip);
3059                         } else {
3060                                 /*
3061                                  * Large Files: Why this check here?
3062                                  * Though we do it in vn_create() we really
3063                                  * want to guarantee that we do not destroy
3064                                  * Large file data by atomically checking
3065                                  * the size while holding the contents
3066                                  * lock.
3067                                  */
3068                                 if (flag && !(flag & FOFFMAX) &&
3069                                     ((ip->i_mode & IFMT) == IFREG) &&
3070                                     (ip->i_size > (offset_t)MAXOFF32_T)) {
3071                                         rw_exit(&ip->i_contents);
3072                                         rw_exit(&ufsvfsp->vfs_dqrwlock);
3073                                         error = EOVERFLOW;
3074                                         goto unlock;
3075                                 }
3076                                 if (TRANS_ISTRANS(ufsvfsp))
3077                                         truncflag++;
3078                                 else {
3079                                         rw_exit(&ip->i_contents);
3080                                         rw_exit(&ufsvfsp->vfs_dqrwlock);
3081                                         ufs_tryirwlock_trans(&ip->i_rwlock,
3082                                             RW_WRITER, TOP_CREATE,
3083                                             retry_file);
3084                                         if (indeadlock) {
3085                                                 VN_RELE(ITOV(ip));
3086                                                 goto again;
3087                                         }
3088                                         rw_enter(&ufsvfsp->vfs_dqrwlock,
3089                                             RW_READER);
3090                                         rw_enter(&ip->i_contents, RW_WRITER);
3091                                         (void) ufs_itrunc(ip, (u_offset_t)0, 0,
3092                                             cr);
3093                                         rw_exit(&ip->i_rwlock);
3094                                 }
3095 
3096                         }
3097                         if (error == 0) {
3098                                 vnevent_create(ITOV(ip), ct);
3099                         }
3100                 }
3101         }
3102 
3103         if (error) {
3104                 if (ip != NULL) {
3105                         rw_exit(&ufsvfsp->vfs_dqrwlock);
3106                         rw_exit(&ip->i_contents);
3107                 }
3108                 goto unlock;
3109         }
3110 
3111         *vpp = ITOV(ip);
3112         ITIMES(ip);
3113         rw_exit(&ip->i_contents);
3114         rw_exit(&ufsvfsp->vfs_dqrwlock);
3115 
3116         /*
3117          * If vnode is a device return special vnode instead.
3118          */
3119         if (!error && IS_DEVVP(*vpp)) {
3120                 struct vnode *newvp;
3121 
3122                 newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
3123                 VN_RELE(*vpp);
3124                 if (newvp == NULL) {
3125                         error = ENOSYS;
3126                         goto unlock;
3127                 }
3128                 truncflag = 0;
3129                 *vpp = newvp;
3130         }
3131 unlock:
3132 
3133         /*
3134          * Do the deferred update of the parent directory's sequence
3135          * number now.
3136          */
3137         if (defer_dip_seq_update == 1) {
3138                 rw_enter(&dip->i_contents, RW_READER);
3139                 mutex_enter(&dip->i_tlock);
3140                 dip->i_seq++;
3141                 mutex_exit(&dip->i_tlock);
3142                 rw_exit(&dip->i_contents);
3143         }
3144 
3145         if (ulp) {
3146                 int terr = 0;
3147 
3148                 TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_CREATE,
3149                     trans_size);
3150 
3151                 /*
3152                  * If we haven't had a more interesting failure
3153                  * already, then anything that might've happened
3154                  * here should be reported.
3155                  */
3156                 if (error == 0)
3157                         error = terr;
3158         }
3159 
3160         if (!error && truncflag) {
3161                 ufs_tryirwlock(&ip->i_rwlock, RW_WRITER, retry_trunc);
3162                 if (indeadlock) {
3163                         if (ulp)
3164                                 ufs_lockfs_end(ulp);
3165                         VN_RELE(ITOV(ip));
3166                         goto again;
3167                 }
3168                 (void) TRANS_ITRUNC(ip, (u_offset_t)0, 0, cr);
3169                 rw_exit(&ip->i_rwlock);
3170         }
3171 
3172         if (ulp)
3173                 ufs_lockfs_end(ulp);
3174 
3175         /*
3176          * If no inodes available, try to free one up out of the
3177          * pending delete queue.
3178          */
3179         if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
3180                 ufs_delete_drain_wait(ufsvfsp, 1);
3181                 retry = 0;
3182                 goto again;
3183         }
3184 
3185 out:
3186         return (error);
3187 }
3188 
3189 extern int ufs_idle_max;
3190 /*ARGSUSED*/
3191 static int
3192 ufs_remove(struct vnode *vp, char *nm, struct cred *cr, caller_context_t *ct,
3193     int flags)
3194 {
3195         struct inode *ip = VTOI(vp);
3196         struct ufsvfs *ufsvfsp  = ip->i_ufsvfs;
3197         struct ulockfs *ulp;
3198         vnode_t *rmvp = NULL;   /* Vnode corresponding to name being removed */
3199         int indeadlock;
3200         int error;
3201         int issync;
3202         int trans_size;
3203 
3204         /*
3205          * don't let the delete queue get too long
3206          */
3207         if (ufsvfsp == NULL) {
3208                 error = EIO;
3209                 goto out;
3210         }
3211         if (ufsvfsp->vfs_delete.uq_ne > ufs_idle_max)
3212                 ufs_delete_drain(vp->v_vfsp, 1, 1);
3213 
3214         error = ufs_eventlookup(vp, nm, cr, &rmvp);
3215         if (rmvp != NULL) {
3216                 /* Only send the event if there were no errors */
3217                 if (error == 0)
3218                         vnevent_remove(rmvp, vp, nm, ct);
3219                 VN_RELE(rmvp);
3220         }
3221 
3222 retry_remove:
3223         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_REMOVE_MASK);
3224         if (error)
3225                 goto out;
3226 
3227         if (ulp)
3228                 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE,
3229                     trans_size = (int)TOP_REMOVE_SIZE(VTOI(vp)));
3230 
3231         /*
3232          * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
3233          * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
3234          * possible, retries the operation.
3235          */
3236         ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_REMOVE, retry);
3237         if (indeadlock)
3238                 goto retry_remove;
3239         error = ufs_dirremove(ip, nm, (struct inode *)0, (struct vnode *)0,
3240             DR_REMOVE, cr);
3241         rw_exit(&ip->i_rwlock);
3242 
3243         if (ulp) {
3244                 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_REMOVE, trans_size);
3245                 ufs_lockfs_end(ulp);
3246         }
3247 
3248 out:
3249         return (error);
3250 }
3251 
3252 /*
3253  * Link a file or a directory.  Only privileged processes are allowed to
3254  * make links to directories.
3255  */
3256 /*ARGSUSED*/
3257 static int
3258 ufs_link(struct vnode *tdvp, struct vnode *svp, char *tnm, struct cred *cr,
3259     caller_context_t *ct, int flags)
3260 {
3261         struct inode *sip;
3262         struct inode *tdp = VTOI(tdvp);
3263         struct ufsvfs *ufsvfsp = tdp->i_ufsvfs;
3264         struct ulockfs *ulp;
3265         struct vnode *realvp;
3266         int error;
3267         int issync;
3268         int trans_size;
3269         int isdev;
3270         int indeadlock;
3271 
3272 retry_link:
3273         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_LINK_MASK);
3274         if (error)
3275                 goto out;
3276 
3277         if (ulp)
3278                 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_LINK,
3279                     trans_size = (int)TOP_LINK_SIZE(VTOI(tdvp)));
3280 
3281         if (VOP_REALVP(svp, &realvp, ct) == 0)
3282                 svp = realvp;
3283 
3284         /*
3285          * Make sure link for extended attributes is valid
3286          * We only support hard linking of attr in ATTRDIR to ATTRDIR
3287          *
3288          * Make certain we don't attempt to look at a device node as
3289          * a ufs inode.
3290          */
3291 
3292         isdev = IS_DEVVP(svp);
3293         if (((isdev == 0) && ((VTOI(svp)->i_cflags & IXATTR) == 0) &&
3294             ((tdp->i_mode & IFMT) == IFATTRDIR)) ||
3295             ((isdev == 0) && (VTOI(svp)->i_cflags & IXATTR) &&
3296             ((tdp->i_mode & IFMT) == IFDIR))) {
3297                 error = EINVAL;
3298                 goto unlock;
3299         }
3300 
3301         sip = VTOI(svp);
3302         if ((svp->v_type == VDIR &&
3303             secpolicy_fs_linkdir(cr, ufsvfsp->vfs_vfs) != 0) ||
3304             (sip->i_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0)) {
3305                 error = EPERM;
3306                 goto unlock;
3307         }
3308 
3309         /*
3310          * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
3311          * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
3312          * possible, retries the operation.
3313          */
3314         ufs_tryirwlock_trans(&tdp->i_rwlock, RW_WRITER, TOP_LINK, retry);
3315         if (indeadlock)
3316                 goto retry_link;
3317         error = ufs_direnter_lr(tdp, tnm, DE_LINK, (struct inode *)0,
3318             sip, cr);
3319         rw_exit(&tdp->i_rwlock);
3320 
3321 unlock:
3322         if (ulp) {
3323                 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_LINK, trans_size);
3324                 ufs_lockfs_end(ulp);
3325         }
3326 
3327         if (!error) {
3328                 vnevent_link(svp, ct);
3329         }
3330 out:
3331         return (error);
3332 }
3333 
3334 uint64_t ufs_rename_retry_cnt;
3335 uint64_t ufs_rename_upgrade_retry_cnt;
3336 uint64_t ufs_rename_dircheck_retry_cnt;
3337 clock_t  ufs_rename_backoff_delay = 1;
3338 
3339 /*
3340  * Rename a file or directory.
3341  * We are given the vnode and entry string of the source and the
3342  * vnode and entry string of the place we want to move the source
3343  * to (the target). The essential operation is:
3344  *      unlink(target);
3345  *      link(source, target);
3346  *      unlink(source);
3347  * but "atomically".  Can't do full commit without saving state in
3348  * the inode on disk, which isn't feasible at this time.  Best we
3349  * can do is always guarantee that the TARGET exists.
3350  */
3351 
3352 /*ARGSUSED*/
3353 static int
3354 ufs_rename(struct vnode *sdvp, char *snm, struct vnode *tdvp, char *tnm,
3355     struct cred *cr, caller_context_t *ct, int flags)
3356 {
3357         struct inode *sip = NULL;       /* source inode */
3358         struct inode *ip = NULL;        /* check inode */
3359         struct inode *sdp;              /* old (source) parent inode */
3360         struct inode *tdp;              /* new (target) parent inode */
3361         struct vnode *svp = NULL;       /* source vnode */
3362         struct vnode *tvp = NULL;       /* target vnode, if it exists */
3363         struct vnode *realvp;
3364         struct ufsvfs *ufsvfsp;
3365         struct ulockfs *ulp = NULL;
3366         struct ufs_slot slot;
3367         timestruc_t now;
3368         int error;
3369         int issync;
3370         int trans_size;
3371         krwlock_t *first_lock;
3372         krwlock_t *second_lock;
3373         krwlock_t *reverse_lock;
3374         int serr, terr;
3375 
3376         sdp = VTOI(sdvp);
3377         slot.fbp = NULL;
3378         ufsvfsp = sdp->i_ufsvfs;
3379 
3380         if (VOP_REALVP(tdvp, &realvp, ct) == 0)
3381                 tdvp = realvp;
3382 
3383         /* Must do this before taking locks in case of DNLC miss */
3384         terr = ufs_eventlookup(tdvp, tnm, cr, &tvp);
3385         serr = ufs_eventlookup(sdvp, snm, cr, &svp);
3386 
3387         if ((serr == 0) && ((terr == 0) || (terr == ENOENT))) {
3388                 if (tvp != NULL)
3389                         vnevent_pre_rename_dest(tvp, tdvp, tnm, ct);
3390 
3391                 /*
3392                  * Notify the target directory of the rename event
3393                  * if source and target directories are not the same.
3394                  */
3395                 if (sdvp != tdvp)
3396                         vnevent_pre_rename_dest_dir(tdvp, svp, tnm, ct);
3397 
3398                 if (svp != NULL)
3399                         vnevent_pre_rename_src(svp, sdvp, snm, ct);
3400         }
3401 
3402         if (svp != NULL)
3403                 VN_RELE(svp);
3404 
3405 retry_rename:
3406         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_RENAME_MASK);
3407         if (error)
3408                 goto unlock;
3409 
3410         if (ulp)
3411                 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_RENAME,
3412                     trans_size = (int)TOP_RENAME_SIZE(sdp));
3413 
3414         if (VOP_REALVP(tdvp, &realvp, ct) == 0)
3415                 tdvp = realvp;
3416 
3417         tdp = VTOI(tdvp);
3418 
3419         /*
3420          * We only allow renaming of attributes from ATTRDIR to ATTRDIR.
3421          */
3422         if ((tdp->i_mode & IFMT) != (sdp->i_mode & IFMT)) {
3423                 error = EINVAL;
3424                 goto unlock;
3425         }
3426 
3427         /*
3428          * Check accessibility of directory.
3429          */
3430         if (error = ufs_diraccess(sdp, IEXEC, cr))
3431                 goto unlock;
3432 
3433         /*
3434          * Look up inode of file we're supposed to rename.
3435          */
3436         gethrestime(&now);
3437         if (error = ufs_dirlook(sdp, snm, &sip, cr, 0, 0)) {
3438                 if (error == EAGAIN) {
3439                         if (ulp) {
3440                                 TRANS_END_CSYNC(ufsvfsp, error, issync,
3441                                     TOP_RENAME, trans_size);
3442                                 ufs_lockfs_end(ulp);
3443                         }
3444                         goto retry_rename;
3445                 }
3446 
3447                 goto unlock;
3448         }
3449 
3450         /*
3451          * Lock both the source and target directories (they may be
3452          * the same) to provide the atomicity semantics that was
3453          * previously provided by the per file system vfs_rename_lock
3454          *
3455          * with vfs_rename_lock removed to allow simultaneous renames
3456          * within a file system, ufs_dircheckpath can deadlock while
3457          * traversing back to ensure that source is not a parent directory
3458          * of target parent directory. This is because we get into
3459          * ufs_dircheckpath with the sdp and tdp locks held as RW_WRITER.
3460          * If the tdp and sdp of the simultaneous renames happen to be
3461          * in the path of each other, it can lead to a deadlock. This
3462          * can be avoided by getting the locks as RW_READER here and then
3463          * upgrading to RW_WRITER after completing the ufs_dircheckpath.
3464          *
3465          * We hold the target directory's i_rwlock after calling
3466          * ufs_lockfs_begin but in many other operations (like ufs_readdir)
3467          * VOP_RWLOCK is explicitly called by the filesystem independent code
3468          * before calling the file system operation. In these cases the order
3469          * is reversed (i.e i_rwlock is taken first and then ufs_lockfs_begin
3470          * is called). This is fine as long as ufs_lockfs_begin acts as a VOP
3471          * counter but with ufs_quiesce setting the SLOCK bit this becomes a
3472          * synchronizing object which might lead to a deadlock. So we use
3473          * rw_tryenter instead of rw_enter. If we fail to get this lock and
3474          * find that SLOCK bit is set, we call ufs_lockfs_end and restart the
3475          * operation.
3476          */
3477 retry:
3478         first_lock = &tdp->i_rwlock;
3479         second_lock = &sdp->i_rwlock;
3480 retry_firstlock:
3481         if (!rw_tryenter(first_lock, RW_READER)) {
3482                 /*
3483                  * We didn't get the lock. Check if the SLOCK is set in the
3484                  * ufsvfs. If yes, we might be in a deadlock. Safer to give up
3485                  * and wait for SLOCK to be cleared.
3486                  */
3487 
3488                 if (ulp && ULOCKFS_IS_SLOCK(ulp)) {
3489                         TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RENAME,
3490                             trans_size);
3491                         ufs_lockfs_end(ulp);
3492                         goto retry_rename;
3493 
3494                 } else {
3495                         /*
3496                          * SLOCK isn't set so this is a genuine synchronization
3497                          * case. Let's try again after giving them a breather.
3498                          */
3499                         delay(RETRY_LOCK_DELAY);
3500                         goto  retry_firstlock;
3501                 }
3502         }
3503         /*
3504          * Need to check if the tdp and sdp are same !!!
3505          */
3506         if ((tdp != sdp) && (!rw_tryenter(second_lock, RW_READER))) {
3507                 /*
3508                  * We didn't get the lock. Check if the SLOCK is set in the
3509                  * ufsvfs. If yes, we might be in a deadlock. Safer to give up
3510                  * and wait for SLOCK to be cleared.
3511                  */
3512 
3513                 rw_exit(first_lock);
3514                 if (ulp && ULOCKFS_IS_SLOCK(ulp)) {
3515                         TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RENAME,
3516                             trans_size);
3517                         ufs_lockfs_end(ulp);
3518                         goto retry_rename;
3519 
3520                 } else {
3521                         /*
3522                          * So we couldn't get the second level peer lock *and*
3523                          * the SLOCK bit isn't set. Too bad we can be
3524                          * contentding with someone wanting these locks otherway
3525                          * round. Reverse the locks in case there is a heavy
3526                          * contention for the second level lock.
3527                          */
3528                         reverse_lock = first_lock;
3529                         first_lock = second_lock;
3530                         second_lock = reverse_lock;
3531                         ufs_rename_retry_cnt++;
3532                         goto  retry_firstlock;
3533                 }
3534         }
3535 
3536         if (sip == tdp) {
3537                 error = EINVAL;
3538                 goto errout;
3539         }
3540         /*
3541          * Make sure we can delete the source entry.  This requires
3542          * write permission on the containing directory.
3543          * Check for sticky directories.
3544          */
3545         rw_enter(&sdp->i_contents, RW_READER);
3546         rw_enter(&sip->i_contents, RW_READER);
3547         if ((error = ufs_iaccess(sdp, IWRITE, cr, 0)) != 0 ||
3548             (error = ufs_sticky_remove_access(sdp, sip, cr)) != 0) {
3549                 rw_exit(&sip->i_contents);
3550                 rw_exit(&sdp->i_contents);
3551                 goto errout;
3552         }
3553 
3554         /*
3555          * If this is a rename of a directory and the parent is
3556          * different (".." must be changed), then the source
3557          * directory must not be in the directory hierarchy
3558          * above the target, as this would orphan everything
3559          * below the source directory.  Also the user must have
3560          * write permission in the source so as to be able to
3561          * change "..".
3562          */
3563         if ((((sip->i_mode & IFMT) == IFDIR) ||
3564             ((sip->i_mode & IFMT) == IFATTRDIR)) && sdp != tdp) {
3565                 ino_t   inum;
3566 
3567                 if (error = ufs_iaccess(sip, IWRITE, cr, 0)) {
3568                         rw_exit(&sip->i_contents);
3569                         rw_exit(&sdp->i_contents);
3570                         goto errout;
3571                 }
3572                 inum = sip->i_number;
3573                 rw_exit(&sip->i_contents);
3574                 rw_exit(&sdp->i_contents);
3575                 if ((error = ufs_dircheckpath(inum, tdp, sdp, cr))) {
3576                         /*
3577                          * If we got EAGAIN ufs_dircheckpath detected a
3578                          * potential deadlock and backed out. We need
3579                          * to retry the operation since sdp and tdp have
3580                          * to be released to avoid the deadlock.
3581                          */
3582                         if (error == EAGAIN) {
3583                                 rw_exit(&tdp->i_rwlock);
3584                                 if (tdp != sdp)
3585                                         rw_exit(&sdp->i_rwlock);
3586                                 delay(ufs_rename_backoff_delay);
3587                                 ufs_rename_dircheck_retry_cnt++;
3588                                 goto retry;
3589                         }
3590                         goto errout;
3591                 }
3592         } else {
3593                 rw_exit(&sip->i_contents);
3594                 rw_exit(&sdp->i_contents);
3595         }
3596 
3597 
3598         /*
3599          * Check for renaming '.' or '..' or alias of '.'
3600          */
3601         if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0 || sdp == sip) {
3602                 error = EINVAL;
3603                 goto errout;
3604         }
3605 
3606         /*
3607          * Simultaneous renames can deadlock in ufs_dircheckpath since it
3608          * tries to traverse back the file tree with both tdp and sdp held
3609          * as RW_WRITER. To avoid that we have to hold the tdp and sdp locks
3610          * as RW_READERS  till ufs_dircheckpath is done.
3611          * Now that ufs_dircheckpath is done with, we can upgrade the locks
3612          * to RW_WRITER.
3613          */
3614         if (!rw_tryupgrade(&tdp->i_rwlock)) {
3615                 /*
3616                  * The upgrade failed. We got to give away the lock
3617                  * as to avoid deadlocking with someone else who is
3618                  * waiting for writer lock. With the lock gone, we
3619                  * cannot be sure the checks done above will hold
3620                  * good when we eventually get them back as writer.
3621                  * So if we can't upgrade we drop the locks and retry
3622                  * everything again.
3623                  */
3624                 rw_exit(&tdp->i_rwlock);
3625                 if (tdp != sdp)
3626                         rw_exit(&sdp->i_rwlock);
3627                 delay(ufs_rename_backoff_delay);
3628                 ufs_rename_upgrade_retry_cnt++;
3629                 goto retry;
3630         }
3631         if (tdp != sdp) {
3632                 if (!rw_tryupgrade(&sdp->i_rwlock)) {
3633                         /*
3634                          * The upgrade failed. We got to give away the lock
3635                          * as to avoid deadlocking with someone else who is
3636                          * waiting for writer lock. With the lock gone, we
3637                          * cannot be sure the checks done above will hold
3638                          * good when we eventually get them back as writer.
3639                          * So if we can't upgrade we drop the locks and retry
3640                          * everything again.
3641                          */
3642                         rw_exit(&tdp->i_rwlock);
3643                         rw_exit(&sdp->i_rwlock);
3644                         delay(ufs_rename_backoff_delay);
3645                         ufs_rename_upgrade_retry_cnt++;
3646                         goto retry;
3647                 }
3648         }
3649 
3650         /*
3651          * Now that all the locks are held check to make sure another thread
3652          * didn't slip in and take out the sip.
3653          */
3654         slot.status = NONE;
3655         if ((sip->i_ctime.tv_usec * 1000) > now.tv_nsec ||
3656             sip->i_ctime.tv_sec > now.tv_sec) {
3657                 rw_enter(&sdp->i_ufsvfs->vfs_dqrwlock, RW_READER);
3658                 rw_enter(&sdp->i_contents, RW_WRITER);
3659                 error = ufs_dircheckforname(sdp, snm, strlen(snm), &slot,
3660                     &ip, cr, 0);
3661                 rw_exit(&sdp->i_contents);
3662                 rw_exit(&sdp->i_ufsvfs->vfs_dqrwlock);
3663                 if (error) {
3664                         goto errout;
3665                 }
3666                 if (ip == NULL) {
3667                         error = ENOENT;
3668                         goto errout;
3669                 } else {
3670                         /*
3671                          * If the inode was found need to drop the v_count
3672                          * so as not to keep the filesystem from being
3673                          * unmounted at a later time.
3674                          */
3675                         VN_RELE(ITOV(ip));
3676                 }
3677 
3678                 /*
3679                  * Release the slot.fbp that has the page mapped and
3680                  * locked SE_SHARED, and could be used in in
3681                  * ufs_direnter_lr() which needs to get the SE_EXCL lock
3682                  * on said page.
3683                  */
3684                 if (slot.fbp) {
3685                         fbrelse(slot.fbp, S_OTHER);
3686                         slot.fbp = NULL;
3687                 }
3688         }
3689 
3690         /*
3691          * Link source to the target.
3692          */
3693         if (error = ufs_direnter_lr(tdp, tnm, DE_RENAME, sdp, sip, cr)) {
3694                 /*
3695                  * ESAME isn't really an error; it indicates that the
3696                  * operation should not be done because the source and target
3697                  * are the same file, but that no error should be reported.
3698                  */
3699                 if (error == ESAME)
3700                         error = 0;
3701                 goto errout;
3702         }
3703 
3704         if (error == 0 && tvp != NULL)
3705                 vnevent_rename_dest(tvp, tdvp, tnm, ct);
3706 
3707         /*
3708          * Unlink the source.
3709          * Remove the source entry.  ufs_dirremove() checks that the entry
3710          * still reflects sip, and returns an error if it doesn't.
3711          * If the entry has changed just forget about it.  Release
3712          * the source inode.
3713          */
3714         if ((error = ufs_dirremove(sdp, snm, sip, (struct vnode *)0,
3715             DR_RENAME, cr)) == ENOENT)
3716                 error = 0;
3717 
3718         if (error == 0) {
3719                 vnevent_rename_src(ITOV(sip), sdvp, snm, ct);
3720                 /*
3721                  * Notify the target directory of the rename event
3722                  * if source and target directories are not the same.
3723                  */
3724                 if (sdvp != tdvp)
3725                         vnevent_rename_dest_dir(tdvp, ct);
3726         }
3727 
3728 errout:
3729         if (slot.fbp)
3730                 fbrelse(slot.fbp, S_OTHER);
3731 
3732         rw_exit(&tdp->i_rwlock);
3733         if (sdp != tdp) {
3734                 rw_exit(&sdp->i_rwlock);
3735         }
3736 
3737 unlock:
3738         if (tvp != NULL)
3739                 VN_RELE(tvp);
3740         if (sip != NULL)
3741                 VN_RELE(ITOV(sip));
3742 
3743         if (ulp) {
3744                 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RENAME, trans_size);
3745                 ufs_lockfs_end(ulp);
3746         }
3747 
3748         return (error);
3749 }
3750 
3751 /*ARGSUSED*/
3752 static int
3753 ufs_mkdir(struct vnode *dvp, char *dirname, struct vattr *vap,
3754     struct vnode **vpp, struct cred *cr, caller_context_t *ct, int flags,
3755     vsecattr_t *vsecp)
3756 {
3757         struct inode *ip;
3758         struct inode *xip;
3759         struct ufsvfs *ufsvfsp;
3760         struct ulockfs *ulp;
3761         int error;
3762         int issync;
3763         int trans_size;
3764         int indeadlock;
3765         int retry = 1;
3766 
3767         ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
3768 
3769         /*
3770          * Can't make directory in attr hidden dir
3771          */
3772         if ((VTOI(dvp)->i_mode & IFMT) == IFATTRDIR)
3773                 return (EINVAL);
3774 
3775 again:
3776         ip = VTOI(dvp);
3777         ufsvfsp = ip->i_ufsvfs;
3778         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MKDIR_MASK);
3779         if (error)
3780                 goto out;
3781         if (ulp)
3782                 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_MKDIR,
3783                     trans_size = (int)TOP_MKDIR_SIZE(ip));
3784 
3785         /*
3786          * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
3787          * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
3788          * possible, retries the operation.
3789          */
3790         ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_MKDIR, retry);
3791         if (indeadlock)
3792                 goto again;
3793 
3794         error = ufs_direnter_cm(ip, dirname, DE_MKDIR, vap, &xip, cr,
3795             (retry ? IQUIET : 0));
3796         if (error == EAGAIN) {
3797                 if (ulp) {
3798                         TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_MKDIR,
3799                             trans_size);
3800                         ufs_lockfs_end(ulp);
3801                 }
3802                 goto again;
3803         }
3804 
3805         rw_exit(&ip->i_rwlock);
3806         if (error == 0) {
3807                 ip = xip;
3808                 *vpp = ITOV(ip);
3809         } else if (error == EEXIST)
3810                 VN_RELE(ITOV(xip));
3811 
3812         if (ulp) {
3813                 int terr = 0;
3814                 TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_MKDIR, trans_size);
3815                 ufs_lockfs_end(ulp);
3816                 if (error == 0)
3817                         error = terr;
3818         }
3819 out:
3820         if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
3821                 ufs_delete_drain_wait(ufsvfsp, 1);
3822                 retry = 0;
3823                 goto again;
3824         }
3825 
3826         return (error);
3827 }
3828 
3829 /*ARGSUSED*/
3830 static int
3831 ufs_rmdir(struct vnode *vp, char *nm, struct vnode *cdir, struct cred *cr,
3832     caller_context_t *ct, int flags)
3833 {
3834         struct inode *ip = VTOI(vp);
3835         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
3836         struct ulockfs *ulp;
3837         vnode_t *rmvp = NULL;   /* Vnode of removed directory */
3838         int error;
3839         int issync;
3840         int trans_size;
3841         int indeadlock;
3842 
3843         /*
3844          * don't let the delete queue get too long
3845          */
3846         if (ufsvfsp == NULL) {
3847                 error = EIO;
3848                 goto out;
3849         }
3850         if (ufsvfsp->vfs_delete.uq_ne > ufs_idle_max)
3851                 ufs_delete_drain(vp->v_vfsp, 1, 1);
3852 
3853         error = ufs_eventlookup(vp, nm, cr, &rmvp);
3854         if (rmvp != NULL) {
3855                 /* Only send the event if there were no errors */
3856                 if (error == 0)
3857                         vnevent_rmdir(rmvp, vp, nm, ct);
3858                 VN_RELE(rmvp);
3859         }
3860 
3861 retry_rmdir:
3862         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_RMDIR_MASK);
3863         if (error)
3864                 goto out;
3865 
3866         if (ulp)
3867                 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_RMDIR,
3868                     trans_size = TOP_RMDIR_SIZE);
3869 
3870         /*
3871          * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
3872          * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
3873          * possible, retries the operation.
3874          */
3875         ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_RMDIR, retry);
3876         if (indeadlock)
3877                 goto retry_rmdir;
3878         error = ufs_dirremove(ip, nm, (struct inode *)0, cdir, DR_RMDIR, cr);
3879 
3880         rw_exit(&ip->i_rwlock);
3881 
3882         if (ulp) {
3883                 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RMDIR,
3884                     trans_size);
3885                 ufs_lockfs_end(ulp);
3886         }
3887 
3888 out:
3889         return (error);
3890 }
3891 
3892 /* ARGSUSED */
3893 static int
3894 ufs_readdir(struct vnode *vp, struct uio *uiop, struct cred *cr, int *eofp,
3895     caller_context_t *ct, int flags)
3896 {
3897         struct iovec *iovp;
3898         struct inode *ip;
3899         struct direct *idp;
3900         struct dirent64 *odp;
3901         struct fbuf *fbp;
3902         struct ufsvfs *ufsvfsp;
3903         struct ulockfs *ulp;
3904         caddr_t outbuf;
3905         size_t bufsize;
3906         uint_t offset;
3907         uint_t bytes_wanted, total_bytes_wanted;
3908         int incount = 0;
3909         int outcount = 0;
3910         int error;
3911 
3912         ip = VTOI(vp);
3913         ASSERT(RW_READ_HELD(&ip->i_rwlock));
3914 
3915         if (uiop->uio_loffset >= MAXOFF32_T) {
3916                 if (eofp)
3917                         *eofp = 1;
3918                 return (0);
3919         }
3920 
3921         /*
3922          * Check if we have been called with a valid iov_len
3923          * and bail out if not, otherwise we may potentially loop
3924          * forever further down.
3925          */
3926         if (uiop->uio_iov->iov_len <= 0) {
3927                 error = EINVAL;
3928                 goto out;
3929         }
3930 
3931         /*
3932          * Large Files: When we come here we are guaranteed that
3933          * uio_offset can be used safely. The high word is zero.
3934          */
3935 
3936         ufsvfsp = ip->i_ufsvfs;
3937         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READDIR_MASK);
3938         if (error)
3939                 goto out;
3940 
3941         iovp = uiop->uio_iov;
3942         total_bytes_wanted = iovp->iov_len;
3943 
3944         /* Large Files: directory files should not be "large" */
3945 
3946         ASSERT(ip->i_size <= MAXOFF32_T);
3947 
3948         /* Force offset to be valid (to guard against bogus lseek() values) */
3949         offset = (uint_t)uiop->uio_offset & ~(DIRBLKSIZ - 1);
3950 
3951         /* Quit if at end of file or link count of zero (posix) */
3952         if (offset >= (uint_t)ip->i_size || ip->i_nlink <= 0) {
3953                 if (eofp)
3954                         *eofp = 1;
3955                 error = 0;
3956                 goto unlock;
3957         }
3958 
3959         /*
3960          * Get space to change directory entries into fs independent format.
3961          * Do fast alloc for the most commonly used-request size (filesystem
3962          * block size).
3963          */
3964         if (uiop->uio_segflg != UIO_SYSSPACE || uiop->uio_iovcnt != 1) {
3965                 bufsize = total_bytes_wanted;
3966                 outbuf = kmem_alloc(bufsize, KM_SLEEP);
3967                 odp = (struct dirent64 *)outbuf;
3968         } else {
3969                 bufsize = total_bytes_wanted;
3970                 odp = (struct dirent64 *)iovp->iov_base;
3971         }
3972 
3973 nextblk:
3974         bytes_wanted = total_bytes_wanted;
3975 
3976         /* Truncate request to file size */
3977         if (offset + bytes_wanted > (int)ip->i_size)
3978                 bytes_wanted = (int)(ip->i_size - offset);
3979 
3980         /* Comply with MAXBSIZE boundary restrictions of fbread() */
3981         if ((offset & MAXBOFFSET) + bytes_wanted > MAXBSIZE)
3982                 bytes_wanted = MAXBSIZE - (offset & MAXBOFFSET);
3983 
3984         /*
3985          * Read in the next chunk.
3986          * We are still holding the i_rwlock.
3987          */
3988         error = fbread(vp, (offset_t)offset, bytes_wanted, S_OTHER, &fbp);
3989 
3990         if (error)
3991                 goto update_inode;
3992         if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && (ip->i_fs->fs_ronly == 0) &&
3993             (!ufsvfsp->vfs_noatime)) {
3994                 ip->i_flag |= IACC;
3995         }
3996         incount = 0;
3997         idp = (struct direct *)fbp->fb_addr;
3998         if (idp->d_ino == 0 && idp->d_reclen == 0 && idp->d_namlen == 0) {
3999                 cmn_err(CE_WARN, "ufs_readdir: bad dir, inumber = %llu, "
4000                     "fs = %s\n",
4001                     (u_longlong_t)ip->i_number, ufsvfsp->vfs_fs->fs_fsmnt);
4002                 fbrelse(fbp, S_OTHER);
4003                 error = ENXIO;
4004                 goto update_inode;
4005         }
4006         /* Transform to file-system independent format */
4007         while (incount < bytes_wanted) {
4008                 /*
4009                  * If the current directory entry is mangled, then skip
4010                  * to the next block.  It would be nice to set the FSBAD
4011                  * flag in the super-block so that a fsck is forced on
4012                  * next reboot, but locking is a problem.
4013                  */
4014                 if (idp->d_reclen & 0x3) {
4015                         offset = (offset + DIRBLKSIZ) & ~(DIRBLKSIZ-1);
4016                         break;
4017                 }
4018 
4019                 /* Skip to requested offset and skip empty entries */
4020                 if (idp->d_ino != 0 && offset >= (uint_t)uiop->uio_offset) {
4021                         ushort_t this_reclen =
4022                             DIRENT64_RECLEN(idp->d_namlen);
4023                         /* Buffer too small for any entries */
4024                         if (!outcount && this_reclen > bufsize) {
4025                                 fbrelse(fbp, S_OTHER);
4026                                 error = EINVAL;
4027                                 goto update_inode;
4028                         }
4029                         /* If would overrun the buffer, quit */
4030                         if (outcount + this_reclen > bufsize) {
4031                                 break;
4032                         }
4033                         /* Take this entry */
4034                         odp->d_ino = (ino64_t)idp->d_ino;
4035                         odp->d_reclen = (ushort_t)this_reclen;
4036                         odp->d_off = (offset_t)(offset + idp->d_reclen);
4037 
4038                         /* use strncpy(9f) to zero out uninitialized bytes */
4039 
4040                         ASSERT(strlen(idp->d_name) + 1 <=
4041                             DIRENT64_NAMELEN(this_reclen));
4042                         (void) strncpy(odp->d_name, idp->d_name,
4043                             DIRENT64_NAMELEN(this_reclen));
4044                         outcount += odp->d_reclen;
4045                         odp = (struct dirent64 *)
4046                             ((intptr_t)odp + odp->d_reclen);
4047                         ASSERT(outcount <= bufsize);
4048                 }
4049                 if (idp->d_reclen) {
4050                         incount += idp->d_reclen;
4051                         offset += idp->d_reclen;
4052                         idp = (struct direct *)((intptr_t)idp + idp->d_reclen);
4053                 } else {
4054                         offset = (offset + DIRBLKSIZ) & ~(DIRBLKSIZ-1);
4055                         break;
4056                 }
4057         }
4058         /* Release the chunk */
4059         fbrelse(fbp, S_OTHER);
4060 
4061         /* Read whole block, but got no entries, read another if not eof */
4062 
4063         /*
4064          * Large Files: casting i_size to int here is not a problem
4065          * because directory sizes are always less than MAXOFF32_T.
4066          * See assertion above.
4067          */
4068 
4069         if (offset < (int)ip->i_size && !outcount)
4070                 goto nextblk;
4071 
4072         /* Copy out the entry data */
4073         if (uiop->uio_segflg == UIO_SYSSPACE && uiop->uio_iovcnt == 1) {
4074                 iovp->iov_base += outcount;
4075                 iovp->iov_len -= outcount;
4076                 uiop->uio_resid -= outcount;
4077                 uiop->uio_offset = offset;
4078         } else if ((error = uiomove(outbuf, (long)outcount, UIO_READ,
4079             uiop)) == 0)
4080                 uiop->uio_offset = offset;
4081 update_inode:
4082         ITIMES(ip);
4083         if (uiop->uio_segflg != UIO_SYSSPACE || uiop->uio_iovcnt != 1)
4084                 kmem_free(outbuf, bufsize);
4085 
4086         if (eofp && error == 0)
4087                 *eofp = (uiop->uio_offset >= (int)ip->i_size);
4088 unlock:
4089         if (ulp) {
4090                 ufs_lockfs_end(ulp);
4091         }
4092 out:
4093         return (error);
4094 }
4095 
4096 /*ARGSUSED*/
4097 static int
4098 ufs_symlink(struct vnode *dvp, char *linkname, struct vattr *vap, char *target,
4099     struct cred *cr, caller_context_t *ct, int flags)
4100 {
4101         struct inode *ip, *dip = VTOI(dvp);
4102         struct ufsvfs *ufsvfsp = dip->i_ufsvfs;
4103         struct ulockfs *ulp;
4104         int error;
4105         int issync;
4106         int trans_size;
4107         int residual;
4108         int ioflag;
4109         int retry = 1;
4110 
4111         /*
4112          * No symlinks in attrdirs at this time
4113          */
4114         if ((VTOI(dvp)->i_mode & IFMT) == IFATTRDIR)
4115                 return (EINVAL);
4116 
4117 again:
4118         ip = (struct inode *)NULL;
4119         vap->va_type = VLNK;
4120         vap->va_rdev = 0;
4121 
4122         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SYMLINK_MASK);
4123         if (error)
4124                 goto out;
4125 
4126         if (ulp)
4127                 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_SYMLINK,
4128                     trans_size = (int)TOP_SYMLINK_SIZE(dip));
4129 
4130         /*
4131          * We must create the inode before the directory entry, to avoid
4132          * racing with readlink().  ufs_dirmakeinode requires that we
4133          * hold the quota lock as reader, and directory locks as writer.
4134          */
4135 
4136         rw_enter(&dip->i_rwlock, RW_WRITER);
4137         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
4138         rw_enter(&dip->i_contents, RW_WRITER);
4139 
4140         /*
4141          * Suppress any out of inodes messages if we will retry on
4142          * ENOSP
4143          */
4144         if (retry)
4145                 dip->i_flag |= IQUIET;
4146 
4147         error = ufs_dirmakeinode(dip, &ip, vap, DE_SYMLINK, cr);
4148 
4149         dip->i_flag &= ~IQUIET;
4150 
4151         rw_exit(&dip->i_contents);
4152         rw_exit(&ufsvfsp->vfs_dqrwlock);
4153         rw_exit(&dip->i_rwlock);
4154 
4155         if (error)
4156                 goto unlock;
4157 
4158         /*
4159          * OK.  The inode has been created.  Write out the data of the
4160          * symbolic link.  Since symbolic links are metadata, and should
4161          * remain consistent across a system crash, we need to force the
4162          * data out synchronously.
4163          *
4164          * (This is a change from the semantics in earlier releases, which
4165          * only created symbolic links synchronously if the semi-documented
4166          * 'syncdir' option was set, or if we were being invoked by the NFS
4167          * server, which requires symbolic links to be created synchronously.)
4168          *
4169          * We need to pass in a pointer for the residual length; otherwise
4170          * ufs_rdwri() will always return EIO if it can't write the data,
4171          * even if the error was really ENOSPC or EDQUOT.
4172          */
4173 
4174         ioflag = FWRITE | FDSYNC;
4175         residual = 0;
4176 
4177         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
4178         rw_enter(&ip->i_contents, RW_WRITER);
4179 
4180         /*
4181          * Suppress file system full messages if we will retry
4182          */
4183         if (retry)
4184                 ip->i_flag |= IQUIET;
4185 
4186         error = ufs_rdwri(UIO_WRITE, ioflag, ip, target, strlen(target),
4187             (offset_t)0, UIO_SYSSPACE, &residual, cr);
4188 
4189         ip->i_flag &= ~IQUIET;
4190 
4191         if (error) {
4192                 rw_exit(&ip->i_contents);
4193                 rw_exit(&ufsvfsp->vfs_dqrwlock);
4194                 goto remove;
4195         }
4196 
4197         /*
4198          * If the link's data is small enough, we can cache it in the inode.
4199          * This is a "fast symbolic link".  We don't use the first direct
4200          * block because that's actually used to point at the symbolic link's
4201          * contents on disk; but we know that none of the other direct or
4202          * indirect blocks can be used because symbolic links are restricted
4203          * to be smaller than a file system block.
4204          */
4205 
4206         ASSERT(MAXPATHLEN <= VBSIZE(ITOV(ip)));
4207 
4208         if (ip->i_size > 0 && ip->i_size <= FSL_SIZE) {
4209                 if (kcopy(target, &ip->i_db[1], ip->i_size) == 0) {
4210                         ip->i_flag |= IFASTSYMLNK;
4211                 } else {
4212                         int i;
4213                         /* error, clear garbage left behind */
4214                         for (i = 1; i < NDADDR; i++)
4215                                 ip->i_db[i] = 0;
4216                         for (i = 0; i < NIADDR; i++)
4217                                 ip->i_ib[i] = 0;
4218                 }
4219         }
4220 
4221         rw_exit(&ip->i_contents);
4222         rw_exit(&ufsvfsp->vfs_dqrwlock);
4223 
4224         /*
4225          * OK.  We've successfully created the symbolic link.  All that
4226          * remains is to insert it into the appropriate directory.
4227          */
4228 
4229         rw_enter(&dip->i_rwlock, RW_WRITER);
4230         error = ufs_direnter_lr(dip, linkname, DE_SYMLINK, NULL, ip, cr);
4231         rw_exit(&dip->i_rwlock);
4232 
4233         /*
4234          * Fall through into remove-on-error code.  We're either done, or we
4235          * need to remove the inode (if we couldn't insert it).
4236          */
4237 
4238 remove:
4239         if (error && (ip != NULL)) {
4240                 rw_enter(&ip->i_contents, RW_WRITER);
4241                 ip->i_nlink--;
4242                 ip->i_flag |= ICHG;
4243                 ip->i_seq++;
4244                 ufs_setreclaim(ip);
4245                 rw_exit(&ip->i_contents);
4246         }
4247 
4248 unlock:
4249         if (ip != NULL)
4250                 VN_RELE(ITOV(ip));
4251 
4252         if (ulp) {
4253                 int terr = 0;
4254 
4255                 TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_SYMLINK,
4256                     trans_size);
4257                 ufs_lockfs_end(ulp);
4258                 if (error == 0)
4259                         error = terr;
4260         }
4261 
4262         /*
4263          * We may have failed due to lack of an inode or of a block to
4264          * store the target in.  Try flushing the delete queue to free
4265          * logically-available things up and try again.
4266          */
4267         if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
4268                 ufs_delete_drain_wait(ufsvfsp, 1);
4269                 retry = 0;
4270                 goto again;
4271         }
4272 
4273 out:
4274         return (error);
4275 }
4276 
4277 /*
4278  * Ufs specific routine used to do ufs io.
4279  */
4280 int
4281 ufs_rdwri(enum uio_rw rw, int ioflag, struct inode *ip, caddr_t base,
4282     ssize_t len, offset_t offset, enum uio_seg seg, int *aresid,
4283     struct cred *cr)
4284 {
4285         struct uio auio;
4286         struct iovec aiov;
4287         int error;
4288 
4289         ASSERT(RW_LOCK_HELD(&ip->i_contents));
4290 
4291         bzero((caddr_t)&auio, sizeof (uio_t));
4292         bzero((caddr_t)&aiov, sizeof (iovec_t));
4293 
4294         aiov.iov_base = base;
4295         aiov.iov_len = len;
4296         auio.uio_iov = &aiov;
4297         auio.uio_iovcnt = 1;
4298         auio.uio_loffset = offset;
4299         auio.uio_segflg = (short)seg;
4300         auio.uio_resid = len;
4301 
4302         if (rw == UIO_WRITE) {
4303                 auio.uio_fmode = FWRITE;
4304                 auio.uio_extflg = UIO_COPY_DEFAULT;
4305                 auio.uio_llimit = curproc->p_fsz_ctl;
4306                 error = wrip(ip, &auio, ioflag, cr);
4307         } else {
4308                 auio.uio_fmode = FREAD;
4309                 auio.uio_extflg = UIO_COPY_CACHED;
4310                 auio.uio_llimit = MAXOFFSET_T;
4311                 error = rdip(ip, &auio, ioflag, cr);
4312         }
4313 
4314         if (aresid) {
4315                 *aresid = auio.uio_resid;
4316         } else if (auio.uio_resid) {
4317                 error = EIO;
4318         }
4319         return (error);
4320 }
4321 
4322 /*ARGSUSED*/
4323 static int
4324 ufs_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
4325 {
4326         struct ufid *ufid;
4327         struct inode *ip = VTOI(vp);
4328 
4329         if (ip->i_ufsvfs == NULL)
4330                 return (EIO);
4331 
4332         if (fidp->fid_len < (sizeof (struct ufid) - sizeof (ushort_t))) {
4333                 fidp->fid_len = sizeof (struct ufid) - sizeof (ushort_t);
4334                 return (ENOSPC);
4335         }
4336 
4337         ufid = (struct ufid *)fidp;
4338         bzero((char *)ufid, sizeof (struct ufid));
4339         ufid->ufid_len = sizeof (struct ufid) - sizeof (ushort_t);
4340         ufid->ufid_ino = ip->i_number;
4341         ufid->ufid_gen = ip->i_gen;
4342 
4343         return (0);
4344 }
4345 
4346 /* ARGSUSED2 */
4347 static int
4348 ufs_rwlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
4349 {
4350         struct inode    *ip = VTOI(vp);
4351         struct ufsvfs   *ufsvfsp;
4352         int             forcedirectio;
4353 
4354         /*
4355          * Read case is easy.
4356          */
4357         if (!write_lock) {
4358                 rw_enter(&ip->i_rwlock, RW_READER);
4359                 return (V_WRITELOCK_FALSE);
4360         }
4361 
4362         /*
4363          * Caller has requested a writer lock, but that inhibits any
4364          * concurrency in the VOPs that follow. Acquire the lock shared
4365          * and defer exclusive access until it is known to be needed in
4366          * other VOP handlers. Some cases can be determined here.
4367          */
4368 
4369         /*
4370          * If directio is not set, there is no chance of concurrency,
4371          * so just acquire the lock exclusive. Beware of a forced
4372          * unmount before looking at the mount option.
4373          */
4374         ufsvfsp = ip->i_ufsvfs;
4375         forcedirectio = ufsvfsp ? ufsvfsp->vfs_forcedirectio : 0;
4376         if (!(ip->i_flag & IDIRECTIO || forcedirectio) ||
4377             !ufs_allow_shared_writes) {
4378                 rw_enter(&ip->i_rwlock, RW_WRITER);
4379                 return (V_WRITELOCK_TRUE);
4380         }
4381 
4382         /*
4383          * Mandatory locking forces acquiring i_rwlock exclusive.
4384          */
4385         if (MANDLOCK(vp, ip->i_mode)) {
4386                 rw_enter(&ip->i_rwlock, RW_WRITER);
4387                 return (V_WRITELOCK_TRUE);
4388         }
4389 
4390         /*
4391          * Acquire the lock shared in case a concurrent write follows.
4392          * Mandatory locking could have become enabled before the lock
4393          * was acquired. Re-check and upgrade if needed.
4394          */
4395         rw_enter(&ip->i_rwlock, RW_READER);
4396         if (MANDLOCK(vp, ip->i_mode)) {
4397                 rw_exit(&ip->i_rwlock);
4398                 rw_enter(&ip->i_rwlock, RW_WRITER);
4399                 return (V_WRITELOCK_TRUE);
4400         }
4401         return (V_WRITELOCK_FALSE);
4402 }
4403 
4404 /*ARGSUSED*/
4405 static void
4406 ufs_rwunlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
4407 {
4408         struct inode    *ip = VTOI(vp);
4409 
4410         rw_exit(&ip->i_rwlock);
4411 }
4412 
4413 /* ARGSUSED */
4414 static int
4415 ufs_seek(struct vnode *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
4416 {
4417         return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
4418 }
4419 
4420 /* ARGSUSED */
4421 static int
4422 ufs_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
4423     offset_t offset, struct flk_callback *flk_cbp, struct cred *cr,
4424     caller_context_t *ct)
4425 {
4426         struct inode *ip = VTOI(vp);
4427 
4428         if (ip->i_ufsvfs == NULL)
4429                 return (EIO);
4430 
4431         /*
4432          * If file is being mapped, disallow frlock.
4433          * XXX I am not holding tlock while checking i_mapcnt because the
4434          * current locking strategy drops all locks before calling fs_frlock.
4435          * So, mapcnt could change before we enter fs_frlock making is
4436          * meaningless to have held tlock in the first place.
4437          */
4438         if (ip->i_mapcnt > 0 && MANDLOCK(vp, ip->i_mode))
4439                 return (EAGAIN);
4440         return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4441 }
4442 
4443 /* ARGSUSED */
4444 static int
4445 ufs_space(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
4446     offset_t offset, cred_t *cr, caller_context_t *ct)
4447 {
4448         struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs;
4449         struct ulockfs *ulp;
4450         int error;
4451 
4452         if ((error = convoff(vp, bfp, 0, offset)) == 0) {
4453                 if (cmd == F_FREESP) {
4454                         error = ufs_lockfs_begin(ufsvfsp, &ulp,
4455                             ULOCKFS_SPACE_MASK);
4456                         if (error)
4457                                 return (error);
4458                         error = ufs_freesp(vp, bfp, flag, cr);
4459 
4460                         if (error == 0 && bfp->l_start == 0)
4461                                 vnevent_truncate(vp, ct);
4462                 } else if (cmd == F_ALLOCSP) {
4463                         error = ufs_lockfs_begin(ufsvfsp, &ulp,
4464                             ULOCKFS_FALLOCATE_MASK);
4465                         if (error)
4466                                 return (error);
4467                         error = ufs_allocsp(vp, bfp, cr);
4468                 } else
4469                         return (EINVAL); /* Command not handled here */
4470 
4471                 if (ulp)
4472                         ufs_lockfs_end(ulp);
4473 
4474         }
4475         return (error);
4476 }
4477 
4478 /*
4479  * Used to determine if read ahead should be done. Also used to
4480  * to determine when write back occurs.
4481  */
4482 #define CLUSTSZ(ip)             ((ip)->i_ufsvfs->vfs_ioclustsz)
4483 
4484 /*
4485  * A faster version of ufs_getpage.
4486  *
4487  * We optimize by inlining the pvn_getpages iterator, eliminating
4488  * calls to bmap_read if file doesn't have UFS holes, and avoiding
4489  * the overhead of page_exists().
4490  *
4491  * When files has UFS_HOLES and ufs_getpage is called with S_READ,
4492  * we set *protp to PROT_READ to avoid calling bmap_read. This approach
4493  * victimizes performance when a file with UFS holes is faulted
4494  * first in the S_READ mode, and then in the S_WRITE mode. We will get
4495  * two MMU faults in this case.
4496  *
4497  * XXX - the inode fields which control the sequential mode are not
4498  *       protected by any mutex. The read ahead will act wild if
4499  *       multiple processes will access the file concurrently and
4500  *       some of them in sequential mode. One particulary bad case
4501  *       is if another thread will change the value of i_nextrio between
4502  *       the time this thread tests the i_nextrio value and then reads it
4503  *       again to use it as the offset for the read ahead.
4504  */
4505 /*ARGSUSED*/
4506 static int
4507 ufs_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp,
4508     page_t *plarr[], size_t plsz, struct seg *seg, caddr_t addr,
4509     enum seg_rw rw, struct cred *cr, caller_context_t *ct)
4510 {
4511         u_offset_t      uoff = (u_offset_t)off; /* type conversion */
4512         u_offset_t      pgoff;
4513         u_offset_t      eoff;
4514         struct inode    *ip = VTOI(vp);
4515         struct ufsvfs   *ufsvfsp = ip->i_ufsvfs;
4516         struct fs       *fs;
4517         struct ulockfs  *ulp;
4518         page_t          **pl;
4519         caddr_t         pgaddr;
4520         krw_t           rwtype;
4521         int             err;
4522         int             has_holes;
4523         int             beyond_eof;
4524         int             seqmode;
4525         int             pgsize = PAGESIZE;
4526         int             dolock;
4527         int             do_qlock;
4528         int             trans_size;
4529 
4530         ASSERT((uoff & PAGEOFFSET) == 0);
4531 
4532         if (protp)
4533                 *protp = PROT_ALL;
4534 
4535         /*
4536          * Obey the lockfs protocol
4537          */
4538         err = ufs_lockfs_begin_getpage(ufsvfsp, &ulp, seg,
4539             rw == S_READ || rw == S_EXEC, protp);
4540         if (err)
4541                 goto out;
4542 
4543         fs = ufsvfsp->vfs_fs;
4544 
4545         if (ulp && (rw == S_CREATE || rw == S_WRITE) &&
4546             !(vp->v_flag & VISSWAP)) {
4547                 /*
4548                  * Try to start a transaction, will return if blocking is
4549                  * expected to occur and the address space is not the
4550                  * kernel address space.
4551                  */
4552                 trans_size = TOP_GETPAGE_SIZE(ip);
4553                 if (seg->s_as != &kas) {
4554                         TRANS_TRY_BEGIN_ASYNC(ufsvfsp, TOP_GETPAGE,
4555                             trans_size, err)
4556                         if (err == EWOULDBLOCK) {
4557                                 /*
4558                                  * Use EDEADLK here because the VM code
4559                                  * can normally never see this error.
4560                                  */
4561                                 err = EDEADLK;
4562                                 ufs_lockfs_end(ulp);
4563                                 goto out;
4564                         }
4565                 } else {
4566                         TRANS_BEGIN_ASYNC(ufsvfsp, TOP_GETPAGE, trans_size);
4567                 }
4568         }
4569 
4570         if (vp->v_flag & VNOMAP) {
4571                 err = ENOSYS;
4572                 goto unlock;
4573         }
4574 
4575         seqmode = ip->i_nextr == uoff && rw != S_CREATE;
4576 
4577         rwtype = RW_READER;             /* start as a reader */
4578         dolock = (rw_owner(&ip->i_contents) != curthread);
4579         /*
4580          * If this thread owns the lock, i.e., this thread grabbed it
4581          * as writer somewhere above, then we don't need to grab the
4582          * lock as reader in this routine.
4583          */
4584         do_qlock = (rw_owner(&ufsvfsp->vfs_dqrwlock) != curthread);
4585 
4586 retrylock:
4587         if (dolock) {
4588                 /*
4589                  * Grab the quota lock if we need to call
4590                  * bmap_write() below (with i_contents as writer).
4591                  */
4592                 if (do_qlock && rwtype == RW_WRITER)
4593                         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
4594                 rw_enter(&ip->i_contents, rwtype);
4595         }
4596 
4597         /*
4598          * We may be getting called as a side effect of a bmap using
4599          * fbread() when the blocks might be being allocated and the
4600          * size has not yet been up'ed.  In this case we want to be
4601          * able to return zero pages if we get back UFS_HOLE from
4602          * calling bmap for a non write case here.  We also might have
4603          * to read some frags from the disk into a page if we are
4604          * extending the number of frags for a given lbn in bmap().
4605          * Large Files: The read of i_size here is atomic because
4606          * i_contents is held here. If dolock is zero, the lock
4607          * is held in bmap routines.
4608          */
4609         beyond_eof = uoff + len >
4610             P2ROUNDUP_TYPED(ip->i_size, PAGESIZE, u_offset_t);
4611         if (beyond_eof && seg != segkmap) {
4612                 if (dolock) {
4613                         rw_exit(&ip->i_contents);
4614                         if (do_qlock && rwtype == RW_WRITER)
4615                                 rw_exit(&ufsvfsp->vfs_dqrwlock);
4616                 }
4617                 err = EFAULT;
4618                 goto unlock;
4619         }
4620 
4621         /*
4622          * Must hold i_contents lock throughout the call to pvn_getpages
4623          * since locked pages are returned from each call to ufs_getapage.
4624          * Must *not* return locked pages and then try for contents lock
4625          * due to lock ordering requirements (inode > page)
4626          */
4627 
4628         has_holes = bmap_has_holes(ip);
4629 
4630         if ((rw == S_WRITE || rw == S_CREATE) && has_holes && !beyond_eof) {
4631                 int     blk_size;
4632                 u_offset_t offset;
4633 
4634                 /*
4635                  * We must acquire the RW_WRITER lock in order to
4636                  * call bmap_write().
4637                  */
4638                 if (dolock && rwtype == RW_READER) {
4639                         rwtype = RW_WRITER;
4640 
4641                         /*
4642                          * Grab the quota lock before
4643                          * upgrading i_contents, but if we can't grab it
4644                          * don't wait here due to lock order:
4645                          * vfs_dqrwlock > i_contents.
4646                          */
4647                         if (do_qlock &&
4648                             rw_tryenter(&ufsvfsp->vfs_dqrwlock, RW_READER)
4649                             == 0) {
4650                                 rw_exit(&ip->i_contents);
4651                                 goto retrylock;
4652                         }
4653                         if (!rw_tryupgrade(&ip->i_contents)) {
4654                                 rw_exit(&ip->i_contents);
4655                                 if (do_qlock)
4656                                         rw_exit(&ufsvfsp->vfs_dqrwlock);
4657                                 goto retrylock;
4658                         }
4659                 }
4660 
4661                 /*
4662                  * May be allocating disk blocks for holes here as
4663                  * a result of mmap faults. write(2) does the bmap_write
4664                  * in rdip/wrip, not here. We are not dealing with frags
4665                  * in this case.
4666                  */
4667                 /*
4668                  * Large Files: We cast fs_bmask field to offset_t
4669                  * just as we do for MAXBMASK because uoff is a 64-bit
4670                  * data type. fs_bmask will still be a 32-bit type
4671                  * as we cannot change any ondisk data structures.
4672                  */
4673 
4674                 offset = uoff & (offset_t)fs->fs_bmask;
4675                 while (offset < uoff + len) {
4676                         blk_size = (int)blksize(fs, ip, lblkno(fs, offset));
4677                         err = bmap_write(ip, offset, blk_size,
4678                             BI_NORMAL, NULL, cr);
4679                         if (ip->i_flag & (ICHG|IUPD))
4680                                 ip->i_seq++;
4681                         if (err)
4682                                 goto update_inode;
4683                         offset += blk_size; /* XXX - make this contig */
4684                 }
4685         }
4686 
4687         /*
4688          * Can be a reader from now on.
4689          */
4690         if (dolock && rwtype == RW_WRITER) {
4691                 rw_downgrade(&ip->i_contents);
4692                 /*
4693                  * We can release vfs_dqrwlock early so do it, but make
4694                  * sure we don't try to release it again at the bottom.
4695                  */
4696                 if (do_qlock) {
4697                         rw_exit(&ufsvfsp->vfs_dqrwlock);
4698                         do_qlock = 0;
4699                 }
4700         }
4701 
4702         /*
4703          * We remove PROT_WRITE in cases when the file has UFS holes
4704          * because we don't  want to call bmap_read() to check each
4705          * page if it is backed with a disk block.
4706          */
4707         if (protp && has_holes && rw != S_WRITE && rw != S_CREATE)
4708                 *protp &= ~PROT_WRITE;
4709 
4710         err = 0;
4711 
4712         /*
4713          * The loop looks up pages in the range [off, off + len).
4714          * For each page, we first check if we should initiate an asynchronous
4715          * read ahead before we call page_lookup (we may sleep in page_lookup
4716          * for a previously initiated disk read).
4717          */
4718         eoff = (uoff + len);
4719         for (pgoff = uoff, pgaddr = addr, pl = plarr;
4720             pgoff < eoff; /* empty */) {
4721                 page_t  *pp;
4722                 u_offset_t      nextrio;
4723                 se_t    se;
4724                 int retval;
4725 
4726                 se = ((rw == S_CREATE || rw == S_OTHER) ? SE_EXCL : SE_SHARED);
4727 
4728                 /* Handle async getpage (faultahead) */
4729                 if (plarr == NULL) {
4730                         ip->i_nextrio = pgoff;
4731                         (void) ufs_getpage_ra(vp, pgoff, seg, pgaddr);
4732                         pgoff += pgsize;
4733                         pgaddr += pgsize;
4734                         continue;
4735                 }
4736                 /*
4737                  * Check if we should initiate read ahead of next cluster.
4738                  * We call page_exists only when we need to confirm that
4739                  * we have the current page before we initiate the read ahead.
4740                  */
4741                 nextrio = ip->i_nextrio;
4742                 if (seqmode &&
4743                     pgoff + CLUSTSZ(ip) >= nextrio && pgoff <= nextrio &&
4744                     nextrio < ip->i_size && page_exists(vp, pgoff)) {
4745                         retval = ufs_getpage_ra(vp, pgoff, seg, pgaddr);
4746                         /*
4747                          * We always read ahead the next cluster of data
4748                          * starting from i_nextrio. If the page (vp,nextrio)
4749                          * is actually in core at this point, the routine
4750                          * ufs_getpage_ra() will stop pre-fetching data
4751                          * until we read that page in a synchronized manner
4752                          * through ufs_getpage_miss(). So, we should increase
4753                          * i_nextrio if the page (vp, nextrio) exists.
4754                          */
4755                         if ((retval == 0) && page_exists(vp, nextrio)) {
4756                                 ip->i_nextrio = nextrio + pgsize;
4757                         }
4758                 }
4759 
4760                 if ((pp = page_lookup(vp, pgoff, se)) != NULL) {
4761                         /*
4762                          * We found the page in the page cache.
4763                          */
4764                         *pl++ = pp;
4765                         pgoff += pgsize;
4766                         pgaddr += pgsize;
4767                         len -= pgsize;
4768                         plsz -= pgsize;
4769                 } else  {
4770                         /*
4771                          * We have to create the page, or read it from disk.
4772                          */
4773                         if (err = ufs_getpage_miss(vp, pgoff, len, seg, pgaddr,
4774                             pl, plsz, rw, seqmode))
4775                                 goto error;
4776 
4777                         while (*pl != NULL) {
4778                                 pl++;
4779                                 pgoff += pgsize;
4780                                 pgaddr += pgsize;
4781                                 len -= pgsize;
4782                                 plsz -= pgsize;
4783                         }
4784                 }
4785         }
4786 
4787         /*
4788          * Return pages up to plsz if they are in the page cache.
4789          * We cannot return pages if there is a chance that they are
4790          * backed with a UFS hole and rw is S_WRITE or S_CREATE.
4791          */
4792         if (plarr && !(has_holes && (rw == S_WRITE || rw == S_CREATE))) {
4793 
4794                 ASSERT((protp == NULL) ||
4795                     !(has_holes && (*protp & PROT_WRITE)));
4796 
4797                 eoff = pgoff + plsz;
4798                 while (pgoff < eoff) {
4799                         page_t          *pp;
4800 
4801                         if ((pp = page_lookup_nowait(vp, pgoff,
4802                             SE_SHARED)) == NULL)
4803                                 break;
4804 
4805                         *pl++ = pp;
4806                         pgoff += pgsize;
4807                         plsz -= pgsize;
4808                 }
4809         }
4810 
4811         if (plarr)
4812                 *pl = NULL;                     /* Terminate page list */
4813         ip->i_nextr = pgoff;
4814 
4815 error:
4816         if (err && plarr) {
4817                 /*
4818                  * Release any pages we have locked.
4819                  */
4820                 while (pl > &plarr[0])
4821                         page_unlock(*--pl);
4822 
4823                 plarr[0] = NULL;
4824         }
4825 
4826 update_inode:
4827         /*
4828          * If the inode is not already marked for IACC (in rdip() for read)
4829          * and the inode is not marked for no access time update (in wrip()
4830          * for write) then update the inode access time and mod time now.
4831          */
4832         if ((ip->i_flag & (IACC | INOACC)) == 0) {
4833                 if ((rw != S_OTHER) && (ip->i_mode & IFMT) != IFDIR) {
4834                         if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) &&
4835                             (fs->fs_ronly == 0) &&
4836                             (!ufsvfsp->vfs_noatime)) {
4837                                 mutex_enter(&ip->i_tlock);
4838                                 ip->i_flag |= IACC;
4839                                 ITIMES_NOLOCK(ip);
4840                                 mutex_exit(&ip->i_tlock);
4841                         }
4842                 }
4843         }
4844 
4845         if (dolock) {
4846                 rw_exit(&ip->i_contents);
4847                 if (do_qlock && rwtype == RW_WRITER)
4848                         rw_exit(&ufsvfsp->vfs_dqrwlock);
4849         }
4850 
4851 unlock:
4852         if (ulp) {
4853                 if ((rw == S_CREATE || rw == S_WRITE) &&
4854                     !(vp->v_flag & VISSWAP)) {
4855                         TRANS_END_ASYNC(ufsvfsp, TOP_GETPAGE, trans_size);
4856                 }
4857                 ufs_lockfs_end(ulp);
4858         }
4859 out:
4860         return (err);
4861 }
4862 
4863 /*
4864  * ufs_getpage_miss is called when ufs_getpage missed the page in the page
4865  * cache. The page is either read from the disk, or it's created.
4866  * A page is created (without disk read) if rw == S_CREATE, or if
4867  * the page is not backed with a real disk block (UFS hole).
4868  */
4869 /* ARGSUSED */
4870 static int
4871 ufs_getpage_miss(struct vnode *vp, u_offset_t off, size_t len, struct seg *seg,
4872     caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw, int seq)
4873 {
4874         struct inode    *ip = VTOI(vp);
4875         page_t          *pp;
4876         daddr_t         bn;
4877         size_t          io_len;
4878         int             crpage = 0;
4879         int             err;
4880         int             contig;
4881         int             bsize = ip->i_fs->fs_bsize;
4882 
4883         /*
4884          * Figure out whether the page can be created, or must be
4885          * must be read from the disk.
4886          */
4887         if (rw == S_CREATE)
4888                 crpage = 1;
4889         else {
4890                 contig = 0;
4891                 if (err = bmap_read(ip, off, &bn, &contig))
4892                         return (err);
4893 
4894                 crpage = (bn == UFS_HOLE);
4895 
4896                 /*
4897                  * If its also a fallocated block that hasn't been written to
4898                  * yet, we will treat it just like a UFS_HOLE and create
4899                  * a zero page for it
4900                  */
4901                 if (ISFALLOCBLK(ip, bn))
4902                         crpage = 1;
4903         }
4904 
4905         if (crpage) {
4906                 if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT, seg,
4907                     addr)) == NULL) {
4908                         return (ufs_fault(vp,
4909                             "ufs_getpage_miss: page_create == NULL"));
4910                 }
4911 
4912                 if (rw != S_CREATE)
4913                         pagezero(pp, 0, PAGESIZE);
4914 
4915                 io_len = PAGESIZE;
4916         } else {
4917                 u_offset_t      io_off;
4918                 uint_t  xlen;
4919                 struct buf      *bp;
4920                 ufsvfs_t        *ufsvfsp = ip->i_ufsvfs;
4921 
4922                 /*
4923                  * If access is not in sequential order, we read from disk
4924                  * in bsize units.
4925                  *
4926                  * We limit the size of the transfer to bsize if we are reading
4927                  * from the beginning of the file. Note in this situation we
4928                  * will hedge our bets and initiate an async read ahead of
4929                  * the second block.
4930                  */
4931                 if (!seq || off == 0)
4932                         contig = MIN(contig, bsize);
4933 
4934                 pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4935                     &io_len, off, contig, 0);
4936 
4937                 /*
4938                  * Some other thread has entered the page.
4939                  * ufs_getpage will retry page_lookup.
4940                  */
4941                 if (pp == NULL) {
4942                         pl[0] = NULL;
4943                         return (0);
4944                 }
4945 
4946                 /*
4947                  * Zero part of the page which we are not
4948                  * going to read from the disk.
4949                  */
4950                 xlen = io_len & PAGEOFFSET;
4951                 if (xlen != 0)
4952                         pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
4953 
4954                 bp = pageio_setup(pp, io_len, ip->i_devvp, B_READ);
4955                 bp->b_edev = ip->i_dev;
4956                 bp->b_dev = cmpdev(ip->i_dev);
4957                 bp->b_blkno = bn;
4958                 bp->b_un.b_addr = (caddr_t)0;
4959                 bp->b_file = ip->i_vnode;
4960                 bp->b_offset = off;
4961 
4962                 if (ufsvfsp->vfs_log) {
4963                         lufs_read_strategy(ufsvfsp->vfs_log, bp);
4964                 } else if (ufsvfsp->vfs_snapshot) {
4965                         fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
4966                 } else {
4967                         ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
4968                         ub.ub_getpages.value.ul++;
4969                         (void) bdev_strategy(bp);
4970                         lwp_stat_update(LWP_STAT_INBLK, 1);
4971                 }
4972 
4973                 ip->i_nextrio = off + ((io_len + PAGESIZE - 1) & PAGEMASK);
4974 
4975                 /*
4976                  * If the file access is sequential, initiate read ahead
4977                  * of the next cluster.
4978                  */
4979                 if (seq && ip->i_nextrio < ip->i_size)
4980                         (void) ufs_getpage_ra(vp, off, seg, addr);
4981                 err = biowait(bp);
4982                 pageio_done(bp);
4983 
4984                 if (err) {
4985                         pvn_read_done(pp, B_ERROR);
4986                         return (err);
4987                 }
4988         }
4989 
4990         pvn_plist_init(pp, pl, plsz, off, io_len, rw);
4991         return (0);
4992 }
4993 
4994 /*
4995  * Read ahead a cluster from the disk. Returns the length in bytes.
4996  */
4997 static int
4998 ufs_getpage_ra(struct vnode *vp, u_offset_t off, struct seg *seg, caddr_t addr)
4999 {
5000         struct inode    *ip = VTOI(vp);
5001         page_t          *pp;
5002         u_offset_t      io_off = ip->i_nextrio;
5003         ufsvfs_t        *ufsvfsp;
5004         caddr_t         addr2 = addr + (io_off - off);
5005         struct buf      *bp;
5006         daddr_t         bn;
5007         size_t          io_len;
5008         int             err;
5009         int             contig;
5010         int             xlen;
5011         int             bsize = ip->i_fs->fs_bsize;
5012 
5013         /*
5014          * If the directio advisory is in effect on this file,
5015          * then do not do buffered read ahead. Read ahead makes
5016          * it more difficult on threads using directio as they
5017          * will be forced to flush the pages from this vnode.
5018          */
5019         if ((ufsvfsp = ip->i_ufsvfs) == NULL)
5020                 return (0);
5021         if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio)
5022                 return (0);
5023 
5024         /*
5025          * Is this test needed?
5026          */
5027         if (addr2 >= seg->s_base + seg->s_size)
5028                 return (0);
5029 
5030         contig = 0;
5031         err = bmap_read(ip, io_off, &bn, &contig);
5032         /*
5033          * If its a UFS_HOLE or a fallocated block, do not perform
5034          * any read ahead's since there probably is nothing to read ahead
5035          */
5036         if (err || bn == UFS_HOLE || ISFALLOCBLK(ip, bn))
5037                 return (0);
5038 
5039         /*
5040          * Limit the transfer size to bsize if this is the 2nd block.
5041          */
5042         if (io_off == (u_offset_t)bsize)
5043                 contig = MIN(contig, bsize);
5044 
5045         if ((pp = pvn_read_kluster(vp, io_off, seg, addr2, &io_off,
5046             &io_len, io_off, contig, 1)) == NULL)
5047                 return (0);
5048 
5049         /*
5050          * Zero part of page which we are not going to read from disk
5051          */
5052         if ((xlen = (io_len & PAGEOFFSET)) > 0)
5053                 pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
5054 
5055         ip->i_nextrio = (io_off + io_len + PAGESIZE - 1) & PAGEMASK;
5056 
5057         bp = pageio_setup(pp, io_len, ip->i_devvp, B_READ | B_ASYNC);
5058         bp->b_edev = ip->i_dev;
5059         bp->b_dev = cmpdev(ip->i_dev);
5060         bp->b_blkno = bn;
5061         bp->b_un.b_addr = (caddr_t)0;
5062         bp->b_file = ip->i_vnode;
5063         bp->b_offset = off;
5064 
5065         if (ufsvfsp->vfs_log) {
5066                 lufs_read_strategy(ufsvfsp->vfs_log, bp);
5067         } else if (ufsvfsp->vfs_snapshot) {
5068                 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
5069         } else {
5070                 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
5071                 ub.ub_getras.value.ul++;
5072                 (void) bdev_strategy(bp);
5073                 lwp_stat_update(LWP_STAT_INBLK, 1);
5074         }
5075 
5076         return (io_len);
5077 }
5078 
5079 int     ufs_delay = 1;
5080 /*
5081  * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE, B_ASYNC}
5082  *
5083  * LMXXX - the inode really ought to contain a pointer to one of these
5084  * async args.  Stuff gunk in there and just hand the whole mess off.
5085  * This would replace i_delaylen, i_delayoff.
5086  */
5087 /*ARGSUSED*/
5088 static int
5089 ufs_putpage(struct vnode *vp, offset_t off, size_t len, int flags,
5090     struct cred *cr, caller_context_t *ct)
5091 {
5092         struct inode *ip = VTOI(vp);
5093         int err = 0;
5094 
5095         if (vp->v_count == 0) {
5096                 return (ufs_fault(vp, "ufs_putpage: bad v_count == 0"));
5097         }
5098 
5099         /*
5100          * XXX - Why should this check be made here?
5101          */
5102         if (vp->v_flag & VNOMAP) {
5103                 err = ENOSYS;
5104                 goto errout;
5105         }
5106 
5107         if (ip->i_ufsvfs == NULL) {
5108                 err = EIO;
5109                 goto errout;
5110         }
5111 
5112         if (flags & B_ASYNC) {
5113                 if (ufs_delay && len &&
5114                     (flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0) {
5115                         mutex_enter(&ip->i_tlock);
5116                         /*
5117                          * If nobody stalled, start a new cluster.
5118                          */
5119                         if (ip->i_delaylen == 0) {
5120                                 ip->i_delayoff = off;
5121                                 ip->i_delaylen = len;
5122                                 mutex_exit(&ip->i_tlock);
5123                                 goto errout;
5124                         }
5125                         /*
5126                          * If we have a full cluster or they are not contig,
5127                          * then push last cluster and start over.
5128                          */
5129                         if (ip->i_delaylen >= CLUSTSZ(ip) ||
5130                             ip->i_delayoff + ip->i_delaylen != off) {
5131                                 u_offset_t doff;
5132                                 size_t dlen;
5133 
5134                                 doff = ip->i_delayoff;
5135                                 dlen = ip->i_delaylen;
5136                                 ip->i_delayoff = off;
5137                                 ip->i_delaylen = len;
5138                                 mutex_exit(&ip->i_tlock);
5139                                 err = ufs_putpages(vp, doff, dlen,
5140                                     flags, cr);
5141                                 /* LMXXX - flags are new val, not old */
5142                                 goto errout;
5143                         }
5144                         /*
5145                          * There is something there, it's not full, and
5146                          * it is contig.
5147                          */
5148                         ip->i_delaylen += len;
5149                         mutex_exit(&ip->i_tlock);
5150                         goto errout;
5151                 }
5152                 /*
5153                  * Must have weird flags or we are not clustering.
5154                  */
5155         }
5156 
5157         err = ufs_putpages(vp, off, len, flags, cr);
5158 
5159 errout:
5160         return (err);
5161 }
5162 
5163 /*
5164  * If len == 0, do from off to EOF.
5165  *
5166  * The normal cases should be len == 0 & off == 0 (entire vp list),
5167  * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
5168  * (from pageout).
5169  */
5170 /*ARGSUSED*/
5171 static int
5172 ufs_putpages(struct vnode *vp, offset_t off, size_t len, int flags,
5173     struct cred *cr)
5174 {
5175         u_offset_t io_off;
5176         u_offset_t eoff;
5177         struct inode *ip = VTOI(vp);
5178         page_t *pp;
5179         size_t io_len;
5180         int err = 0;
5181         int dolock;
5182 
5183         if (vp->v_count == 0)
5184                 return (ufs_fault(vp, "ufs_putpages: v_count == 0"));
5185         /*
5186          * Acquire the readers/write inode lock before locking
5187          * any pages in this inode.
5188          * The inode lock is held during i/o.
5189          */
5190         if (len == 0) {
5191                 mutex_enter(&ip->i_tlock);
5192                 ip->i_delayoff = ip->i_delaylen = 0;
5193                 mutex_exit(&ip->i_tlock);
5194         }
5195         dolock = (rw_owner(&ip->i_contents) != curthread);
5196         if (dolock) {
5197                 /*
5198                  * Must synchronize this thread and any possible thread
5199                  * operating in the window of vulnerability in wrip().
5200                  * It is dangerous to allow both a thread doing a putpage
5201                  * and a thread writing, so serialize them.  The exception
5202                  * is when the thread in wrip() does something which causes
5203                  * a putpage operation.  Then, the thread must be allowed
5204                  * to continue.  It may encounter a bmap_read problem in
5205                  * ufs_putapage, but that is handled in ufs_putapage.
5206                  * Allow async writers to proceed, we don't want to block
5207                  * the pageout daemon.
5208                  */
5209                 if (ip->i_writer == curthread)
5210                         rw_enter(&ip->i_contents, RW_READER);
5211                 else {
5212                         for (;;) {
5213                                 rw_enter(&ip->i_contents, RW_READER);
5214                                 mutex_enter(&ip->i_tlock);
5215                                 /*
5216                                  * If there is no thread in the critical
5217                                  * section of wrip(), then proceed.
5218                                  * Otherwise, wait until there isn't one.
5219                                  */
5220                                 if (ip->i_writer == NULL) {
5221                                         mutex_exit(&ip->i_tlock);
5222                                         break;
5223                                 }
5224                                 rw_exit(&ip->i_contents);
5225                                 /*
5226                                  * Bounce async writers when we have a writer
5227                                  * working on this file so we don't deadlock
5228                                  * the pageout daemon.
5229                                  */
5230                                 if (flags & B_ASYNC) {
5231                                         mutex_exit(&ip->i_tlock);
5232                                         return (0);
5233                                 }
5234                                 cv_wait(&ip->i_wrcv, &ip->i_tlock);
5235                                 mutex_exit(&ip->i_tlock);
5236                         }
5237                 }
5238         }
5239 
5240         if (!vn_has_cached_data(vp)) {
5241                 if (dolock)
5242                         rw_exit(&ip->i_contents);
5243                 return (0);
5244         }
5245 
5246         if (len == 0) {
5247                 /*
5248                  * Search the entire vp list for pages >= off.
5249                  */
5250                 err = pvn_vplist_dirty(vp, (u_offset_t)off, ufs_putapage,
5251                     flags, cr);
5252         } else {
5253                 /*
5254                  * Loop over all offsets in the range looking for
5255                  * pages to deal with.
5256                  */
5257                 if ((eoff = blkroundup(ip->i_fs, ip->i_size)) != 0)
5258                         eoff = MIN(off + len, eoff);
5259                 else
5260                         eoff = off + len;
5261 
5262                 for (io_off = off; io_off < eoff; io_off += io_len) {
5263                         /*
5264                          * If we are not invalidating, synchronously
5265                          * freeing or writing pages, use the routine
5266                          * page_lookup_nowait() to prevent reclaiming
5267                          * them from the free list.
5268                          */
5269                         if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
5270                                 pp = page_lookup(vp, io_off,
5271                                     (flags & (B_INVAL | B_FREE)) ?
5272                                     SE_EXCL : SE_SHARED);
5273                         } else {
5274                                 pp = page_lookup_nowait(vp, io_off,
5275                                     (flags & B_FREE) ? SE_EXCL : SE_SHARED);
5276                         }
5277 
5278                         if (pp == NULL || pvn_getdirty(pp, flags) == 0)
5279                                 io_len = PAGESIZE;
5280                         else {
5281                                 u_offset_t *io_offp = &io_off;
5282 
5283                                 err = ufs_putapage(vp, pp, io_offp, &io_len,
5284                                     flags, cr);
5285                                 if (err != 0)
5286                                         break;
5287                                 /*
5288                                  * "io_off" and "io_len" are returned as
5289                                  * the range of pages we actually wrote.
5290                                  * This allows us to skip ahead more quickly
5291                                  * since several pages may've been dealt
5292                                  * with by this iteration of the loop.
5293                                  */
5294                         }
5295                 }
5296         }
5297         if (err == 0 && off == 0 && (len == 0 || len >= ip->i_size)) {
5298                 /*
5299                  * We have just sync'ed back all the pages on
5300                  * the inode, turn off the IMODTIME flag.
5301                  */
5302                 mutex_enter(&ip->i_tlock);
5303                 ip->i_flag &= ~IMODTIME;
5304                 mutex_exit(&ip->i_tlock);
5305         }
5306         if (dolock)
5307                 rw_exit(&ip->i_contents);
5308         return (err);
5309 }
5310 
5311 static void
5312 ufs_iodone(buf_t *bp)
5313 {
5314         struct inode *ip;
5315 
5316         ASSERT((bp->b_pages->p_vnode != NULL) && !(bp->b_flags & B_READ));
5317 
5318         bp->b_iodone = NULL;
5319 
5320         ip = VTOI(bp->b_pages->p_vnode);
5321 
5322         mutex_enter(&ip->i_tlock);
5323         if (ip->i_writes >= ufs_LW) {
5324                 if ((ip->i_writes -= bp->b_bcount) <= ufs_LW)
5325                         if (ufs_WRITES)
5326                                 cv_broadcast(&ip->i_wrcv); /* wake all up */
5327         } else {
5328                 ip->i_writes -= bp->b_bcount;
5329         }
5330 
5331         mutex_exit(&ip->i_tlock);
5332         iodone(bp);
5333 }
5334 
5335 /*
5336  * Write out a single page, possibly klustering adjacent
5337  * dirty pages.  The inode lock must be held.
5338  *
5339  * LMXXX - bsize < pagesize not done.
5340  */
5341 /*ARGSUSED*/
5342 int
5343 ufs_putapage(struct vnode *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
5344     int flags, struct cred *cr)
5345 {
5346         u_offset_t io_off;
5347         u_offset_t off;
5348         struct inode *ip = VTOI(vp);
5349         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
5350         struct fs *fs;
5351         struct buf *bp;
5352         size_t io_len;
5353         daddr_t bn;
5354         int err;
5355         int contig;
5356         int dotrans;
5357 
5358         ASSERT(RW_LOCK_HELD(&ip->i_contents));
5359 
5360         if (ufsvfsp == NULL) {
5361                 err = EIO;
5362                 goto out_trace;
5363         }
5364 
5365         fs = ip->i_fs;
5366         ASSERT(fs->fs_ronly == 0);
5367 
5368         /*
5369          * If the modified time on the inode has not already been
5370          * set elsewhere (e.g. for write/setattr) we set the time now.
5371          * This gives us approximate modified times for mmap'ed files
5372          * which are modified via stores in the user address space.
5373          */
5374         if ((ip->i_flag & IMODTIME) == 0) {
5375                 mutex_enter(&ip->i_tlock);
5376                 ip->i_flag |= IUPD;
5377                 ip->i_seq++;
5378                 ITIMES_NOLOCK(ip);
5379                 mutex_exit(&ip->i_tlock);
5380         }
5381 
5382         /*
5383          * Align the request to a block boundry (for old file systems),
5384          * and go ask bmap() how contiguous things are for this file.
5385          */
5386         off = pp->p_offset & (offset_t)fs->fs_bmask;  /* block align it */
5387         contig = 0;
5388         err = bmap_read(ip, off, &bn, &contig);
5389         if (err)
5390                 goto out;
5391         if (bn == UFS_HOLE) {                   /* putpage never allocates */
5392                 /*
5393                  * logging device is in error mode; simply return EIO
5394                  */
5395                 if (TRANS_ISERROR(ufsvfsp)) {
5396                         err = EIO;
5397                         goto out;
5398                 }
5399                 /*
5400                  * Oops, the thread in the window in wrip() did some
5401                  * sort of operation which caused a putpage in the bad
5402                  * range.  In this case, just return an error which will
5403                  * cause the software modified bit on the page to set
5404                  * and the page will get written out again later.
5405                  */
5406                 if (ip->i_writer == curthread) {
5407                         err = EIO;
5408                         goto out;
5409                 }
5410                 /*
5411                  * If the pager is trying to push a page in the bad range
5412                  * just tell it to try again later when things are better.
5413                  */
5414                 if (flags & B_ASYNC) {
5415                         err = EAGAIN;
5416                         goto out;
5417                 }
5418                 err = ufs_fault(ITOV(ip), "ufs_putapage: bn == UFS_HOLE");
5419                 goto out;
5420         }
5421 
5422         /*
5423          * If it is an fallocate'd block, reverse the negativity since
5424          * we are now writing to it
5425          */
5426         if (ISFALLOCBLK(ip, bn)) {
5427                 err = bmap_set_bn(vp, off, dbtofsb(fs, -bn));
5428                 if (err)
5429                         goto out;
5430 
5431                 bn = -bn;
5432         }
5433 
5434         /*
5435          * Take the length (of contiguous bytes) passed back from bmap()
5436          * and _try_ and get a set of pages covering that extent.
5437          */
5438         pp = pvn_write_kluster(vp, pp, &io_off, &io_len, off, contig, flags);
5439 
5440         /*
5441          * May have run out of memory and not clustered backwards.
5442          * off          p_offset
5443          * [  pp - 1  ][   pp   ]
5444          * [    block           ]
5445          * We told bmap off, so we have to adjust the bn accordingly.
5446          */
5447         if (io_off > off) {
5448                 bn += btod(io_off - off);
5449                 contig -= (io_off - off);
5450         }
5451 
5452         /*
5453          * bmap was carefull to tell us the right size so use that.
5454          * There might be unallocated frags at the end.
5455          * LMXXX - bzero the end of the page?  We must be writing after EOF.
5456          */
5457         if (io_len > contig) {
5458                 ASSERT(io_len - contig < fs->fs_bsize);
5459                 io_len -= (io_len - contig);
5460         }
5461 
5462         /*
5463          * Handle the case where we are writing the last page after EOF.
5464          *
5465          * XXX - just a patch for i-mt3.
5466          */
5467         if (io_len == 0) {
5468                 ASSERT(pp->p_offset >=
5469                     (u_offset_t)(roundup(ip->i_size, PAGESIZE)));
5470                 io_len = PAGESIZE;
5471         }
5472 
5473         bp = pageio_setup(pp, io_len, ip->i_devvp, B_WRITE | flags);
5474 
5475         ULOCKFS_SET_MOD(ITOUL(ip));
5476 
5477         bp->b_edev = ip->i_dev;
5478         bp->b_dev = cmpdev(ip->i_dev);
5479         bp->b_blkno = bn;
5480         bp->b_un.b_addr = (caddr_t)0;
5481         bp->b_file = ip->i_vnode;
5482 
5483         /*
5484          * File contents of shadow or quota inodes are metadata, and updates
5485          * to these need to be put into a logging transaction. All direct
5486          * callers in UFS do that, but fsflush can come here _before_ the
5487          * normal codepath. An example would be updating ACL information, for
5488          * which the normal codepath would be:
5489          *      ufs_si_store()
5490          *      ufs_rdwri()
5491          *      wrip()
5492          *      segmap_release()
5493          *      VOP_PUTPAGE()
5494          * Here, fsflush can pick up the dirty page before segmap_release()
5495          * forces it out. If that happens, there's no transaction.
5496          * We therefore need to test whether a transaction exists, and if not
5497          * create one - for fsflush.
5498          */
5499         dotrans =
5500             (((ip->i_mode & IFMT) == IFSHAD || ufsvfsp->vfs_qinod == ip) &&
5501             ((curthread->t_flag & T_DONTBLOCK) == 0) &&
5502             (TRANS_ISTRANS(ufsvfsp)));
5503 
5504         if (dotrans) {
5505                 curthread->t_flag |= T_DONTBLOCK;
5506                 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_PUTPAGE, TOP_PUTPAGE_SIZE(ip));
5507         }
5508         if (TRANS_ISTRANS(ufsvfsp)) {
5509                 if ((ip->i_mode & IFMT) == IFSHAD) {
5510                         TRANS_BUF(ufsvfsp, 0, io_len, bp, DT_SHAD);
5511                 } else if (ufsvfsp->vfs_qinod == ip) {
5512                         TRANS_DELTA(ufsvfsp, ldbtob(bn), bp->b_bcount, DT_QR,
5513                             0, 0);
5514                 }
5515         }
5516         if (dotrans) {
5517                 TRANS_END_ASYNC(ufsvfsp, TOP_PUTPAGE, TOP_PUTPAGE_SIZE(ip));
5518                 curthread->t_flag &= ~T_DONTBLOCK;
5519         }
5520 
5521         /* write throttle */
5522 
5523         ASSERT(bp->b_iodone == NULL);
5524         bp->b_iodone = (int (*)())ufs_iodone;
5525         mutex_enter(&ip->i_tlock);
5526         ip->i_writes += bp->b_bcount;
5527         mutex_exit(&ip->i_tlock);
5528 
5529         if (bp->b_flags & B_ASYNC) {
5530                 if (ufsvfsp->vfs_log) {
5531                         lufs_write_strategy(ufsvfsp->vfs_log, bp);
5532                 } else if (ufsvfsp->vfs_snapshot) {
5533                         fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
5534                 } else {
5535                         ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
5536                         ub.ub_putasyncs.value.ul++;
5537                         (void) bdev_strategy(bp);
5538                         lwp_stat_update(LWP_STAT_OUBLK, 1);
5539                 }
5540         } else {
5541                 if (ufsvfsp->vfs_log) {
5542                         lufs_write_strategy(ufsvfsp->vfs_log, bp);
5543                 } else if (ufsvfsp->vfs_snapshot) {
5544                         fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
5545                 } else {
5546                         ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
5547                         ub.ub_putsyncs.value.ul++;
5548                         (void) bdev_strategy(bp);
5549                         lwp_stat_update(LWP_STAT_OUBLK, 1);
5550                 }
5551                 err = biowait(bp);
5552                 pageio_done(bp);
5553                 pvn_write_done(pp, ((err) ? B_ERROR : 0) | B_WRITE | flags);
5554         }
5555 
5556         pp = NULL;
5557 
5558 out:
5559         if (err != 0 && pp != NULL)
5560                 pvn_write_done(pp, B_ERROR | B_WRITE | flags);
5561 
5562         if (offp)
5563                 *offp = io_off;
5564         if (lenp)
5565                 *lenp = io_len;
5566 out_trace:
5567         return (err);
5568 }
5569 
5570 uint64_t ufs_map_alock_retry_cnt;
5571 uint64_t ufs_map_lockfs_retry_cnt;
5572 
5573 /* ARGSUSED */
5574 static int
5575 ufs_map(struct vnode *vp, offset_t off, struct as *as, caddr_t *addrp,
5576     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, struct cred *cr,
5577     caller_context_t *ct)
5578 {
5579         struct segvn_crargs vn_a;
5580         struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs;
5581         struct ulockfs *ulp;
5582         int error, sig;
5583         k_sigset_t smask;
5584         caddr_t hint = *addrp;
5585 
5586         if (vp->v_flag & VNOMAP) {
5587                 error = ENOSYS;
5588                 goto out;
5589         }
5590 
5591         if (off < (offset_t)0 || (offset_t)(off + len) < (offset_t)0) {
5592                 error = ENXIO;
5593                 goto out;
5594         }
5595 
5596         if (vp->v_type != VREG) {
5597                 error = ENODEV;
5598                 goto out;
5599         }
5600 
5601 retry_map:
5602         *addrp = hint;
5603         /*
5604          * If file is being locked, disallow mapping.
5605          */
5606         if (vn_has_mandatory_locks(vp, VTOI(vp)->i_mode)) {
5607                 error = EAGAIN;
5608                 goto out;
5609         }
5610 
5611         as_rangelock(as);
5612         /*
5613          * Note that if we are retrying (because ufs_lockfs_trybegin failed in
5614          * the previous attempt), some other thread could have grabbed
5615          * the same VA range if MAP_FIXED is set. In that case, choose_addr
5616          * would unmap the valid VA range, that is ok.
5617          */
5618         error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
5619         if (error != 0) {
5620                 as_rangeunlock(as);
5621                 goto out;
5622         }
5623 
5624         /*
5625          * a_lock has to be acquired before entering the lockfs protocol
5626          * because that is the order in which pagefault works. Also we cannot
5627          * block on a_lock here because this waiting writer will prevent
5628          * further readers like ufs_read from progressing and could cause
5629          * deadlock between ufs_read/ufs_map/pagefault when a quiesce is
5630          * pending.
5631          */
5632         while (!AS_LOCK_TRYENTER(as, RW_WRITER)) {
5633                 ufs_map_alock_retry_cnt++;
5634                 delay(RETRY_LOCK_DELAY);
5635         }
5636 
5637         /*
5638          * We can't hold as->a_lock and wait for lockfs to succeed because
5639          * the proc tools might hang on a_lock, so call ufs_lockfs_trybegin()
5640          * instead.
5641          */
5642         if (error = ufs_lockfs_trybegin(ufsvfsp, &ulp, ULOCKFS_MAP_MASK)) {
5643                 /*
5644                  * ufs_lockfs_trybegin() did not succeed. It is safer to give up
5645                  * as->a_lock and wait for ulp->ul_fs_lock status to change.
5646                  */
5647                 ufs_map_lockfs_retry_cnt++;
5648                 AS_LOCK_EXIT(as);
5649                 as_rangeunlock(as);
5650                 if (error == EIO)
5651                         goto out;
5652 
5653                 mutex_enter(&ulp->ul_lock);
5654                 while (ulp->ul_fs_lock & ULOCKFS_MAP_MASK) {
5655                         if (ULOCKFS_IS_SLOCK(ulp) || ufsvfsp->vfs_nointr) {
5656                                 cv_wait(&ulp->ul_cv, &ulp->ul_lock);
5657                         } else {
5658                                 sigintr(&smask, 1);
5659                                 sig = cv_wait_sig(&ulp->ul_cv, &ulp->ul_lock);
5660                                 sigunintr(&smask);
5661                                 if (((ulp->ul_fs_lock & ULOCKFS_MAP_MASK) &&
5662                                     !sig) || ufsvfsp->vfs_dontblock) {
5663                                         mutex_exit(&ulp->ul_lock);
5664                                         return (EINTR);
5665                                 }
5666                         }
5667                 }
5668                 mutex_exit(&ulp->ul_lock);
5669                 goto retry_map;
5670         }
5671 
5672         vn_a.vp = vp;
5673         vn_a.offset = (u_offset_t)off;
5674         vn_a.type = flags & MAP_TYPE;
5675         vn_a.prot = prot;
5676         vn_a.maxprot = maxprot;
5677         vn_a.cred = cr;
5678         vn_a.amp = NULL;
5679         vn_a.flags = flags & ~MAP_TYPE;
5680         vn_a.szc = 0;
5681         vn_a.lgrp_mem_policy_flags = 0;
5682 
5683         error = as_map_locked(as, *addrp, len, segvn_create, &vn_a);
5684         if (ulp)
5685                 ufs_lockfs_end(ulp);
5686         as_rangeunlock(as);
5687 out:
5688         return (error);
5689 }
5690 
5691 /* ARGSUSED */
5692 static int
5693 ufs_addmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
5694     size_t len, uchar_t  prot, uchar_t  maxprot, uint_t    flags,
5695     struct cred *cr, caller_context_t *ct)
5696 {
5697         struct inode *ip = VTOI(vp);
5698 
5699         if (vp->v_flag & VNOMAP) {
5700                 return (ENOSYS);
5701         }
5702 
5703         mutex_enter(&ip->i_tlock);
5704         ip->i_mapcnt += btopr(len);
5705         mutex_exit(&ip->i_tlock);
5706         return (0);
5707 }
5708 
5709 /*ARGSUSED*/
5710 static int
5711 ufs_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
5712     size_t len, uint_t prot,  uint_t maxprot,  uint_t flags, struct cred *cr,
5713     caller_context_t *ct)
5714 {
5715         struct inode *ip = VTOI(vp);
5716 
5717         if (vp->v_flag & VNOMAP) {
5718                 return (ENOSYS);
5719         }
5720 
5721         mutex_enter(&ip->i_tlock);
5722         ip->i_mapcnt -= btopr(len);  /* Count released mappings */
5723         ASSERT(ip->i_mapcnt >= 0);
5724         mutex_exit(&ip->i_tlock);
5725         return (0);
5726 }
5727 /*
5728  * Return the answer requested to poll() for non-device files
5729  */
5730 struct pollhead ufs_pollhd;
5731 
5732 /* ARGSUSED */
5733 int
5734 ufs_poll(vnode_t *vp, short ev, int any, short *revp, struct pollhead **phpp,
5735     caller_context_t *ct)
5736 {
5737         struct ufsvfs   *ufsvfsp;
5738 
5739         /*
5740          * Regular files reject edge-triggered pollers.
5741          * See the comment in fs_poll() for a more detailed explanation.
5742          */
5743         if (ev & POLLET) {
5744                 return (EPERM);
5745         }
5746 
5747         *revp = 0;
5748         ufsvfsp = VTOI(vp)->i_ufsvfs;
5749 
5750         if (!ufsvfsp) {
5751                 *revp = POLLHUP;
5752                 goto out;
5753         }
5754 
5755         if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs) ||
5756             ULOCKFS_IS_ELOCK(&ufsvfsp->vfs_ulockfs)) {
5757                 *revp |= POLLERR;
5758 
5759         } else {
5760                 if ((ev & POLLOUT) && !ufsvfsp->vfs_fs->fs_ronly &&
5761                     !ULOCKFS_IS_WLOCK(&ufsvfsp->vfs_ulockfs))
5762                         *revp |= POLLOUT;
5763 
5764                 if ((ev & POLLWRBAND) && !ufsvfsp->vfs_fs->fs_ronly &&
5765                     !ULOCKFS_IS_WLOCK(&ufsvfsp->vfs_ulockfs))
5766                         *revp |= POLLWRBAND;
5767 
5768                 if (ev & POLLIN)
5769                         *revp |= POLLIN;
5770 
5771                 if (ev & POLLRDNORM)
5772                         *revp |= POLLRDNORM;
5773 
5774                 if (ev & POLLRDBAND)
5775                         *revp |= POLLRDBAND;
5776         }
5777 
5778         if ((ev & POLLPRI) && (*revp & (POLLERR|POLLHUP)))
5779                 *revp |= POLLPRI;
5780 out:
5781         if (*revp == 0 && ! any) {
5782                 *phpp = &ufs_pollhd;
5783         }
5784 
5785         return (0);
5786 }
5787 
5788 /* ARGSUSED */
5789 static int
5790 ufs_l_pathconf(struct vnode *vp, int cmd, ulong_t *valp, struct cred *cr,
5791     caller_context_t *ct)
5792 {
5793         struct ufsvfs   *ufsvfsp = VTOI(vp)->i_ufsvfs;
5794         struct ulockfs  *ulp = NULL;
5795         struct inode    *sip = NULL;
5796         int             error;
5797         struct inode    *ip = VTOI(vp);
5798         int             issync;
5799 
5800         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_PATHCONF_MASK);
5801         if (error)
5802                 return (error);
5803 
5804         switch (cmd) {
5805                 /*
5806                  * Have to handle _PC_NAME_MAX here, because the normal way
5807                  * [fs_pathconf() -> VOP_STATVFS() -> ufs_statvfs()]
5808                  * results in a lock ordering reversal between
5809                  * ufs_lockfs_{begin,end}() and
5810                  * ufs_thread_{suspend,continue}().
5811                  *
5812                  * Keep in sync with ufs_statvfs().
5813                  */
5814         case _PC_NAME_MAX:
5815                 *valp = MAXNAMLEN;
5816                 break;
5817 
5818         case _PC_FILESIZEBITS:
5819                 if (ufsvfsp->vfs_lfflags & UFS_LARGEFILES)
5820                         *valp = UFS_FILESIZE_BITS;
5821                 else
5822                         *valp = 32;
5823                 break;
5824 
5825         case _PC_XATTR_EXISTS:
5826                 if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
5827 
5828                         error =
5829                             ufs_xattr_getattrdir(vp, &sip, LOOKUP_XATTR, cr);
5830                         if (error ==  0 && sip != NULL) {
5831                                 /* Start transaction */
5832                                 if (ulp) {
5833                                         TRANS_BEGIN_CSYNC(ufsvfsp, issync,
5834                                             TOP_RMDIR, TOP_RMDIR_SIZE);
5835                                 }
5836                                 /*
5837                                  * Is directory empty
5838                                  */
5839                                 rw_enter(&sip->i_rwlock, RW_WRITER);
5840                                 rw_enter(&sip->i_contents, RW_WRITER);
5841                                 if (ufs_xattrdirempty(sip,
5842                                     sip->i_number, CRED())) {
5843                                         rw_enter(&ip->i_contents, RW_WRITER);
5844                                         ufs_unhook_shadow(ip, sip);
5845                                         rw_exit(&ip->i_contents);
5846 
5847                                         *valp = 0;
5848 
5849                                 } else
5850                                         *valp = 1;
5851                                 rw_exit(&sip->i_contents);
5852                                 rw_exit(&sip->i_rwlock);
5853                                 if (ulp) {
5854                                         TRANS_END_CSYNC(ufsvfsp, error, issync,
5855                                             TOP_RMDIR, TOP_RMDIR_SIZE);
5856                                 }
5857                                 VN_RELE(ITOV(sip));
5858                         } else if (error == ENOENT) {
5859                                 *valp = 0;
5860                                 error = 0;
5861                         }
5862                 } else {
5863                         error = fs_pathconf(vp, cmd, valp, cr, ct);
5864                 }
5865                 break;
5866 
5867         case _PC_ACL_ENABLED:
5868                 *valp = _ACL_ACLENT_ENABLED;
5869                 break;
5870 
5871         case _PC_MIN_HOLE_SIZE:
5872                 *valp = (ulong_t)ip->i_fs->fs_bsize;
5873                 break;
5874 
5875         case _PC_SATTR_ENABLED:
5876         case _PC_SATTR_EXISTS:
5877                 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
5878                     (vp->v_type == VREG || vp->v_type == VDIR);
5879                 break;
5880 
5881         case _PC_TIMESTAMP_RESOLUTION:
5882                 /*
5883                  * UFS keeps only microsecond timestamp resolution.
5884                  * This is historical and will probably never change.
5885                  */
5886                 *valp = 1000L;
5887                 break;
5888 
5889         default:
5890                 error = fs_pathconf(vp, cmd, valp, cr, ct);
5891                 break;
5892         }
5893 
5894         if (ulp != NULL) {
5895                 ufs_lockfs_end(ulp);
5896         }
5897         return (error);
5898 }
5899 
5900 int ufs_pageio_writes, ufs_pageio_reads;
5901 
5902 /*ARGSUSED*/
5903 static int
5904 ufs_pageio(struct vnode *vp, page_t *pp, u_offset_t io_off, size_t io_len,
5905     int flags, struct cred *cr, caller_context_t *ct)
5906 {
5907         struct inode *ip = VTOI(vp);
5908         struct ufsvfs *ufsvfsp;
5909         page_t *npp = NULL, *opp = NULL, *cpp = pp;
5910         struct buf *bp;
5911         daddr_t bn;
5912         size_t done_len = 0, cur_len = 0;
5913         int err = 0;
5914         int contig = 0;
5915         int dolock;
5916         int vmpss = 0;
5917         struct ulockfs *ulp;
5918 
5919         if ((flags & B_READ) && pp != NULL && pp->p_vnode == vp &&
5920             vp->v_mpssdata != NULL) {
5921                 vmpss = 1;
5922         }
5923 
5924         dolock = (rw_owner(&ip->i_contents) != curthread);
5925         /*
5926          * We need a better check.  Ideally, we would use another
5927          * vnodeops so that hlocked and forcibly unmounted file
5928          * systems would return EIO where appropriate and w/o the
5929          * need for these checks.
5930          */
5931         if ((ufsvfsp = ip->i_ufsvfs) == NULL)
5932                 return (EIO);
5933 
5934         /*
5935          * For vmpss (pp can be NULL) case respect the quiesce protocol.
5936          * ul_lock must be taken before locking pages so we can't use it here
5937          * if pp is non NULL because segvn already locked pages
5938          * SE_EXCL. Instead we rely on the fact that a forced umount or
5939          * applying a filesystem lock via ufs_fiolfs() will block in the
5940          * implicit call to ufs_flush() until we unlock the pages after the
5941          * return to segvn. Other ufs_quiesce() callers keep ufs_quiesce_pend
5942          * above 0 until they are done. We have to be careful not to increment
5943          * ul_vnops_cnt here after forceful unmount hlocks the file system.
5944          *
5945          * If pp is NULL use ul_lock to make sure we don't increment
5946          * ul_vnops_cnt after forceful unmount hlocks the file system.
5947          */
5948         if (vmpss || pp == NULL) {
5949                 ulp = &ufsvfsp->vfs_ulockfs;
5950                 if (pp == NULL)
5951                         mutex_enter(&ulp->ul_lock);
5952                 if (ulp->ul_fs_lock & ULOCKFS_GETREAD_MASK) {
5953                         if (pp == NULL) {
5954                                 mutex_exit(&ulp->ul_lock);
5955                         }
5956                         return (vmpss ? EIO : EINVAL);
5957                 }
5958                 atomic_inc_ulong(&ulp->ul_vnops_cnt);
5959                 if (pp == NULL)
5960                         mutex_exit(&ulp->ul_lock);
5961                 if (ufs_quiesce_pend) {
5962                         if (!atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
5963                                 cv_broadcast(&ulp->ul_cv);
5964                         return (vmpss ? EIO : EINVAL);
5965                 }
5966         }
5967 
5968         if (dolock) {
5969                 /*
5970                  * segvn may call VOP_PAGEIO() instead of VOP_GETPAGE() to
5971                  * handle a fault against a segment that maps vnode pages with
5972                  * large mappings.  Segvn creates pages and holds them locked
5973                  * SE_EXCL during VOP_PAGEIO() call. In this case we have to
5974                  * use rw_tryenter() to avoid a potential deadlock since in
5975                  * lock order i_contents needs to be taken first.
5976                  * Segvn will retry via VOP_GETPAGE() if VOP_PAGEIO() fails.
5977                  */
5978                 if (!vmpss) {
5979                         rw_enter(&ip->i_contents, RW_READER);
5980                 } else if (!rw_tryenter(&ip->i_contents, RW_READER)) {
5981                         if (!atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
5982                                 cv_broadcast(&ulp->ul_cv);
5983                         return (EDEADLK);
5984                 }
5985         }
5986 
5987         /*
5988          * Return an error to segvn because the pagefault request is beyond
5989          * PAGESIZE rounded EOF.
5990          */
5991         if (vmpss && btopr(io_off + io_len) > btopr(ip->i_size)) {
5992                 if (dolock)
5993                         rw_exit(&ip->i_contents);
5994                 if (!atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
5995                         cv_broadcast(&ulp->ul_cv);
5996                 return (EFAULT);
5997         }
5998 
5999         if (pp == NULL) {
6000                 if (bmap_has_holes(ip)) {
6001                         err = ENOSYS;
6002                 } else {
6003                         err = EINVAL;
6004                 }
6005                 if (dolock)
6006                         rw_exit(&ip->i_contents);
6007                 if (!atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
6008                         cv_broadcast(&ulp->ul_cv);
6009                 return (err);
6010         }
6011 
6012         /*
6013          * Break the io request into chunks, one for each contiguous
6014          * stretch of disk blocks in the target file.
6015          */
6016         while (done_len < io_len) {
6017                 ASSERT(cpp);
6018                 contig = 0;
6019                 if (err = bmap_read(ip, (u_offset_t)(io_off + done_len),
6020                     &bn, &contig))
6021                         break;
6022 
6023                 if (bn == UFS_HOLE) {   /* No holey swapfiles */
6024                         if (vmpss) {
6025                                 err = EFAULT;
6026                                 break;
6027                         }
6028                         err = ufs_fault(ITOV(ip), "ufs_pageio: bn == UFS_HOLE");
6029                         break;
6030                 }
6031 
6032                 cur_len = MIN(io_len - done_len, contig);
6033                 /*
6034                  * Zero out a page beyond EOF, when the last block of
6035                  * a file is a UFS fragment so that ufs_pageio() can be used
6036                  * instead of ufs_getpage() to handle faults against
6037                  * segvn segments that use large pages.
6038                  */
6039                 page_list_break(&cpp, &npp, btopr(cur_len));
6040                 if ((flags & B_READ) && (cur_len & PAGEOFFSET)) {
6041                         size_t xlen = cur_len & PAGEOFFSET;
6042                         pagezero(cpp->p_prev, xlen, PAGESIZE - xlen);
6043                 }
6044 
6045                 bp = pageio_setup(cpp, cur_len, ip->i_devvp, flags);
6046                 ASSERT(bp != NULL);
6047 
6048                 bp->b_edev = ip->i_dev;
6049                 bp->b_dev = cmpdev(ip->i_dev);
6050                 bp->b_blkno = bn;
6051                 bp->b_un.b_addr = (caddr_t)0;
6052                 bp->b_file = ip->i_vnode;
6053 
6054                 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
6055                 ub.ub_pageios.value.ul++;
6056                 if (ufsvfsp->vfs_snapshot)
6057                         fssnap_strategy(&(ufsvfsp->vfs_snapshot), bp);
6058                 else
6059                         (void) bdev_strategy(bp);
6060 
6061                 if (flags & B_READ)
6062                         ufs_pageio_reads++;
6063                 else
6064                         ufs_pageio_writes++;
6065                 if (flags & B_READ)
6066                         lwp_stat_update(LWP_STAT_INBLK, 1);
6067                 else
6068                         lwp_stat_update(LWP_STAT_OUBLK, 1);
6069                 /*
6070                  * If the request is not B_ASYNC, wait for i/o to complete
6071                  * and re-assemble the page list to return to the caller.
6072                  * If it is B_ASYNC we leave the page list in pieces and
6073                  * cleanup() will dispose of them.
6074                  */
6075                 if ((flags & B_ASYNC) == 0) {
6076                         err = biowait(bp);
6077                         pageio_done(bp);
6078                         if (err)
6079                                 break;
6080                         page_list_concat(&opp, &cpp);
6081                 }
6082                 cpp = npp;
6083                 npp = NULL;
6084                 if (flags & B_READ)
6085                         cur_len = P2ROUNDUP_TYPED(cur_len, PAGESIZE, size_t);
6086                 done_len += cur_len;
6087         }
6088         ASSERT(err || (cpp == NULL && npp == NULL && done_len == io_len));
6089         if (err) {
6090                 if (flags & B_ASYNC) {
6091                         /* Cleanup unprocessed parts of list */
6092                         page_list_concat(&cpp, &npp);
6093                         if (flags & B_READ)
6094                                 pvn_read_done(cpp, B_ERROR);
6095                         else
6096                                 pvn_write_done(cpp, B_ERROR);
6097                 } else {
6098                         /* Re-assemble list and let caller clean up */
6099                         page_list_concat(&opp, &cpp);
6100                         page_list_concat(&opp, &npp);
6101                 }
6102         }
6103 
6104         if (vmpss && !(ip->i_flag & IACC) && !ULOCKFS_IS_NOIACC(ulp) &&
6105             ufsvfsp->vfs_fs->fs_ronly == 0 && !ufsvfsp->vfs_noatime) {
6106                 mutex_enter(&ip->i_tlock);
6107                 ip->i_flag |= IACC;
6108                 ITIMES_NOLOCK(ip);
6109                 mutex_exit(&ip->i_tlock);
6110         }
6111 
6112         if (dolock)
6113                 rw_exit(&ip->i_contents);
6114         if (vmpss && !atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
6115                 cv_broadcast(&ulp->ul_cv);
6116         return (err);
6117 }
6118 
6119 /*
6120  * Called when the kernel is in a frozen state to dump data
6121  * directly to the device. It uses a private dump data structure,
6122  * set up by dump_ctl, to locate the correct disk block to which to dump.
6123  */
6124 /*ARGSUSED*/
6125 static int
6126 ufs_dump(vnode_t *vp, caddr_t addr, offset_t ldbn, offset_t dblks,
6127     caller_context_t *ct)
6128 {
6129         u_offset_t      file_size;
6130         struct inode    *ip = VTOI(vp);
6131         struct fs       *fs = ip->i_fs;
6132         daddr_t         dbn, lfsbn;
6133         int             disk_blks = fs->fs_bsize >> DEV_BSHIFT;
6134         int             error = 0;
6135         int             ndbs, nfsbs;
6136 
6137         /*
6138          * forced unmount case
6139          */
6140         if (ip->i_ufsvfs == NULL)
6141                 return (EIO);
6142         /*
6143          * Validate the inode that it has not been modified since
6144          * the dump structure is allocated.
6145          */
6146         mutex_enter(&ip->i_tlock);
6147         if ((dump_info == NULL) ||
6148             (dump_info->ip != ip) ||
6149             (dump_info->time.tv_sec != ip->i_mtime.tv_sec) ||
6150             (dump_info->time.tv_usec != ip->i_mtime.tv_usec)) {
6151                 mutex_exit(&ip->i_tlock);
6152                 return (-1);
6153         }
6154         mutex_exit(&ip->i_tlock);
6155 
6156         /*
6157          * See that the file has room for this write
6158          */
6159         UFS_GET_ISIZE(&file_size, ip);
6160 
6161         if (ldbtob(ldbn + dblks) > file_size)
6162                 return (ENOSPC);
6163 
6164         /*
6165          * Find the physical disk block numbers from the dump
6166          * private data structure directly and write out the data
6167          * in contiguous block lumps
6168          */
6169         while (dblks > 0 && !error) {
6170                 lfsbn = (daddr_t)lblkno(fs, ldbtob(ldbn));
6171                 dbn = fsbtodb(fs, dump_info->dblk[lfsbn]) + ldbn % disk_blks;
6172                 nfsbs = 1;
6173                 ndbs = disk_blks - ldbn % disk_blks;
6174                 while (ndbs < dblks && fsbtodb(fs, dump_info->dblk[lfsbn +
6175                     nfsbs]) == dbn + ndbs) {
6176                         nfsbs++;
6177                         ndbs += disk_blks;
6178                 }
6179                 if (ndbs > dblks)
6180                         ndbs = dblks;
6181                 error = bdev_dump(ip->i_dev, addr, dbn, ndbs);
6182                 addr += ldbtob((offset_t)ndbs);
6183                 dblks -= ndbs;
6184                 ldbn += ndbs;
6185         }
6186         return (error);
6187 
6188 }
6189 
6190 /*
6191  * Prepare the file system before and after the dump operation.
6192  *
6193  * action = DUMP_ALLOC:
6194  * Preparation before dump, allocate dump private data structure
6195  * to hold all the direct and indirect block info for dump.
6196  *
6197  * action = DUMP_FREE:
6198  * Clean up after dump, deallocate the dump private data structure.
6199  *
6200  * action = DUMP_SCAN:
6201  * Scan dump_info for *blkp DEV_BSIZE blocks of contig fs space;
6202  * if found, the starting file-relative DEV_BSIZE lbn is written
6203  * to *bklp; that lbn is intended for use with VOP_DUMP()
6204  */
6205 /*ARGSUSED*/
6206 static int
6207 ufs_dumpctl(vnode_t *vp, int action, offset_t *blkp, caller_context_t *ct)
6208 {
6209         struct inode    *ip = VTOI(vp);
6210         ufsvfs_t        *ufsvfsp = ip->i_ufsvfs;
6211         struct fs       *fs;
6212         daddr32_t       *dblk, *storeblk;
6213         daddr32_t       *nextblk, *endblk;
6214         struct buf      *bp;
6215         int             i, entry, entries;
6216         int             n, ncontig;
6217 
6218         /*
6219          * check for forced unmount
6220          */
6221         if (ufsvfsp == NULL)
6222                 return (EIO);
6223 
6224         if (action == DUMP_ALLOC) {
6225                 /*
6226                  * alloc and record dump_info
6227                  */
6228                 if (dump_info != NULL)
6229                         return (EINVAL);
6230 
6231                 ASSERT(vp->v_type == VREG);
6232                 fs = ufsvfsp->vfs_fs;
6233 
6234                 rw_enter(&ip->i_contents, RW_READER);
6235 
6236                 if (bmap_has_holes(ip)) {
6237                         rw_exit(&ip->i_contents);
6238                         return (EFAULT);
6239                 }
6240 
6241                 /*
6242                  * calculate and allocate space needed according to i_size
6243                  */
6244                 entries = (int)lblkno(fs, blkroundup(fs, ip->i_size));
6245                 dump_info = kmem_alloc(sizeof (struct dump) +
6246                     (entries - 1) * sizeof (daddr32_t), KM_NOSLEEP);
6247                 if (dump_info == NULL) {
6248                         rw_exit(&ip->i_contents);
6249                         return (ENOMEM);
6250                 }
6251 
6252                 /* Start saving the info */
6253                 dump_info->fsbs = entries;
6254                 dump_info->ip = ip;
6255                 storeblk = &dump_info->dblk[0];
6256 
6257                 /* Direct Blocks */
6258                 for (entry = 0; entry < NDADDR && entry < entries; entry++)
6259                         *storeblk++ = ip->i_db[entry];
6260 
6261                 /* Indirect Blocks */
6262                 for (i = 0; i < NIADDR; i++) {
6263                         int error = 0;
6264 
6265                         bp = UFS_BREAD(ufsvfsp,
6266                             ip->i_dev, fsbtodb(fs, ip->i_ib[i]), fs->fs_bsize);
6267                         if (bp->b_flags & B_ERROR)
6268                                 error = EIO;
6269                         else {
6270                                 dblk = bp->b_un.b_daddr;
6271                                 if ((storeblk = save_dblks(ip, ufsvfsp,
6272                                     storeblk, dblk, i, entries)) == NULL)
6273                                         error = EIO;
6274                         }
6275 
6276                         brelse(bp);
6277 
6278                         if (error != 0) {
6279                                 kmem_free(dump_info, sizeof (struct dump) +
6280                                     (entries - 1) * sizeof (daddr32_t));
6281                                 rw_exit(&ip->i_contents);
6282                                 dump_info = NULL;
6283                                 return (error);
6284                         }
6285                 }
6286                 /* and time stamp the information */
6287                 mutex_enter(&ip->i_tlock);
6288                 dump_info->time = ip->i_mtime;
6289                 mutex_exit(&ip->i_tlock);
6290 
6291                 rw_exit(&ip->i_contents);
6292         } else if (action == DUMP_FREE) {
6293                 /*
6294                  * free dump_info
6295                  */
6296                 if (dump_info == NULL)
6297                         return (EINVAL);
6298                 entries = dump_info->fsbs - 1;
6299                 kmem_free(dump_info, sizeof (struct dump) +
6300                     entries * sizeof (daddr32_t));
6301                 dump_info = NULL;
6302         } else if (action == DUMP_SCAN) {
6303                 /*
6304                  * scan dump_info
6305                  */
6306                 if (dump_info == NULL)
6307                         return (EINVAL);
6308 
6309                 dblk = dump_info->dblk;
6310                 nextblk = dblk + 1;
6311                 endblk = dblk + dump_info->fsbs - 1;
6312                 fs = ufsvfsp->vfs_fs;
6313                 ncontig = *blkp >> (fs->fs_bshift - DEV_BSHIFT);
6314 
6315                 /*
6316                  * scan dblk[] entries; contig fs space is found when:
6317                  * ((current blkno + frags per block) == next blkno)
6318                  */
6319                 n = 0;
6320                 while (n < ncontig && dblk < endblk) {
6321                         if ((*dblk + fs->fs_frag) == *nextblk)
6322                                 n++;
6323                         else
6324                                 n = 0;
6325                         dblk++;
6326                         nextblk++;
6327                 }
6328 
6329                 /*
6330                  * index is where size bytes of contig space begins;
6331                  * conversion from index to the file's DEV_BSIZE lbn
6332                  * is equivalent to:  (index * fs_bsize) / DEV_BSIZE
6333                  */
6334                 if (n == ncontig) {
6335                         i = (dblk - dump_info->dblk) - ncontig;
6336                         *blkp = i << (fs->fs_bshift - DEV_BSHIFT);
6337                 } else
6338                         return (EFAULT);
6339         }
6340         return (0);
6341 }
6342 
6343 /*
6344  * Recursive helper function for ufs_dumpctl().  It follows the indirect file
6345  * system  blocks until it reaches the the disk block addresses, which are
6346  * then stored into the given buffer, storeblk.
6347  */
6348 static daddr32_t *
6349 save_dblks(struct inode *ip, struct ufsvfs *ufsvfsp,  daddr32_t *storeblk,
6350     daddr32_t *dblk, int level, int entries)
6351 {
6352         struct fs       *fs = ufsvfsp->vfs_fs;
6353         struct buf      *bp;
6354         int             i;
6355 
6356         if (level == 0) {
6357                 for (i = 0; i < NINDIR(fs); i++) {
6358                         if (storeblk - dump_info->dblk >= entries)
6359                                 break;
6360                         *storeblk++ = dblk[i];
6361                 }
6362                 return (storeblk);
6363         }
6364         for (i = 0; i < NINDIR(fs); i++) {
6365                 if (storeblk - dump_info->dblk >= entries)
6366                         break;
6367                 bp = UFS_BREAD(ufsvfsp,
6368                     ip->i_dev, fsbtodb(fs, dblk[i]), fs->fs_bsize);
6369                 if (bp->b_flags & B_ERROR) {
6370                         brelse(bp);
6371                         return (NULL);
6372                 }
6373                 storeblk = save_dblks(ip, ufsvfsp, storeblk, bp->b_un.b_daddr,
6374                     level - 1, entries);
6375                 brelse(bp);
6376 
6377                 if (storeblk == NULL)
6378                         return (NULL);
6379         }
6380         return (storeblk);
6381 }
6382 
6383 /* ARGSUSED */
6384 static int
6385 ufs_getsecattr(struct vnode *vp, vsecattr_t *vsap, int flag,
6386     struct cred *cr, caller_context_t *ct)
6387 {
6388         struct inode    *ip = VTOI(vp);
6389         struct ulockfs  *ulp;
6390         struct ufsvfs   *ufsvfsp = ip->i_ufsvfs;
6391         ulong_t         vsa_mask = vsap->vsa_mask;
6392         int             err = EINVAL;
6393 
6394         vsa_mask &= (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT);
6395 
6396         /*
6397          * Only grab locks if needed - they're not needed to check vsa_mask
6398          * or if the mask contains no acl flags.
6399          */
6400         if (vsa_mask != 0) {
6401                 if (err = ufs_lockfs_begin(ufsvfsp, &ulp,
6402                     ULOCKFS_GETATTR_MASK))
6403                         return (err);
6404 
6405                 rw_enter(&ip->i_contents, RW_READER);
6406                 err = ufs_acl_get(ip, vsap, flag, cr);
6407                 rw_exit(&ip->i_contents);
6408 
6409                 if (ulp)
6410                         ufs_lockfs_end(ulp);
6411         }
6412         return (err);
6413 }
6414 
6415 /* ARGSUSED */
6416 static int
6417 ufs_setsecattr(struct vnode *vp, vsecattr_t *vsap, int flag, struct cred *cr,
6418     caller_context_t *ct)
6419 {
6420         struct inode    *ip = VTOI(vp);
6421         struct ulockfs  *ulp = NULL;
6422         struct ufsvfs   *ufsvfsp = VTOI(vp)->i_ufsvfs;
6423         ulong_t         vsa_mask = vsap->vsa_mask;
6424         int             err;
6425         int             haverwlock = 1;
6426         int             trans_size;
6427         int             donetrans = 0;
6428         int             retry = 1;
6429 
6430         ASSERT(RW_LOCK_HELD(&ip->i_rwlock));
6431 
6432         /* Abort now if the request is either empty or invalid. */
6433         vsa_mask &= (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT);
6434         if ((vsa_mask == 0) ||
6435             ((vsap->vsa_aclentp == NULL) &&
6436             (vsap->vsa_dfaclentp == NULL))) {
6437                 err = EINVAL;
6438                 goto out;
6439         }
6440 
6441         /*
6442          * Following convention, if this is a directory then we acquire the
6443          * inode's i_rwlock after starting a UFS logging transaction;
6444          * otherwise, we acquire it beforehand. Since we were called (and
6445          * must therefore return) with the lock held, we will have to drop it,
6446          * and later reacquire it, if operating on a directory.
6447          */
6448         if (vp->v_type == VDIR) {
6449                 rw_exit(&ip->i_rwlock);
6450                 haverwlock = 0;
6451         } else {
6452                 /* Upgrade the lock if required. */
6453                 if (!rw_write_held(&ip->i_rwlock)) {
6454                         rw_exit(&ip->i_rwlock);
6455                         rw_enter(&ip->i_rwlock, RW_WRITER);
6456                 }
6457         }
6458 
6459 again:
6460         ASSERT(!(vp->v_type == VDIR && haverwlock));
6461         if (err = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SETATTR_MASK)) {
6462                 ulp = NULL;
6463                 retry = 0;
6464                 goto out;
6465         }
6466 
6467         /*
6468          * Check that the file system supports this operation. Note that
6469          * ufs_lockfs_begin() will have checked that the file system had
6470          * not been forcibly unmounted.
6471          */
6472         if (ufsvfsp->vfs_fs->fs_ronly) {
6473                 err = EROFS;
6474                 goto out;
6475         }
6476         if (ufsvfsp->vfs_nosetsec) {
6477                 err = ENOSYS;
6478                 goto out;
6479         }
6480 
6481         if (ulp) {
6482                 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_SETSECATTR,
6483                     trans_size = TOP_SETSECATTR_SIZE(VTOI(vp)));
6484                 donetrans = 1;
6485         }
6486 
6487         if (vp->v_type == VDIR) {
6488                 rw_enter(&ip->i_rwlock, RW_WRITER);
6489                 haverwlock = 1;
6490         }
6491 
6492         ASSERT(haverwlock);
6493 
6494         /* Do the actual work. */
6495         rw_enter(&ip->i_contents, RW_WRITER);
6496         /*
6497          * Suppress out of inodes messages if we will retry.
6498          */
6499         if (retry)
6500                 ip->i_flag |= IQUIET;
6501         err = ufs_acl_set(ip, vsap, flag, cr);
6502         ip->i_flag &= ~IQUIET;
6503         rw_exit(&ip->i_contents);
6504 
6505 out:
6506         if (ulp) {
6507                 if (donetrans) {
6508                         /*
6509                          * top_end_async() can eventually call
6510                          * top_end_sync(), which can block. We must
6511                          * therefore observe the lock-ordering protocol
6512                          * here as well.
6513                          */
6514                         if (vp->v_type == VDIR) {
6515                                 rw_exit(&ip->i_rwlock);
6516                                 haverwlock = 0;
6517                         }
6518                         TRANS_END_ASYNC(ufsvfsp, TOP_SETSECATTR, trans_size);
6519                 }
6520                 ufs_lockfs_end(ulp);
6521         }
6522         /*
6523          * If no inodes available, try scaring a logically-
6524          * free one out of the delete queue to someplace
6525          * that we can find it.
6526          */
6527         if ((err == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
6528                 ufs_delete_drain_wait(ufsvfsp, 1);
6529                 retry = 0;
6530                 if (vp->v_type == VDIR && haverwlock) {
6531                         rw_exit(&ip->i_rwlock);
6532                         haverwlock = 0;
6533                 }
6534                 goto again;
6535         }
6536         /*
6537          * If we need to reacquire the lock then it is safe to do so
6538          * as a reader. This is because ufs_rwunlock(), which will be
6539          * called by our caller after we return, does not differentiate
6540          * between shared and exclusive locks.
6541          */
6542         if (!haverwlock) {
6543                 ASSERT(vp->v_type == VDIR);
6544                 rw_enter(&ip->i_rwlock, RW_READER);
6545         }
6546 
6547         return (err);
6548 }
6549 
6550 /*
6551  * Locate the vnode to be used for an event notification. As this will
6552  * be called prior to the name space change perform basic verification
6553  * that the change will be allowed.
6554  */
6555 
6556 static int
6557 ufs_eventlookup(struct vnode *dvp, char *nm, struct cred *cr,
6558     struct vnode **vpp)
6559 {
6560         int     namlen;
6561         int     error;
6562         struct vnode    *vp;
6563         struct inode    *ip;
6564         struct inode    *xip;
6565         struct ufsvfs   *ufsvfsp;
6566         struct ulockfs  *ulp;
6567 
6568         ip = VTOI(dvp);
6569         *vpp = NULL;
6570 
6571         if ((namlen = strlen(nm)) == 0)
6572                 return (EINVAL);
6573 
6574         if (nm[0] == '.') {
6575                 if (namlen == 1)
6576                         return (EINVAL);
6577                 else if ((namlen == 2) && nm[1] == '.') {
6578                         return (EEXIST);
6579                 }
6580         }
6581 
6582         /*
6583          * Check accessibility and write access of parent directory as we
6584          * only want to post the event if we're able to make a change.
6585          */
6586         if (error = ufs_diraccess(ip, IEXEC|IWRITE, cr))
6587                 return (error);
6588 
6589         if (vp = dnlc_lookup(dvp, nm)) {
6590                 if (vp == DNLC_NO_VNODE) {
6591                         VN_RELE(vp);
6592                         return (ENOENT);
6593                 }
6594 
6595                 *vpp = vp;
6596                 return (0);
6597         }
6598 
6599         /*
6600          * Keep the idle queue from getting too long by idling two
6601          * inodes before attempting to allocate another.
6602          * This operation must be performed before entering lockfs
6603          * or a transaction.
6604          */
6605         if (ufs_idle_q.uq_ne > ufs_idle_q.uq_hiwat)
6606                 if ((curthread->t_flag & T_DONTBLOCK) == 0) {
6607                         ins.in_lidles.value.ul += ufs_lookup_idle_count;
6608                         ufs_idle_some(ufs_lookup_idle_count);
6609                 }
6610 
6611         ufsvfsp = ip->i_ufsvfs;
6612 
6613 retry_lookup:
6614         if (error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_LOOKUP_MASK))
6615                 return (error);
6616 
6617         if ((error = ufs_dirlook(ip, nm, &xip, cr, 1, 1)) == 0) {
6618                 vp = ITOV(xip);
6619                 *vpp = vp;
6620         }
6621 
6622         if (ulp) {
6623                 ufs_lockfs_end(ulp);
6624         }
6625 
6626         if (error == EAGAIN)
6627                 goto retry_lookup;
6628 
6629         return (error);
6630 }