epoll Old usr/src/uts/common/fs/ufs/ufs

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1984, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2015, Joyent, Inc.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  */
  27 
  28 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  29 /*        All Rights Reserved   */
  30 
  31 /*
  32  * Portions of this source code were derived from Berkeley 4.3 BSD
  33  * under license from the Regents of the University of California.
  34  */
  35 
  36 #include <sys/types.h>
  37 #include <sys/t_lock.h>
  38 #include <sys/ksynch.h>
  39 #include <sys/param.h>
  40 #include <sys/time.h>
  41 #include <sys/systm.h>
  42 #include <sys/sysmacros.h>
  43 #include <sys/resource.h>
  44 #include <sys/signal.h>
  45 #include <sys/cred.h>
  46 #include <sys/user.h>
  47 #include <sys/buf.h>
  48 #include <sys/vfs.h>
  49 #include <sys/vfs_opreg.h>
  50 #include <sys/vnode.h>
  51 #include <sys/proc.h>
  52 #include <sys/disp.h>
  53 #include <sys/file.h>
  54 #include <sys/fcntl.h>
  55 #include <sys/flock.h>
  56 #include <sys/atomic.h>
  57 #include <sys/kmem.h>
  58 #include <sys/uio.h>
  59 #include <sys/dnlc.h>
  60 #include <sys/conf.h>
  61 #include <sys/mman.h>
  62 #include <sys/pathname.h>
  63 #include <sys/debug.h>
  64 #include <sys/vmsystm.h>
  65 #include <sys/cmn_err.h>
  66 #include <sys/filio.h>
  67 #include <sys/policy.h>
  68 
  69 #include <sys/fs/ufs_fs.h>
  70 #include <sys/fs/ufs_lockfs.h>
  71 #include <sys/fs/ufs_filio.h>
  72 #include <sys/fs/ufs_inode.h>
  73 #include <sys/fs/ufs_fsdir.h>
  74 #include <sys/fs/ufs_quota.h>
  75 #include <sys/fs/ufs_log.h>
  76 #include <sys/fs/ufs_snap.h>
  77 #include <sys/fs/ufs_trans.h>
  78 #include <sys/fs/ufs_panic.h>
  79 #include <sys/fs/ufs_bio.h>
  80 #include <sys/dirent.h>           /* must be AFTER <sys/fs/fsdir.h>! */
  81 #include <sys/errno.h>
  82 #include <sys/fssnap_if.h>
  83 #include <sys/unistd.h>
  84 #include <sys/sunddi.h>
  85 
  86 #include <sys/filio.h>            /* _FIOIO */
  87 
  88 #include <vm/hat.h>
  89 #include <vm/page.h>
  90 #include <vm/pvn.h>
  91 #include <vm/as.h>
  92 #include <vm/seg.h>
  93 #include <vm/seg_map.h>
  94 #include <vm/seg_vn.h>
  95 #include <vm/seg_kmem.h>
  96 #include <vm/rm.h>
  97 #include <sys/swap.h>
  98 
  99 #include <fs/fs_subr.h>
 100 
 101 #include <sys/fs/decomp.h>
 102 
 103 static struct instats ins;
 104 
 105 static  int ufs_getpage_ra(struct vnode *, u_offset_t, struct seg *, caddr_t);
 106 static  int ufs_getpage_miss(struct vnode *, u_offset_t, size_t, struct seg *,
 107                 caddr_t, struct page **, size_t, enum seg_rw, int);
 108 static  int ufs_open(struct vnode **, int, struct cred *, caller_context_t *);
 109 static  int ufs_close(struct vnode *, int, int, offset_t, struct cred *,
 110                 caller_context_t *);
 111 static  int ufs_read(struct vnode *, struct uio *, int, struct cred *,
 112                 struct caller_context *);
 113 static  int ufs_write(struct vnode *, struct uio *, int, struct cred *,
 114                 struct caller_context *);
 115 static  int ufs_ioctl(struct vnode *, int, intptr_t, int, struct cred *,
 116                 int *, caller_context_t *);
 117 static  int ufs_getattr(struct vnode *, struct vattr *, int, struct cred *,
 118                 caller_context_t *);
 119 static  int ufs_setattr(struct vnode *, struct vattr *, int, struct cred *,
 120                 caller_context_t *);
 121 static  int ufs_access(struct vnode *, int, int, struct cred *,
 122                 caller_context_t *);
 123 static  int ufs_lookup(struct vnode *, char *, struct vnode **,
 124                 struct pathname *, int, struct vnode *, struct cred *,
 125                 caller_context_t *, int *, pathname_t *);
 126 static  int ufs_create(struct vnode *, char *, struct vattr *, enum vcexcl,
 127                 int, struct vnode **, struct cred *, int,
 128                 caller_context_t *, vsecattr_t  *);
 129 static  int ufs_remove(struct vnode *, char *, struct cred *,
 130                 caller_context_t *, int);
 131 static  int ufs_link(struct vnode *, struct vnode *, char *, struct cred *,
 132                 caller_context_t *, int);
 133 static  int ufs_rename(struct vnode *, char *, struct vnode *, char *,
 134                 struct cred *, caller_context_t *, int);
 135 static  int ufs_mkdir(struct vnode *, char *, struct vattr *, struct vnode **,
 136                 struct cred *, caller_context_t *, int, vsecattr_t *);
 137 static  int ufs_rmdir(struct vnode *, char *, struct vnode *, struct cred *,
 138                 caller_context_t *, int);
 139 static  int ufs_readdir(struct vnode *, struct uio *, struct cred *, int *,
 140                 caller_context_t *, int);
 141 static  int ufs_symlink(struct vnode *, char *, struct vattr *, char *,
 142                 struct cred *, caller_context_t *, int);
 143 static  int ufs_readlink(struct vnode *, struct uio *, struct cred *,
 144                 caller_context_t *);
 145 static  int ufs_fsync(struct vnode *, int, struct cred *, caller_context_t *);
 146 static  void ufs_inactive(struct vnode *, struct cred *, caller_context_t *);
 147 static  int ufs_fid(struct vnode *, struct fid *, caller_context_t *);
 148 static  int ufs_rwlock(struct vnode *, int, caller_context_t *);
 149 static  void ufs_rwunlock(struct vnode *, int, caller_context_t *);
 150 static  int ufs_seek(struct vnode *, offset_t, offset_t *, caller_context_t *);
 151 static  int ufs_frlock(struct vnode *, int, struct flock64 *, int, offset_t,
 152                 struct flk_callback *, struct cred *,
 153                 caller_context_t *);
 154 static  int ufs_space(struct vnode *, int, struct flock64 *, int, offset_t,
 155                 cred_t *, caller_context_t *);
 156 static  int ufs_getpage(struct vnode *, offset_t, size_t, uint_t *,
 157                 struct page **, size_t, struct seg *, caddr_t,
 158                 enum seg_rw, struct cred *, caller_context_t *);
 159 static  int ufs_putpage(struct vnode *, offset_t, size_t, int, struct cred *,
 160                 caller_context_t *);
 161 static  int ufs_putpages(struct vnode *, offset_t, size_t, int, struct cred *);
 162 static  int ufs_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t,
 163                 uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
 164 static  int ufs_addmap(struct vnode *, offset_t, struct as *, caddr_t,  size_t,
 165                 uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
 166 static  int ufs_delmap(struct vnode *, offset_t, struct as *, caddr_t,  size_t,
 167                 uint_t, uint_t, uint_t, struct cred *, caller_context_t *);
 168 static  int ufs_poll(vnode_t *, short, int, short *, struct pollhead **,
 169                 caller_context_t *);
 170 static  int ufs_dump(vnode_t *, caddr_t, offset_t, offset_t,
 171     caller_context_t *);
 172 static  int ufs_l_pathconf(struct vnode *, int, ulong_t *, struct cred *,
 173                 caller_context_t *);
 174 static  int ufs_pageio(struct vnode *, struct page *, u_offset_t, size_t, int,
 175                 struct cred *, caller_context_t *);
 176 static  int ufs_dumpctl(vnode_t *, int, offset_t *, caller_context_t *);
 177 static  daddr32_t *save_dblks(struct inode *, struct ufsvfs *, daddr32_t *,
 178                 daddr32_t *, int, int);
 179 static  int ufs_getsecattr(struct vnode *, vsecattr_t *, int, struct cred *,
 180                 caller_context_t *);
 181 static  int ufs_setsecattr(struct vnode *, vsecattr_t *, int, struct cred *,
 182                 caller_context_t *);
 183 static  int ufs_priv_access(void *, int, struct cred *);
 184 static  int ufs_eventlookup(struct vnode *, char *, struct cred *,
 185     struct vnode **);
 186 extern int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
 187 
 188 /*
 189  * For lockfs: ulockfs begin/end is now inlined in the ufs_xxx functions.
 190  *
 191  * XXX - ULOCKFS in fs_pathconf and ufs_ioctl is not inlined yet.
 192  */
 193 struct vnodeops *ufs_vnodeops;
 194 
 195 /* NOTE: "not blkd" below  means that the operation isn't blocked by lockfs */
 196 const fs_operation_def_t ufs_vnodeops_template[] = {
 197         VOPNAME_OPEN,           { .vop_open = ufs_open },       /* not blkd */
 198         VOPNAME_CLOSE,          { .vop_close = ufs_close },     /* not blkd */
 199         VOPNAME_READ,           { .vop_read = ufs_read },
 200         VOPNAME_WRITE,          { .vop_write = ufs_write },
 201         VOPNAME_IOCTL,          { .vop_ioctl = ufs_ioctl },
 202         VOPNAME_GETATTR,        { .vop_getattr = ufs_getattr },
 203         VOPNAME_SETATTR,        { .vop_setattr = ufs_setattr },
 204         VOPNAME_ACCESS,         { .vop_access = ufs_access },
 205         VOPNAME_LOOKUP,         { .vop_lookup = ufs_lookup },
 206         VOPNAME_CREATE,         { .vop_create = ufs_create },
 207         VOPNAME_REMOVE,         { .vop_remove = ufs_remove },
 208         VOPNAME_LINK,           { .vop_link = ufs_link },
 209         VOPNAME_RENAME,         { .vop_rename = ufs_rename },
 210         VOPNAME_MKDIR,          { .vop_mkdir = ufs_mkdir },
 211         VOPNAME_RMDIR,          { .vop_rmdir = ufs_rmdir },
 212         VOPNAME_READDIR,        { .vop_readdir = ufs_readdir },
 213         VOPNAME_SYMLINK,        { .vop_symlink = ufs_symlink },
 214         VOPNAME_READLINK,       { .vop_readlink = ufs_readlink },
 215         VOPNAME_FSYNC,          { .vop_fsync = ufs_fsync },
 216         VOPNAME_INACTIVE,       { .vop_inactive = ufs_inactive }, /* not blkd */
 217         VOPNAME_FID,            { .vop_fid = ufs_fid },
 218         VOPNAME_RWLOCK,         { .vop_rwlock = ufs_rwlock },   /* not blkd */
 219         VOPNAME_RWUNLOCK,       { .vop_rwunlock = ufs_rwunlock }, /* not blkd */
 220         VOPNAME_SEEK,           { .vop_seek = ufs_seek },
 221         VOPNAME_FRLOCK,         { .vop_frlock = ufs_frlock },
 222         VOPNAME_SPACE,          { .vop_space = ufs_space },
 223         VOPNAME_GETPAGE,        { .vop_getpage = ufs_getpage },
 224         VOPNAME_PUTPAGE,        { .vop_putpage = ufs_putpage },
 225         VOPNAME_MAP,            { .vop_map = ufs_map },
 226         VOPNAME_ADDMAP,         { .vop_addmap = ufs_addmap },   /* not blkd */
 227         VOPNAME_DELMAP,         { .vop_delmap = ufs_delmap },   /* not blkd */
 228         VOPNAME_POLL,           { .vop_poll = ufs_poll },       /* not blkd */
 229         VOPNAME_DUMP,           { .vop_dump = ufs_dump },
 230         VOPNAME_PATHCONF,       { .vop_pathconf = ufs_l_pathconf },
 231         VOPNAME_PAGEIO,         { .vop_pageio = ufs_pageio },
 232         VOPNAME_DUMPCTL,        { .vop_dumpctl = ufs_dumpctl },
 233         VOPNAME_GETSECATTR,     { .vop_getsecattr = ufs_getsecattr },
 234         VOPNAME_SETSECATTR,     { .vop_setsecattr = ufs_setsecattr },
 235         VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
 236         NULL,                   NULL
 237 };
 238 
 239 #define MAX_BACKFILE_COUNT      9999
 240 
 241 /*
 242  * Created by ufs_dumpctl() to store a file's disk block info into memory.
 243  * Used by ufs_dump() to dump data to disk directly.
 244  */
 245 struct dump {
 246         struct inode    *ip;            /* the file we contain */
 247         daddr_t         fsbs;           /* number of blocks stored */
 248         struct timeval32 time;          /* time stamp for the struct */
 249         daddr32_t       dblk[1];        /* place holder for block info */
 250 };
 251 
 252 static struct dump *dump_info = NULL;
 253 
 254 /*
 255  * Previously there was no special action required for ordinary files.
 256  * (Devices are handled through the device file system.)
 257  * Now we support Large Files and Large File API requires open to
 258  * fail if file is large.
 259  * We could take care to prevent data corruption
 260  * by doing an atomic check of size and truncate if file is opened with
 261  * FTRUNC flag set but traditionally this is being done by the vfs/vnode
 262  * layers. So taking care of truncation here is a change in the existing
 263  * semantics of VOP_OPEN and therefore we chose not to implement any thing
 264  * here. The check for the size of the file > 2GB is being done at the
 265  * vfs layer in routine vn_open().
 266  */
 267 
 268 /* ARGSUSED */
 269 static int
 270 ufs_open(struct vnode **vpp, int flag, struct cred *cr, caller_context_t *ct)
 271 {
 272         return (0);
 273 }
 274 
 275 /*ARGSUSED*/
 276 static int
 277 ufs_close(struct vnode *vp, int flag, int count, offset_t offset,
 278         struct cred *cr, caller_context_t *ct)
 279 {
 280         cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
 281         cleanshares(vp, ttoproc(curthread)->p_pid);
 282 
 283         /*
 284          * Push partially filled cluster at last close.
 285          * ``last close'' is approximated because the dnlc
 286          * may have a hold on the vnode.
 287          * Checking for VBAD here will also act as a forced umount check.
 288          */
 289         if (vp->v_count <= 2 && vp->v_type != VBAD) {
 290                 struct inode *ip = VTOI(vp);
 291                 if (ip->i_delaylen) {
 292                         ins.in_poc.value.ul++;
 293                         (void) ufs_putpages(vp, ip->i_delayoff, ip->i_delaylen,
 294                             B_ASYNC | B_FREE, cr);
 295                         ip->i_delaylen = 0;
 296                 }
 297         }
 298 
 299         return (0);
 300 }
 301 
 302 /*ARGSUSED*/
 303 static int
 304 ufs_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr,
 305         struct caller_context *ct)
 306 {
 307         struct inode *ip = VTOI(vp);
 308         struct ufsvfs *ufsvfsp;
 309         struct ulockfs *ulp = NULL;
 310         int error = 0;
 311         int intrans = 0;
 312 
 313         ASSERT(RW_READ_HELD(&ip->i_rwlock));
 314 
 315         /*
 316          * Mandatory locking needs to be done before ufs_lockfs_begin()
 317          * and TRANS_BEGIN_SYNC() calls since mandatory locks can sleep.
 318          */
 319         if (MANDLOCK(vp, ip->i_mode)) {
 320                 /*
 321                  * ufs_getattr ends up being called by chklock
 322                  */
 323                 error = chklock(vp, FREAD, uiop->uio_loffset,
 324                     uiop->uio_resid, uiop->uio_fmode, ct);
 325                 if (error)
 326                         goto out;
 327         }
 328 
 329         ufsvfsp = ip->i_ufsvfs;
 330         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READ_MASK);
 331         if (error)
 332                 goto out;
 333 
 334         /*
 335          * In the case that a directory is opened for reading as a file
 336          * (eg "cat .") with the  O_RSYNC, O_SYNC and O_DSYNC flags set.
 337          * The locking order had to be changed to avoid a deadlock with
 338          * an update taking place on that directory at the same time.
 339          */
 340         if ((ip->i_mode & IFMT) == IFDIR) {
 341 
 342                 rw_enter(&ip->i_contents, RW_READER);
 343                 error = rdip(ip, uiop, ioflag, cr);
 344                 rw_exit(&ip->i_contents);
 345 
 346                 if (error) {
 347                         if (ulp)
 348                                 ufs_lockfs_end(ulp);
 349                         goto out;
 350                 }
 351 
 352                 if (ulp && (ioflag & FRSYNC) && (ioflag & (FSYNC | FDSYNC)) &&
 353                     TRANS_ISTRANS(ufsvfsp)) {
 354                         rw_exit(&ip->i_rwlock);
 355                         TRANS_BEGIN_SYNC(ufsvfsp, TOP_READ_SYNC, TOP_READ_SIZE,
 356                             error);
 357                         ASSERT(!error);
 358                         TRANS_END_SYNC(ufsvfsp, error, TOP_READ_SYNC,
 359                             TOP_READ_SIZE);
 360                         rw_enter(&ip->i_rwlock, RW_READER);
 361                 }
 362         } else {
 363                 /*
 364                  * Only transact reads to files opened for sync-read and
 365                  * sync-write on a file system that is not write locked.
 366                  *
 367                  * The ``not write locked'' check prevents problems with
 368                  * enabling/disabling logging on a busy file system.  E.g.,
 369                  * logging exists at the beginning of the read but does not
 370                  * at the end.
 371                  *
 372                  */
 373                 if (ulp && (ioflag & FRSYNC) && (ioflag & (FSYNC | FDSYNC)) &&
 374                     TRANS_ISTRANS(ufsvfsp)) {
 375                         TRANS_BEGIN_SYNC(ufsvfsp, TOP_READ_SYNC, TOP_READ_SIZE,
 376                             error);
 377                         ASSERT(!error);
 378                         intrans = 1;
 379                 }
 380 
 381                 rw_enter(&ip->i_contents, RW_READER);
 382                 error = rdip(ip, uiop, ioflag, cr);
 383                 rw_exit(&ip->i_contents);
 384 
 385                 if (intrans) {
 386                         TRANS_END_SYNC(ufsvfsp, error, TOP_READ_SYNC,
 387                             TOP_READ_SIZE);
 388                 }
 389         }
 390 
 391         if (ulp) {
 392                 ufs_lockfs_end(ulp);
 393         }
 394 out:
 395 
 396         return (error);
 397 }
 398 
 399 extern  int     ufs_HW;         /* high water mark */
 400 extern  int     ufs_LW;         /* low water mark */
 401 int     ufs_WRITES = 1;         /* XXX - enable/disable */
 402 int     ufs_throttles = 0;      /* throttling count */
 403 int     ufs_allow_shared_writes = 1;    /* directio shared writes */
 404 
 405 static int
 406 ufs_check_rewrite(struct inode *ip, struct uio *uiop, int ioflag)
 407 {
 408         int     shared_write;
 409 
 410         /*
 411          * If the FDSYNC flag is set then ignore the global
 412          * ufs_allow_shared_writes in this case.
 413          */
 414         shared_write = (ioflag & FDSYNC) | ufs_allow_shared_writes;
 415 
 416         /*
 417          * Filter to determine if this request is suitable as a
 418          * concurrent rewrite. This write must not allocate blocks
 419          * by extending the file or filling in holes. No use trying
 420          * through FSYNC descriptors as the inode will be synchronously
 421          * updated after the write. The uio structure has not yet been
 422          * checked for sanity, so assume nothing.
 423          */
 424         return (((ip->i_mode & IFMT) == IFREG) && !(ioflag & FAPPEND) &&
 425             (uiop->uio_loffset >= (offset_t)0) &&
 426             (uiop->uio_loffset < ip->i_size) && (uiop->uio_resid > 0) &&
 427             ((ip->i_size - uiop->uio_loffset) >= uiop->uio_resid) &&
 428             !(ioflag & FSYNC) && !bmap_has_holes(ip) &&
 429             shared_write);
 430 }
 431 
 432 /*ARGSUSED*/
 433 static int
 434 ufs_write(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cr,
 435         caller_context_t *ct)
 436 {
 437         struct inode *ip = VTOI(vp);
 438         struct ufsvfs *ufsvfsp;
 439         struct ulockfs *ulp;
 440         int retry = 1;
 441         int error, resv, resid = 0;
 442         int directio_status;
 443         int exclusive;
 444         int rewriteflg;
 445         long start_resid = uiop->uio_resid;
 446 
 447         ASSERT(RW_LOCK_HELD(&ip->i_rwlock));
 448 
 449 retry_mandlock:
 450         /*
 451          * Mandatory locking needs to be done before ufs_lockfs_begin()
 452          * and TRANS_BEGIN_[A]SYNC() calls since mandatory locks can sleep.
 453          * Check for forced unmounts normally done in ufs_lockfs_begin().
 454          */
 455         if ((ufsvfsp = ip->i_ufsvfs) == NULL) {
 456                 error = EIO;
 457                 goto out;
 458         }
 459         if (MANDLOCK(vp, ip->i_mode)) {
 460 
 461                 ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
 462 
 463                 /*
 464                  * ufs_getattr ends up being called by chklock
 465                  */
 466                 error = chklock(vp, FWRITE, uiop->uio_loffset,
 467                     uiop->uio_resid, uiop->uio_fmode, ct);
 468                 if (error)
 469                         goto out;
 470         }
 471 
 472         /* i_rwlock can change in chklock */
 473         exclusive = rw_write_held(&ip->i_rwlock);
 474         rewriteflg = ufs_check_rewrite(ip, uiop, ioflag);
 475 
 476         /*
 477          * Check for fast-path special case of directio re-writes.
 478          */
 479         if ((ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) &&
 480             !exclusive && rewriteflg) {
 481 
 482                 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_WRITE_MASK);
 483                 if (error)
 484                         goto out;
 485 
 486                 rw_enter(&ip->i_contents, RW_READER);
 487                 error = ufs_directio_write(ip, uiop, ioflag, 1, cr,
 488                     &directio_status);
 489                 if (directio_status == DIRECTIO_SUCCESS) {
 490                         uint_t i_flag_save;
 491 
 492                         if (start_resid != uiop->uio_resid)
 493                                 error = 0;
 494                         /*
 495                          * Special treatment of access times for re-writes.
 496                          * If IMOD is not already set, then convert it
 497                          * to IMODACC for this operation. This defers
 498                          * entering a delta into the log until the inode
 499                          * is flushed. This mimics what is done for read
 500                          * operations and inode access time.
 501                          */
 502                         mutex_enter(&ip->i_tlock);
 503                         i_flag_save = ip->i_flag;
 504                         ip->i_flag |= IUPD | ICHG;
 505                         ip->i_seq++;
 506                         ITIMES_NOLOCK(ip);
 507                         if ((i_flag_save & IMOD) == 0) {
 508                                 ip->i_flag &= ~IMOD;
 509                                 ip->i_flag |= IMODACC;
 510                         }
 511                         mutex_exit(&ip->i_tlock);
 512                         rw_exit(&ip->i_contents);
 513                         if (ulp)
 514                                 ufs_lockfs_end(ulp);
 515                         goto out;
 516                 }
 517                 rw_exit(&ip->i_contents);
 518                 if (ulp)
 519                         ufs_lockfs_end(ulp);
 520         }
 521 
 522         if (!exclusive && !rw_tryupgrade(&ip->i_rwlock)) {
 523                 rw_exit(&ip->i_rwlock);
 524                 rw_enter(&ip->i_rwlock, RW_WRITER);
 525                 /*
 526                  * Mandatory locking could have been enabled
 527                  * after dropping the i_rwlock.
 528                  */
 529                 if (MANDLOCK(vp, ip->i_mode))
 530                         goto retry_mandlock;
 531         }
 532 
 533         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_WRITE_MASK);
 534         if (error)
 535                 goto out;
 536 
 537         /*
 538          * Amount of log space needed for this write
 539          */
 540         if (!rewriteflg || !(ioflag & FDSYNC))
 541                 TRANS_WRITE_RESV(ip, uiop, ulp, &resv, &resid);
 542 
 543         /*
 544          * Throttle writes.
 545          */
 546         if (ufs_WRITES && (ip->i_writes > ufs_HW)) {
 547                 mutex_enter(&ip->i_tlock);
 548                 while (ip->i_writes > ufs_HW) {
 549                         ufs_throttles++;
 550                         cv_wait(&ip->i_wrcv, &ip->i_tlock);
 551                 }
 552                 mutex_exit(&ip->i_tlock);
 553         }
 554 
 555         /*
 556          * Enter Transaction
 557          *
 558          * If the write is a rewrite there is no need to open a transaction
 559          * if the FDSYNC flag is set and not the FSYNC.  In this case just
 560          * set the IMODACC flag to modify do the update at a later time
 561          * thus avoiding the overhead of the logging transaction that is
 562          * not required.
 563          */
 564         if (ioflag & (FSYNC|FDSYNC)) {
 565                 if (ulp) {
 566                         if (rewriteflg) {
 567                                 uint_t i_flag_save;
 568 
 569                                 rw_enter(&ip->i_contents, RW_READER);
 570                                 mutex_enter(&ip->i_tlock);
 571                                 i_flag_save = ip->i_flag;
 572                                 ip->i_flag |= IUPD | ICHG;
 573                                 ip->i_seq++;
 574                                 ITIMES_NOLOCK(ip);
 575                                 if ((i_flag_save & IMOD) == 0) {
 576                                         ip->i_flag &= ~IMOD;
 577                                         ip->i_flag |= IMODACC;
 578                                 }
 579                                 mutex_exit(&ip->i_tlock);
 580                                 rw_exit(&ip->i_contents);
 581                         } else {
 582                                 int terr = 0;
 583                                 TRANS_BEGIN_SYNC(ufsvfsp, TOP_WRITE_SYNC, resv,
 584                                     terr);
 585                                 ASSERT(!terr);
 586                         }
 587                 }
 588         } else {
 589                 if (ulp)
 590                         TRANS_BEGIN_ASYNC(ufsvfsp, TOP_WRITE, resv);
 591         }
 592 
 593         /*
 594          * Write the file
 595          */
 596         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
 597         rw_enter(&ip->i_contents, RW_WRITER);
 598         if ((ioflag & FAPPEND) != 0 && (ip->i_mode & IFMT) == IFREG) {
 599                 /*
 600                  * In append mode start at end of file.
 601                  */
 602                 uiop->uio_loffset = ip->i_size;
 603         }
 604 
 605         /*
 606          * Mild optimisation, don't call ufs_trans_write() unless we have to
 607          * Also, suppress file system full messages if we will retry.
 608          */
 609         if (retry)
 610                 ip->i_flag |= IQUIET;
 611         if (resid) {
 612                 TRANS_WRITE(ip, uiop, ioflag, error, ulp, cr, resv, resid);
 613         } else {
 614                 error = wrip(ip, uiop, ioflag, cr);
 615         }
 616         ip->i_flag &= ~IQUIET;
 617 
 618         rw_exit(&ip->i_contents);
 619         rw_exit(&ufsvfsp->vfs_dqrwlock);
 620 
 621         /*
 622          * Leave Transaction
 623          */
 624         if (ulp) {
 625                 if (ioflag & (FSYNC|FDSYNC)) {
 626                         if (!rewriteflg) {
 627                                 int terr = 0;
 628 
 629                                 TRANS_END_SYNC(ufsvfsp, terr, TOP_WRITE_SYNC,
 630                                     resv);
 631                                 if (error == 0)
 632                                         error = terr;
 633                         }
 634                 } else {
 635                         TRANS_END_ASYNC(ufsvfsp, TOP_WRITE, resv);
 636                 }
 637                 ufs_lockfs_end(ulp);
 638         }
 639 out:
 640         if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
 641                 /*
 642                  * Any blocks tied up in pending deletes?
 643                  */
 644                 ufs_delete_drain_wait(ufsvfsp, 1);
 645                 retry = 0;
 646                 goto retry_mandlock;
 647         }
 648 
 649         if (error == ENOSPC && (start_resid != uiop->uio_resid))
 650                 error = 0;
 651 
 652         return (error);
 653 }
 654 
 655 /*
 656  * Don't cache write blocks to files with the sticky bit set.
 657  * Used to keep swap files from blowing the page cache on a server.
 658  */
 659 int stickyhack = 1;
 660 
 661 /*
 662  * Free behind hacks.  The pager is busted.
 663  * XXX - need to pass the information down to writedone() in a flag like B_SEQ
 664  * or B_FREE_IF_TIGHT_ON_MEMORY.
 665  */
 666 int     freebehind = 1;
 667 int     smallfile = 0;
 668 u_offset_t smallfile64 = 32 * 1024;
 669 
 670 /*
 671  * While we should, in most cases, cache the pages for write, we
 672  * may also want to cache the pages for read as long as they are
 673  * frequently re-usable.
 674  *
 675  * If cache_read_ahead = 1, the pages for read will go to the tail
 676  * of the cache list when they are released, otherwise go to the head.
 677  */
 678 int     cache_read_ahead = 0;
 679 
 680 /*
 681  * Freebehind exists  so that as we read  large files  sequentially we
 682  * don't consume most of memory with pages  from a few files. It takes
 683  * longer to re-read from disk multiple small files as it does reading
 684  * one large one sequentially.  As system  memory grows customers need
 685  * to retain bigger chunks   of files in  memory.   The advent of  the
 686  * cachelist opens up of the possibility freeing pages  to the head or
 687  * tail of the list.
 688  *
 689  * Not freeing a page is a bet that the page will be read again before
 690  * it's segmap slot is needed for something else. If we loose the bet,
 691  * it means some  other thread is  burdened with the  page free we did
 692  * not do. If we win we save a free and reclaim.
 693  *
 694  * Freeing it at the tail  vs the head of cachelist  is a bet that the
 695  * page will survive until the next  read.  It's also saying that this
 696  * page is more likely to  be re-used than a  page freed some time ago
 697  * and never reclaimed.
 698  *
 699  * Freebehind maintains a  range of  file offset [smallfile1; smallfile2]
 700  *
 701  *            0 < offset < smallfile1 : pages are not freed.
 702  *   smallfile1 < offset < smallfile2 : pages freed to tail of cachelist.
 703  *   smallfile2 < offset              : pages freed to head of cachelist.
 704  *
 705  * The range  is  computed  at most  once  per second  and  depends on
 706  * freemem  and  ncpus_online.  Both parameters  are   bounded to be
 707  * >= smallfile && >= smallfile64.
 708  *
 709  * smallfile1 = (free memory / ncpu) / 1000
 710  * smallfile2 = (free memory / ncpu) / 10
 711  *
 712  * A few examples values:
 713  *
 714  *       Free Mem (in Bytes) [smallfile1; smallfile2]  [smallfile1; smallfile2]
 715  *                                 ncpus_online = 4          ncpus_online = 64
 716  *       ------------------  -----------------------   -----------------------
 717  *             1G                   [256K;  25M]               [32K; 1.5M]
 718  *            10G                   [2.5M; 250M]              [156K; 15M]
 719  *           100G                    [25M; 2.5G]              [1.5M; 150M]
 720  *
 721  */
 722 
 723 #define SMALLFILE1_D 1000
 724 #define SMALLFILE2_D 10
 725 static u_offset_t smallfile1 = 32 * 1024;
 726 static u_offset_t smallfile2 = 32 * 1024;
 727 static clock_t smallfile_update = 0; /* lbolt value of when to recompute */
 728 uint_t smallfile1_d = SMALLFILE1_D;
 729 uint_t smallfile2_d = SMALLFILE2_D;
 730 
 731 /*
 732  * wrip does the real work of write requests for ufs.
 733  */
 734 int
 735 wrip(struct inode *ip, struct uio *uio, int ioflag, struct cred *cr)
 736 {
 737         rlim64_t limit = uio->uio_llimit;
 738         u_offset_t off;
 739         u_offset_t old_i_size;
 740         struct fs *fs;
 741         struct vnode *vp;
 742         struct ufsvfs *ufsvfsp;
 743         caddr_t base;
 744         long start_resid = uio->uio_resid;   /* save starting resid */
 745         long premove_resid;                     /* resid before uiomove() */
 746         uint_t flags;
 747         int newpage;
 748         int iupdat_flag, directio_status;
 749         int n, on, mapon;
 750         int error, pagecreate;
 751         int do_dqrwlock;                /* drop/reacquire vfs_dqrwlock */
 752         int32_t iblocks;
 753         int     new_iblocks;
 754 
 755         /*
 756          * ip->i_size is incremented before the uiomove
 757          * is done on a write.  If the move fails (bad user
 758          * address) reset ip->i_size.
 759          * The better way would be to increment ip->i_size
 760          * only if the uiomove succeeds.
 761          */
 762         int i_size_changed = 0;
 763         o_mode_t type;
 764         int i_seq_needed = 0;
 765 
 766         vp = ITOV(ip);
 767 
 768         /*
 769          * check for forced unmount - should not happen as
 770          * the request passed the lockfs checks.
 771          */
 772         if ((ufsvfsp = ip->i_ufsvfs) == NULL)
 773                 return (EIO);
 774 
 775         fs = ip->i_fs;
 776 
 777         ASSERT(RW_WRITE_HELD(&ip->i_contents));
 778 
 779         /* check for valid filetype */
 780         type = ip->i_mode & IFMT;
 781         if ((type != IFREG) && (type != IFDIR) && (type != IFATTRDIR) &&
 782             (type != IFLNK) && (type != IFSHAD)) {
 783                 return (EIO);
 784         }
 785 
 786         /*
 787          * the actual limit of UFS file size
 788          * is UFS_MAXOFFSET_T
 789          */
 790         if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 791                 limit = MAXOFFSET_T;
 792 
 793         if (uio->uio_loffset >= limit) {
 794                 proc_t *p = ttoproc(curthread);
 795 
 796                 mutex_enter(&p->p_lock);
 797                 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls,
 798                     p, RCA_UNSAFE_SIGINFO);
 799                 mutex_exit(&p->p_lock);
 800                 return (EFBIG);
 801         }
 802 
 803         /*
 804          * if largefiles are disallowed, the limit is
 805          * the pre-largefiles value of 2GB
 806          */
 807         if (ufsvfsp->vfs_lfflags & UFS_LARGEFILES)
 808                 limit = MIN(UFS_MAXOFFSET_T, limit);
 809         else
 810                 limit = MIN(MAXOFF32_T, limit);
 811 
 812         if (uio->uio_loffset < (offset_t)0) {
 813                 return (EINVAL);
 814         }
 815         if (uio->uio_resid == 0) {
 816                 return (0);
 817         }
 818 
 819         if (uio->uio_loffset >= limit)
 820                 return (EFBIG);
 821 
 822         ip->i_flag |= INOACC;        /* don't update ref time in getpage */
 823 
 824         if (ioflag & (FSYNC|FDSYNC)) {
 825                 ip->i_flag |= ISYNC;
 826                 iupdat_flag = 1;
 827         }
 828         /*
 829          * Try to go direct
 830          */
 831         if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) {
 832                 uio->uio_llimit = limit;
 833                 error = ufs_directio_write(ip, uio, ioflag, 0, cr,
 834                     &directio_status);
 835                 /*
 836                  * If ufs_directio wrote to the file or set the flags,
 837                  * we need to update i_seq, but it may be deferred.
 838                  */
 839                 if (start_resid != uio->uio_resid ||
 840                     (ip->i_flag & (ICHG|IUPD))) {
 841                         i_seq_needed = 1;
 842                         ip->i_flag |= ISEQ;
 843                 }
 844                 if (directio_status == DIRECTIO_SUCCESS)
 845                         goto out;
 846         }
 847 
 848         /*
 849          * Behavior with respect to dropping/reacquiring vfs_dqrwlock:
 850          *
 851          * o shadow inodes: vfs_dqrwlock is not held at all
 852          * o quota updates: vfs_dqrwlock is read or write held
 853          * o other updates: vfs_dqrwlock is read held
 854          *
 855          * The first case is the only one where we do not hold
 856          * vfs_dqrwlock at all while entering wrip().
 857          * We must make sure not to downgrade/drop vfs_dqrwlock if we
 858          * have it as writer, i.e. if we are updating the quota inode.
 859          * There is no potential deadlock scenario in this case as
 860          * ufs_getpage() takes care of this and avoids reacquiring
 861          * vfs_dqrwlock in that case.
 862          *
 863          * This check is done here since the above conditions do not change
 864          * and we possibly loop below, so save a few cycles.
 865          */
 866         if ((type == IFSHAD) ||
 867             (rw_owner(&ufsvfsp->vfs_dqrwlock) == curthread)) {
 868                 do_dqrwlock = 0;
 869         } else {
 870                 do_dqrwlock = 1;
 871         }
 872 
 873         /*
 874          * Large Files: We cast MAXBMASK to offset_t
 875          * inorder to mask out the higher bits. Since offset_t
 876          * is a signed value, the high order bit set in MAXBMASK
 877          * value makes it do the right thing by having all bits 1
 878          * in the higher word. May be removed for _SOLARIS64_.
 879          */
 880 
 881         fs = ip->i_fs;
 882         do {
 883                 u_offset_t uoff = uio->uio_loffset;
 884                 off = uoff & (offset_t)MAXBMASK;
 885                 mapon = (int)(uoff & (offset_t)MAXBOFFSET);
 886                 on = (int)blkoff(fs, uoff);
 887                 n = (int)MIN(fs->fs_bsize - on, uio->uio_resid);
 888                 new_iblocks = 1;
 889 
 890                 if (type == IFREG && uoff + n >= limit) {
 891                         if (uoff >= limit) {
 892                                 error = EFBIG;
 893                                 goto out;
 894                         }
 895                         /*
 896                          * since uoff + n >= limit,
 897                          * therefore n >= limit - uoff, and n is an int
 898                          * so it is safe to cast it to an int
 899                          */
 900                         n = (int)(limit - (rlim64_t)uoff);
 901                 }
 902                 if (uoff + n > ip->i_size) {
 903                         /*
 904                          * We are extending the length of the file.
 905                          * bmap is used so that we are sure that
 906                          * if we need to allocate new blocks, that it
 907                          * is done here before we up the file size.
 908                          */
 909                         error = bmap_write(ip, uoff, (int)(on + n),
 910                             mapon == 0, NULL, cr);
 911                         /*
 912                          * bmap_write never drops i_contents so if
 913                          * the flags are set it changed the file.
 914                          */
 915                         if (ip->i_flag & (ICHG|IUPD)) {
 916                                 i_seq_needed = 1;
 917                                 ip->i_flag |= ISEQ;
 918                         }
 919                         if (error)
 920                                 break;
 921                         /*
 922                          * There is a window of vulnerability here.
 923                          * The sequence of operations: allocate file
 924                          * system blocks, uiomove the data into pages,
 925                          * and then update the size of the file in the
 926                          * inode, must happen atomically.  However, due
 927                          * to current locking constraints, this can not
 928                          * be done.
 929                          */
 930                         ASSERT(ip->i_writer == NULL);
 931                         ip->i_writer = curthread;
 932                         i_size_changed = 1;
 933                         /*
 934                          * If we are writing from the beginning of
 935                          * the mapping, we can just create the
 936                          * pages without having to read them.
 937                          */
 938                         pagecreate = (mapon == 0);
 939                 } else if (n == MAXBSIZE) {
 940                         /*
 941                          * Going to do a whole mappings worth,
 942                          * so we can just create the pages w/o
 943                          * having to read them in.  But before
 944                          * we do that, we need to make sure any
 945                          * needed blocks are allocated first.
 946                          */
 947                         iblocks = ip->i_blocks;
 948                         error = bmap_write(ip, uoff, (int)(on + n),
 949                             BI_ALLOC_ONLY, NULL, cr);
 950                         /*
 951                          * bmap_write never drops i_contents so if
 952                          * the flags are set it changed the file.
 953                          */
 954                         if (ip->i_flag & (ICHG|IUPD)) {
 955                                 i_seq_needed = 1;
 956                                 ip->i_flag |= ISEQ;
 957                         }
 958                         if (error)
 959                                 break;
 960                         pagecreate = 1;
 961                         /*
 962                          * check if the new created page needed the
 963                          * allocation of new disk blocks.
 964                          */
 965                         if (iblocks == ip->i_blocks)
 966                                 new_iblocks = 0; /* no new blocks allocated */
 967                 } else {
 968                         pagecreate = 0;
 969                         /*
 970                          * In sync mode flush the indirect blocks which
 971                          * may have been allocated and not written on
 972                          * disk. In above cases bmap_write will allocate
 973                          * in sync mode.
 974                          */
 975                         if (ioflag & (FSYNC|FDSYNC)) {
 976                                 error = ufs_indirblk_sync(ip, uoff);
 977                                 if (error)
 978                                         break;
 979                         }
 980                 }
 981 
 982                 /*
 983                  * At this point we can enter ufs_getpage() in one
 984                  * of two ways:
 985                  * 1) segmap_getmapflt() calls ufs_getpage() when the
 986                  *    forcefault parameter is true (pagecreate == 0)
 987                  * 2) uiomove() causes a page fault.
 988                  *
 989                  * We have to drop the contents lock to prevent the VM
 990                  * system from trying to reacquire it in ufs_getpage()
 991                  * should the uiomove cause a pagefault.
 992                  *
 993                  * We have to drop the reader vfs_dqrwlock here as well.
 994                  */
 995                 rw_exit(&ip->i_contents);
 996                 if (do_dqrwlock) {
 997                         ASSERT(RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock));
 998                         ASSERT(!(RW_WRITE_HELD(&ufsvfsp->vfs_dqrwlock)));
 999                         rw_exit(&ufsvfsp->vfs_dqrwlock);
1000                 }
1001 
1002                 newpage = 0;
1003                 premove_resid = uio->uio_resid;
1004 
1005                 /*
1006                  * Touch the page and fault it in if it is not in core
1007                  * before segmap_getmapflt or vpm_data_copy can lock it.
1008                  * This is to avoid the deadlock if the buffer is mapped
1009                  * to the same file through mmap which we want to write.
1010                  */
1011                 uio_prefaultpages((long)n, uio);
1012 
1013                 if (vpm_enable) {
1014                         /*
1015                          * Copy data. If new pages are created, part of
1016                          * the page that is not written will be initizliazed
1017                          * with zeros.
1018                          */
1019                         error = vpm_data_copy(vp, (off + mapon), (uint_t)n,
1020                             uio, !pagecreate, &newpage, 0, S_WRITE);
1021                 } else {
1022 
1023                         base = segmap_getmapflt(segkmap, vp, (off + mapon),
1024                             (uint_t)n, !pagecreate, S_WRITE);
1025 
1026                         /*
1027                          * segmap_pagecreate() returns 1 if it calls
1028                          * page_create_va() to allocate any pages.
1029                          */
1030 
1031                         if (pagecreate)
1032                                 newpage = segmap_pagecreate(segkmap, base,
1033                                     (size_t)n, 0);
1034 
1035                         error = uiomove(base + mapon, (long)n, UIO_WRITE, uio);
1036                 }
1037 
1038                 /*
1039                  * If "newpage" is set, then a new page was created and it
1040                  * does not contain valid data, so it needs to be initialized
1041                  * at this point.
1042                  * Otherwise the page contains old data, which was overwritten
1043                  * partially or as a whole in uiomove.
1044                  * If there is only one iovec structure within uio, then
1045                  * on error uiomove will not be able to update uio->uio_loffset
1046                  * and we would zero the whole page here!
1047                  *
1048                  * If uiomove fails because of an error, the old valid data
1049                  * is kept instead of filling the rest of the page with zero's.
1050                  */
1051                 if (!vpm_enable && newpage &&
1052                     uio->uio_loffset < roundup(off + mapon + n, PAGESIZE)) {
1053                         /*
1054                          * We created pages w/o initializing them completely,
1055                          * thus we need to zero the part that wasn't set up.
1056                          * This happens on most EOF write cases and if
1057                          * we had some sort of error during the uiomove.
1058                          */
1059                         int nzero, nmoved;
1060 
1061                         nmoved = (int)(uio->uio_loffset - (off + mapon));
1062                         ASSERT(nmoved >= 0 && nmoved <= n);
1063                         nzero = roundup(on + n, PAGESIZE) - nmoved;
1064                         ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE);
1065                         (void) kzero(base + mapon + nmoved, (uint_t)nzero);
1066                 }
1067 
1068                 /*
1069                  * Unlock the pages allocated by page_create_va()
1070                  * in segmap_pagecreate()
1071                  */
1072                 if (!vpm_enable && newpage)
1073                         segmap_pageunlock(segkmap, base, (size_t)n, S_WRITE);
1074 
1075                 /*
1076                  * If the size of the file changed, then update the
1077                  * size field in the inode now.  This can't be done
1078                  * before the call to segmap_pageunlock or there is
1079                  * a potential deadlock with callers to ufs_putpage().
1080                  * They will be holding i_contents and trying to lock
1081                  * a page, while this thread is holding a page locked
1082                  * and trying to acquire i_contents.
1083                  */
1084                 if (i_size_changed) {
1085                         rw_enter(&ip->i_contents, RW_WRITER);
1086                         old_i_size = ip->i_size;
1087                         UFS_SET_ISIZE(uoff + n, ip);
1088                         TRANS_INODE(ufsvfsp, ip);
1089                         /*
1090                          * file has grown larger than 2GB. Set flag
1091                          * in superblock to indicate this, if it
1092                          * is not already set.
1093                          */
1094                         if ((ip->i_size > MAXOFF32_T) &&
1095                             !(fs->fs_flags & FSLARGEFILES)) {
1096                                 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
1097                                 mutex_enter(&ufsvfsp->vfs_lock);
1098                                 fs->fs_flags |= FSLARGEFILES;
1099                                 ufs_sbwrite(ufsvfsp);
1100                                 mutex_exit(&ufsvfsp->vfs_lock);
1101                         }
1102                         mutex_enter(&ip->i_tlock);
1103                         ip->i_writer = NULL;
1104                         cv_broadcast(&ip->i_wrcv);
1105                         mutex_exit(&ip->i_tlock);
1106                         rw_exit(&ip->i_contents);
1107                 }
1108 
1109                 if (error) {
1110                         /*
1111                          * If we failed on a write, we may have already
1112                          * allocated file blocks as well as pages.  It's
1113                          * hard to undo the block allocation, but we must
1114                          * be sure to invalidate any pages that may have
1115                          * been allocated.
1116                          *
1117                          * If the page was created without initialization
1118                          * then we must check if it should be possible
1119                          * to destroy the new page and to keep the old data
1120                          * on the disk.
1121                          *
1122                          * It is possible to destroy the page without
1123                          * having to write back its contents only when
1124                          * - the size of the file keeps unchanged
1125                          * - bmap_write() did not allocate new disk blocks
1126                          *   it is possible to create big files using "seek" and
1127                          *   write to the end of the file. A "write" to a
1128                          *   position before the end of the file would not
1129                          *   change the size of the file but it would allocate
1130                          *   new disk blocks.
1131                          * - uiomove intended to overwrite the whole page.
1132                          * - a new page was created (newpage == 1).
1133                          */
1134 
1135                         if (i_size_changed == 0 && new_iblocks == 0 &&
1136                             newpage) {
1137 
1138                                 /* unwind what uiomove eventually last did */
1139                                 uio->uio_resid = premove_resid;
1140 
1141                                 /*
1142                                  * destroy the page, do not write ambiguous
1143                                  * data to the disk.
1144                                  */
1145                                 flags = SM_DESTROY;
1146                         } else {
1147                                 /*
1148                                  * write the page back to the disk, if dirty,
1149                                  * and remove the page from the cache.
1150                                  */
1151                                 flags = SM_INVAL;
1152                         }
1153 
1154                         if (vpm_enable) {
1155                                 /*
1156                                  *  Flush pages.
1157                                  */
1158                                 (void) vpm_sync_pages(vp, off, n, flags);
1159                         } else {
1160                                 (void) segmap_release(segkmap, base, flags);
1161                         }
1162                 } else {
1163                         flags = 0;
1164                         /*
1165                          * Force write back for synchronous write cases.
1166                          */
1167                         if ((ioflag & (FSYNC|FDSYNC)) || type == IFDIR) {
1168                                 /*
1169                                  * If the sticky bit is set but the
1170                                  * execute bit is not set, we do a
1171                                  * synchronous write back and free
1172                                  * the page when done.  We set up swap
1173                                  * files to be handled this way to
1174                                  * prevent servers from keeping around
1175                                  * the client's swap pages too long.
1176                                  * XXX - there ought to be a better way.
1177                                  */
1178                                 if (IS_SWAPVP(vp)) {
1179                                         flags = SM_WRITE | SM_FREE |
1180                                             SM_DONTNEED;
1181                                         iupdat_flag = 0;
1182                                 } else {
1183                                         flags = SM_WRITE;
1184                                 }
1185                         } else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) {
1186                                 /*
1187                                  * Have written a whole block.
1188                                  * Start an asynchronous write and
1189                                  * mark the buffer to indicate that
1190                                  * it won't be needed again soon.
1191                                  */
1192                                 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
1193                         }
1194                         if (vpm_enable) {
1195                                 /*
1196                                  * Flush pages.
1197                                  */
1198                                 error = vpm_sync_pages(vp, off, n, flags);
1199                         } else {
1200                                 error = segmap_release(segkmap, base, flags);
1201                         }
1202                         /*
1203                          * If the operation failed and is synchronous,
1204                          * then we need to unwind what uiomove() last
1205                          * did so we can potentially return an error to
1206                          * the caller.  If this write operation was
1207                          * done in two pieces and the first succeeded,
1208                          * then we won't return an error for the second
1209                          * piece that failed.  However, we only want to
1210                          * return a resid value that reflects what was
1211                          * really done.
1212                          *
1213                          * Failures for non-synchronous operations can
1214                          * be ignored since the page subsystem will
1215                          * retry the operation until it succeeds or the
1216                          * file system is unmounted.
1217                          */
1218                         if (error) {
1219                                 if ((ioflag & (FSYNC | FDSYNC)) ||
1220                                     type == IFDIR) {
1221                                         uio->uio_resid = premove_resid;
1222                                 } else {
1223                                         error = 0;
1224                                 }
1225                         }
1226                 }
1227 
1228                 /*
1229                  * Re-acquire contents lock.
1230                  * If it was dropped, reacquire reader vfs_dqrwlock as well.
1231                  */
1232                 if (do_dqrwlock)
1233                         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1234                 rw_enter(&ip->i_contents, RW_WRITER);
1235 
1236                 /*
1237                  * If the uiomove() failed or if a synchronous
1238                  * page push failed, fix up i_size.
1239                  */
1240                 if (error) {
1241                         if (i_size_changed) {
1242                                 /*
1243                                  * The uiomove failed, and we
1244                                  * allocated blocks,so get rid
1245                                  * of them.
1246                                  */
1247                                 (void) ufs_itrunc(ip, old_i_size, 0, cr);
1248                         }
1249                 } else {
1250                         /*
1251                          * XXX - Can this be out of the loop?
1252                          */
1253                         ip->i_flag |= IUPD | ICHG;
1254                         /*
1255                          * Only do one increase of i_seq for multiple
1256                          * pieces.  Because we drop locks, record
1257                          * the fact that we changed the timestamp and
1258                          * are deferring the increase in case another thread
1259                          * pushes our timestamp update.
1260                          */
1261                         i_seq_needed = 1;
1262                         ip->i_flag |= ISEQ;
1263                         if (i_size_changed)
1264                                 ip->i_flag |= IATTCHG;
1265                         if ((ip->i_mode & (IEXEC | (IEXEC >> 3) |
1266                             (IEXEC >> 6))) != 0 &&
1267                             (ip->i_mode & (ISUID | ISGID)) != 0 &&
1268                             secpolicy_vnode_setid_retain(cr,
1269                             (ip->i_mode & ISUID) != 0 && ip->i_uid == 0) != 0) {
1270                                 /*
1271                                  * Clear Set-UID & Set-GID bits on
1272                                  * successful write if not privileged
1273                                  * and at least one of the execute bits
1274                                  * is set.  If we always clear Set-GID,
1275                                  * mandatory file and record locking is
1276                                  * unuseable.
1277                                  */
1278                                 ip->i_mode &= ~(ISUID | ISGID);
1279                         }
1280                 }
1281                 /*
1282                  * In the case the FDSYNC flag is set and this is a
1283                  * "rewrite" we won't log a delta.
1284                  * The FSYNC flag overrides all cases.
1285                  */
1286                 if (!ufs_check_rewrite(ip, uio, ioflag) || !(ioflag & FDSYNC)) {
1287                         TRANS_INODE(ufsvfsp, ip);
1288                 }
1289         } while (error == 0 && uio->uio_resid > 0 && n != 0);
1290 
1291 out:
1292         /*
1293          * Make sure i_seq is increased at least once per write
1294          */
1295         if (i_seq_needed) {
1296                 ip->i_seq++;
1297                 ip->i_flag &= ~ISEQ;     /* no longer deferred */
1298         }
1299 
1300         /*
1301          * Inode is updated according to this table -
1302          *
1303          *   FSYNC        FDSYNC(posix.4)
1304          *   --------------------------
1305          *   always@      IATTCHG|IBDWRITE
1306          *
1307          * @ -  If we are doing synchronous write the only time we should
1308          *      not be sync'ing the ip here is if we have the stickyhack
1309          *      activated, the file is marked with the sticky bit and
1310          *      no exec bit, the file length has not been changed and
1311          *      no new blocks have been allocated during this write.
1312          */
1313 
1314         if ((ip->i_flag & ISYNC) != 0) {
1315                 /*
1316                  * we have eliminated nosync
1317                  */
1318                 if ((ip->i_flag & (IATTCHG|IBDWRITE)) ||
1319                     ((ioflag & FSYNC) && iupdat_flag)) {
1320                         ufs_iupdat(ip, 1);
1321                 }
1322         }
1323 
1324         /*
1325          * If we've already done a partial-write, terminate
1326          * the write but return no error unless the error is ENOSPC
1327          * because the caller can detect this and free resources and
1328          * try again.
1329          */
1330         if ((start_resid != uio->uio_resid) && (error != ENOSPC))
1331                 error = 0;
1332 
1333         ip->i_flag &= ~(INOACC | ISYNC);
1334         ITIMES_NOLOCK(ip);
1335         return (error);
1336 }
1337 
1338 /*
1339  * rdip does the real work of read requests for ufs.
1340  */
1341 int
1342 rdip(struct inode *ip, struct uio *uio, int ioflag, cred_t *cr)
1343 {
1344         u_offset_t off;
1345         caddr_t base;
1346         struct fs *fs;
1347         struct ufsvfs *ufsvfsp;
1348         struct vnode *vp;
1349         long oresid = uio->uio_resid;
1350         u_offset_t n, on, mapon;
1351         int error = 0;
1352         int doupdate = 1;
1353         uint_t flags;
1354         int dofree, directio_status;
1355         krw_t rwtype;
1356         o_mode_t type;
1357         clock_t now;
1358 
1359         vp = ITOV(ip);
1360 
1361         ASSERT(RW_LOCK_HELD(&ip->i_contents));
1362 
1363         ufsvfsp = ip->i_ufsvfs;
1364 
1365         if (ufsvfsp == NULL)
1366                 return (EIO);
1367 
1368         fs = ufsvfsp->vfs_fs;
1369 
1370         /* check for valid filetype */
1371         type = ip->i_mode & IFMT;
1372         if ((type != IFREG) && (type != IFDIR) && (type != IFATTRDIR) &&
1373             (type != IFLNK) && (type != IFSHAD)) {
1374                 return (EIO);
1375         }
1376 
1377         if (uio->uio_loffset > UFS_MAXOFFSET_T) {
1378                 error = 0;
1379                 goto out;
1380         }
1381         if (uio->uio_loffset < (offset_t)0) {
1382                 return (EINVAL);
1383         }
1384         if (uio->uio_resid == 0) {
1385                 return (0);
1386         }
1387 
1388         if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && (fs->fs_ronly == 0) &&
1389             (!ufsvfsp->vfs_noatime)) {
1390                 mutex_enter(&ip->i_tlock);
1391                 ip->i_flag |= IACC;
1392                 mutex_exit(&ip->i_tlock);
1393         }
1394         /*
1395          * Try to go direct
1396          */
1397         if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) {
1398                 error = ufs_directio_read(ip, uio, cr, &directio_status);
1399                 if (directio_status == DIRECTIO_SUCCESS)
1400                         goto out;
1401         }
1402 
1403         rwtype = (rw_write_held(&ip->i_contents)?RW_WRITER:RW_READER);
1404 
1405         do {
1406                 offset_t diff;
1407                 u_offset_t uoff = uio->uio_loffset;
1408                 off = uoff & (offset_t)MAXBMASK;
1409                 mapon = (u_offset_t)(uoff & (offset_t)MAXBOFFSET);
1410                 on = (u_offset_t)blkoff(fs, uoff);
1411                 n = MIN((u_offset_t)fs->fs_bsize - on,
1412                     (u_offset_t)uio->uio_resid);
1413 
1414                 diff = ip->i_size - uoff;
1415 
1416                 if (diff <= (offset_t)0) {
1417                         error = 0;
1418                         goto out;
1419                 }
1420                 if (diff < (offset_t)n)
1421                         n = (int)diff;
1422 
1423                 /*
1424                  * We update smallfile2 and smallfile1 at most every second.
1425                  */
1426                 now = ddi_get_lbolt();
1427                 if (now >= smallfile_update) {
1428                         uint64_t percpufreeb;
1429                         if (smallfile1_d == 0) smallfile1_d = SMALLFILE1_D;
1430                         if (smallfile2_d == 0) smallfile2_d = SMALLFILE2_D;
1431                         percpufreeb = ptob((uint64_t)freemem) / ncpus_online;
1432                         smallfile1 = percpufreeb / smallfile1_d;
1433                         smallfile2 = percpufreeb / smallfile2_d;
1434                         smallfile1 = MAX(smallfile1, smallfile);
1435                         smallfile1 = MAX(smallfile1, smallfile64);
1436                         smallfile2 = MAX(smallfile1, smallfile2);
1437                         smallfile_update = now + hz;
1438                 }
1439 
1440                 dofree = freebehind &&
1441                     ip->i_nextr == (off & PAGEMASK) && off > smallfile1;
1442 
1443                 /*
1444                  * At this point we can enter ufs_getpage() in one of two
1445                  * ways:
1446                  * 1) segmap_getmapflt() calls ufs_getpage() when the
1447                  *    forcefault parameter is true (value of 1 is passed)
1448                  * 2) uiomove() causes a page fault.
1449                  *
1450                  * We cannot hold onto an i_contents reader lock without
1451                  * risking deadlock in ufs_getpage() so drop a reader lock.
1452                  * The ufs_getpage() dolock logic already allows for a
1453                  * thread holding i_contents as writer to work properly
1454                  * so we keep a writer lock.
1455                  */
1456                 if (rwtype == RW_READER)
1457                         rw_exit(&ip->i_contents);
1458 
1459                 if (vpm_enable) {
1460                         /*
1461                          * Copy data.
1462                          */
1463                         error = vpm_data_copy(vp, (off + mapon), (uint_t)n,
1464                             uio, 1, NULL, 0, S_READ);
1465                 } else {
1466                         base = segmap_getmapflt(segkmap, vp, (off + mapon),
1467                             (uint_t)n, 1, S_READ);
1468                         error = uiomove(base + mapon, (long)n, UIO_READ, uio);
1469                 }
1470 
1471                 flags = 0;
1472                 if (!error) {
1473                         /*
1474                          * If  reading sequential  we won't need  this
1475                          * buffer again  soon.  For  offsets in  range
1476                          * [smallfile1,  smallfile2] release the pages
1477                          * at   the  tail  of the   cache list, larger
1478                          * offsets are released at the head.
1479                          */
1480                         if (dofree) {
1481                                 flags = SM_FREE | SM_ASYNC;
1482                                 if ((cache_read_ahead == 0) &&
1483                                     (off > smallfile2))
1484                                         flags |=  SM_DONTNEED;
1485                         }
1486                         /*
1487                          * In POSIX SYNC (FSYNC and FDSYNC) read mode,
1488                          * we want to make sure that the page which has
1489                          * been read, is written on disk if it is dirty.
1490                          * And corresponding indirect blocks should also
1491                          * be flushed out.
1492                          */
1493                         if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) {
1494                                 flags &= ~SM_ASYNC;
1495                                 flags |= SM_WRITE;
1496                         }
1497                         if (vpm_enable) {
1498                                 error = vpm_sync_pages(vp, off, n, flags);
1499                         } else {
1500                                 error = segmap_release(segkmap, base, flags);
1501                         }
1502                 } else {
1503                         if (vpm_enable) {
1504                                 (void) vpm_sync_pages(vp, off, n, flags);
1505                         } else {
1506                                 (void) segmap_release(segkmap, base, flags);
1507                         }
1508                 }
1509 
1510                 if (rwtype == RW_READER)
1511                         rw_enter(&ip->i_contents, rwtype);
1512         } while (error == 0 && uio->uio_resid > 0 && n != 0);
1513 out:
1514         /*
1515          * Inode is updated according to this table if FRSYNC is set.
1516          *
1517          *   FSYNC        FDSYNC(posix.4)
1518          *   --------------------------
1519          *   always       IATTCHG|IBDWRITE
1520          */
1521         /*
1522          * The inode is not updated if we're logging and the inode is a
1523          * directory with FRSYNC, FSYNC and FDSYNC flags set.
1524          */
1525         if (ioflag & FRSYNC) {
1526                 if (TRANS_ISTRANS(ufsvfsp) && ((ip->i_mode & IFMT) == IFDIR)) {
1527                         doupdate = 0;
1528                 }
1529                 if (doupdate) {
1530                         if ((ioflag & FSYNC) ||
1531                             ((ioflag & FDSYNC) &&
1532                             (ip->i_flag & (IATTCHG|IBDWRITE)))) {
1533                                 ufs_iupdat(ip, 1);
1534                         }
1535                 }
1536         }
1537         /*
1538          * If we've already done a partial read, terminate
1539          * the read but return no error.
1540          */
1541         if (oresid != uio->uio_resid)
1542                 error = 0;
1543         ITIMES(ip);
1544 
1545         return (error);
1546 }
1547 
1548 /* ARGSUSED */
1549 static int
1550 ufs_ioctl(
1551         struct vnode    *vp,
1552         int             cmd,
1553         intptr_t        arg,
1554         int             flag,
1555         struct cred     *cr,
1556         int             *rvalp,
1557         caller_context_t *ct)
1558 {
1559         struct lockfs   lockfs, lockfs_out;
1560         struct ufsvfs   *ufsvfsp = VTOI(vp)->i_ufsvfs;
1561         char            *comment, *original_comment;
1562         struct fs       *fs;
1563         struct ulockfs  *ulp;
1564         offset_t        off;
1565         extern int      maxphys;
1566         int             error;
1567         int             issync;
1568         int             trans_size;
1569 
1570 
1571         /*
1572          * forcibly unmounted
1573          */
1574         if (ufsvfsp == NULL || vp->v_vfsp == NULL ||
1575             vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
1576                 return (EIO);
1577         fs = ufsvfsp->vfs_fs;
1578 
1579         if (cmd == Q_QUOTACTL) {
1580                 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_QUOTA_MASK);
1581                 if (error)
1582                         return (error);
1583 
1584                 if (ulp) {
1585                         TRANS_BEGIN_ASYNC(ufsvfsp, TOP_QUOTA,
1586                             TOP_SETQUOTA_SIZE(fs));
1587                 }
1588 
1589                 error = quotactl(vp, arg, flag, cr);
1590 
1591                 if (ulp) {
1592                         TRANS_END_ASYNC(ufsvfsp, TOP_QUOTA,
1593                             TOP_SETQUOTA_SIZE(fs));
1594                         ufs_lockfs_end(ulp);
1595                 }
1596                 return (error);
1597         }
1598 
1599         switch (cmd) {
1600                 case _FIOLFS:
1601                         /*
1602                          * file system locking
1603                          */
1604                         if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1605                                 return (EPERM);
1606 
1607                         if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
1608                                 if (copyin((caddr_t)arg, &lockfs,
1609                                     sizeof (struct lockfs)))
1610                                         return (EFAULT);
1611                         }
1612 #ifdef _SYSCALL32_IMPL
1613                         else {
1614                                 struct lockfs32 lockfs32;
1615                                 /* Translate ILP32 lockfs to LP64 lockfs */
1616                                 if (copyin((caddr_t)arg, &lockfs32,
1617                                     sizeof (struct lockfs32)))
1618                                         return (EFAULT);
1619                                 lockfs.lf_lock = (ulong_t)lockfs32.lf_lock;
1620                                 lockfs.lf_flags = (ulong_t)lockfs32.lf_flags;
1621                                 lockfs.lf_key = (ulong_t)lockfs32.lf_key;
1622                                 lockfs.lf_comlen = (ulong_t)lockfs32.lf_comlen;
1623                                 lockfs.lf_comment =
1624                                     (caddr_t)(uintptr_t)lockfs32.lf_comment;
1625                         }
1626 #endif /* _SYSCALL32_IMPL */
1627 
1628                         if (lockfs.lf_comlen) {
1629                                 if (lockfs.lf_comlen > LOCKFS_MAXCOMMENTLEN)
1630                                         return (ENAMETOOLONG);
1631                                 comment =
1632                                     kmem_alloc(lockfs.lf_comlen, KM_SLEEP);
1633                                 if (copyin(lockfs.lf_comment, comment,
1634                                     lockfs.lf_comlen)) {
1635                                         kmem_free(comment, lockfs.lf_comlen);
1636                                         return (EFAULT);
1637                                 }
1638                                 original_comment = lockfs.lf_comment;
1639                                 lockfs.lf_comment = comment;
1640                         }
1641                         if ((error = ufs_fiolfs(vp, &lockfs, 0)) == 0) {
1642                                 lockfs.lf_comment = original_comment;
1643 
1644                                 if ((flag & DATAMODEL_MASK) ==
1645                                     DATAMODEL_NATIVE) {
1646                                         (void) copyout(&lockfs, (caddr_t)arg,
1647                                             sizeof (struct lockfs));
1648                                 }
1649 #ifdef _SYSCALL32_IMPL
1650                                 else {
1651                                         struct lockfs32 lockfs32;
1652                                         /* Translate LP64 to ILP32 lockfs */
1653                                         lockfs32.lf_lock =
1654                                             (uint32_t)lockfs.lf_lock;
1655                                         lockfs32.lf_flags =
1656                                             (uint32_t)lockfs.lf_flags;
1657                                         lockfs32.lf_key =
1658                                             (uint32_t)lockfs.lf_key;
1659                                         lockfs32.lf_comlen =
1660                                             (uint32_t)lockfs.lf_comlen;
1661                                         lockfs32.lf_comment =
1662                                             (uint32_t)(uintptr_t)
1663                                             lockfs.lf_comment;
1664                                         (void) copyout(&lockfs32, (caddr_t)arg,
1665                                             sizeof (struct lockfs32));
1666                                 }
1667 #endif /* _SYSCALL32_IMPL */
1668 
1669                         } else {
1670                                 if (lockfs.lf_comlen)
1671                                         kmem_free(comment, lockfs.lf_comlen);
1672                         }
1673                         return (error);
1674 
1675                 case _FIOLFSS:
1676                         /*
1677                          * get file system locking status
1678                          */
1679 
1680                         if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
1681                                 if (copyin((caddr_t)arg, &lockfs,
1682                                     sizeof (struct lockfs)))
1683                                         return (EFAULT);
1684                         }
1685 #ifdef _SYSCALL32_IMPL
1686                         else {
1687                                 struct lockfs32 lockfs32;
1688                                 /* Translate ILP32 lockfs to LP64 lockfs */
1689                                 if (copyin((caddr_t)arg, &lockfs32,
1690                                     sizeof (struct lockfs32)))
1691                                         return (EFAULT);
1692                                 lockfs.lf_lock = (ulong_t)lockfs32.lf_lock;
1693                                 lockfs.lf_flags = (ulong_t)lockfs32.lf_flags;
1694                                 lockfs.lf_key = (ulong_t)lockfs32.lf_key;
1695                                 lockfs.lf_comlen = (ulong_t)lockfs32.lf_comlen;
1696                                 lockfs.lf_comment =
1697                                     (caddr_t)(uintptr_t)lockfs32.lf_comment;
1698                         }
1699 #endif /* _SYSCALL32_IMPL */
1700 
1701                         if (error =  ufs_fiolfss(vp, &lockfs_out))
1702                                 return (error);
1703                         lockfs.lf_lock = lockfs_out.lf_lock;
1704                         lockfs.lf_key = lockfs_out.lf_key;
1705                         lockfs.lf_flags = lockfs_out.lf_flags;
1706                         lockfs.lf_comlen = MIN(lockfs.lf_comlen,
1707                             lockfs_out.lf_comlen);
1708 
1709                         if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
1710                                 if (copyout(&lockfs, (caddr_t)arg,
1711                                     sizeof (struct lockfs)))
1712                                         return (EFAULT);
1713                         }
1714 #ifdef _SYSCALL32_IMPL
1715                         else {
1716                                 /* Translate LP64 to ILP32 lockfs */
1717                                 struct lockfs32 lockfs32;
1718                                 lockfs32.lf_lock = (uint32_t)lockfs.lf_lock;
1719                                 lockfs32.lf_flags = (uint32_t)lockfs.lf_flags;
1720                                 lockfs32.lf_key = (uint32_t)lockfs.lf_key;
1721                                 lockfs32.lf_comlen = (uint32_t)lockfs.lf_comlen;
1722                                 lockfs32.lf_comment =
1723                                     (uint32_t)(uintptr_t)lockfs.lf_comment;
1724                                 if (copyout(&lockfs32, (caddr_t)arg,
1725                                     sizeof (struct lockfs32)))
1726                                         return (EFAULT);
1727                         }
1728 #endif /* _SYSCALL32_IMPL */
1729 
1730                         if (lockfs.lf_comlen &&
1731                             lockfs.lf_comment && lockfs_out.lf_comment)
1732                                 if (copyout(lockfs_out.lf_comment,
1733                                     lockfs.lf_comment, lockfs.lf_comlen))
1734                                         return (EFAULT);
1735                         return (0);
1736 
1737                 case _FIOSATIME:
1738                         /*
1739                          * set access time
1740                          */
1741 
1742                         /*
1743                          * if mounted w/o atime, return quietly.
1744                          * I briefly thought about returning ENOSYS, but
1745                          * figured that most apps would consider this fatal
1746                          * but the idea is to make this as seamless as poss.
1747                          */
1748                         if (ufsvfsp->vfs_noatime)
1749                                 return (0);
1750 
1751                         error = ufs_lockfs_begin(ufsvfsp, &ulp,
1752                             ULOCKFS_SETATTR_MASK);
1753                         if (error)
1754                                 return (error);
1755 
1756                         if (ulp) {
1757                                 trans_size = (int)TOP_SETATTR_SIZE(VTOI(vp));
1758                                 TRANS_BEGIN_CSYNC(ufsvfsp, issync,
1759                                     TOP_SETATTR, trans_size);
1760                         }
1761 
1762                         error = ufs_fiosatime(vp, (struct timeval *)arg,
1763                             flag, cr);
1764 
1765                         if (ulp) {
1766                                 TRANS_END_CSYNC(ufsvfsp, error, issync,
1767                                     TOP_SETATTR, trans_size);
1768                                 ufs_lockfs_end(ulp);
1769                         }
1770                         return (error);
1771 
1772                 case _FIOSDIO:
1773                         /*
1774                          * set delayed-io
1775                          */
1776                         return (ufs_fiosdio(vp, (uint_t *)arg, flag, cr));
1777 
1778                 case _FIOGDIO:
1779                         /*
1780                          * get delayed-io
1781                          */
1782                         return (ufs_fiogdio(vp, (uint_t *)arg, flag, cr));
1783 
1784                 case _FIOIO:
1785                         /*
1786                          * inode open
1787                          */
1788                         error = ufs_lockfs_begin(ufsvfsp, &ulp,
1789                             ULOCKFS_VGET_MASK);
1790                         if (error)
1791                                 return (error);
1792 
1793                         error = ufs_fioio(vp, (struct fioio *)arg, flag, cr);
1794 
1795                         if (ulp) {
1796                                 ufs_lockfs_end(ulp);
1797                         }
1798                         return (error);
1799 
1800                 case _FIOFFS:
1801                         /*
1802                          * file system flush (push w/invalidate)
1803                          */
1804                         if ((caddr_t)arg != NULL)
1805                                 return (EINVAL);
1806                         return (ufs_fioffs(vp, NULL, cr));
1807 
1808                 case _FIOISBUSY:
1809                         /*
1810                          * Contract-private interface for Legato
1811                          * Purge this vnode from the DNLC and decide
1812                          * if this vnode is busy (*arg == 1) or not
1813                          * (*arg == 0)
1814                          */
1815                         if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1816                                 return (EPERM);
1817                         error = ufs_fioisbusy(vp, (int *)arg, cr);
1818                         return (error);
1819 
1820                 case _FIODIRECTIO:
1821                         return (ufs_fiodirectio(vp, (int)arg, cr));
1822 
1823                 case _FIOTUNE:
1824                         /*
1825                          * Tune the file system (aka setting fs attributes)
1826                          */
1827                         error = ufs_lockfs_begin(ufsvfsp, &ulp,
1828                             ULOCKFS_SETATTR_MASK);
1829                         if (error)
1830                                 return (error);
1831 
1832                         error = ufs_fiotune(vp, (struct fiotune *)arg, cr);
1833 
1834                         if (ulp)
1835                                 ufs_lockfs_end(ulp);
1836                         return (error);
1837 
1838                 case _FIOLOGENABLE:
1839                         if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1840                                 return (EPERM);
1841                         return (ufs_fiologenable(vp, (void *)arg, cr, flag));
1842 
1843                 case _FIOLOGDISABLE:
1844                         if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1845                                 return (EPERM);
1846                         return (ufs_fiologdisable(vp, (void *)arg, cr, flag));
1847 
1848                 case _FIOISLOG:
1849                         return (ufs_fioislog(vp, (void *)arg, cr, flag));
1850 
1851                 case _FIOSNAPSHOTCREATE_MULTI:
1852                 {
1853                         struct fiosnapcreate_multi      fc, *fcp;
1854                         size_t  fcm_size;
1855 
1856                         if (copyin((void *)arg, &fc, sizeof (fc)))
1857                                 return (EFAULT);
1858                         if (fc.backfilecount > MAX_BACKFILE_COUNT)
1859                                 return (EINVAL);
1860                         fcm_size = sizeof (struct fiosnapcreate_multi) +
1861                             (fc.backfilecount - 1) * sizeof (int);
1862                         fcp = (struct fiosnapcreate_multi *)
1863                             kmem_alloc(fcm_size, KM_SLEEP);
1864                         if (copyin((void *)arg, fcp, fcm_size)) {
1865                                 kmem_free(fcp, fcm_size);
1866                                 return (EFAULT);
1867                         }
1868                         error = ufs_snap_create(vp, fcp, cr);
1869                         /*
1870                          * Do copyout even if there is an error because
1871                          * the details of error is stored in fcp.
1872                          */
1873                         if (copyout(fcp, (void *)arg, fcm_size))
1874                                 error = EFAULT;
1875                         kmem_free(fcp, fcm_size);
1876                         return (error);
1877                 }
1878 
1879                 case _FIOSNAPSHOTDELETE:
1880                 {
1881                         struct fiosnapdelete    fc;
1882 
1883                         if (copyin((void *)arg, &fc, sizeof (fc)))
1884                                 return (EFAULT);
1885                         error = ufs_snap_delete(vp, &fc, cr);
1886                         if (!error && copyout(&fc, (void *)arg, sizeof (fc)))
1887                                 error = EFAULT;
1888                         return (error);
1889                 }
1890 
1891                 case _FIOGETSUPERBLOCK:
1892                         if (copyout(fs, (void *)arg, SBSIZE))
1893                                 return (EFAULT);
1894                         return (0);
1895 
1896                 case _FIOGETMAXPHYS:
1897                         if (copyout(&maxphys, (void *)arg, sizeof (maxphys)))
1898                                 return (EFAULT);
1899                         return (0);
1900 
1901                 /*
1902                  * The following 3 ioctls are for TSufs support
1903                  * although could potentially be used elsewhere
1904                  */
1905                 case _FIO_SET_LUFS_DEBUG:
1906                         if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1907                                 return (EPERM);
1908                         lufs_debug = (uint32_t)arg;
1909                         return (0);
1910 
1911                 case _FIO_SET_LUFS_ERROR:
1912                         if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1913                                 return (EPERM);
1914                         TRANS_SETERROR(ufsvfsp);
1915                         return (0);
1916 
1917                 case _FIO_GET_TOP_STATS:
1918                 {
1919                         fio_lufs_stats_t *ls;
1920                         ml_unit_t *ul = ufsvfsp->vfs_log;
1921 
1922                         ls = kmem_zalloc(sizeof (*ls), KM_SLEEP);
1923                         ls->ls_debug = ul->un_debug; /* return debug value */
1924                         /* Copy stucture if statistics are being kept */
1925                         if (ul->un_logmap->mtm_tops) {
1926                                 ls->ls_topstats = *(ul->un_logmap->mtm_tops);
1927                         }
1928                         error = 0;
1929                         if (copyout(ls, (void *)arg, sizeof (*ls)))
1930                                 error = EFAULT;
1931                         kmem_free(ls, sizeof (*ls));
1932                         return (error);
1933                 }
1934 
1935                 case _FIO_SEEK_DATA:
1936                 case _FIO_SEEK_HOLE:
1937                         if (ddi_copyin((void *)arg, &off, sizeof (off), flag))
1938                                 return (EFAULT);
1939                         /* offset paramater is in/out */
1940                         error = ufs_fio_holey(vp, cmd, &off);
1941                         if (error)
1942                                 return (error);
1943                         if (ddi_copyout(&off, (void *)arg, sizeof (off), flag))
1944                                 return (EFAULT);
1945                         return (0);
1946 
1947                 case _FIO_COMPRESSED:
1948                 {
1949                         /*
1950                          * This is a project private ufs ioctl() to mark
1951                          * the inode as that belonging to a compressed
1952                          * file. This is used to mark individual
1953                          * compressed files in a miniroot archive.
1954                          * The files compressed in this manner are
1955                          * automatically decompressed by the dcfs filesystem
1956                          * (via an interception in ufs_lookup - see decompvp())
1957                          * which is layered on top of ufs on a system running
1958                          * from the archive. See uts/common/fs/dcfs for details.
1959                          * This ioctl only marks the file as compressed - the
1960                          * actual compression is done by fiocompress (a
1961                          * userland utility) which invokes this ioctl().
1962                          */
1963                         struct inode *ip = VTOI(vp);
1964 
1965                         error = ufs_lockfs_begin(ufsvfsp, &ulp,
1966                             ULOCKFS_SETATTR_MASK);
1967                         if (error)
1968                                 return (error);
1969 
1970                         if (ulp) {
1971                                 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IUPDAT,
1972                                     TOP_IUPDAT_SIZE(ip));
1973                         }
1974 
1975                         error = ufs_mark_compressed(vp);
1976 
1977                         if (ulp) {
1978                                 TRANS_END_ASYNC(ufsvfsp, TOP_IUPDAT,
1979                                     TOP_IUPDAT_SIZE(ip));
1980                                 ufs_lockfs_end(ulp);
1981                         }
1982 
1983                         return (error);
1984 
1985                 }
1986 
1987                 default:
1988                         return (ENOTTY);
1989         }
1990 }
1991 
1992 
1993 /* ARGSUSED */
1994 static int
1995 ufs_getattr(struct vnode *vp, struct vattr *vap, int flags,
1996         struct cred *cr, caller_context_t *ct)
1997 {
1998         struct inode *ip = VTOI(vp);
1999         struct ufsvfs *ufsvfsp;
2000         int err;
2001 
2002         if (vap->va_mask == AT_SIZE) {
2003                 /*
2004                  * for performance, if only the size is requested don't bother
2005                  * with anything else.
2006                  */
2007                 UFS_GET_ISIZE(&vap->va_size, ip);
2008                 return (0);
2009         }
2010 
2011         /*
2012          * inlined lockfs checks
2013          */
2014         ufsvfsp = ip->i_ufsvfs;
2015         if ((ufsvfsp == NULL) || ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs)) {
2016                 err = EIO;
2017                 goto out;
2018         }
2019 
2020         rw_enter(&ip->i_contents, RW_READER);
2021         /*
2022          * Return all the attributes.  This should be refined so
2023          * that it only returns what's asked for.
2024          */
2025 
2026         /*
2027          * Copy from inode table.
2028          */
2029         vap->va_type = vp->v_type;
2030         vap->va_mode = ip->i_mode & MODEMASK;
2031         /*
2032          * If there is an ACL and there is a mask entry, then do the
2033          * extra work that completes the equivalent of an acltomode(3)
2034          * call.  According to POSIX P1003.1e, the acl mask should be
2035          * returned in the group permissions field.
2036          *
2037          * - start with the original permission and mode bits (from above)
2038          * - clear the group owner bits
2039          * - add in the mask bits.
2040          */
2041         if (ip->i_ufs_acl && ip->i_ufs_acl->aclass.acl_ismask) {
2042                 vap->va_mode &= ~((VREAD | VWRITE | VEXEC) >> 3);
2043                 vap->va_mode |=
2044                     (ip->i_ufs_acl->aclass.acl_maskbits & PERMMASK) << 3;
2045         }
2046         vap->va_uid = ip->i_uid;
2047         vap->va_gid = ip->i_gid;
2048         vap->va_fsid = ip->i_dev;
2049         vap->va_nodeid = (ino64_t)ip->i_number;
2050         vap->va_nlink = ip->i_nlink;
2051         vap->va_size = ip->i_size;
2052         if (vp->v_type == VCHR || vp->v_type == VBLK)
2053                 vap->va_rdev = ip->i_rdev;
2054         else
2055                 vap->va_rdev = 0;    /* not a b/c spec. */
2056         mutex_enter(&ip->i_tlock);
2057         ITIMES_NOLOCK(ip);      /* mark correct time in inode */
2058         vap->va_seq = ip->i_seq;
2059         vap->va_atime.tv_sec = (time_t)ip->i_atime.tv_sec;
2060         vap->va_atime.tv_nsec = ip->i_atime.tv_usec*1000;
2061         vap->va_mtime.tv_sec = (time_t)ip->i_mtime.tv_sec;
2062         vap->va_mtime.tv_nsec = ip->i_mtime.tv_usec*1000;
2063         vap->va_ctime.tv_sec = (time_t)ip->i_ctime.tv_sec;
2064         vap->va_ctime.tv_nsec = ip->i_ctime.tv_usec*1000;
2065         mutex_exit(&ip->i_tlock);
2066 
2067         switch (ip->i_mode & IFMT) {
2068 
2069         case IFBLK:
2070                 vap->va_blksize = MAXBSIZE;          /* was BLKDEV_IOSIZE */
2071                 break;
2072 
2073         case IFCHR:
2074                 vap->va_blksize = MAXBSIZE;
2075                 break;
2076 
2077         default:
2078                 vap->va_blksize = ip->i_fs->fs_bsize;
2079                 break;
2080         }
2081         vap->va_nblocks = (fsblkcnt64_t)ip->i_blocks;
2082         rw_exit(&ip->i_contents);
2083         err = 0;
2084 
2085 out:
2086         return (err);
2087 }
2088 
2089 /*
2090  * Special wrapper to provide a callback for secpolicy_vnode_setattr().
2091  * The i_contents lock is already held by the caller and we need to
2092  * declare the inode as 'void *' argument.
2093  */
2094 static int
2095 ufs_priv_access(void *vip, int mode, struct cred *cr)
2096 {
2097         struct inode *ip = vip;
2098 
2099         return (ufs_iaccess(ip, mode, cr, 0));
2100 }
2101 
2102 /*ARGSUSED4*/
2103 static int
2104 ufs_setattr(
2105         struct vnode *vp,
2106         struct vattr *vap,
2107         int flags,
2108         struct cred *cr,
2109         caller_context_t *ct)
2110 {
2111         struct inode *ip = VTOI(vp);
2112         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
2113         struct fs *fs;
2114         struct ulockfs *ulp;
2115         char *errmsg1;
2116         char *errmsg2;
2117         long blocks;
2118         long int mask = vap->va_mask;
2119         size_t len1, len2;
2120         int issync;
2121         int trans_size;
2122         int dotrans;
2123         int dorwlock;
2124         int error;
2125         int owner_change;
2126         int dodqlock;
2127         timestruc_t now;
2128         vattr_t oldva;
2129         int retry = 1;
2130         int indeadlock;
2131 
2132         /*
2133          * Cannot set these attributes.
2134          */
2135         if ((mask & AT_NOSET) || (mask & AT_XVATTR))
2136                 return (EINVAL);
2137 
2138         /*
2139          * check for forced unmount
2140          */
2141         if (ufsvfsp == NULL)
2142                 return (EIO);
2143 
2144         fs = ufsvfsp->vfs_fs;
2145         if (fs->fs_ronly != 0)
2146                 return (EROFS);
2147 
2148 again:
2149         errmsg1 = NULL;
2150         errmsg2 = NULL;
2151         dotrans = 0;
2152         dorwlock = 0;
2153         dodqlock = 0;
2154 
2155         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SETATTR_MASK);
2156         if (error)
2157                 goto out;
2158 
2159         /*
2160          * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file.
2161          * This follows the protocol for read()/write().
2162          */
2163         if (vp->v_type != VDIR) {
2164                 /*
2165                  * ufs_tryirwlock uses rw_tryenter and checks for SLOCK to
2166                  * avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
2167                  * possible, retries the operation.
2168                  */
2169                 ufs_tryirwlock(&ip->i_rwlock, RW_WRITER, retry_file);
2170                 if (indeadlock) {
2171                         if (ulp)
2172                                 ufs_lockfs_end(ulp);
2173                         goto again;
2174                 }
2175                 dorwlock = 1;
2176         }
2177 
2178         /*
2179          * Truncate file.  Must have write permission and not be a directory.
2180          */
2181         if (mask & AT_SIZE) {
2182                 rw_enter(&ip->i_contents, RW_WRITER);
2183                 if (vp->v_type == VDIR) {
2184                         error = EISDIR;
2185                         goto update_inode;
2186                 }
2187                 if (error = ufs_iaccess(ip, IWRITE, cr, 0))
2188                         goto update_inode;
2189 
2190                 rw_exit(&ip->i_contents);
2191                 error = TRANS_ITRUNC(ip, vap->va_size, 0, cr);
2192                 if (error) {
2193                         rw_enter(&ip->i_contents, RW_WRITER);
2194                         goto update_inode;
2195                 }
2196 
2197                 if (error == 0 && vap->va_size)
2198                         vnevent_truncate(vp, ct);
2199         }
2200 
2201         if (ulp) {
2202                 trans_size = (int)TOP_SETATTR_SIZE(ip);
2203                 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_SETATTR, trans_size);
2204                 ++dotrans;
2205         }
2206 
2207         /*
2208          * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory.
2209          * This follows the protocol established by
2210          * ufs_link/create/remove/rename/mkdir/rmdir/symlink.
2211          */
2212         if (vp->v_type == VDIR) {
2213                 ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_SETATTR,
2214                     retry_dir);
2215                 if (indeadlock)
2216                         goto again;
2217                 dorwlock = 1;
2218         }
2219 
2220         /*
2221          * Grab quota lock if we are changing the file's owner.
2222          */
2223         if (mask & AT_UID) {
2224                 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
2225                 dodqlock = 1;
2226         }
2227         rw_enter(&ip->i_contents, RW_WRITER);
2228 
2229         oldva.va_mode = ip->i_mode;
2230         oldva.va_uid = ip->i_uid;
2231         oldva.va_gid = ip->i_gid;
2232 
2233         vap->va_mask &= ~AT_SIZE;
2234 
2235         error = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
2236             ufs_priv_access, ip);
2237         if (error)
2238                 goto update_inode;
2239 
2240         mask = vap->va_mask;
2241 
2242         /*
2243          * Change file access modes.
2244          */
2245         if (mask & AT_MODE) {
2246                 ip->i_mode = (ip->i_mode & IFMT) | (vap->va_mode & ~IFMT);
2247                 TRANS_INODE(ufsvfsp, ip);
2248                 ip->i_flag |= ICHG;
2249                 if (stickyhack) {
2250                         mutex_enter(&vp->v_lock);
2251                         if ((ip->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX)
2252                                 vp->v_flag |= VSWAPLIKE;
2253                         else
2254                                 vp->v_flag &= ~VSWAPLIKE;
2255                         mutex_exit(&vp->v_lock);
2256                 }
2257         }
2258         if (mask & (AT_UID|AT_GID)) {
2259                 if (mask & AT_UID) {
2260                         /*
2261                          * Don't change ownership of the quota inode.
2262                          */
2263                         if (ufsvfsp->vfs_qinod == ip) {
2264                                 ASSERT(ufsvfsp->vfs_qflags & MQ_ENABLED);
2265                                 error = EINVAL;
2266                                 goto update_inode;
2267                         }
2268 
2269                         /*
2270                          * No real ownership change.
2271                          */
2272                         if (ip->i_uid == vap->va_uid) {
2273                                 blocks = 0;
2274                                 owner_change = 0;
2275                         }
2276                         /*
2277                          * Remove the blocks and the file, from the old user's
2278                          * quota.
2279                          */
2280                         else {
2281                                 blocks = ip->i_blocks;
2282                                 owner_change = 1;
2283 
2284                                 (void) chkdq(ip, -blocks, /* force */ 1, cr,
2285                                     (char **)NULL, (size_t *)NULL);
2286                                 (void) chkiq(ufsvfsp, /* change */ -1, ip,
2287                                     (uid_t)ip->i_uid, /* force */ 1, cr,
2288                                     (char **)NULL, (size_t *)NULL);
2289                                 dqrele(ip->i_dquot);
2290                         }
2291 
2292                         ip->i_uid = vap->va_uid;
2293 
2294                         /*
2295                          * There is a real ownership change.
2296                          */
2297                         if (owner_change) {
2298                                 /*
2299                                  * Add the blocks and the file to the new
2300                                  * user's quota.
2301                                  */
2302                                 ip->i_dquot = getinoquota(ip);
2303                                 (void) chkdq(ip, blocks, /* force */ 1, cr,
2304                                     &errmsg1, &len1);
2305                                 (void) chkiq(ufsvfsp, /* change */ 1,
2306                                     (struct inode *)NULL, (uid_t)ip->i_uid,
2307                                     /* force */ 1, cr, &errmsg2, &len2);
2308                         }
2309                 }
2310                 if (mask & AT_GID) {
2311                         ip->i_gid = vap->va_gid;
2312                 }
2313                 TRANS_INODE(ufsvfsp, ip);
2314                 ip->i_flag |= ICHG;
2315         }
2316         /*
2317          * Change file access or modified times.
2318          */
2319         if (mask & (AT_ATIME|AT_MTIME)) {
2320                 /* Check that the time value is within ufs range */
2321                 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2322                     ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2323                         error = EOVERFLOW;
2324                         goto update_inode;
2325                 }
2326 
2327                 /*
2328                  * if the "noaccess" mount option is set and only atime
2329                  * update is requested, do nothing. No error is returned.
2330                  */
2331                 if ((ufsvfsp->vfs_noatime) &&
2332                     ((mask & (AT_ATIME|AT_MTIME)) == AT_ATIME))
2333                         goto skip_atime;
2334 
2335                 if (mask & AT_ATIME) {
2336                         ip->i_atime.tv_sec = vap->va_atime.tv_sec;
2337                         ip->i_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2338                         ip->i_flag &= ~IACC;
2339                 }
2340                 if (mask & AT_MTIME) {
2341                         ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
2342                         ip->i_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2343                         gethrestime(&now);
2344                         if (now.tv_sec > TIME32_MAX) {
2345                                 /*
2346                                  * In 2038, ctime sticks forever..
2347                                  */
2348                                 ip->i_ctime.tv_sec = TIME32_MAX;
2349                                 ip->i_ctime.tv_usec = 0;
2350                         } else {
2351                                 ip->i_ctime.tv_sec = now.tv_sec;
2352                                 ip->i_ctime.tv_usec = now.tv_nsec / 1000;
2353                         }
2354                         ip->i_flag &= ~(IUPD|ICHG);
2355                         ip->i_flag |= IMODTIME;
2356                 }
2357                 TRANS_INODE(ufsvfsp, ip);
2358                 ip->i_flag |= IMOD;
2359         }
2360 
2361 skip_atime:
2362         /*
2363          * The presence of a shadow inode may indicate an ACL, but does
2364          * not imply an ACL.  Future FSD types should be handled here too
2365          * and check for the presence of the attribute-specific data
2366          * before referencing it.
2367          */
2368         if (ip->i_shadow) {
2369                 /*
2370                  * XXX if ufs_iupdat is changed to sandbagged write fix
2371                  * ufs_acl_setattr to push ip to keep acls consistent
2372                  *
2373                  * Suppress out of inodes messages if we will retry.
2374                  */
2375                 if (retry)
2376                         ip->i_flag |= IQUIET;
2377                 error = ufs_acl_setattr(ip, vap, cr);
2378                 ip->i_flag &= ~IQUIET;
2379         }
2380 
2381 update_inode:
2382         /*
2383          * Setattr always increases the sequence number
2384          */
2385         ip->i_seq++;
2386 
2387         /*
2388          * if nfsd and not logging; push synchronously
2389          */
2390         if ((curthread->t_flag & T_DONTPEND) && !TRANS_ISTRANS(ufsvfsp)) {
2391                 ufs_iupdat(ip, 1);
2392         } else {
2393                 ITIMES_NOLOCK(ip);
2394         }
2395 
2396         rw_exit(&ip->i_contents);
2397         if (dodqlock) {
2398                 rw_exit(&ufsvfsp->vfs_dqrwlock);
2399         }
2400         if (dorwlock)
2401                 rw_exit(&ip->i_rwlock);
2402 
2403         if (ulp) {
2404                 if (dotrans) {
2405                         int terr = 0;
2406                         TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_SETATTR,
2407                             trans_size);
2408                         if (error == 0)
2409                                 error = terr;
2410                 }
2411                 ufs_lockfs_end(ulp);
2412         }
2413 out:
2414         /*
2415          * If out of inodes or blocks, see if we can free something
2416          * up from the delete queue.
2417          */
2418         if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
2419                 ufs_delete_drain_wait(ufsvfsp, 1);
2420                 retry = 0;
2421                 if (errmsg1 != NULL)
2422                         kmem_free(errmsg1, len1);
2423                 if (errmsg2 != NULL)
2424                         kmem_free(errmsg2, len2);
2425                 goto again;
2426         }
2427         if (errmsg1 != NULL) {
2428                 uprintf(errmsg1);
2429                 kmem_free(errmsg1, len1);
2430         }
2431         if (errmsg2 != NULL) {
2432                 uprintf(errmsg2);
2433                 kmem_free(errmsg2, len2);
2434         }
2435         return (error);
2436 }
2437 
2438 /*ARGSUSED*/
2439 static int
2440 ufs_access(struct vnode *vp, int mode, int flags, struct cred *cr,
2441         caller_context_t *ct)
2442 {
2443         struct inode *ip = VTOI(vp);
2444 
2445         if (ip->i_ufsvfs == NULL)
2446                 return (EIO);
2447 
2448         /*
2449          * The ufs_iaccess function wants to be called with
2450          * mode bits expressed as "ufs specific" bits.
2451          * I.e., VWRITE|VREAD|VEXEC do not make sense to
2452          * ufs_iaccess() but IWRITE|IREAD|IEXEC do.
2453          * But since they're the same we just pass the vnode mode
2454          * bit but just verify that assumption at compile time.
2455          */
2456 #if IWRITE != VWRITE || IREAD != VREAD || IEXEC != VEXEC
2457 #error "ufs_access needs to map Vmodes to Imodes"
2458 #endif
2459         return (ufs_iaccess(ip, mode, cr, 1));
2460 }
2461 
2462 /* ARGSUSED */
2463 static int
2464 ufs_readlink(struct vnode *vp, struct uio *uiop, struct cred *cr,
2465         caller_context_t *ct)
2466 {
2467         struct inode *ip = VTOI(vp);
2468         struct ufsvfs *ufsvfsp;
2469         struct ulockfs *ulp;
2470         int error;
2471         int fastsymlink;
2472 
2473         if (vp->v_type != VLNK) {
2474                 error = EINVAL;
2475                 goto nolockout;
2476         }
2477 
2478         /*
2479          * If the symbolic link is empty there is nothing to read.
2480          * Fast-track these empty symbolic links
2481          */
2482         if (ip->i_size == 0) {
2483                 error = 0;
2484                 goto nolockout;
2485         }
2486 
2487         ufsvfsp = ip->i_ufsvfs;
2488         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READLINK_MASK);
2489         if (error)
2490                 goto nolockout;
2491         /*
2492          * The ip->i_rwlock protects the data blocks used for FASTSYMLINK
2493          */
2494 again:
2495         fastsymlink = 0;
2496         if (ip->i_flag & IFASTSYMLNK) {
2497                 rw_enter(&ip->i_rwlock, RW_READER);
2498                 rw_enter(&ip->i_contents, RW_READER);
2499                 if (ip->i_flag & IFASTSYMLNK) {
2500                         if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) &&
2501                             (ip->i_fs->fs_ronly == 0) &&
2502                             (!ufsvfsp->vfs_noatime)) {
2503                                 mutex_enter(&ip->i_tlock);
2504                                 ip->i_flag |= IACC;
2505                                 mutex_exit(&ip->i_tlock);
2506                         }
2507                         error = uiomove((caddr_t)&ip->i_db[1],
2508                             MIN(ip->i_size, uiop->uio_resid),
2509                             UIO_READ, uiop);
2510                         ITIMES(ip);
2511                         ++fastsymlink;
2512                 }
2513                 rw_exit(&ip->i_contents);
2514                 rw_exit(&ip->i_rwlock);
2515         }
2516         if (!fastsymlink) {
2517                 ssize_t size;   /* number of bytes read  */
2518                 caddr_t basep;  /* pointer to input data */
2519                 ino_t ino;
2520                 long  igen;
2521                 struct uio tuio;        /* temp uio struct */
2522                 struct uio *tuiop;
2523                 iovec_t tiov;           /* temp iovec struct */
2524                 char kbuf[FSL_SIZE];    /* buffer to hold fast symlink */
2525                 int tflag = 0;          /* flag to indicate temp vars used */
2526 
2527                 ino = ip->i_number;
2528                 igen = ip->i_gen;
2529                 size = uiop->uio_resid;
2530                 basep = uiop->uio_iov->iov_base;
2531                 tuiop = uiop;
2532 
2533                 rw_enter(&ip->i_rwlock, RW_WRITER);
2534                 rw_enter(&ip->i_contents, RW_WRITER);
2535                 if (ip->i_flag & IFASTSYMLNK) {
2536                         rw_exit(&ip->i_contents);
2537                         rw_exit(&ip->i_rwlock);
2538                         goto again;
2539                 }
2540 
2541                 /* can this be a fast symlink and is it a user buffer? */
2542                 if (ip->i_size <= FSL_SIZE &&
2543                     (uiop->uio_segflg == UIO_USERSPACE ||
2544                     uiop->uio_segflg == UIO_USERISPACE)) {
2545 
2546                         bzero(&tuio, sizeof (struct uio));
2547                         /*
2548                          * setup a kernel buffer to read link into.  this
2549                          * is to fix a race condition where the user buffer
2550                          * got corrupted before copying it into the inode.
2551                          */
2552                         size = ip->i_size;
2553                         tiov.iov_len = size;
2554                         tiov.iov_base = kbuf;
2555                         tuio.uio_iov = &tiov;
2556                         tuio.uio_iovcnt = 1;
2557                         tuio.uio_offset = uiop->uio_offset;
2558                         tuio.uio_segflg = UIO_SYSSPACE;
2559                         tuio.uio_fmode = uiop->uio_fmode;
2560                         tuio.uio_extflg = uiop->uio_extflg;
2561                         tuio.uio_limit = uiop->uio_limit;
2562                         tuio.uio_resid = size;
2563 
2564                         basep = tuio.uio_iov->iov_base;
2565                         tuiop = &tuio;
2566                         tflag = 1;
2567                 }
2568 
2569                 error = rdip(ip, tuiop, 0, cr);
2570                 if (!(error == 0 && ip->i_number == ino && ip->i_gen == igen)) {
2571                         rw_exit(&ip->i_contents);
2572                         rw_exit(&ip->i_rwlock);
2573                         goto out;
2574                 }
2575 
2576                 if (tflag == 0)
2577                         size -= uiop->uio_resid;
2578 
2579                 if ((tflag == 0 && ip->i_size <= FSL_SIZE &&
2580                     ip->i_size == size) || (tflag == 1 &&
2581                     tuio.uio_resid == 0)) {
2582                         error = kcopy(basep, &ip->i_db[1], ip->i_size);
2583                         if (error == 0) {
2584                                 ip->i_flag |= IFASTSYMLNK;
2585                                 /*
2586                                  * free page
2587                                  */
2588                                 (void) VOP_PUTPAGE(ITOV(ip),
2589                                     (offset_t)0, PAGESIZE,
2590                                     (B_DONTNEED | B_FREE | B_FORCE | B_ASYNC),
2591                                     cr, ct);
2592                         } else {
2593                                 int i;
2594                                 /* error, clear garbage left behind */
2595                                 for (i = 1; i < NDADDR; i++)
2596                                         ip->i_db[i] = 0;
2597                                 for (i = 0; i < NIADDR; i++)
2598                                         ip->i_ib[i] = 0;
2599                         }
2600                 }
2601                 if (tflag == 1) {
2602                         /* now, copy it into the user buffer */
2603                         error = uiomove((caddr_t)kbuf,
2604                             MIN(size, uiop->uio_resid),
2605                             UIO_READ, uiop);
2606                 }
2607                 rw_exit(&ip->i_contents);
2608                 rw_exit(&ip->i_rwlock);
2609         }
2610 out:
2611         if (ulp) {
2612                 ufs_lockfs_end(ulp);
2613         }
2614 nolockout:
2615         return (error);
2616 }
2617 
2618 /* ARGSUSED */
2619 static int
2620 ufs_fsync(struct vnode *vp, int syncflag, struct cred *cr,
2621         caller_context_t *ct)
2622 {
2623         struct inode *ip = VTOI(vp);
2624         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
2625         struct ulockfs *ulp;
2626         int error;
2627 
2628         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_FSYNC_MASK);
2629         if (error)
2630                 return (error);
2631 
2632         if (TRANS_ISTRANS(ufsvfsp)) {
2633                 /*
2634                  * First push out any data pages
2635                  */
2636                 if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) &&
2637                     (vp->v_type != VCHR) && !(IS_SWAPVP(vp))) {
2638                         error = VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
2639                             0, CRED(), ct);
2640                         if (error)
2641                                 goto out;
2642                 }
2643 
2644                 /*
2645                  * Delta any delayed inode times updates
2646                  * and push inode to log.
2647                  * All other inode deltas will have already been delta'd
2648                  * and will be pushed during the commit.
2649                  */
2650                 if (!(syncflag & FDSYNC) &&
2651                     ((ip->i_flag & (IMOD|IMODACC)) == IMODACC)) {
2652                         if (ulp) {
2653                                 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_FSYNC,
2654                                     TOP_SYNCIP_SIZE);
2655                         }
2656                         rw_enter(&ip->i_contents, RW_READER);
2657                         mutex_enter(&ip->i_tlock);
2658                         ip->i_flag &= ~IMODTIME;
2659                         mutex_exit(&ip->i_tlock);
2660                         ufs_iupdat(ip, I_SYNC);
2661                         rw_exit(&ip->i_contents);
2662                         if (ulp) {
2663                                 TRANS_END_ASYNC(ufsvfsp, TOP_FSYNC,
2664                                     TOP_SYNCIP_SIZE);
2665                         }
2666                 }
2667 
2668                 /*
2669                  * Commit the Moby transaction
2670                  *
2671                  * Deltas have already been made so we just need to
2672                  * commit them with a synchronous transaction.
2673                  * TRANS_BEGIN_SYNC() will return an error
2674                  * if there are no deltas to commit, for an
2675                  * empty transaction.
2676                  */
2677                 if (ulp) {
2678                         TRANS_BEGIN_SYNC(ufsvfsp, TOP_FSYNC, TOP_COMMIT_SIZE,
2679                             error);
2680                         if (error) {
2681                                 error = 0; /* commit wasn't needed */
2682                                 goto out;
2683                         }
2684                         TRANS_END_SYNC(ufsvfsp, error, TOP_FSYNC,
2685                             TOP_COMMIT_SIZE);
2686                 }
2687         } else {        /* not logging */
2688                 if (!(IS_SWAPVP(vp)))
2689                         if (syncflag & FNODSYNC) {
2690                                 /* Just update the inode only */
2691                                 TRANS_IUPDAT(ip, 1);
2692                                 error = 0;
2693                         } else if (syncflag & FDSYNC)
2694                                 /* Do data-synchronous writes */
2695                                 error = TRANS_SYNCIP(ip, 0, I_DSYNC, TOP_FSYNC);
2696                         else
2697                                 /* Do synchronous writes */
2698                                 error = TRANS_SYNCIP(ip, 0, I_SYNC, TOP_FSYNC);
2699 
2700                 rw_enter(&ip->i_contents, RW_WRITER);
2701                 if (!error)
2702                         error = ufs_sync_indir(ip);
2703                 rw_exit(&ip->i_contents);
2704         }
2705 out:
2706         if (ulp) {
2707                 ufs_lockfs_end(ulp);
2708         }
2709         return (error);
2710 }
2711 
2712 /*ARGSUSED*/
2713 static void
2714 ufs_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ct)
2715 {
2716         ufs_iinactive(VTOI(vp));
2717 }
2718 
2719 /*
2720  * Unix file system operations having to do with directory manipulation.
2721  */
2722 int ufs_lookup_idle_count = 2;  /* Number of inodes to idle each time */
2723 /* ARGSUSED */
2724 static int
2725 ufs_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
2726         struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cr,
2727         caller_context_t *ct, int *direntflags, pathname_t *realpnp)
2728 {
2729         struct inode *ip;
2730         struct inode *sip;
2731         struct inode *xip;
2732         struct ufsvfs *ufsvfsp;
2733         struct ulockfs *ulp;
2734         struct vnode *vp;
2735         int error;
2736 
2737         /*
2738          * Check flags for type of lookup (regular file or attribute file)
2739          */
2740 
2741         ip = VTOI(dvp);
2742 
2743         if (flags & LOOKUP_XATTR) {
2744 
2745                 /*
2746                  * If not mounted with XATTR support then return EINVAL
2747                  */
2748 
2749                 if (!(ip->i_ufsvfs->vfs_vfs->vfs_flag & VFS_XATTR))
2750                         return (EINVAL);
2751                 /*
2752                  * We don't allow recursive attributes...
2753                  * Maybe someday we will.
2754                  */
2755                 if ((ip->i_cflags & IXATTR)) {
2756                         return (EINVAL);
2757                 }
2758 
2759                 if ((vp = dnlc_lookup(dvp, XATTR_DIR_NAME)) == NULL) {
2760                         error = ufs_xattr_getattrdir(dvp, &sip, flags, cr);
2761                         if (error) {
2762                                 *vpp = NULL;
2763                                 goto out;
2764                         }
2765 
2766                         vp = ITOV(sip);
2767                         dnlc_update(dvp, XATTR_DIR_NAME, vp);
2768                 }
2769 
2770                 /*
2771                  * Check accessibility of directory.
2772                  */
2773                 if (vp == DNLC_NO_VNODE) {
2774                         VN_RELE(vp);
2775                         error = ENOENT;
2776                         goto out;
2777                 }
2778                 if ((error = ufs_iaccess(VTOI(vp), IEXEC, cr, 1)) != 0) {
2779                         VN_RELE(vp);
2780                         goto out;
2781                 }
2782 
2783                 *vpp = vp;
2784                 return (0);
2785         }
2786 
2787         /*
2788          * Check for a null component, which we should treat as
2789          * looking at dvp from within it's parent, so we don't
2790          * need a call to ufs_iaccess(), as it has already been
2791          * done.
2792          */
2793         if (nm[0] == 0) {
2794                 VN_HOLD(dvp);
2795                 error = 0;
2796                 *vpp = dvp;
2797                 goto out;
2798         }
2799 
2800         /*
2801          * Check for "." ie itself. this is a quick check and
2802          * avoids adding "." into the dnlc (which have been seen
2803          * to occupy >10% of the cache).
2804          */
2805         if ((nm[0] == '.') && (nm[1] == 0)) {
2806                 /*
2807                  * Don't return without checking accessibility
2808                  * of the directory. We only need the lock if
2809                  * we are going to return it.
2810                  */
2811                 if ((error = ufs_iaccess(ip, IEXEC, cr, 1)) == 0) {
2812                         VN_HOLD(dvp);
2813                         *vpp = dvp;
2814                 }
2815                 goto out;
2816         }
2817 
2818         /*
2819          * Fast path: Check the directory name lookup cache.
2820          */
2821         if (vp = dnlc_lookup(dvp, nm)) {
2822                 /*
2823                  * Check accessibility of directory.
2824                  */
2825                 if ((error = ufs_iaccess(ip, IEXEC, cr, 1)) != 0) {
2826                         VN_RELE(vp);
2827                         goto out;
2828                 }
2829                 if (vp == DNLC_NO_VNODE) {
2830                         VN_RELE(vp);
2831                         error = ENOENT;
2832                         goto out;
2833                 }
2834                 xip = VTOI(vp);
2835                 ulp = NULL;
2836                 goto fastpath;
2837         }
2838 
2839         /*
2840          * Keep the idle queue from getting too long by
2841          * idling two inodes before attempting to allocate another.
2842          *    This operation must be performed before entering
2843          *    lockfs or a transaction.
2844          */
2845         if (ufs_idle_q.uq_ne > ufs_idle_q.uq_hiwat)
2846                 if ((curthread->t_flag & T_DONTBLOCK) == 0) {
2847                         ins.in_lidles.value.ul += ufs_lookup_idle_count;
2848                         ufs_idle_some(ufs_lookup_idle_count);
2849                 }
2850 
2851 retry_lookup:
2852         /*
2853          * Check accessibility of directory.
2854          */
2855         if (error = ufs_diraccess(ip, IEXEC, cr))
2856                 goto out;
2857 
2858         ufsvfsp = ip->i_ufsvfs;
2859         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_LOOKUP_MASK);
2860         if (error)
2861                 goto out;
2862 
2863         error = ufs_dirlook(ip, nm, &xip, cr, 1, 0);
2864 
2865 fastpath:
2866         if (error == 0) {
2867                 ip = xip;
2868                 *vpp = ITOV(ip);
2869 
2870                 /*
2871                  * If vnode is a device return special vnode instead.
2872                  */
2873                 if (IS_DEVVP(*vpp)) {
2874                         struct vnode *newvp;
2875 
2876                         newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type,
2877                             cr);
2878                         VN_RELE(*vpp);
2879                         if (newvp == NULL)
2880                                 error = ENOSYS;
2881                         else
2882                                 *vpp = newvp;
2883                 } else if (ip->i_cflags & ICOMPRESS) {
2884                         struct vnode *newvp;
2885 
2886                         /*
2887                          * Compressed file, substitute dcfs vnode
2888                          */
2889                         newvp = decompvp(*vpp, cr, ct);
2890                         VN_RELE(*vpp);
2891                         if (newvp == NULL)
2892                                 error = ENOSYS;
2893                         else
2894                                 *vpp = newvp;
2895                 }
2896         }
2897         if (ulp) {
2898                 ufs_lockfs_end(ulp);
2899         }
2900 
2901         if (error == EAGAIN)
2902                 goto retry_lookup;
2903 
2904 out:
2905         return (error);
2906 }
2907 
2908 /*ARGSUSED*/
2909 static int
2910 ufs_create(struct vnode *dvp, char *name, struct vattr *vap, enum vcexcl excl,
2911         int mode, struct vnode **vpp, struct cred *cr, int flag,
2912         caller_context_t *ct, vsecattr_t *vsecp)
2913 {
2914         struct inode *ip;
2915         struct inode *xip;
2916         struct inode *dip;
2917         struct vnode *xvp;
2918         struct ufsvfs *ufsvfsp;
2919         struct ulockfs *ulp;
2920         int error;
2921         int issync;
2922         int truncflag;
2923         int trans_size;
2924         int noentry;
2925         int defer_dip_seq_update = 0;   /* need to defer update of dip->i_seq */
2926         int retry = 1;
2927         int indeadlock;
2928 
2929 again:
2930         ip = VTOI(dvp);
2931         ufsvfsp = ip->i_ufsvfs;
2932         truncflag = 0;
2933 
2934         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_CREATE_MASK);
2935         if (error)
2936                 goto out;
2937 
2938         if (ulp) {
2939                 trans_size = (int)TOP_CREATE_SIZE(ip);
2940                 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_CREATE, trans_size);
2941         }
2942 
2943         if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr) != 0)
2944                 vap->va_mode &= ~VSVTX;
2945 
2946         if (*name == '\0') {
2947                 /*
2948                  * Null component name refers to the directory itself.
2949                  */
2950                 VN_HOLD(dvp);
2951                 /*
2952                  * Even though this is an error case, we need to grab the
2953                  * quota lock since the error handling code below is common.
2954                  */
2955                 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
2956                 rw_enter(&ip->i_contents, RW_WRITER);
2957                 error = EEXIST;
2958         } else {
2959                 xip = NULL;
2960                 noentry = 0;
2961                 /*
2962                  * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
2963                  * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
2964                  * possible, retries the operation.
2965                  */
2966                 ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_CREATE,
2967                     retry_dir);
2968                 if (indeadlock)
2969                         goto again;
2970 
2971                 xvp = dnlc_lookup(dvp, name);
2972                 if (xvp == DNLC_NO_VNODE) {
2973                         noentry = 1;
2974                         VN_RELE(xvp);
2975                         xvp = NULL;
2976                 }
2977                 if (xvp) {
2978                         rw_exit(&ip->i_rwlock);
2979                         if (error = ufs_iaccess(ip, IEXEC, cr, 1)) {
2980                                 VN_RELE(xvp);
2981                         } else {
2982                                 error = EEXIST;
2983                                 xip = VTOI(xvp);
2984                         }
2985                 } else {
2986                         /*
2987                          * Suppress file system full message if we will retry
2988                          */
2989                         error = ufs_direnter_cm(ip, name, DE_CREATE,
2990                             vap, &xip, cr, (noentry | (retry ? IQUIET : 0)));
2991                         if (error == EAGAIN) {
2992                                 if (ulp) {
2993                                         TRANS_END_CSYNC(ufsvfsp, error, issync,
2994                                             TOP_CREATE, trans_size);
2995                                         ufs_lockfs_end(ulp);
2996                                 }
2997                                 goto again;
2998                         }
2999                         rw_exit(&ip->i_rwlock);
3000                 }
3001                 ip = xip;
3002                 if (ip != NULL) {
3003                         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
3004                         rw_enter(&ip->i_contents, RW_WRITER);
3005                 }
3006         }
3007 
3008         /*
3009          * If the file already exists and this is a non-exclusive create,
3010          * check permissions and allow access for non-directories.
3011          * Read-only create of an existing directory is also allowed.
3012          * We fail an exclusive create of anything which already exists.
3013          */
3014         if (error == EEXIST) {
3015                 dip = VTOI(dvp);
3016                 if (excl == NONEXCL) {
3017                         if ((((ip->i_mode & IFMT) == IFDIR) ||
3018                             ((ip->i_mode & IFMT) == IFATTRDIR)) &&
3019                             (mode & IWRITE))
3020                                 error = EISDIR;
3021                         else if (mode)
3022                                 error = ufs_iaccess(ip, mode, cr, 0);
3023                         else
3024                                 error = 0;
3025                 }
3026                 if (error) {
3027                         rw_exit(&ip->i_contents);
3028                         rw_exit(&ufsvfsp->vfs_dqrwlock);
3029                         VN_RELE(ITOV(ip));
3030                         goto unlock;
3031                 }
3032                 /*
3033                  * If the error EEXIST was set, then i_seq can not
3034                  * have been updated. The sequence number interface
3035                  * is defined such that a non-error VOP_CREATE must
3036                  * increase the dir va_seq it by at least one. If we
3037                  * have cleared the error, increase i_seq. Note that
3038                  * we are increasing the dir i_seq and in rare cases
3039                  * ip may actually be from the dvp, so we already have
3040                  * the locks and it will not be subject to truncation.
3041                  * In case we have to update i_seq of the parent
3042                  * directory dip, we have to defer it till we have
3043                  * released our locks on ip due to lock ordering requirements.
3044                  */
3045                 if (ip != dip)
3046                         defer_dip_seq_update = 1;
3047                 else
3048                         ip->i_seq++;
3049 
3050                 if (((ip->i_mode & IFMT) == IFREG) &&
3051                     (vap->va_mask & AT_SIZE) && vap->va_size == 0) {
3052                         /*
3053                          * Truncate regular files, if requested by caller.
3054                          * Grab i_rwlock to make sure no one else is
3055                          * currently writing to the file (we promised
3056                          * bmap we would do this).
3057                          * Must get the locks in the correct order.
3058                          */
3059                         if (ip->i_size == 0) {
3060                                 ip->i_flag |= ICHG | IUPD;
3061                                 ip->i_seq++;
3062                                 TRANS_INODE(ufsvfsp, ip);
3063                         } else {
3064                                 /*
3065                                  * Large Files: Why this check here?
3066                                  * Though we do it in vn_create() we really
3067                                  * want to guarantee that we do not destroy
3068                                  * Large file data by atomically checking
3069                                  * the size while holding the contents
3070                                  * lock.
3071                                  */
3072                                 if (flag && !(flag & FOFFMAX) &&
3073                                     ((ip->i_mode & IFMT) == IFREG) &&
3074                                     (ip->i_size > (offset_t)MAXOFF32_T)) {
3075                                         rw_exit(&ip->i_contents);
3076                                         rw_exit(&ufsvfsp->vfs_dqrwlock);
3077                                         error = EOVERFLOW;
3078                                         goto unlock;
3079                                 }
3080                                 if (TRANS_ISTRANS(ufsvfsp))
3081                                         truncflag++;
3082                                 else {
3083                                         rw_exit(&ip->i_contents);
3084                                         rw_exit(&ufsvfsp->vfs_dqrwlock);
3085                                         ufs_tryirwlock_trans(&ip->i_rwlock,
3086                                             RW_WRITER, TOP_CREATE,
3087                                             retry_file);
3088                                         if (indeadlock) {
3089                                                 VN_RELE(ITOV(ip));
3090                                                 goto again;
3091                                         }
3092                                         rw_enter(&ufsvfsp->vfs_dqrwlock,
3093                                             RW_READER);
3094                                         rw_enter(&ip->i_contents, RW_WRITER);
3095                                         (void) ufs_itrunc(ip, (u_offset_t)0, 0,
3096                                             cr);
3097                                         rw_exit(&ip->i_rwlock);
3098                                 }
3099 
3100                         }
3101                         if (error == 0) {
3102                                 vnevent_create(ITOV(ip), ct);
3103                         }
3104                 }
3105         }
3106 
3107         if (error) {
3108                 if (ip != NULL) {
3109                         rw_exit(&ufsvfsp->vfs_dqrwlock);
3110                         rw_exit(&ip->i_contents);
3111                 }
3112                 goto unlock;
3113         }
3114 
3115         *vpp = ITOV(ip);
3116         ITIMES(ip);
3117         rw_exit(&ip->i_contents);
3118         rw_exit(&ufsvfsp->vfs_dqrwlock);
3119 
3120         /*
3121          * If vnode is a device return special vnode instead.
3122          */
3123         if (!error && IS_DEVVP(*vpp)) {
3124                 struct vnode *newvp;
3125 
3126                 newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
3127                 VN_RELE(*vpp);
3128                 if (newvp == NULL) {
3129                         error = ENOSYS;
3130                         goto unlock;
3131                 }
3132                 truncflag = 0;
3133                 *vpp = newvp;
3134         }
3135 unlock:
3136 
3137         /*
3138          * Do the deferred update of the parent directory's sequence
3139          * number now.
3140          */
3141         if (defer_dip_seq_update == 1) {
3142                 rw_enter(&dip->i_contents, RW_READER);
3143                 mutex_enter(&dip->i_tlock);
3144                 dip->i_seq++;
3145                 mutex_exit(&dip->i_tlock);
3146                 rw_exit(&dip->i_contents);
3147         }
3148 
3149         if (ulp) {
3150                 int terr = 0;
3151 
3152                 TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_CREATE,
3153                     trans_size);
3154 
3155                 /*
3156                  * If we haven't had a more interesting failure
3157                  * already, then anything that might've happened
3158                  * here should be reported.
3159                  */
3160                 if (error == 0)
3161                         error = terr;
3162         }
3163 
3164         if (!error && truncflag) {
3165                 ufs_tryirwlock(&ip->i_rwlock, RW_WRITER, retry_trunc);
3166                 if (indeadlock) {
3167                         if (ulp)
3168                                 ufs_lockfs_end(ulp);
3169                         VN_RELE(ITOV(ip));
3170                         goto again;
3171                 }
3172                 (void) TRANS_ITRUNC(ip, (u_offset_t)0, 0, cr);
3173                 rw_exit(&ip->i_rwlock);
3174         }
3175 
3176         if (ulp)
3177                 ufs_lockfs_end(ulp);
3178 
3179         /*
3180          * If no inodes available, try to free one up out of the
3181          * pending delete queue.
3182          */
3183         if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
3184                 ufs_delete_drain_wait(ufsvfsp, 1);
3185                 retry = 0;
3186                 goto again;
3187         }
3188 
3189 out:
3190         return (error);
3191 }
3192 
3193 extern int ufs_idle_max;
3194 /*ARGSUSED*/
3195 static int
3196 ufs_remove(struct vnode *vp, char *nm, struct cred *cr,
3197         caller_context_t *ct, int flags)
3198 {
3199         struct inode *ip = VTOI(vp);
3200         struct ufsvfs *ufsvfsp  = ip->i_ufsvfs;
3201         struct ulockfs *ulp;
3202         vnode_t *rmvp = NULL;   /* Vnode corresponding to name being removed */
3203         int indeadlock;
3204         int error;
3205         int issync;
3206         int trans_size;
3207 
3208         /*
3209          * don't let the delete queue get too long
3210          */
3211         if (ufsvfsp == NULL) {
3212                 error = EIO;
3213                 goto out;
3214         }
3215         if (ufsvfsp->vfs_delete.uq_ne > ufs_idle_max)
3216                 ufs_delete_drain(vp->v_vfsp, 1, 1);
3217 
3218         error = ufs_eventlookup(vp, nm, cr, &rmvp);
3219         if (rmvp != NULL) {
3220                 /* Only send the event if there were no errors */
3221                 if (error == 0)
3222                         vnevent_remove(rmvp, vp, nm, ct);
3223                 VN_RELE(rmvp);
3224         }
3225 
3226 retry_remove:
3227         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_REMOVE_MASK);
3228         if (error)
3229                 goto out;
3230 
3231         if (ulp)
3232                 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE,
3233                     trans_size = (int)TOP_REMOVE_SIZE(VTOI(vp)));
3234 
3235         /*
3236          * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
3237          * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
3238          * possible, retries the operation.
3239          */
3240         ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_REMOVE, retry);
3241         if (indeadlock)
3242                 goto retry_remove;
3243         error = ufs_dirremove(ip, nm, (struct inode *)0, (struct vnode *)0,
3244             DR_REMOVE, cr);
3245         rw_exit(&ip->i_rwlock);
3246 
3247         if (ulp) {
3248                 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_REMOVE, trans_size);
3249                 ufs_lockfs_end(ulp);
3250         }
3251 
3252 out:
3253         return (error);
3254 }
3255 
3256 /*
3257  * Link a file or a directory.  Only privileged processes are allowed to
3258  * make links to directories.
3259  */
3260 /*ARGSUSED*/
3261 static int
3262 ufs_link(struct vnode *tdvp, struct vnode *svp, char *tnm, struct cred *cr,
3263         caller_context_t *ct, int flags)
3264 {
3265         struct inode *sip;
3266         struct inode *tdp = VTOI(tdvp);
3267         struct ufsvfs *ufsvfsp = tdp->i_ufsvfs;
3268         struct ulockfs *ulp;
3269         struct vnode *realvp;
3270         int error;
3271         int issync;
3272         int trans_size;
3273         int isdev;
3274         int indeadlock;
3275 
3276 retry_link:
3277         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_LINK_MASK);
3278         if (error)
3279                 goto out;
3280 
3281         if (ulp)
3282                 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_LINK,
3283                     trans_size = (int)TOP_LINK_SIZE(VTOI(tdvp)));
3284 
3285         if (VOP_REALVP(svp, &realvp, ct) == 0)
3286                 svp = realvp;
3287 
3288         /*
3289          * Make sure link for extended attributes is valid
3290          * We only support hard linking of attr in ATTRDIR to ATTRDIR
3291          *
3292          * Make certain we don't attempt to look at a device node as
3293          * a ufs inode.
3294          */
3295 
3296         isdev = IS_DEVVP(svp);
3297         if (((isdev == 0) && ((VTOI(svp)->i_cflags & IXATTR) == 0) &&
3298             ((tdp->i_mode & IFMT) == IFATTRDIR)) ||
3299             ((isdev == 0) && (VTOI(svp)->i_cflags & IXATTR) &&
3300             ((tdp->i_mode & IFMT) == IFDIR))) {
3301                 error = EINVAL;
3302                 goto unlock;
3303         }
3304 
3305         sip = VTOI(svp);
3306         if ((svp->v_type == VDIR &&
3307             secpolicy_fs_linkdir(cr, ufsvfsp->vfs_vfs) != 0) ||
3308             (sip->i_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0)) {
3309                 error = EPERM;
3310                 goto unlock;
3311         }
3312 
3313         /*
3314          * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
3315          * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
3316          * possible, retries the operation.
3317          */
3318         ufs_tryirwlock_trans(&tdp->i_rwlock, RW_WRITER, TOP_LINK, retry);
3319         if (indeadlock)
3320                 goto retry_link;
3321         error = ufs_direnter_lr(tdp, tnm, DE_LINK, (struct inode *)0,
3322             sip, cr);
3323         rw_exit(&tdp->i_rwlock);
3324 
3325 unlock:
3326         if (ulp) {
3327                 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_LINK, trans_size);
3328                 ufs_lockfs_end(ulp);
3329         }
3330 
3331         if (!error) {
3332                 vnevent_link(svp, ct);
3333         }
3334 out:
3335         return (error);
3336 }
3337 
3338 uint64_t ufs_rename_retry_cnt;
3339 uint64_t ufs_rename_upgrade_retry_cnt;
3340 uint64_t ufs_rename_dircheck_retry_cnt;
3341 clock_t  ufs_rename_backoff_delay = 1;
3342 
3343 /*
3344  * Rename a file or directory.
3345  * We are given the vnode and entry string of the source and the
3346  * vnode and entry string of the place we want to move the source
3347  * to (the target). The essential operation is:
3348  *      unlink(target);
3349  *      link(source, target);
3350  *      unlink(source);
3351  * but "atomically".  Can't do full commit without saving state in
3352  * the inode on disk, which isn't feasible at this time.  Best we
3353  * can do is always guarantee that the TARGET exists.
3354  */
3355 
3356 /*ARGSUSED*/
3357 static int
3358 ufs_rename(
3359         struct vnode *sdvp,             /* old (source) parent vnode */
3360         char *snm,                      /* old (source) entry name */
3361         struct vnode *tdvp,             /* new (target) parent vnode */
3362         char *tnm,                      /* new (target) entry name */
3363         struct cred *cr,
3364         caller_context_t *ct,
3365         int flags)
3366 {
3367         struct inode *sip = NULL;       /* source inode */
3368         struct inode *ip = NULL;        /* check inode */
3369         struct inode *sdp;              /* old (source) parent inode */
3370         struct inode *tdp;              /* new (target) parent inode */
3371         struct vnode *svp = NULL;       /* source vnode */
3372         struct vnode *tvp = NULL;       /* target vnode, if it exists */
3373         struct vnode *realvp;
3374         struct ufsvfs *ufsvfsp;
3375         struct ulockfs *ulp = NULL;
3376         struct ufs_slot slot;
3377         timestruc_t now;
3378         int error;
3379         int issync;
3380         int trans_size;
3381         krwlock_t *first_lock;
3382         krwlock_t *second_lock;
3383         krwlock_t *reverse_lock;
3384         int serr, terr;
3385 
3386         sdp = VTOI(sdvp);
3387         slot.fbp = NULL;
3388         ufsvfsp = sdp->i_ufsvfs;
3389 
3390         if (VOP_REALVP(tdvp, &realvp, ct) == 0)
3391                 tdvp = realvp;
3392 
3393         /* Must do this before taking locks in case of DNLC miss */
3394         terr = ufs_eventlookup(tdvp, tnm, cr, &tvp);
3395         serr = ufs_eventlookup(sdvp, snm, cr, &svp);
3396 
3397         if ((serr == 0) && ((terr == 0) || (terr == ENOENT))) {
3398                 if (tvp != NULL)
3399                         vnevent_pre_rename_dest(tvp, tdvp, tnm, ct);
3400 
3401                 /*
3402                  * Notify the target directory of the rename event
3403                  * if source and target directories are not the same.
3404                  */
3405                 if (sdvp != tdvp)
3406                         vnevent_pre_rename_dest_dir(tdvp, svp, tnm, ct);
3407 
3408                 if (svp != NULL)
3409                         vnevent_pre_rename_src(svp, sdvp, snm, ct);
3410         }
3411 
3412         if (svp != NULL)
3413                 VN_RELE(svp);
3414 
3415 retry_rename:
3416         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_RENAME_MASK);
3417         if (error)
3418                 goto unlock;
3419 
3420         if (ulp)
3421                 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_RENAME,
3422                     trans_size = (int)TOP_RENAME_SIZE(sdp));
3423 
3424         if (VOP_REALVP(tdvp, &realvp, ct) == 0)
3425                 tdvp = realvp;
3426 
3427         tdp = VTOI(tdvp);
3428 
3429         /*
3430          * We only allow renaming of attributes from ATTRDIR to ATTRDIR.
3431          */
3432         if ((tdp->i_mode & IFMT) != (sdp->i_mode & IFMT)) {
3433                 error = EINVAL;
3434                 goto unlock;
3435         }
3436 
3437         /*
3438          * Check accessibility of directory.
3439          */
3440         if (error = ufs_diraccess(sdp, IEXEC, cr))
3441                 goto unlock;
3442 
3443         /*
3444          * Look up inode of file we're supposed to rename.
3445          */
3446         gethrestime(&now);
3447         if (error = ufs_dirlook(sdp, snm, &sip, cr, 0, 0)) {
3448                 if (error == EAGAIN) {
3449                         if (ulp) {
3450                                 TRANS_END_CSYNC(ufsvfsp, error, issync,
3451                                     TOP_RENAME, trans_size);
3452                                 ufs_lockfs_end(ulp);
3453                         }
3454                         goto retry_rename;
3455                 }
3456 
3457                 goto unlock;
3458         }
3459 
3460         /*
3461          * Lock both the source and target directories (they may be
3462          * the same) to provide the atomicity semantics that was
3463          * previously provided by the per file system vfs_rename_lock
3464          *
3465          * with vfs_rename_lock removed to allow simultaneous renames
3466          * within a file system, ufs_dircheckpath can deadlock while
3467          * traversing back to ensure that source is not a parent directory
3468          * of target parent directory. This is because we get into
3469          * ufs_dircheckpath with the sdp and tdp locks held as RW_WRITER.
3470          * If the tdp and sdp of the simultaneous renames happen to be
3471          * in the path of each other, it can lead to a deadlock. This
3472          * can be avoided by getting the locks as RW_READER here and then
3473          * upgrading to RW_WRITER after completing the ufs_dircheckpath.
3474          *
3475          * We hold the target directory's i_rwlock after calling
3476          * ufs_lockfs_begin but in many other operations (like ufs_readdir)
3477          * VOP_RWLOCK is explicitly called by the filesystem independent code
3478          * before calling the file system operation. In these cases the order
3479          * is reversed (i.e i_rwlock is taken first and then ufs_lockfs_begin
3480          * is called). This is fine as long as ufs_lockfs_begin acts as a VOP
3481          * counter but with ufs_quiesce setting the SLOCK bit this becomes a
3482          * synchronizing object which might lead to a deadlock. So we use
3483          * rw_tryenter instead of rw_enter. If we fail to get this lock and
3484          * find that SLOCK bit is set, we call ufs_lockfs_end and restart the
3485          * operation.
3486          */
3487 retry:
3488         first_lock = &tdp->i_rwlock;
3489         second_lock = &sdp->i_rwlock;
3490 retry_firstlock:
3491         if (!rw_tryenter(first_lock, RW_READER)) {
3492                 /*
3493                  * We didn't get the lock. Check if the SLOCK is set in the
3494                  * ufsvfs. If yes, we might be in a deadlock. Safer to give up
3495                  * and wait for SLOCK to be cleared.
3496                  */
3497 
3498                 if (ulp && ULOCKFS_IS_SLOCK(ulp)) {
3499                         TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RENAME,
3500                             trans_size);
3501                         ufs_lockfs_end(ulp);
3502                         goto retry_rename;
3503 
3504                 } else {
3505                         /*
3506                          * SLOCK isn't set so this is a genuine synchronization
3507                          * case. Let's try again after giving them a breather.
3508                          */
3509                         delay(RETRY_LOCK_DELAY);
3510                         goto  retry_firstlock;
3511                 }
3512         }
3513         /*
3514          * Need to check if the tdp and sdp are same !!!
3515          */
3516         if ((tdp != sdp) && (!rw_tryenter(second_lock, RW_READER))) {
3517                 /*
3518                  * We didn't get the lock. Check if the SLOCK is set in the
3519                  * ufsvfs. If yes, we might be in a deadlock. Safer to give up
3520                  * and wait for SLOCK to be cleared.
3521                  */
3522 
3523                 rw_exit(first_lock);
3524                 if (ulp && ULOCKFS_IS_SLOCK(ulp)) {
3525                         TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RENAME,
3526                             trans_size);
3527                         ufs_lockfs_end(ulp);
3528                         goto retry_rename;
3529 
3530                 } else {
3531                         /*
3532                          * So we couldn't get the second level peer lock *and*
3533                          * the SLOCK bit isn't set. Too bad we can be
3534                          * contentding with someone wanting these locks otherway
3535                          * round. Reverse the locks in case there is a heavy
3536                          * contention for the second level lock.
3537                          */
3538                         reverse_lock = first_lock;
3539                         first_lock = second_lock;
3540                         second_lock = reverse_lock;
3541                         ufs_rename_retry_cnt++;
3542                         goto  retry_firstlock;
3543                 }
3544         }
3545 
3546         if (sip == tdp) {
3547                 error = EINVAL;
3548                 goto errout;
3549         }
3550         /*
3551          * Make sure we can delete the source entry.  This requires
3552          * write permission on the containing directory.
3553          * Check for sticky directories.
3554          */
3555         rw_enter(&sdp->i_contents, RW_READER);
3556         rw_enter(&sip->i_contents, RW_READER);
3557         if ((error = ufs_iaccess(sdp, IWRITE, cr, 0)) != 0 ||
3558             (error = ufs_sticky_remove_access(sdp, sip, cr)) != 0) {
3559                 rw_exit(&sip->i_contents);
3560                 rw_exit(&sdp->i_contents);
3561                 goto errout;
3562         }
3563 
3564         /*
3565          * If this is a rename of a directory and the parent is
3566          * different (".." must be changed), then the source
3567          * directory must not be in the directory hierarchy
3568          * above the target, as this would orphan everything
3569          * below the source directory.  Also the user must have
3570          * write permission in the source so as to be able to
3571          * change "..".
3572          */
3573         if ((((sip->i_mode & IFMT) == IFDIR) ||
3574             ((sip->i_mode & IFMT) == IFATTRDIR)) && sdp != tdp) {
3575                 ino_t   inum;
3576 
3577                 if (error = ufs_iaccess(sip, IWRITE, cr, 0)) {
3578                         rw_exit(&sip->i_contents);
3579                         rw_exit(&sdp->i_contents);
3580                         goto errout;
3581                 }
3582                 inum = sip->i_number;
3583                 rw_exit(&sip->i_contents);
3584                 rw_exit(&sdp->i_contents);
3585                 if ((error = ufs_dircheckpath(inum, tdp, sdp, cr))) {
3586                         /*
3587                          * If we got EAGAIN ufs_dircheckpath detected a
3588                          * potential deadlock and backed out. We need
3589                          * to retry the operation since sdp and tdp have
3590                          * to be released to avoid the deadlock.
3591                          */
3592                         if (error == EAGAIN) {
3593                                 rw_exit(&tdp->i_rwlock);
3594                                 if (tdp != sdp)
3595                                         rw_exit(&sdp->i_rwlock);
3596                                 delay(ufs_rename_backoff_delay);
3597                                 ufs_rename_dircheck_retry_cnt++;
3598                                 goto retry;
3599                         }
3600                         goto errout;
3601                 }
3602         } else {
3603                 rw_exit(&sip->i_contents);
3604                 rw_exit(&sdp->i_contents);
3605         }
3606 
3607 
3608         /*
3609          * Check for renaming '.' or '..' or alias of '.'
3610          */
3611         if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0 || sdp == sip) {
3612                 error = EINVAL;
3613                 goto errout;
3614         }
3615 
3616         /*
3617          * Simultaneous renames can deadlock in ufs_dircheckpath since it
3618          * tries to traverse back the file tree with both tdp and sdp held
3619          * as RW_WRITER. To avoid that we have to hold the tdp and sdp locks
3620          * as RW_READERS  till ufs_dircheckpath is done.
3621          * Now that ufs_dircheckpath is done with, we can upgrade the locks
3622          * to RW_WRITER.
3623          */
3624         if (!rw_tryupgrade(&tdp->i_rwlock)) {
3625                 /*
3626                  * The upgrade failed. We got to give away the lock
3627                  * as to avoid deadlocking with someone else who is
3628                  * waiting for writer lock. With the lock gone, we
3629                  * cannot be sure the checks done above will hold
3630                  * good when we eventually get them back as writer.
3631                  * So if we can't upgrade we drop the locks and retry
3632                  * everything again.
3633                  */
3634                 rw_exit(&tdp->i_rwlock);
3635                 if (tdp != sdp)
3636                         rw_exit(&sdp->i_rwlock);
3637                 delay(ufs_rename_backoff_delay);
3638                 ufs_rename_upgrade_retry_cnt++;
3639                 goto retry;
3640         }
3641         if (tdp != sdp) {
3642                 if (!rw_tryupgrade(&sdp->i_rwlock)) {
3643                         /*
3644                          * The upgrade failed. We got to give away the lock
3645                          * as to avoid deadlocking with someone else who is
3646                          * waiting for writer lock. With the lock gone, we
3647                          * cannot be sure the checks done above will hold
3648                          * good when we eventually get them back as writer.
3649                          * So if we can't upgrade we drop the locks and retry
3650                          * everything again.
3651                          */
3652                         rw_exit(&tdp->i_rwlock);
3653                         rw_exit(&sdp->i_rwlock);
3654                         delay(ufs_rename_backoff_delay);
3655                         ufs_rename_upgrade_retry_cnt++;
3656                         goto retry;
3657                 }
3658         }
3659 
3660         /*
3661          * Now that all the locks are held check to make sure another thread
3662          * didn't slip in and take out the sip.
3663          */
3664         slot.status = NONE;
3665         if ((sip->i_ctime.tv_usec * 1000) > now.tv_nsec ||
3666             sip->i_ctime.tv_sec > now.tv_sec) {
3667                 rw_enter(&sdp->i_ufsvfs->vfs_dqrwlock, RW_READER);
3668                 rw_enter(&sdp->i_contents, RW_WRITER);
3669                 error = ufs_dircheckforname(sdp, snm, strlen(snm), &slot,
3670                     &ip, cr, 0);
3671                 rw_exit(&sdp->i_contents);
3672                 rw_exit(&sdp->i_ufsvfs->vfs_dqrwlock);
3673                 if (error) {
3674                         goto errout;
3675                 }
3676                 if (ip == NULL) {
3677                         error = ENOENT;
3678                         goto errout;
3679                 } else {
3680                         /*
3681                          * If the inode was found need to drop the v_count
3682                          * so as not to keep the filesystem from being
3683                          * unmounted at a later time.
3684                          */
3685                         VN_RELE(ITOV(ip));
3686                 }
3687 
3688                 /*
3689                  * Release the slot.fbp that has the page mapped and
3690                  * locked SE_SHARED, and could be used in in
3691                  * ufs_direnter_lr() which needs to get the SE_EXCL lock
3692                  * on said page.
3693                  */
3694                 if (slot.fbp) {
3695                         fbrelse(slot.fbp, S_OTHER);
3696                         slot.fbp = NULL;
3697                 }
3698         }
3699 
3700         /*
3701          * Link source to the target.
3702          */
3703         if (error = ufs_direnter_lr(tdp, tnm, DE_RENAME, sdp, sip, cr)) {
3704                 /*
3705                  * ESAME isn't really an error; it indicates that the
3706                  * operation should not be done because the source and target
3707                  * are the same file, but that no error should be reported.
3708                  */
3709                 if (error == ESAME)
3710                         error = 0;
3711                 goto errout;
3712         }
3713 
3714         if (error == 0 && tvp != NULL)
3715                 vnevent_rename_dest(tvp, tdvp, tnm, ct);
3716 
3717         /*
3718          * Unlink the source.
3719          * Remove the source entry.  ufs_dirremove() checks that the entry
3720          * still reflects sip, and returns an error if it doesn't.
3721          * If the entry has changed just forget about it.  Release
3722          * the source inode.
3723          */
3724         if ((error = ufs_dirremove(sdp, snm, sip, (struct vnode *)0,
3725             DR_RENAME, cr)) == ENOENT)
3726                 error = 0;
3727 
3728         if (error == 0) {
3729                 vnevent_rename_src(ITOV(sip), sdvp, snm, ct);
3730                 /*
3731                  * Notify the target directory of the rename event
3732                  * if source and target directories are not the same.
3733                  */
3734                 if (sdvp != tdvp)
3735                         vnevent_rename_dest_dir(tdvp, ct);
3736         }
3737 
3738 errout:
3739         if (slot.fbp)
3740                 fbrelse(slot.fbp, S_OTHER);
3741 
3742         rw_exit(&tdp->i_rwlock);
3743         if (sdp != tdp) {
3744                 rw_exit(&sdp->i_rwlock);
3745         }
3746 
3747 unlock:
3748         if (tvp != NULL)
3749                 VN_RELE(tvp);
3750         if (sip != NULL)
3751                 VN_RELE(ITOV(sip));
3752 
3753         if (ulp) {
3754                 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RENAME, trans_size);
3755                 ufs_lockfs_end(ulp);
3756         }
3757 
3758         return (error);
3759 }
3760 
3761 /*ARGSUSED*/
3762 static int
3763 ufs_mkdir(struct vnode *dvp, char *dirname, struct vattr *vap,
3764         struct vnode **vpp, struct cred *cr, caller_context_t *ct, int flags,
3765         vsecattr_t *vsecp)
3766 {
3767         struct inode *ip;
3768         struct inode *xip;
3769         struct ufsvfs *ufsvfsp;
3770         struct ulockfs *ulp;
3771         int error;
3772         int issync;
3773         int trans_size;
3774         int indeadlock;
3775         int retry = 1;
3776 
3777         ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
3778 
3779         /*
3780          * Can't make directory in attr hidden dir
3781          */
3782         if ((VTOI(dvp)->i_mode & IFMT) == IFATTRDIR)
3783                 return (EINVAL);
3784 
3785 again:
3786         ip = VTOI(dvp);
3787         ufsvfsp = ip->i_ufsvfs;
3788         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MKDIR_MASK);
3789         if (error)
3790                 goto out;
3791         if (ulp)
3792                 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_MKDIR,
3793                     trans_size = (int)TOP_MKDIR_SIZE(ip));
3794 
3795         /*
3796          * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
3797          * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
3798          * possible, retries the operation.
3799          */
3800         ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_MKDIR, retry);
3801         if (indeadlock)
3802                 goto again;
3803 
3804         error = ufs_direnter_cm(ip, dirname, DE_MKDIR, vap, &xip, cr,
3805             (retry ? IQUIET : 0));
3806         if (error == EAGAIN) {
3807                 if (ulp) {
3808                         TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_MKDIR,
3809                             trans_size);
3810                         ufs_lockfs_end(ulp);
3811                 }
3812                 goto again;
3813         }
3814 
3815         rw_exit(&ip->i_rwlock);
3816         if (error == 0) {
3817                 ip = xip;
3818                 *vpp = ITOV(ip);
3819         } else if (error == EEXIST)
3820                 VN_RELE(ITOV(xip));
3821 
3822         if (ulp) {
3823                 int terr = 0;
3824                 TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_MKDIR, trans_size);
3825                 ufs_lockfs_end(ulp);
3826                 if (error == 0)
3827                         error = terr;
3828         }
3829 out:
3830         if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
3831                 ufs_delete_drain_wait(ufsvfsp, 1);
3832                 retry = 0;
3833                 goto again;
3834         }
3835 
3836         return (error);
3837 }
3838 
3839 /*ARGSUSED*/
3840 static int
3841 ufs_rmdir(struct vnode *vp, char *nm, struct vnode *cdir, struct cred *cr,
3842         caller_context_t *ct, int flags)
3843 {
3844         struct inode *ip = VTOI(vp);
3845         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
3846         struct ulockfs *ulp;
3847         vnode_t *rmvp = NULL;   /* Vnode of removed directory */
3848         int error;
3849         int issync;
3850         int trans_size;
3851         int indeadlock;
3852 
3853         /*
3854          * don't let the delete queue get too long
3855          */
3856         if (ufsvfsp == NULL) {
3857                 error = EIO;
3858                 goto out;
3859         }
3860         if (ufsvfsp->vfs_delete.uq_ne > ufs_idle_max)
3861                 ufs_delete_drain(vp->v_vfsp, 1, 1);
3862 
3863         error = ufs_eventlookup(vp, nm, cr, &rmvp);
3864         if (rmvp != NULL) {
3865                 /* Only send the event if there were no errors */
3866                 if (error == 0)
3867                         vnevent_rmdir(rmvp, vp, nm, ct);
3868                 VN_RELE(rmvp);
3869         }
3870 
3871 retry_rmdir:
3872         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_RMDIR_MASK);
3873         if (error)
3874                 goto out;
3875 
3876         if (ulp)
3877                 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_RMDIR,
3878                     trans_size = TOP_RMDIR_SIZE);
3879 
3880         /*
3881          * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
3882          * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
3883          * possible, retries the operation.
3884          */
3885         ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_RMDIR, retry);
3886         if (indeadlock)
3887                 goto retry_rmdir;
3888         error = ufs_dirremove(ip, nm, (struct inode *)0, cdir, DR_RMDIR, cr);
3889 
3890         rw_exit(&ip->i_rwlock);
3891 
3892         if (ulp) {
3893                 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RMDIR,
3894                     trans_size);
3895                 ufs_lockfs_end(ulp);
3896         }
3897 
3898 out:
3899         return (error);
3900 }
3901 
3902 /* ARGSUSED */
3903 static int
3904 ufs_readdir(
3905         struct vnode *vp,
3906         struct uio *uiop,
3907         struct cred *cr,
3908         int *eofp,
3909         caller_context_t *ct,
3910         int flags)
3911 {
3912         struct iovec *iovp;
3913         struct inode *ip;
3914         struct direct *idp;
3915         struct dirent64 *odp;
3916         struct fbuf *fbp;
3917         struct ufsvfs *ufsvfsp;
3918         struct ulockfs *ulp;
3919         caddr_t outbuf;
3920         size_t bufsize;
3921         uint_t offset;
3922         uint_t bytes_wanted, total_bytes_wanted;
3923         int incount = 0;
3924         int outcount = 0;
3925         int error;
3926 
3927         ip = VTOI(vp);
3928         ASSERT(RW_READ_HELD(&ip->i_rwlock));
3929 
3930         if (uiop->uio_loffset >= MAXOFF32_T) {
3931                 if (eofp)
3932                         *eofp = 1;
3933                 return (0);
3934         }
3935 
3936         /*
3937          * Check if we have been called with a valid iov_len
3938          * and bail out if not, otherwise we may potentially loop
3939          * forever further down.
3940          */
3941         if (uiop->uio_iov->iov_len <= 0) {
3942                 error = EINVAL;
3943                 goto out;
3944         }
3945 
3946         /*
3947          * Large Files: When we come here we are guaranteed that
3948          * uio_offset can be used safely. The high word is zero.
3949          */
3950 
3951         ufsvfsp = ip->i_ufsvfs;
3952         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READDIR_MASK);
3953         if (error)
3954                 goto out;
3955 
3956         iovp = uiop->uio_iov;
3957         total_bytes_wanted = iovp->iov_len;
3958 
3959         /* Large Files: directory files should not be "large" */
3960 
3961         ASSERT(ip->i_size <= MAXOFF32_T);
3962 
3963         /* Force offset to be valid (to guard against bogus lseek() values) */
3964         offset = (uint_t)uiop->uio_offset & ~(DIRBLKSIZ - 1);
3965 
3966         /* Quit if at end of file or link count of zero (posix) */
3967         if (offset >= (uint_t)ip->i_size || ip->i_nlink <= 0) {
3968                 if (eofp)
3969                         *eofp = 1;
3970                 error = 0;
3971                 goto unlock;
3972         }
3973 
3974         /*
3975          * Get space to change directory entries into fs independent format.
3976          * Do fast alloc for the most commonly used-request size (filesystem
3977          * block size).
3978          */
3979         if (uiop->uio_segflg != UIO_SYSSPACE || uiop->uio_iovcnt != 1) {
3980                 bufsize = total_bytes_wanted;
3981                 outbuf = kmem_alloc(bufsize, KM_SLEEP);
3982                 odp = (struct dirent64 *)outbuf;
3983         } else {
3984                 bufsize = total_bytes_wanted;
3985                 odp = (struct dirent64 *)iovp->iov_base;
3986         }
3987 
3988 nextblk:
3989         bytes_wanted = total_bytes_wanted;
3990 
3991         /* Truncate request to file size */
3992         if (offset + bytes_wanted > (int)ip->i_size)
3993                 bytes_wanted = (int)(ip->i_size - offset);
3994 
3995         /* Comply with MAXBSIZE boundary restrictions of fbread() */
3996         if ((offset & MAXBOFFSET) + bytes_wanted > MAXBSIZE)
3997                 bytes_wanted = MAXBSIZE - (offset & MAXBOFFSET);
3998 
3999         /*
4000          * Read in the next chunk.
4001          * We are still holding the i_rwlock.
4002          */
4003         error = fbread(vp, (offset_t)offset, bytes_wanted, S_OTHER, &fbp);
4004 
4005         if (error)
4006                 goto update_inode;
4007         if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && (ip->i_fs->fs_ronly == 0) &&
4008             (!ufsvfsp->vfs_noatime)) {
4009                 ip->i_flag |= IACC;
4010         }
4011         incount = 0;
4012         idp = (struct direct *)fbp->fb_addr;
4013         if (idp->d_ino == 0 && idp->d_reclen == 0 && idp->d_namlen == 0) {
4014                 cmn_err(CE_WARN, "ufs_readdir: bad dir, inumber = %llu, "
4015                     "fs = %s\n",
4016                     (u_longlong_t)ip->i_number, ufsvfsp->vfs_fs->fs_fsmnt);
4017                 fbrelse(fbp, S_OTHER);
4018                 error = ENXIO;
4019                 goto update_inode;
4020         }
4021         /* Transform to file-system independent format */
4022         while (incount < bytes_wanted) {
4023                 /*
4024                  * If the current directory entry is mangled, then skip
4025                  * to the next block.  It would be nice to set the FSBAD
4026                  * flag in the super-block so that a fsck is forced on
4027                  * next reboot, but locking is a problem.
4028                  */
4029                 if (idp->d_reclen & 0x3) {
4030                         offset = (offset + DIRBLKSIZ) & ~(DIRBLKSIZ-1);
4031                         break;
4032                 }
4033 
4034                 /* Skip to requested offset and skip empty entries */
4035                 if (idp->d_ino != 0 && offset >= (uint_t)uiop->uio_offset) {
4036                         ushort_t this_reclen =
4037                             DIRENT64_RECLEN(idp->d_namlen);
4038                         /* Buffer too small for any entries */
4039                         if (!outcount && this_reclen > bufsize) {
4040                                 fbrelse(fbp, S_OTHER);
4041                                 error = EINVAL;
4042                                 goto update_inode;
4043                         }
4044                         /* If would overrun the buffer, quit */
4045                         if (outcount + this_reclen > bufsize) {
4046                                 break;
4047                         }
4048                         /* Take this entry */
4049                         odp->d_ino = (ino64_t)idp->d_ino;
4050                         odp->d_reclen = (ushort_t)this_reclen;
4051                         odp->d_off = (offset_t)(offset + idp->d_reclen);
4052 
4053                         /* use strncpy(9f) to zero out uninitialized bytes */
4054 
4055                         ASSERT(strlen(idp->d_name) + 1 <=
4056                             DIRENT64_NAMELEN(this_reclen));
4057                         (void) strncpy(odp->d_name, idp->d_name,
4058                             DIRENT64_NAMELEN(this_reclen));
4059                         outcount += odp->d_reclen;
4060                         odp = (struct dirent64 *)
4061                             ((intptr_t)odp + odp->d_reclen);
4062                         ASSERT(outcount <= bufsize);
4063                 }
4064                 if (idp->d_reclen) {
4065                         incount += idp->d_reclen;
4066                         offset += idp->d_reclen;
4067                         idp = (struct direct *)((intptr_t)idp + idp->d_reclen);
4068                 } else {
4069                         offset = (offset + DIRBLKSIZ) & ~(DIRBLKSIZ-1);
4070                         break;
4071                 }
4072         }
4073         /* Release the chunk */
4074         fbrelse(fbp, S_OTHER);
4075 
4076         /* Read whole block, but got no entries, read another if not eof */
4077 
4078         /*
4079          * Large Files: casting i_size to int here is not a problem
4080          * because directory sizes are always less than MAXOFF32_T.
4081          * See assertion above.
4082          */
4083 
4084         if (offset < (int)ip->i_size && !outcount)
4085                 goto nextblk;
4086 
4087         /* Copy out the entry data */
4088         if (uiop->uio_segflg == UIO_SYSSPACE && uiop->uio_iovcnt == 1) {
4089                 iovp->iov_base += outcount;
4090                 iovp->iov_len -= outcount;
4091                 uiop->uio_resid -= outcount;
4092                 uiop->uio_offset = offset;
4093         } else if ((error = uiomove(outbuf, (long)outcount, UIO_READ,
4094             uiop)) == 0)
4095                 uiop->uio_offset = offset;
4096 update_inode:
4097         ITIMES(ip);
4098         if (uiop->uio_segflg != UIO_SYSSPACE || uiop->uio_iovcnt != 1)
4099                 kmem_free(outbuf, bufsize);
4100 
4101         if (eofp && error == 0)
4102                 *eofp = (uiop->uio_offset >= (int)ip->i_size);
4103 unlock:
4104         if (ulp) {
4105                 ufs_lockfs_end(ulp);
4106         }
4107 out:
4108         return (error);
4109 }
4110 
4111 /*ARGSUSED*/
4112 static int
4113 ufs_symlink(
4114         struct vnode *dvp,              /* ptr to parent dir vnode */
4115         char *linkname,                 /* name of symbolic link */
4116         struct vattr *vap,              /* attributes */
4117         char *target,                   /* target path */
4118         struct cred *cr,                /* user credentials */
4119         caller_context_t *ct,
4120         int flags)
4121 {
4122         struct inode *ip, *dip = VTOI(dvp);
4123         struct ufsvfs *ufsvfsp = dip->i_ufsvfs;
4124         struct ulockfs *ulp;
4125         int error;
4126         int issync;
4127         int trans_size;
4128         int residual;
4129         int ioflag;
4130         int retry = 1;
4131 
4132         /*
4133          * No symlinks in attrdirs at this time
4134          */
4135         if ((VTOI(dvp)->i_mode & IFMT) == IFATTRDIR)
4136                 return (EINVAL);
4137 
4138 again:
4139         ip = (struct inode *)NULL;
4140         vap->va_type = VLNK;
4141         vap->va_rdev = 0;
4142 
4143         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SYMLINK_MASK);
4144         if (error)
4145                 goto out;
4146 
4147         if (ulp)
4148                 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_SYMLINK,
4149                     trans_size = (int)TOP_SYMLINK_SIZE(dip));
4150 
4151         /*
4152          * We must create the inode before the directory entry, to avoid
4153          * racing with readlink().  ufs_dirmakeinode requires that we
4154          * hold the quota lock as reader, and directory locks as writer.
4155          */
4156 
4157         rw_enter(&dip->i_rwlock, RW_WRITER);
4158         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
4159         rw_enter(&dip->i_contents, RW_WRITER);
4160 
4161         /*
4162          * Suppress any out of inodes messages if we will retry on
4163          * ENOSP
4164          */
4165         if (retry)
4166                 dip->i_flag |= IQUIET;
4167 
4168         error = ufs_dirmakeinode(dip, &ip, vap, DE_SYMLINK, cr);
4169 
4170         dip->i_flag &= ~IQUIET;
4171 
4172         rw_exit(&dip->i_contents);
4173         rw_exit(&ufsvfsp->vfs_dqrwlock);
4174         rw_exit(&dip->i_rwlock);
4175 
4176         if (error)
4177                 goto unlock;
4178 
4179         /*
4180          * OK.  The inode has been created.  Write out the data of the
4181          * symbolic link.  Since symbolic links are metadata, and should
4182          * remain consistent across a system crash, we need to force the
4183          * data out synchronously.
4184          *
4185          * (This is a change from the semantics in earlier releases, which
4186          * only created symbolic links synchronously if the semi-documented
4187          * 'syncdir' option was set, or if we were being invoked by the NFS
4188          * server, which requires symbolic links to be created synchronously.)
4189          *
4190          * We need to pass in a pointer for the residual length; otherwise
4191          * ufs_rdwri() will always return EIO if it can't write the data,
4192          * even if the error was really ENOSPC or EDQUOT.
4193          */
4194 
4195         ioflag = FWRITE | FDSYNC;
4196         residual = 0;
4197 
4198         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
4199         rw_enter(&ip->i_contents, RW_WRITER);
4200 
4201         /*
4202          * Suppress file system full messages if we will retry
4203          */
4204         if (retry)
4205                 ip->i_flag |= IQUIET;
4206 
4207         error = ufs_rdwri(UIO_WRITE, ioflag, ip, target, strlen(target),
4208             (offset_t)0, UIO_SYSSPACE, &residual, cr);
4209 
4210         ip->i_flag &= ~IQUIET;
4211 
4212         if (error) {
4213                 rw_exit(&ip->i_contents);
4214                 rw_exit(&ufsvfsp->vfs_dqrwlock);
4215                 goto remove;
4216         }
4217 
4218         /*
4219          * If the link's data is small enough, we can cache it in the inode.
4220          * This is a "fast symbolic link".  We don't use the first direct
4221          * block because that's actually used to point at the symbolic link's
4222          * contents on disk; but we know that none of the other direct or
4223          * indirect blocks can be used because symbolic links are restricted
4224          * to be smaller than a file system block.
4225          */
4226 
4227         ASSERT(MAXPATHLEN <= VBSIZE(ITOV(ip)));
4228 
4229         if (ip->i_size > 0 && ip->i_size <= FSL_SIZE) {
4230                 if (kcopy(target, &ip->i_db[1], ip->i_size) == 0) {
4231                         ip->i_flag |= IFASTSYMLNK;
4232                 } else {
4233                         int i;
4234                         /* error, clear garbage left behind */
4235                         for (i = 1; i < NDADDR; i++)
4236                                 ip->i_db[i] = 0;
4237                         for (i = 0; i < NIADDR; i++)
4238                                 ip->i_ib[i] = 0;
4239                 }
4240         }
4241 
4242         rw_exit(&ip->i_contents);
4243         rw_exit(&ufsvfsp->vfs_dqrwlock);
4244 
4245         /*
4246          * OK.  We've successfully created the symbolic link.  All that
4247          * remains is to insert it into the appropriate directory.
4248          */
4249 
4250         rw_enter(&dip->i_rwlock, RW_WRITER);
4251         error = ufs_direnter_lr(dip, linkname, DE_SYMLINK, NULL, ip, cr);
4252         rw_exit(&dip->i_rwlock);
4253 
4254         /*
4255          * Fall through into remove-on-error code.  We're either done, or we
4256          * need to remove the inode (if we couldn't insert it).
4257          */
4258 
4259 remove:
4260         if (error && (ip != NULL)) {
4261                 rw_enter(&ip->i_contents, RW_WRITER);
4262                 ip->i_nlink--;
4263                 ip->i_flag |= ICHG;
4264                 ip->i_seq++;
4265                 ufs_setreclaim(ip);
4266                 rw_exit(&ip->i_contents);
4267         }
4268 
4269 unlock:
4270         if (ip != NULL)
4271                 VN_RELE(ITOV(ip));
4272 
4273         if (ulp) {
4274                 int terr = 0;
4275 
4276                 TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_SYMLINK,
4277                     trans_size);
4278                 ufs_lockfs_end(ulp);
4279                 if (error == 0)
4280                         error = terr;
4281         }
4282 
4283         /*
4284          * We may have failed due to lack of an inode or of a block to
4285          * store the target in.  Try flushing the delete queue to free
4286          * logically-available things up and try again.
4287          */
4288         if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
4289                 ufs_delete_drain_wait(ufsvfsp, 1);
4290                 retry = 0;
4291                 goto again;
4292         }
4293 
4294 out:
4295         return (error);
4296 }
4297 
4298 /*
4299  * Ufs specific routine used to do ufs io.
4300  */
4301 int
4302 ufs_rdwri(enum uio_rw rw, int ioflag, struct inode *ip, caddr_t base,
4303         ssize_t len, offset_t offset, enum uio_seg seg, int *aresid,
4304         struct cred *cr)
4305 {
4306         struct uio auio;
4307         struct iovec aiov;
4308         int error;
4309 
4310         ASSERT(RW_LOCK_HELD(&ip->i_contents));
4311 
4312         bzero((caddr_t)&auio, sizeof (uio_t));
4313         bzero((caddr_t)&aiov, sizeof (iovec_t));
4314 
4315         aiov.iov_base = base;
4316         aiov.iov_len = len;
4317         auio.uio_iov = &aiov;
4318         auio.uio_iovcnt = 1;
4319         auio.uio_loffset = offset;
4320         auio.uio_segflg = (short)seg;
4321         auio.uio_resid = len;
4322 
4323         if (rw == UIO_WRITE) {
4324                 auio.uio_fmode = FWRITE;
4325                 auio.uio_extflg = UIO_COPY_DEFAULT;
4326                 auio.uio_llimit = curproc->p_fsz_ctl;
4327                 error = wrip(ip, &auio, ioflag, cr);
4328         } else {
4329                 auio.uio_fmode = FREAD;
4330                 auio.uio_extflg = UIO_COPY_CACHED;
4331                 auio.uio_llimit = MAXOFFSET_T;
4332                 error = rdip(ip, &auio, ioflag, cr);
4333         }
4334 
4335         if (aresid) {
4336                 *aresid = auio.uio_resid;
4337         } else if (auio.uio_resid) {
4338                 error = EIO;
4339         }
4340         return (error);
4341 }
4342 
4343 /*ARGSUSED*/
4344 static int
4345 ufs_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
4346 {
4347         struct ufid *ufid;
4348         struct inode *ip = VTOI(vp);
4349 
4350         if (ip->i_ufsvfs == NULL)
4351                 return (EIO);
4352 
4353         if (fidp->fid_len < (sizeof (struct ufid) - sizeof (ushort_t))) {
4354                 fidp->fid_len = sizeof (struct ufid) - sizeof (ushort_t);
4355                 return (ENOSPC);
4356         }
4357 
4358         ufid = (struct ufid *)fidp;
4359         bzero((char *)ufid, sizeof (struct ufid));
4360         ufid->ufid_len = sizeof (struct ufid) - sizeof (ushort_t);
4361         ufid->ufid_ino = ip->i_number;
4362         ufid->ufid_gen = ip->i_gen;
4363 
4364         return (0);
4365 }
4366 
4367 /* ARGSUSED2 */
4368 static int
4369 ufs_rwlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
4370 {
4371         struct inode    *ip = VTOI(vp);
4372         struct ufsvfs   *ufsvfsp;
4373         int             forcedirectio;
4374 
4375         /*
4376          * Read case is easy.
4377          */
4378         if (!write_lock) {
4379                 rw_enter(&ip->i_rwlock, RW_READER);
4380                 return (V_WRITELOCK_FALSE);
4381         }
4382 
4383         /*
4384          * Caller has requested a writer lock, but that inhibits any
4385          * concurrency in the VOPs that follow. Acquire the lock shared
4386          * and defer exclusive access until it is known to be needed in
4387          * other VOP handlers. Some cases can be determined here.
4388          */
4389 
4390         /*
4391          * If directio is not set, there is no chance of concurrency,
4392          * so just acquire the lock exclusive. Beware of a forced
4393          * unmount before looking at the mount option.
4394          */
4395         ufsvfsp = ip->i_ufsvfs;
4396         forcedirectio = ufsvfsp ? ufsvfsp->vfs_forcedirectio : 0;
4397         if (!(ip->i_flag & IDIRECTIO || forcedirectio) ||
4398             !ufs_allow_shared_writes) {
4399                 rw_enter(&ip->i_rwlock, RW_WRITER);
4400                 return (V_WRITELOCK_TRUE);
4401         }
4402 
4403         /*
4404          * Mandatory locking forces acquiring i_rwlock exclusive.
4405          */
4406         if (MANDLOCK(vp, ip->i_mode)) {
4407                 rw_enter(&ip->i_rwlock, RW_WRITER);
4408                 return (V_WRITELOCK_TRUE);
4409         }
4410 
4411         /*
4412          * Acquire the lock shared in case a concurrent write follows.
4413          * Mandatory locking could have become enabled before the lock
4414          * was acquired. Re-check and upgrade if needed.
4415          */
4416         rw_enter(&ip->i_rwlock, RW_READER);
4417         if (MANDLOCK(vp, ip->i_mode)) {
4418                 rw_exit(&ip->i_rwlock);
4419                 rw_enter(&ip->i_rwlock, RW_WRITER);
4420                 return (V_WRITELOCK_TRUE);
4421         }
4422         return (V_WRITELOCK_FALSE);
4423 }
4424 
4425 /*ARGSUSED*/
4426 static void
4427 ufs_rwunlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
4428 {
4429         struct inode    *ip = VTOI(vp);
4430 
4431         rw_exit(&ip->i_rwlock);
4432 }
4433 
4434 /* ARGSUSED */
4435 static int
4436 ufs_seek(struct vnode *vp, offset_t ooff, offset_t *noffp,
4437         caller_context_t *ct)
4438 {
4439         return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
4440 }
4441 
4442 /* ARGSUSED */
4443 static int
4444 ufs_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
4445         offset_t offset, struct flk_callback *flk_cbp, struct cred *cr,
4446         caller_context_t *ct)
4447 {
4448         struct inode *ip = VTOI(vp);
4449 
4450         if (ip->i_ufsvfs == NULL)
4451                 return (EIO);
4452 
4453         /*
4454          * If file is being mapped, disallow frlock.
4455          * XXX I am not holding tlock while checking i_mapcnt because the
4456          * current locking strategy drops all locks before calling fs_frlock.
4457          * So, mapcnt could change before we enter fs_frlock making is
4458          * meaningless to have held tlock in the first place.
4459          */
4460         if (ip->i_mapcnt > 0 && MANDLOCK(vp, ip->i_mode))
4461                 return (EAGAIN);
4462         return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4463 }
4464 
4465 /* ARGSUSED */
4466 static int
4467 ufs_space(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
4468         offset_t offset, cred_t *cr, caller_context_t *ct)
4469 {
4470         struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs;
4471         struct ulockfs *ulp;
4472         int error;
4473 
4474         if ((error = convoff(vp, bfp, 0, offset)) == 0) {
4475                 if (cmd == F_FREESP) {
4476                         error = ufs_lockfs_begin(ufsvfsp, &ulp,
4477                             ULOCKFS_SPACE_MASK);
4478                         if (error)
4479                                 return (error);
4480                         error = ufs_freesp(vp, bfp, flag, cr);
4481 
4482                         if (error == 0 && bfp->l_start == 0)
4483                                 vnevent_truncate(vp, ct);
4484                 } else if (cmd == F_ALLOCSP) {
4485                         error = ufs_lockfs_begin(ufsvfsp, &ulp,
4486                             ULOCKFS_FALLOCATE_MASK);
4487                         if (error)
4488                                 return (error);
4489                         error = ufs_allocsp(vp, bfp, cr);
4490                 } else
4491                         return (EINVAL); /* Command not handled here */
4492 
4493                 if (ulp)
4494                         ufs_lockfs_end(ulp);
4495 
4496         }
4497         return (error);
4498 }
4499 
4500 /*
4501  * Used to determine if read ahead should be done. Also used to
4502  * to determine when write back occurs.
4503  */
4504 #define CLUSTSZ(ip)             ((ip)->i_ufsvfs->vfs_ioclustsz)
4505 
4506 /*
4507  * A faster version of ufs_getpage.
4508  *
4509  * We optimize by inlining the pvn_getpages iterator, eliminating
4510  * calls to bmap_read if file doesn't have UFS holes, and avoiding
4511  * the overhead of page_exists().
4512  *
4513  * When files has UFS_HOLES and ufs_getpage is called with S_READ,
4514  * we set *protp to PROT_READ to avoid calling bmap_read. This approach
4515  * victimizes performance when a file with UFS holes is faulted
4516  * first in the S_READ mode, and then in the S_WRITE mode. We will get
4517  * two MMU faults in this case.
4518  *
4519  * XXX - the inode fields which control the sequential mode are not
4520  *       protected by any mutex. The read ahead will act wild if
4521  *       multiple processes will access the file concurrently and
4522  *       some of them in sequential mode. One particulary bad case
4523  *       is if another thread will change the value of i_nextrio between
4524  *       the time this thread tests the i_nextrio value and then reads it
4525  *       again to use it as the offset for the read ahead.
4526  */
4527 /*ARGSUSED*/
4528 static int
4529 ufs_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp,
4530         page_t *plarr[], size_t plsz, struct seg *seg, caddr_t addr,
4531         enum seg_rw rw, struct cred *cr, caller_context_t *ct)
4532 {
4533         u_offset_t      uoff = (u_offset_t)off; /* type conversion */
4534         u_offset_t      pgoff;
4535         u_offset_t      eoff;
4536         struct inode    *ip = VTOI(vp);
4537         struct ufsvfs   *ufsvfsp = ip->i_ufsvfs;
4538         struct fs       *fs;
4539         struct ulockfs  *ulp;
4540         page_t          **pl;
4541         caddr_t         pgaddr;
4542         krw_t           rwtype;
4543         int             err;
4544         int             has_holes;
4545         int             beyond_eof;
4546         int             seqmode;
4547         int             pgsize = PAGESIZE;
4548         int             dolock;
4549         int             do_qlock;
4550         int             trans_size;
4551 
4552         ASSERT((uoff & PAGEOFFSET) == 0);
4553 
4554         if (protp)
4555                 *protp = PROT_ALL;
4556 
4557         /*
4558          * Obey the lockfs protocol
4559          */
4560         err = ufs_lockfs_begin_getpage(ufsvfsp, &ulp, seg,
4561             rw == S_READ || rw == S_EXEC, protp);
4562         if (err)
4563                 goto out;
4564 
4565         fs = ufsvfsp->vfs_fs;
4566 
4567         if (ulp && (rw == S_CREATE || rw == S_WRITE) &&
4568             !(vp->v_flag & VISSWAP)) {
4569                 /*
4570                  * Try to start a transaction, will return if blocking is
4571                  * expected to occur and the address space is not the
4572                  * kernel address space.
4573                  */
4574                 trans_size = TOP_GETPAGE_SIZE(ip);
4575                 if (seg->s_as != &kas) {
4576                         TRANS_TRY_BEGIN_ASYNC(ufsvfsp, TOP_GETPAGE,
4577                             trans_size, err)
4578                         if (err == EWOULDBLOCK) {
4579                                 /*
4580                                  * Use EDEADLK here because the VM code
4581                                  * can normally never see this error.
4582                                  */
4583                                 err = EDEADLK;
4584                                 ufs_lockfs_end(ulp);
4585                                 goto out;
4586                         }
4587                 } else {
4588                         TRANS_BEGIN_ASYNC(ufsvfsp, TOP_GETPAGE, trans_size);
4589                 }
4590         }
4591 
4592         if (vp->v_flag & VNOMAP) {
4593                 err = ENOSYS;
4594                 goto unlock;
4595         }
4596 
4597         seqmode = ip->i_nextr == uoff && rw != S_CREATE;
4598 
4599         rwtype = RW_READER;             /* start as a reader */
4600         dolock = (rw_owner(&ip->i_contents) != curthread);
4601         /*
4602          * If this thread owns the lock, i.e., this thread grabbed it
4603          * as writer somewhere above, then we don't need to grab the
4604          * lock as reader in this routine.
4605          */
4606         do_qlock = (rw_owner(&ufsvfsp->vfs_dqrwlock) != curthread);
4607 
4608 retrylock:
4609         if (dolock) {
4610                 /*
4611                  * Grab the quota lock if we need to call
4612                  * bmap_write() below (with i_contents as writer).
4613                  */
4614                 if (do_qlock && rwtype == RW_WRITER)
4615                         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
4616                 rw_enter(&ip->i_contents, rwtype);
4617         }
4618 
4619         /*
4620          * We may be getting called as a side effect of a bmap using
4621          * fbread() when the blocks might be being allocated and the
4622          * size has not yet been up'ed.  In this case we want to be
4623          * able to return zero pages if we get back UFS_HOLE from
4624          * calling bmap for a non write case here.  We also might have
4625          * to read some frags from the disk into a page if we are
4626          * extending the number of frags for a given lbn in bmap().
4627          * Large Files: The read of i_size here is atomic because
4628          * i_contents is held here. If dolock is zero, the lock
4629          * is held in bmap routines.
4630          */
4631         beyond_eof = uoff + len >
4632             P2ROUNDUP_TYPED(ip->i_size, PAGESIZE, u_offset_t);
4633         if (beyond_eof && seg != segkmap) {
4634                 if (dolock) {
4635                         rw_exit(&ip->i_contents);
4636                         if (do_qlock && rwtype == RW_WRITER)
4637                                 rw_exit(&ufsvfsp->vfs_dqrwlock);
4638                 }
4639                 err = EFAULT;
4640                 goto unlock;
4641         }
4642 
4643         /*
4644          * Must hold i_contents lock throughout the call to pvn_getpages
4645          * since locked pages are returned from each call to ufs_getapage.
4646          * Must *not* return locked pages and then try for contents lock
4647          * due to lock ordering requirements (inode > page)
4648          */
4649 
4650         has_holes = bmap_has_holes(ip);
4651 
4652         if ((rw == S_WRITE || rw == S_CREATE) && has_holes && !beyond_eof) {
4653                 int     blk_size;
4654                 u_offset_t offset;
4655 
4656                 /*
4657                  * We must acquire the RW_WRITER lock in order to
4658                  * call bmap_write().
4659                  */
4660                 if (dolock && rwtype == RW_READER) {
4661                         rwtype = RW_WRITER;
4662 
4663                         /*
4664                          * Grab the quota lock before
4665                          * upgrading i_contents, but if we can't grab it
4666                          * don't wait here due to lock order:
4667                          * vfs_dqrwlock > i_contents.
4668                          */
4669                         if (do_qlock &&
4670                             rw_tryenter(&ufsvfsp->vfs_dqrwlock, RW_READER)
4671                             == 0) {
4672                                 rw_exit(&ip->i_contents);
4673                                 goto retrylock;
4674                         }
4675                         if (!rw_tryupgrade(&ip->i_contents)) {
4676                                 rw_exit(&ip->i_contents);
4677                                 if (do_qlock)
4678                                         rw_exit(&ufsvfsp->vfs_dqrwlock);
4679                                 goto retrylock;
4680                         }
4681                 }
4682 
4683                 /*
4684                  * May be allocating disk blocks for holes here as
4685                  * a result of mmap faults. write(2) does the bmap_write
4686                  * in rdip/wrip, not here. We are not dealing with frags
4687                  * in this case.
4688                  */
4689                 /*
4690                  * Large Files: We cast fs_bmask field to offset_t
4691                  * just as we do for MAXBMASK because uoff is a 64-bit
4692                  * data type. fs_bmask will still be a 32-bit type
4693                  * as we cannot change any ondisk data structures.
4694                  */
4695 
4696                 offset = uoff & (offset_t)fs->fs_bmask;
4697                 while (offset < uoff + len) {
4698                         blk_size = (int)blksize(fs, ip, lblkno(fs, offset));
4699                         err = bmap_write(ip, offset, blk_size,
4700                             BI_NORMAL, NULL, cr);
4701                         if (ip->i_flag & (ICHG|IUPD))
4702                                 ip->i_seq++;
4703                         if (err)
4704                                 goto update_inode;
4705                         offset += blk_size; /* XXX - make this contig */
4706                 }
4707         }
4708 
4709         /*
4710          * Can be a reader from now on.
4711          */
4712         if (dolock && rwtype == RW_WRITER) {
4713                 rw_downgrade(&ip->i_contents);
4714                 /*
4715                  * We can release vfs_dqrwlock early so do it, but make
4716                  * sure we don't try to release it again at the bottom.
4717                  */
4718                 if (do_qlock) {
4719                         rw_exit(&ufsvfsp->vfs_dqrwlock);
4720                         do_qlock = 0;
4721                 }
4722         }
4723 
4724         /*
4725          * We remove PROT_WRITE in cases when the file has UFS holes
4726          * because we don't  want to call bmap_read() to check each
4727          * page if it is backed with a disk block.
4728          */
4729         if (protp && has_holes && rw != S_WRITE && rw != S_CREATE)
4730                 *protp &= ~PROT_WRITE;
4731 
4732         err = 0;
4733 
4734         /*
4735          * The loop looks up pages in the range [off, off + len).
4736          * For each page, we first check if we should initiate an asynchronous
4737          * read ahead before we call page_lookup (we may sleep in page_lookup
4738          * for a previously initiated disk read).
4739          */
4740         eoff = (uoff + len);
4741         for (pgoff = uoff, pgaddr = addr, pl = plarr;
4742             pgoff < eoff; /* empty */) {
4743                 page_t  *pp;
4744                 u_offset_t      nextrio;
4745                 se_t    se;
4746                 int retval;
4747 
4748                 se = ((rw == S_CREATE || rw == S_OTHER) ? SE_EXCL : SE_SHARED);
4749 
4750                 /* Handle async getpage (faultahead) */
4751                 if (plarr == NULL) {
4752                         ip->i_nextrio = pgoff;
4753                         (void) ufs_getpage_ra(vp, pgoff, seg, pgaddr);
4754                         pgoff += pgsize;
4755                         pgaddr += pgsize;
4756                         continue;
4757                 }
4758                 /*
4759                  * Check if we should initiate read ahead of next cluster.
4760                  * We call page_exists only when we need to confirm that
4761                  * we have the current page before we initiate the read ahead.
4762                  */
4763                 nextrio = ip->i_nextrio;
4764                 if (seqmode &&
4765                     pgoff + CLUSTSZ(ip) >= nextrio && pgoff <= nextrio &&
4766                     nextrio < ip->i_size && page_exists(vp, pgoff)) {
4767                         retval = ufs_getpage_ra(vp, pgoff, seg, pgaddr);
4768                         /*
4769                          * We always read ahead the next cluster of data
4770                          * starting from i_nextrio. If the page (vp,nextrio)
4771                          * is actually in core at this point, the routine
4772                          * ufs_getpage_ra() will stop pre-fetching data
4773                          * until we read that page in a synchronized manner
4774                          * through ufs_getpage_miss(). So, we should increase
4775                          * i_nextrio if the page (vp, nextrio) exists.
4776                          */
4777                         if ((retval == 0) && page_exists(vp, nextrio)) {
4778                                 ip->i_nextrio = nextrio + pgsize;
4779                         }
4780                 }
4781 
4782                 if ((pp = page_lookup(vp, pgoff, se)) != NULL) {
4783                         /*
4784                          * We found the page in the page cache.
4785                          */
4786                         *pl++ = pp;
4787                         pgoff += pgsize;
4788                         pgaddr += pgsize;
4789                         len -= pgsize;
4790                         plsz -= pgsize;
4791                 } else  {
4792                         /*
4793                          * We have to create the page, or read it from disk.
4794                          */
4795                         if (err = ufs_getpage_miss(vp, pgoff, len, seg, pgaddr,
4796                             pl, plsz, rw, seqmode))
4797                                 goto error;
4798 
4799                         while (*pl != NULL) {
4800                                 pl++;
4801                                 pgoff += pgsize;
4802                                 pgaddr += pgsize;
4803                                 len -= pgsize;
4804                                 plsz -= pgsize;
4805                         }
4806                 }
4807         }
4808 
4809         /*
4810          * Return pages up to plsz if they are in the page cache.
4811          * We cannot return pages if there is a chance that they are
4812          * backed with a UFS hole and rw is S_WRITE or S_CREATE.
4813          */
4814         if (plarr && !(has_holes && (rw == S_WRITE || rw == S_CREATE))) {
4815 
4816                 ASSERT((protp == NULL) ||
4817                     !(has_holes && (*protp & PROT_WRITE)));
4818 
4819                 eoff = pgoff + plsz;
4820                 while (pgoff < eoff) {
4821                         page_t          *pp;
4822 
4823                         if ((pp = page_lookup_nowait(vp, pgoff,
4824                             SE_SHARED)) == NULL)
4825                                 break;
4826 
4827                         *pl++ = pp;
4828                         pgoff += pgsize;
4829                         plsz -= pgsize;
4830                 }
4831         }
4832 
4833         if (plarr)
4834                 *pl = NULL;                     /* Terminate page list */
4835         ip->i_nextr = pgoff;
4836 
4837 error:
4838         if (err && plarr) {
4839                 /*
4840                  * Release any pages we have locked.
4841                  */
4842                 while (pl > &plarr[0])
4843                         page_unlock(*--pl);
4844 
4845                 plarr[0] = NULL;
4846         }
4847 
4848 update_inode:
4849         /*
4850          * If the inode is not already marked for IACC (in rdip() for read)
4851          * and the inode is not marked for no access time update (in wrip()
4852          * for write) then update the inode access time and mod time now.
4853          */
4854         if ((ip->i_flag & (IACC | INOACC)) == 0) {
4855                 if ((rw != S_OTHER) && (ip->i_mode & IFMT) != IFDIR) {
4856                         if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) &&
4857                             (fs->fs_ronly == 0) &&
4858                             (!ufsvfsp->vfs_noatime)) {
4859                                 mutex_enter(&ip->i_tlock);
4860                                 ip->i_flag |= IACC;
4861                                 ITIMES_NOLOCK(ip);
4862                                 mutex_exit(&ip->i_tlock);
4863                         }
4864                 }
4865         }
4866 
4867         if (dolock) {
4868                 rw_exit(&ip->i_contents);
4869                 if (do_qlock && rwtype == RW_WRITER)
4870                         rw_exit(&ufsvfsp->vfs_dqrwlock);
4871         }
4872 
4873 unlock:
4874         if (ulp) {
4875                 if ((rw == S_CREATE || rw == S_WRITE) &&
4876                     !(vp->v_flag & VISSWAP)) {
4877                         TRANS_END_ASYNC(ufsvfsp, TOP_GETPAGE, trans_size);
4878                 }
4879                 ufs_lockfs_end(ulp);
4880         }
4881 out:
4882         return (err);
4883 }
4884 
4885 /*
4886  * ufs_getpage_miss is called when ufs_getpage missed the page in the page
4887  * cache. The page is either read from the disk, or it's created.
4888  * A page is created (without disk read) if rw == S_CREATE, or if
4889  * the page is not backed with a real disk block (UFS hole).
4890  */
4891 /* ARGSUSED */
4892 static int
4893 ufs_getpage_miss(struct vnode *vp, u_offset_t off, size_t len, struct seg *seg,
4894         caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw, int seq)
4895 {
4896         struct inode    *ip = VTOI(vp);
4897         page_t          *pp;
4898         daddr_t         bn;
4899         size_t          io_len;
4900         int             crpage = 0;
4901         int             err;
4902         int             contig;
4903         int             bsize = ip->i_fs->fs_bsize;
4904 
4905         /*
4906          * Figure out whether the page can be created, or must be
4907          * must be read from the disk.
4908          */
4909         if (rw == S_CREATE)
4910                 crpage = 1;
4911         else {
4912                 contig = 0;
4913                 if (err = bmap_read(ip, off, &bn, &contig))
4914                         return (err);
4915 
4916                 crpage = (bn == UFS_HOLE);
4917 
4918                 /*
4919                  * If its also a fallocated block that hasn't been written to
4920                  * yet, we will treat it just like a UFS_HOLE and create
4921                  * a zero page for it
4922                  */
4923                 if (ISFALLOCBLK(ip, bn))
4924                         crpage = 1;
4925         }
4926 
4927         if (crpage) {
4928                 if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT, seg,
4929                     addr)) == NULL) {
4930                         return (ufs_fault(vp,
4931                             "ufs_getpage_miss: page_create == NULL"));
4932                 }
4933 
4934                 if (rw != S_CREATE)
4935                         pagezero(pp, 0, PAGESIZE);
4936 
4937                 io_len = PAGESIZE;
4938         } else {
4939                 u_offset_t      io_off;
4940                 uint_t  xlen;
4941                 struct buf      *bp;
4942                 ufsvfs_t        *ufsvfsp = ip->i_ufsvfs;
4943 
4944                 /*
4945                  * If access is not in sequential order, we read from disk
4946                  * in bsize units.
4947                  *
4948                  * We limit the size of the transfer to bsize if we are reading
4949                  * from the beginning of the file. Note in this situation we
4950                  * will hedge our bets and initiate an async read ahead of
4951                  * the second block.
4952                  */
4953                 if (!seq || off == 0)
4954                         contig = MIN(contig, bsize);
4955 
4956                 pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4957                     &io_len, off, contig, 0);
4958 
4959                 /*
4960                  * Some other thread has entered the page.
4961                  * ufs_getpage will retry page_lookup.
4962                  */
4963                 if (pp == NULL) {
4964                         pl[0] = NULL;
4965                         return (0);
4966                 }
4967 
4968                 /*
4969                  * Zero part of the page which we are not
4970                  * going to read from the disk.
4971                  */
4972                 xlen = io_len & PAGEOFFSET;
4973                 if (xlen != 0)
4974                         pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
4975 
4976                 bp = pageio_setup(pp, io_len, ip->i_devvp, B_READ);
4977                 bp->b_edev = ip->i_dev;
4978                 bp->b_dev = cmpdev(ip->i_dev);
4979                 bp->b_blkno = bn;
4980                 bp->b_un.b_addr = (caddr_t)0;
4981                 bp->b_file = ip->i_vnode;
4982                 bp->b_offset = off;
4983 
4984                 if (ufsvfsp->vfs_log) {
4985                         lufs_read_strategy(ufsvfsp->vfs_log, bp);
4986                 } else if (ufsvfsp->vfs_snapshot) {
4987                         fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
4988                 } else {
4989                         ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
4990                         ub.ub_getpages.value.ul++;
4991                         (void) bdev_strategy(bp);
4992                         lwp_stat_update(LWP_STAT_INBLK, 1);
4993                 }
4994 
4995                 ip->i_nextrio = off + ((io_len + PAGESIZE - 1) & PAGEMASK);
4996 
4997                 /*
4998                  * If the file access is sequential, initiate read ahead
4999                  * of the next cluster.
5000                  */
5001                 if (seq && ip->i_nextrio < ip->i_size)
5002                         (void) ufs_getpage_ra(vp, off, seg, addr);
5003                 err = biowait(bp);
5004                 pageio_done(bp);
5005 
5006                 if (err) {
5007                         pvn_read_done(pp, B_ERROR);
5008                         return (err);
5009                 }
5010         }
5011 
5012         pvn_plist_init(pp, pl, plsz, off, io_len, rw);
5013         return (0);
5014 }
5015 
5016 /*
5017  * Read ahead a cluster from the disk. Returns the length in bytes.
5018  */
5019 static int
5020 ufs_getpage_ra(struct vnode *vp, u_offset_t off, struct seg *seg, caddr_t addr)
5021 {
5022         struct inode    *ip = VTOI(vp);
5023         page_t          *pp;
5024         u_offset_t      io_off = ip->i_nextrio;
5025         ufsvfs_t        *ufsvfsp;
5026         caddr_t         addr2 = addr + (io_off - off);
5027         struct buf      *bp;
5028         daddr_t         bn;
5029         size_t          io_len;
5030         int             err;
5031         int             contig;
5032         int             xlen;
5033         int             bsize = ip->i_fs->fs_bsize;
5034 
5035         /*
5036          * If the directio advisory is in effect on this file,
5037          * then do not do buffered read ahead. Read ahead makes
5038          * it more difficult on threads using directio as they
5039          * will be forced to flush the pages from this vnode.
5040          */
5041         if ((ufsvfsp = ip->i_ufsvfs) == NULL)
5042                 return (0);
5043         if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio)
5044                 return (0);
5045 
5046         /*
5047          * Is this test needed?
5048          */
5049         if (addr2 >= seg->s_base + seg->s_size)
5050                 return (0);
5051 
5052         contig = 0;
5053         err = bmap_read(ip, io_off, &bn, &contig);
5054         /*
5055          * If its a UFS_HOLE or a fallocated block, do not perform
5056          * any read ahead's since there probably is nothing to read ahead
5057          */
5058         if (err || bn == UFS_HOLE || ISFALLOCBLK(ip, bn))
5059                 return (0);
5060 
5061         /*
5062          * Limit the transfer size to bsize if this is the 2nd block.
5063          */
5064         if (io_off == (u_offset_t)bsize)
5065                 contig = MIN(contig, bsize);
5066 
5067         if ((pp = pvn_read_kluster(vp, io_off, seg, addr2, &io_off,
5068             &io_len, io_off, contig, 1)) == NULL)
5069                 return (0);
5070 
5071         /*
5072          * Zero part of page which we are not going to read from disk
5073          */
5074         if ((xlen = (io_len & PAGEOFFSET)) > 0)
5075                 pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
5076 
5077         ip->i_nextrio = (io_off + io_len + PAGESIZE - 1) & PAGEMASK;
5078 
5079         bp = pageio_setup(pp, io_len, ip->i_devvp, B_READ | B_ASYNC);
5080         bp->b_edev = ip->i_dev;
5081         bp->b_dev = cmpdev(ip->i_dev);
5082         bp->b_blkno = bn;
5083         bp->b_un.b_addr = (caddr_t)0;
5084         bp->b_file = ip->i_vnode;
5085         bp->b_offset = off;
5086 
5087         if (ufsvfsp->vfs_log) {
5088                 lufs_read_strategy(ufsvfsp->vfs_log, bp);
5089         } else if (ufsvfsp->vfs_snapshot) {
5090                 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
5091         } else {
5092                 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
5093                 ub.ub_getras.value.ul++;
5094                 (void) bdev_strategy(bp);
5095                 lwp_stat_update(LWP_STAT_INBLK, 1);
5096         }
5097 
5098         return (io_len);
5099 }
5100 
5101 int     ufs_delay = 1;
5102 /*
5103  * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE, B_ASYNC}
5104  *
5105  * LMXXX - the inode really ought to contain a pointer to one of these
5106  * async args.  Stuff gunk in there and just hand the whole mess off.
5107  * This would replace i_delaylen, i_delayoff.
5108  */
5109 /*ARGSUSED*/
5110 static int
5111 ufs_putpage(struct vnode *vp, offset_t off, size_t len, int flags,
5112         struct cred *cr, caller_context_t *ct)
5113 {
5114         struct inode *ip = VTOI(vp);
5115         int err = 0;
5116 
5117         if (vp->v_count == 0) {
5118                 return (ufs_fault(vp, "ufs_putpage: bad v_count == 0"));
5119         }
5120 
5121         /*
5122          * XXX - Why should this check be made here?
5123          */
5124         if (vp->v_flag & VNOMAP) {
5125                 err = ENOSYS;
5126                 goto errout;
5127         }
5128 
5129         if (ip->i_ufsvfs == NULL) {
5130                 err = EIO;
5131                 goto errout;
5132         }
5133 
5134         if (flags & B_ASYNC) {
5135                 if (ufs_delay && len &&
5136                     (flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0) {
5137                         mutex_enter(&ip->i_tlock);
5138                         /*
5139                          * If nobody stalled, start a new cluster.
5140                          */
5141                         if (ip->i_delaylen == 0) {
5142                                 ip->i_delayoff = off;
5143                                 ip->i_delaylen = len;
5144                                 mutex_exit(&ip->i_tlock);
5145                                 goto errout;
5146                         }
5147                         /*
5148                          * If we have a full cluster or they are not contig,
5149                          * then push last cluster and start over.
5150                          */
5151                         if (ip->i_delaylen >= CLUSTSZ(ip) ||
5152                             ip->i_delayoff + ip->i_delaylen != off) {
5153                                 u_offset_t doff;
5154                                 size_t dlen;
5155 
5156                                 doff = ip->i_delayoff;
5157                                 dlen = ip->i_delaylen;
5158                                 ip->i_delayoff = off;
5159                                 ip->i_delaylen = len;
5160                                 mutex_exit(&ip->i_tlock);
5161                                 err = ufs_putpages(vp, doff, dlen,
5162                                     flags, cr);
5163                                 /* LMXXX - flags are new val, not old */
5164                                 goto errout;
5165                         }
5166                         /*
5167                          * There is something there, it's not full, and
5168                          * it is contig.
5169                          */
5170                         ip->i_delaylen += len;
5171                         mutex_exit(&ip->i_tlock);
5172                         goto errout;
5173                 }
5174                 /*
5175                  * Must have weird flags or we are not clustering.
5176                  */
5177         }
5178 
5179         err = ufs_putpages(vp, off, len, flags, cr);
5180 
5181 errout:
5182         return (err);
5183 }
5184 
5185 /*
5186  * If len == 0, do from off to EOF.
5187  *
5188  * The normal cases should be len == 0 & off == 0 (entire vp list),
5189  * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
5190  * (from pageout).
5191  */
5192 /*ARGSUSED*/
5193 static int
5194 ufs_putpages(
5195         struct vnode *vp,
5196         offset_t off,
5197         size_t len,
5198         int flags,
5199         struct cred *cr)
5200 {
5201         u_offset_t io_off;
5202         u_offset_t eoff;
5203         struct inode *ip = VTOI(vp);
5204         page_t *pp;
5205         size_t io_len;
5206         int err = 0;
5207         int dolock;
5208 
5209         if (vp->v_count == 0)
5210                 return (ufs_fault(vp, "ufs_putpages: v_count == 0"));
5211         /*
5212          * Acquire the readers/write inode lock before locking
5213          * any pages in this inode.
5214          * The inode lock is held during i/o.
5215          */
5216         if (len == 0) {
5217                 mutex_enter(&ip->i_tlock);
5218                 ip->i_delayoff = ip->i_delaylen = 0;
5219                 mutex_exit(&ip->i_tlock);
5220         }
5221         dolock = (rw_owner(&ip->i_contents) != curthread);
5222         if (dolock) {
5223                 /*
5224                  * Must synchronize this thread and any possible thread
5225                  * operating in the window of vulnerability in wrip().
5226                  * It is dangerous to allow both a thread doing a putpage
5227                  * and a thread writing, so serialize them.  The exception
5228                  * is when the thread in wrip() does something which causes
5229                  * a putpage operation.  Then, the thread must be allowed
5230                  * to continue.  It may encounter a bmap_read problem in
5231                  * ufs_putapage, but that is handled in ufs_putapage.
5232                  * Allow async writers to proceed, we don't want to block
5233                  * the pageout daemon.
5234                  */
5235                 if (ip->i_writer == curthread)
5236                         rw_enter(&ip->i_contents, RW_READER);
5237                 else {
5238                         for (;;) {
5239                                 rw_enter(&ip->i_contents, RW_READER);
5240                                 mutex_enter(&ip->i_tlock);
5241                                 /*
5242                                  * If there is no thread in the critical
5243                                  * section of wrip(), then proceed.
5244                                  * Otherwise, wait until there isn't one.
5245                                  */
5246                                 if (ip->i_writer == NULL) {
5247                                         mutex_exit(&ip->i_tlock);
5248                                         break;
5249                                 }
5250                                 rw_exit(&ip->i_contents);
5251                                 /*
5252                                  * Bounce async writers when we have a writer
5253                                  * working on this file so we don't deadlock
5254                                  * the pageout daemon.
5255                                  */
5256                                 if (flags & B_ASYNC) {
5257                                         mutex_exit(&ip->i_tlock);
5258                                         return (0);
5259                                 }
5260                                 cv_wait(&ip->i_wrcv, &ip->i_tlock);
5261                                 mutex_exit(&ip->i_tlock);
5262                         }
5263                 }
5264         }
5265 
5266         if (!vn_has_cached_data(vp)) {
5267                 if (dolock)
5268                         rw_exit(&ip->i_contents);
5269                 return (0);
5270         }
5271 
5272         if (len == 0) {
5273                 /*
5274                  * Search the entire vp list for pages >= off.
5275                  */
5276                 err = pvn_vplist_dirty(vp, (u_offset_t)off, ufs_putapage,
5277                     flags, cr);
5278         } else {
5279                 /*
5280                  * Loop over all offsets in the range looking for
5281                  * pages to deal with.
5282                  */
5283                 if ((eoff = blkroundup(ip->i_fs, ip->i_size)) != 0)
5284                         eoff = MIN(off + len, eoff);
5285                 else
5286                         eoff = off + len;
5287 
5288                 for (io_off = off; io_off < eoff; io_off += io_len) {
5289                         /*
5290                          * If we are not invalidating, synchronously
5291                          * freeing or writing pages, use the routine
5292                          * page_lookup_nowait() to prevent reclaiming
5293                          * them from the free list.
5294                          */
5295                         if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
5296                                 pp = page_lookup(vp, io_off,
5297                                     (flags & (B_INVAL | B_FREE)) ?
5298                                     SE_EXCL : SE_SHARED);
5299                         } else {
5300                                 pp = page_lookup_nowait(vp, io_off,
5301                                     (flags & B_FREE) ? SE_EXCL : SE_SHARED);
5302                         }
5303 
5304                         if (pp == NULL || pvn_getdirty(pp, flags) == 0)
5305                                 io_len = PAGESIZE;
5306                         else {
5307                                 u_offset_t *io_offp = &io_off;
5308 
5309                                 err = ufs_putapage(vp, pp, io_offp, &io_len,
5310                                     flags, cr);
5311                                 if (err != 0)
5312                                         break;
5313                                 /*
5314                                  * "io_off" and "io_len" are returned as
5315                                  * the range of pages we actually wrote.
5316                                  * This allows us to skip ahead more quickly
5317                                  * since several pages may've been dealt
5318                                  * with by this iteration of the loop.
5319                                  */
5320                         }
5321                 }
5322         }
5323         if (err == 0 && off == 0 && (len == 0 || len >= ip->i_size)) {
5324                 /*
5325                  * We have just sync'ed back all the pages on
5326                  * the inode, turn off the IMODTIME flag.
5327                  */
5328                 mutex_enter(&ip->i_tlock);
5329                 ip->i_flag &= ~IMODTIME;
5330                 mutex_exit(&ip->i_tlock);
5331         }
5332         if (dolock)
5333                 rw_exit(&ip->i_contents);
5334         return (err);
5335 }
5336 
5337 static void
5338 ufs_iodone(buf_t *bp)
5339 {
5340         struct inode *ip;
5341 
5342         ASSERT((bp->b_pages->p_vnode != NULL) && !(bp->b_flags & B_READ));
5343 
5344         bp->b_iodone = NULL;
5345 
5346         ip = VTOI(bp->b_pages->p_vnode);
5347 
5348         mutex_enter(&ip->i_tlock);
5349         if (ip->i_writes >= ufs_LW) {
5350                 if ((ip->i_writes -= bp->b_bcount) <= ufs_LW)
5351                         if (ufs_WRITES)
5352                                 cv_broadcast(&ip->i_wrcv); /* wake all up */
5353         } else {
5354                 ip->i_writes -= bp->b_bcount;
5355         }
5356 
5357         mutex_exit(&ip->i_tlock);
5358         iodone(bp);
5359 }
5360 
5361 /*
5362  * Write out a single page, possibly klustering adjacent
5363  * dirty pages.  The inode lock must be held.
5364  *
5365  * LMXXX - bsize < pagesize not done.
5366  */
5367 /*ARGSUSED*/
5368 int
5369 ufs_putapage(
5370         struct vnode *vp,
5371         page_t *pp,
5372         u_offset_t *offp,
5373         size_t *lenp,           /* return values */
5374         int flags,
5375         struct cred *cr)
5376 {
5377         u_offset_t io_off;
5378         u_offset_t off;
5379         struct inode *ip = VTOI(vp);
5380         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
5381         struct fs *fs;
5382         struct buf *bp;
5383         size_t io_len;
5384         daddr_t bn;
5385         int err;
5386         int contig;
5387         int dotrans;
5388 
5389         ASSERT(RW_LOCK_HELD(&ip->i_contents));
5390 
5391         if (ufsvfsp == NULL) {
5392                 err = EIO;
5393                 goto out_trace;
5394         }
5395 
5396         fs = ip->i_fs;
5397         ASSERT(fs->fs_ronly == 0);
5398 
5399         /*
5400          * If the modified time on the inode has not already been
5401          * set elsewhere (e.g. for write/setattr) we set the time now.
5402          * This gives us approximate modified times for mmap'ed files
5403          * which are modified via stores in the user address space.
5404          */
5405         if ((ip->i_flag & IMODTIME) == 0) {
5406                 mutex_enter(&ip->i_tlock);
5407                 ip->i_flag |= IUPD;
5408                 ip->i_seq++;
5409                 ITIMES_NOLOCK(ip);
5410                 mutex_exit(&ip->i_tlock);
5411         }
5412 
5413         /*
5414          * Align the request to a block boundry (for old file systems),
5415          * and go ask bmap() how contiguous things are for this file.
5416          */
5417         off = pp->p_offset & (offset_t)fs->fs_bmask;  /* block align it */
5418         contig = 0;
5419         err = bmap_read(ip, off, &bn, &contig);
5420         if (err)
5421                 goto out;
5422         if (bn == UFS_HOLE) {                   /* putpage never allocates */
5423                 /*
5424                  * logging device is in error mode; simply return EIO
5425                  */
5426                 if (TRANS_ISERROR(ufsvfsp)) {
5427                         err = EIO;
5428                         goto out;
5429                 }
5430                 /*
5431                  * Oops, the thread in the window in wrip() did some
5432                  * sort of operation which caused a putpage in the bad
5433                  * range.  In this case, just return an error which will
5434                  * cause the software modified bit on the page to set
5435                  * and the page will get written out again later.
5436                  */
5437                 if (ip->i_writer == curthread) {
5438                         err = EIO;
5439                         goto out;
5440                 }
5441                 /*
5442                  * If the pager is trying to push a page in the bad range
5443                  * just tell it to try again later when things are better.
5444                  */
5445                 if (flags & B_ASYNC) {
5446                         err = EAGAIN;
5447                         goto out;
5448                 }
5449                 err = ufs_fault(ITOV(ip), "ufs_putapage: bn == UFS_HOLE");
5450                 goto out;
5451         }
5452 
5453         /*
5454          * If it is an fallocate'd block, reverse the negativity since
5455          * we are now writing to it
5456          */
5457         if (ISFALLOCBLK(ip, bn)) {
5458                 err = bmap_set_bn(vp, off, dbtofsb(fs, -bn));
5459                 if (err)
5460                         goto out;
5461 
5462                 bn = -bn;
5463         }
5464 
5465         /*
5466          * Take the length (of contiguous bytes) passed back from bmap()
5467          * and _try_ and get a set of pages covering that extent.
5468          */
5469         pp = pvn_write_kluster(vp, pp, &io_off, &io_len, off, contig, flags);
5470 
5471         /*
5472          * May have run out of memory and not clustered backwards.
5473          * off          p_offset
5474          * [  pp - 1  ][   pp   ]
5475          * [    block           ]
5476          * We told bmap off, so we have to adjust the bn accordingly.
5477          */
5478         if (io_off > off) {
5479                 bn += btod(io_off - off);
5480                 contig -= (io_off - off);
5481         }
5482 
5483         /*
5484          * bmap was carefull to tell us the right size so use that.
5485          * There might be unallocated frags at the end.
5486          * LMXXX - bzero the end of the page?  We must be writing after EOF.
5487          */
5488         if (io_len > contig) {
5489                 ASSERT(io_len - contig < fs->fs_bsize);
5490                 io_len -= (io_len - contig);
5491         }
5492 
5493         /*
5494          * Handle the case where we are writing the last page after EOF.
5495          *
5496          * XXX - just a patch for i-mt3.
5497          */
5498         if (io_len == 0) {
5499                 ASSERT(pp->p_offset >=
5500                     (u_offset_t)(roundup(ip->i_size, PAGESIZE)));
5501                 io_len = PAGESIZE;
5502         }
5503 
5504         bp = pageio_setup(pp, io_len, ip->i_devvp, B_WRITE | flags);
5505 
5506         ULOCKFS_SET_MOD(ITOUL(ip));
5507 
5508         bp->b_edev = ip->i_dev;
5509         bp->b_dev = cmpdev(ip->i_dev);
5510         bp->b_blkno = bn;
5511         bp->b_un.b_addr = (caddr_t)0;
5512         bp->b_file = ip->i_vnode;
5513 
5514         /*
5515          * File contents of shadow or quota inodes are metadata, and updates
5516          * to these need to be put into a logging transaction. All direct
5517          * callers in UFS do that, but fsflush can come here _before_ the
5518          * normal codepath. An example would be updating ACL information, for
5519          * which the normal codepath would be:
5520          *      ufs_si_store()
5521          *      ufs_rdwri()
5522          *      wrip()
5523          *      segmap_release()
5524          *      VOP_PUTPAGE()
5525          * Here, fsflush can pick up the dirty page before segmap_release()
5526          * forces it out. If that happens, there's no transaction.
5527          * We therefore need to test whether a transaction exists, and if not
5528          * create one - for fsflush.
5529          */
5530         dotrans =
5531             (((ip->i_mode & IFMT) == IFSHAD || ufsvfsp->vfs_qinod == ip) &&
5532             ((curthread->t_flag & T_DONTBLOCK) == 0) &&
5533             (TRANS_ISTRANS(ufsvfsp)));
5534 
5535         if (dotrans) {
5536                 curthread->t_flag |= T_DONTBLOCK;
5537                 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_PUTPAGE, TOP_PUTPAGE_SIZE(ip));
5538         }
5539         if (TRANS_ISTRANS(ufsvfsp)) {
5540                 if ((ip->i_mode & IFMT) == IFSHAD) {
5541                         TRANS_BUF(ufsvfsp, 0, io_len, bp, DT_SHAD);
5542                 } else if (ufsvfsp->vfs_qinod == ip) {
5543                         TRANS_DELTA(ufsvfsp, ldbtob(bn), bp->b_bcount, DT_QR,
5544                             0, 0);
5545                 }
5546         }
5547         if (dotrans) {
5548                 TRANS_END_ASYNC(ufsvfsp, TOP_PUTPAGE, TOP_PUTPAGE_SIZE(ip));
5549                 curthread->t_flag &= ~T_DONTBLOCK;
5550         }
5551 
5552         /* write throttle */
5553 
5554         ASSERT(bp->b_iodone == NULL);
5555         bp->b_iodone = (int (*)())ufs_iodone;
5556         mutex_enter(&ip->i_tlock);
5557         ip->i_writes += bp->b_bcount;
5558         mutex_exit(&ip->i_tlock);
5559 
5560         if (bp->b_flags & B_ASYNC) {
5561                 if (ufsvfsp->vfs_log) {
5562                         lufs_write_strategy(ufsvfsp->vfs_log, bp);
5563                 } else if (ufsvfsp->vfs_snapshot) {
5564                         fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
5565                 } else {
5566                         ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
5567                         ub.ub_putasyncs.value.ul++;
5568                         (void) bdev_strategy(bp);
5569                         lwp_stat_update(LWP_STAT_OUBLK, 1);
5570                 }
5571         } else {
5572                 if (ufsvfsp->vfs_log) {
5573                         lufs_write_strategy(ufsvfsp->vfs_log, bp);
5574                 } else if (ufsvfsp->vfs_snapshot) {
5575                         fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
5576                 } else {
5577                         ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
5578                         ub.ub_putsyncs.value.ul++;
5579                         (void) bdev_strategy(bp);
5580                         lwp_stat_update(LWP_STAT_OUBLK, 1);
5581                 }
5582                 err = biowait(bp);
5583                 pageio_done(bp);
5584                 pvn_write_done(pp, ((err) ? B_ERROR : 0) | B_WRITE | flags);
5585         }
5586 
5587         pp = NULL;
5588 
5589 out:
5590         if (err != 0 && pp != NULL)
5591                 pvn_write_done(pp, B_ERROR | B_WRITE | flags);
5592 
5593         if (offp)
5594                 *offp = io_off;
5595         if (lenp)
5596                 *lenp = io_len;
5597 out_trace:
5598         return (err);
5599 }
5600 
5601 uint64_t ufs_map_alock_retry_cnt;
5602 uint64_t ufs_map_lockfs_retry_cnt;
5603 
5604 /* ARGSUSED */
5605 static int
5606 ufs_map(struct vnode *vp,
5607         offset_t off,
5608         struct as *as,
5609         caddr_t *addrp,
5610         size_t len,
5611         uchar_t prot,
5612         uchar_t maxprot,
5613         uint_t flags,
5614         struct cred *cr,
5615         caller_context_t *ct)
5616 {
5617         struct segvn_crargs vn_a;
5618         struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs;
5619         struct ulockfs *ulp;
5620         int error, sig;
5621         k_sigset_t smask;
5622         caddr_t hint = *addrp;
5623 
5624         if (vp->v_flag & VNOMAP) {
5625                 error = ENOSYS;
5626                 goto out;
5627         }
5628 
5629         if (off < (offset_t)0 || (offset_t)(off + len) < (offset_t)0) {
5630                 error = ENXIO;
5631                 goto out;
5632         }
5633 
5634         if (vp->v_type != VREG) {
5635                 error = ENODEV;
5636                 goto out;
5637         }
5638 
5639 retry_map:
5640         *addrp = hint;
5641         /*
5642          * If file is being locked, disallow mapping.
5643          */
5644         if (vn_has_mandatory_locks(vp, VTOI(vp)->i_mode)) {
5645                 error = EAGAIN;
5646                 goto out;
5647         }
5648 
5649         as_rangelock(as);
5650         /*
5651          * Note that if we are retrying (because ufs_lockfs_trybegin failed in
5652          * the previous attempt), some other thread could have grabbed
5653          * the same VA range if MAP_FIXED is set. In that case, choose_addr
5654          * would unmap the valid VA range, that is ok.
5655          */
5656         error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
5657         if (error != 0) {
5658                 as_rangeunlock(as);
5659                 goto out;
5660         }
5661 
5662         /*
5663          * a_lock has to be acquired before entering the lockfs protocol
5664          * because that is the order in which pagefault works. Also we cannot
5665          * block on a_lock here because this waiting writer will prevent
5666          * further readers like ufs_read from progressing and could cause
5667          * deadlock between ufs_read/ufs_map/pagefault when a quiesce is
5668          * pending.
5669          */
5670         while (!AS_LOCK_TRYENTER(as, RW_WRITER)) {
5671                 ufs_map_alock_retry_cnt++;
5672                 delay(RETRY_LOCK_DELAY);
5673         }
5674 
5675         /*
5676          * We can't hold as->a_lock and wait for lockfs to succeed because
5677          * the proc tools might hang on a_lock, so call ufs_lockfs_trybegin()
5678          * instead.
5679          */
5680         if (error = ufs_lockfs_trybegin(ufsvfsp, &ulp, ULOCKFS_MAP_MASK)) {
5681                 /*
5682                  * ufs_lockfs_trybegin() did not succeed. It is safer to give up
5683                  * as->a_lock and wait for ulp->ul_fs_lock status to change.
5684                  */
5685                 ufs_map_lockfs_retry_cnt++;
5686                 AS_LOCK_EXIT(as);
5687                 as_rangeunlock(as);
5688                 if (error == EIO)
5689                         goto out;
5690 
5691                 mutex_enter(&ulp->ul_lock);
5692                 while (ulp->ul_fs_lock & ULOCKFS_MAP_MASK) {
5693                         if (ULOCKFS_IS_SLOCK(ulp) || ufsvfsp->vfs_nointr) {
5694                                 cv_wait(&ulp->ul_cv, &ulp->ul_lock);
5695                         } else {
5696                                 sigintr(&smask, 1);
5697                                 sig = cv_wait_sig(&ulp->ul_cv, &ulp->ul_lock);
5698                                 sigunintr(&smask);
5699                                 if (((ulp->ul_fs_lock & ULOCKFS_MAP_MASK) &&
5700                                     !sig) || ufsvfsp->vfs_dontblock) {
5701                                         mutex_exit(&ulp->ul_lock);
5702                                         return (EINTR);
5703                                 }
5704                         }
5705                 }
5706                 mutex_exit(&ulp->ul_lock);
5707                 goto retry_map;
5708         }
5709 
5710         vn_a.vp = vp;
5711         vn_a.offset = (u_offset_t)off;
5712         vn_a.type = flags & MAP_TYPE;
5713         vn_a.prot = prot;
5714         vn_a.maxprot = maxprot;
5715         vn_a.cred = cr;
5716         vn_a.amp = NULL;
5717         vn_a.flags = flags & ~MAP_TYPE;
5718         vn_a.szc = 0;
5719         vn_a.lgrp_mem_policy_flags = 0;
5720 
5721         error = as_map_locked(as, *addrp, len, segvn_create, &vn_a);
5722         if (ulp)
5723                 ufs_lockfs_end(ulp);
5724         as_rangeunlock(as);
5725 out:
5726         return (error);
5727 }
5728 
5729 /* ARGSUSED */
5730 static int
5731 ufs_addmap(struct vnode *vp,
5732         offset_t off,
5733         struct as *as,
5734         caddr_t addr,
5735         size_t  len,
5736         uchar_t  prot,
5737         uchar_t  maxprot,
5738         uint_t    flags,
5739         struct cred *cr,
5740         caller_context_t *ct)
5741 {
5742         struct inode *ip = VTOI(vp);
5743 
5744         if (vp->v_flag & VNOMAP) {
5745                 return (ENOSYS);
5746         }
5747 
5748         mutex_enter(&ip->i_tlock);
5749         ip->i_mapcnt += btopr(len);
5750         mutex_exit(&ip->i_tlock);
5751         return (0);
5752 }
5753 
5754 /*ARGSUSED*/
5755 static int
5756 ufs_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
5757         size_t len, uint_t prot,  uint_t maxprot,  uint_t flags,
5758         struct cred *cr, caller_context_t *ct)
5759 {
5760         struct inode *ip = VTOI(vp);
5761 
5762         if (vp->v_flag & VNOMAP) {
5763                 return (ENOSYS);
5764         }
5765 
5766         mutex_enter(&ip->i_tlock);
5767         ip->i_mapcnt -= btopr(len);  /* Count released mappings */
5768         ASSERT(ip->i_mapcnt >= 0);
5769         mutex_exit(&ip->i_tlock);
5770         return (0);
5771 }
5772 /*
5773  * Return the answer requested to poll() for non-device files
5774  */
5775 struct pollhead ufs_pollhd;
5776 
5777 /* ARGSUSED */
5778 int
5779 ufs_poll(vnode_t *vp, short ev, int any, short *revp, struct pollhead **phpp,
5780         caller_context_t *ct)
5781 {
5782         struct ufsvfs   *ufsvfsp;
5783 
5784         *revp = 0;
5785         ufsvfsp = VTOI(vp)->i_ufsvfs;
5786 
5787         if (!ufsvfsp) {
5788                 *revp = POLLHUP;
5789                 goto out;
5790         }
5791 
5792         if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs) ||
5793             ULOCKFS_IS_ELOCK(&ufsvfsp->vfs_ulockfs)) {
5794                 *revp |= POLLERR;
5795 
5796         } else {
5797                 if ((ev & POLLOUT) && !ufsvfsp->vfs_fs->fs_ronly &&
5798                     !ULOCKFS_IS_WLOCK(&ufsvfsp->vfs_ulockfs))
5799                         *revp |= POLLOUT;
5800 
5801                 if ((ev & POLLWRBAND) && !ufsvfsp->vfs_fs->fs_ronly &&
5802                     !ULOCKFS_IS_WLOCK(&ufsvfsp->vfs_ulockfs))
5803                         *revp |= POLLWRBAND;
5804 
5805                 if (ev & POLLIN)
5806                         *revp |= POLLIN;
5807 
5808                 if (ev & POLLRDNORM)
5809                         *revp |= POLLRDNORM;
5810 
5811                 if (ev & POLLRDBAND)
5812                         *revp |= POLLRDBAND;
5813         }
5814 
5815         if ((ev & POLLPRI) && (*revp & (POLLERR|POLLHUP)))
5816                 *revp |= POLLPRI;
5817 out:
5818         *phpp = !any && !*revp ? &ufs_pollhd : (struct pollhead *)NULL;
5819 
5820         return (0);
5821 }
5822 
5823 /* ARGSUSED */
5824 static int
5825 ufs_l_pathconf(struct vnode *vp, int cmd, ulong_t *valp, struct cred *cr,
5826         caller_context_t *ct)
5827 {
5828         struct ufsvfs   *ufsvfsp = VTOI(vp)->i_ufsvfs;
5829         struct ulockfs  *ulp = NULL;
5830         struct inode    *sip = NULL;
5831         int             error;
5832         struct inode    *ip = VTOI(vp);
5833         int             issync;
5834 
5835         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_PATHCONF_MASK);
5836         if (error)
5837                 return (error);
5838 
5839         switch (cmd) {
5840                 /*
5841                  * Have to handle _PC_NAME_MAX here, because the normal way
5842                  * [fs_pathconf() -> VOP_STATVFS() -> ufs_statvfs()]
5843                  * results in a lock ordering reversal between
5844                  * ufs_lockfs_{begin,end}() and
5845                  * ufs_thread_{suspend,continue}().
5846                  *
5847                  * Keep in sync with ufs_statvfs().
5848                  */
5849         case _PC_NAME_MAX:
5850                 *valp = MAXNAMLEN;
5851                 break;
5852 
5853         case _PC_FILESIZEBITS:
5854                 if (ufsvfsp->vfs_lfflags & UFS_LARGEFILES)
5855                         *valp = UFS_FILESIZE_BITS;
5856                 else
5857                         *valp = 32;
5858                 break;
5859 
5860         case _PC_XATTR_EXISTS:
5861                 if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
5862 
5863                         error =
5864                             ufs_xattr_getattrdir(vp, &sip, LOOKUP_XATTR, cr);
5865                         if (error ==  0 && sip != NULL) {
5866                                 /* Start transaction */
5867                                 if (ulp) {
5868                                         TRANS_BEGIN_CSYNC(ufsvfsp, issync,
5869                                             TOP_RMDIR, TOP_RMDIR_SIZE);
5870                                 }
5871                                 /*
5872                                  * Is directory empty
5873                                  */
5874                                 rw_enter(&sip->i_rwlock, RW_WRITER);
5875                                 rw_enter(&sip->i_contents, RW_WRITER);
5876                                 if (ufs_xattrdirempty(sip,
5877                                     sip->i_number, CRED())) {
5878                                         rw_enter(&ip->i_contents, RW_WRITER);
5879                                         ufs_unhook_shadow(ip, sip);
5880                                         rw_exit(&ip->i_contents);
5881 
5882                                         *valp = 0;
5883 
5884                                 } else
5885                                         *valp = 1;
5886                                 rw_exit(&sip->i_contents);
5887                                 rw_exit(&sip->i_rwlock);
5888                                 if (ulp) {
5889                                         TRANS_END_CSYNC(ufsvfsp, error, issync,
5890                                             TOP_RMDIR, TOP_RMDIR_SIZE);
5891                                 }
5892                                 VN_RELE(ITOV(sip));
5893                         } else if (error == ENOENT) {
5894                                 *valp = 0;
5895                                 error = 0;
5896                         }
5897                 } else {
5898                         error = fs_pathconf(vp, cmd, valp, cr, ct);
5899                 }
5900                 break;
5901 
5902         case _PC_ACL_ENABLED:
5903                 *valp = _ACL_ACLENT_ENABLED;
5904                 break;
5905 
5906         case _PC_MIN_HOLE_SIZE:
5907                 *valp = (ulong_t)ip->i_fs->fs_bsize;
5908                 break;
5909 
5910         case _PC_SATTR_ENABLED:
5911         case _PC_SATTR_EXISTS:
5912                 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
5913                     (vp->v_type == VREG || vp->v_type == VDIR);
5914                 break;
5915 
5916         case _PC_TIMESTAMP_RESOLUTION:
5917                 /*
5918                  * UFS keeps only microsecond timestamp resolution.
5919                  * This is historical and will probably never change.
5920                  */
5921                 *valp = 1000L;
5922                 break;
5923 
5924         default:
5925                 error = fs_pathconf(vp, cmd, valp, cr, ct);
5926                 break;
5927         }
5928 
5929         if (ulp != NULL) {
5930                 ufs_lockfs_end(ulp);
5931         }
5932         return (error);
5933 }
5934 
5935 int ufs_pageio_writes, ufs_pageio_reads;
5936 
5937 /*ARGSUSED*/
5938 static int
5939 ufs_pageio(struct vnode *vp, page_t *pp, u_offset_t io_off, size_t io_len,
5940         int flags, struct cred *cr, caller_context_t *ct)
5941 {
5942         struct inode *ip = VTOI(vp);
5943         struct ufsvfs *ufsvfsp;
5944         page_t *npp = NULL, *opp = NULL, *cpp = pp;
5945         struct buf *bp;
5946         daddr_t bn;
5947         size_t done_len = 0, cur_len = 0;
5948         int err = 0;
5949         int contig = 0;
5950         int dolock;
5951         int vmpss = 0;
5952         struct ulockfs *ulp;
5953 
5954         if ((flags & B_READ) && pp != NULL && pp->p_vnode == vp &&
5955             vp->v_mpssdata != NULL) {
5956                 vmpss = 1;
5957         }
5958 
5959         dolock = (rw_owner(&ip->i_contents) != curthread);
5960         /*
5961          * We need a better check.  Ideally, we would use another
5962          * vnodeops so that hlocked and forcibly unmounted file
5963          * systems would return EIO where appropriate and w/o the
5964          * need for these checks.
5965          */
5966         if ((ufsvfsp = ip->i_ufsvfs) == NULL)
5967                 return (EIO);
5968 
5969         /*
5970          * For vmpss (pp can be NULL) case respect the quiesce protocol.
5971          * ul_lock must be taken before locking pages so we can't use it here
5972          * if pp is non NULL because segvn already locked pages
5973          * SE_EXCL. Instead we rely on the fact that a forced umount or
5974          * applying a filesystem lock via ufs_fiolfs() will block in the
5975          * implicit call to ufs_flush() until we unlock the pages after the
5976          * return to segvn. Other ufs_quiesce() callers keep ufs_quiesce_pend
5977          * above 0 until they are done. We have to be careful not to increment
5978          * ul_vnops_cnt here after forceful unmount hlocks the file system.
5979          *
5980          * If pp is NULL use ul_lock to make sure we don't increment
5981          * ul_vnops_cnt after forceful unmount hlocks the file system.
5982          */
5983         if (vmpss || pp == NULL) {
5984                 ulp = &ufsvfsp->vfs_ulockfs;
5985                 if (pp == NULL)
5986                         mutex_enter(&ulp->ul_lock);
5987                 if (ulp->ul_fs_lock & ULOCKFS_GETREAD_MASK) {
5988                         if (pp == NULL) {
5989                                 mutex_exit(&ulp->ul_lock);
5990                         }
5991                         return (vmpss ? EIO : EINVAL);
5992                 }
5993                 atomic_inc_ulong(&ulp->ul_vnops_cnt);
5994                 if (pp == NULL)
5995                         mutex_exit(&ulp->ul_lock);
5996                 if (ufs_quiesce_pend) {
5997                         if (!atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
5998                                 cv_broadcast(&ulp->ul_cv);
5999                         return (vmpss ? EIO : EINVAL);
6000                 }
6001         }
6002 
6003         if (dolock) {
6004                 /*
6005                  * segvn may call VOP_PAGEIO() instead of VOP_GETPAGE() to
6006                  * handle a fault against a segment that maps vnode pages with
6007                  * large mappings.  Segvn creates pages and holds them locked
6008                  * SE_EXCL during VOP_PAGEIO() call. In this case we have to
6009                  * use rw_tryenter() to avoid a potential deadlock since in
6010                  * lock order i_contents needs to be taken first.
6011                  * Segvn will retry via VOP_GETPAGE() if VOP_PAGEIO() fails.
6012                  */
6013                 if (!vmpss) {
6014                         rw_enter(&ip->i_contents, RW_READER);
6015                 } else if (!rw_tryenter(&ip->i_contents, RW_READER)) {
6016                         if (!atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
6017                                 cv_broadcast(&ulp->ul_cv);
6018                         return (EDEADLK);
6019                 }
6020         }
6021 
6022         /*
6023          * Return an error to segvn because the pagefault request is beyond
6024          * PAGESIZE rounded EOF.
6025          */
6026         if (vmpss && btopr(io_off + io_len) > btopr(ip->i_size)) {
6027                 if (dolock)
6028                         rw_exit(&ip->i_contents);
6029                 if (!atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
6030                         cv_broadcast(&ulp->ul_cv);
6031                 return (EFAULT);
6032         }
6033 
6034         if (pp == NULL) {
6035                 if (bmap_has_holes(ip)) {
6036                         err = ENOSYS;
6037                 } else {
6038                         err = EINVAL;
6039                 }
6040                 if (dolock)
6041                         rw_exit(&ip->i_contents);
6042                 if (!atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
6043                         cv_broadcast(&ulp->ul_cv);
6044                 return (err);
6045         }
6046 
6047         /*
6048          * Break the io request into chunks, one for each contiguous
6049          * stretch of disk blocks in the target file.
6050          */
6051         while (done_len < io_len) {
6052                 ASSERT(cpp);
6053                 contig = 0;
6054                 if (err = bmap_read(ip, (u_offset_t)(io_off + done_len),
6055                     &bn, &contig))
6056                         break;
6057 
6058                 if (bn == UFS_HOLE) {   /* No holey swapfiles */
6059                         if (vmpss) {
6060                                 err = EFAULT;
6061                                 break;
6062                         }
6063                         err = ufs_fault(ITOV(ip), "ufs_pageio: bn == UFS_HOLE");
6064                         break;
6065                 }
6066 
6067                 cur_len = MIN(io_len - done_len, contig);
6068                 /*
6069                  * Zero out a page beyond EOF, when the last block of
6070                  * a file is a UFS fragment so that ufs_pageio() can be used
6071                  * instead of ufs_getpage() to handle faults against
6072                  * segvn segments that use large pages.
6073                  */
6074                 page_list_break(&cpp, &npp, btopr(cur_len));
6075                 if ((flags & B_READ) && (cur_len & PAGEOFFSET)) {
6076                         size_t xlen = cur_len & PAGEOFFSET;
6077                         pagezero(cpp->p_prev, xlen, PAGESIZE - xlen);
6078                 }
6079 
6080                 bp = pageio_setup(cpp, cur_len, ip->i_devvp, flags);
6081                 ASSERT(bp != NULL);
6082 
6083                 bp->b_edev = ip->i_dev;
6084                 bp->b_dev = cmpdev(ip->i_dev);
6085                 bp->b_blkno = bn;
6086                 bp->b_un.b_addr = (caddr_t)0;
6087                 bp->b_file = ip->i_vnode;
6088 
6089                 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
6090                 ub.ub_pageios.value.ul++;
6091                 if (ufsvfsp->vfs_snapshot)
6092                         fssnap_strategy(&(ufsvfsp->vfs_snapshot), bp);
6093                 else
6094                         (void) bdev_strategy(bp);
6095 
6096                 if (flags & B_READ)
6097                         ufs_pageio_reads++;
6098                 else
6099                         ufs_pageio_writes++;
6100                 if (flags & B_READ)
6101                         lwp_stat_update(LWP_STAT_INBLK, 1);
6102                 else
6103                         lwp_stat_update(LWP_STAT_OUBLK, 1);
6104                 /*
6105                  * If the request is not B_ASYNC, wait for i/o to complete
6106                  * and re-assemble the page list to return to the caller.
6107                  * If it is B_ASYNC we leave the page list in pieces and
6108                  * cleanup() will dispose of them.
6109                  */
6110                 if ((flags & B_ASYNC) == 0) {
6111                         err = biowait(bp);
6112                         pageio_done(bp);
6113                         if (err)
6114                                 break;
6115                         page_list_concat(&opp, &cpp);
6116                 }
6117                 cpp = npp;
6118                 npp = NULL;
6119                 if (flags & B_READ)
6120                         cur_len = P2ROUNDUP_TYPED(cur_len, PAGESIZE, size_t);
6121                 done_len += cur_len;
6122         }
6123         ASSERT(err || (cpp == NULL && npp == NULL && done_len == io_len));
6124         if (err) {
6125                 if (flags & B_ASYNC) {
6126                         /* Cleanup unprocessed parts of list */
6127                         page_list_concat(&cpp, &npp);
6128                         if (flags & B_READ)
6129                                 pvn_read_done(cpp, B_ERROR);
6130                         else
6131                                 pvn_write_done(cpp, B_ERROR);
6132                 } else {
6133                         /* Re-assemble list and let caller clean up */
6134                         page_list_concat(&opp, &cpp);
6135                         page_list_concat(&opp, &npp);
6136                 }
6137         }
6138 
6139         if (vmpss && !(ip->i_flag & IACC) && !ULOCKFS_IS_NOIACC(ulp) &&
6140             ufsvfsp->vfs_fs->fs_ronly == 0 && !ufsvfsp->vfs_noatime) {
6141                 mutex_enter(&ip->i_tlock);
6142                 ip->i_flag |= IACC;
6143                 ITIMES_NOLOCK(ip);
6144                 mutex_exit(&ip->i_tlock);
6145         }
6146 
6147         if (dolock)
6148                 rw_exit(&ip->i_contents);
6149         if (vmpss && !atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
6150                 cv_broadcast(&ulp->ul_cv);
6151         return (err);
6152 }
6153 
6154 /*
6155  * Called when the kernel is in a frozen state to dump data
6156  * directly to the device. It uses a private dump data structure,
6157  * set up by dump_ctl, to locate the correct disk block to which to dump.
6158  */
6159 /*ARGSUSED*/
6160 static int
6161 ufs_dump(vnode_t *vp, caddr_t addr, offset_t ldbn, offset_t dblks,
6162     caller_context_t *ct)
6163 {
6164         u_offset_t      file_size;
6165         struct inode    *ip = VTOI(vp);
6166         struct fs       *fs = ip->i_fs;
6167         daddr_t         dbn, lfsbn;
6168         int             disk_blks = fs->fs_bsize >> DEV_BSHIFT;
6169         int             error = 0;
6170         int             ndbs, nfsbs;
6171 
6172         /*
6173          * forced unmount case
6174          */
6175         if (ip->i_ufsvfs == NULL)
6176                 return (EIO);
6177         /*
6178          * Validate the inode that it has not been modified since
6179          * the dump structure is allocated.
6180          */
6181         mutex_enter(&ip->i_tlock);
6182         if ((dump_info == NULL) ||
6183             (dump_info->ip != ip) ||
6184             (dump_info->time.tv_sec != ip->i_mtime.tv_sec) ||
6185             (dump_info->time.tv_usec != ip->i_mtime.tv_usec)) {
6186                 mutex_exit(&ip->i_tlock);
6187                 return (-1);
6188         }
6189         mutex_exit(&ip->i_tlock);
6190 
6191         /*
6192          * See that the file has room for this write
6193          */
6194         UFS_GET_ISIZE(&file_size, ip);
6195 
6196         if (ldbtob(ldbn + dblks) > file_size)
6197                 return (ENOSPC);
6198 
6199         /*
6200          * Find the physical disk block numbers from the dump
6201          * private data structure directly and write out the data
6202          * in contiguous block lumps
6203          */
6204         while (dblks > 0 && !error) {
6205                 lfsbn = (daddr_t)lblkno(fs, ldbtob(ldbn));
6206                 dbn = fsbtodb(fs, dump_info->dblk[lfsbn]) + ldbn % disk_blks;
6207                 nfsbs = 1;
6208                 ndbs = disk_blks - ldbn % disk_blks;
6209                 while (ndbs < dblks && fsbtodb(fs, dump_info->dblk[lfsbn +
6210                     nfsbs]) == dbn + ndbs) {
6211                         nfsbs++;
6212                         ndbs += disk_blks;
6213                 }
6214                 if (ndbs > dblks)
6215                         ndbs = dblks;
6216                 error = bdev_dump(ip->i_dev, addr, dbn, ndbs);
6217                 addr += ldbtob((offset_t)ndbs);
6218                 dblks -= ndbs;
6219                 ldbn += ndbs;
6220         }
6221         return (error);
6222 
6223 }
6224 
6225 /*
6226  * Prepare the file system before and after the dump operation.
6227  *
6228  * action = DUMP_ALLOC:
6229  * Preparation before dump, allocate dump private data structure
6230  * to hold all the direct and indirect block info for dump.
6231  *
6232  * action = DUMP_FREE:
6233  * Clean up after dump, deallocate the dump private data structure.
6234  *
6235  * action = DUMP_SCAN:
6236  * Scan dump_info for *blkp DEV_BSIZE blocks of contig fs space;
6237  * if found, the starting file-relative DEV_BSIZE lbn is written
6238  * to *bklp; that lbn is intended for use with VOP_DUMP()
6239  */
6240 /*ARGSUSED*/
6241 static int
6242 ufs_dumpctl(vnode_t *vp, int action, offset_t *blkp, caller_context_t *ct)
6243 {
6244         struct inode    *ip = VTOI(vp);
6245         ufsvfs_t        *ufsvfsp = ip->i_ufsvfs;
6246         struct fs       *fs;
6247         daddr32_t       *dblk, *storeblk;
6248         daddr32_t       *nextblk, *endblk;
6249         struct buf      *bp;
6250         int             i, entry, entries;
6251         int             n, ncontig;
6252 
6253         /*
6254          * check for forced unmount
6255          */
6256         if (ufsvfsp == NULL)
6257                 return (EIO);
6258 
6259         if (action == DUMP_ALLOC) {
6260                 /*
6261                  * alloc and record dump_info
6262                  */
6263                 if (dump_info != NULL)
6264                         return (EINVAL);
6265 
6266                 ASSERT(vp->v_type == VREG);
6267                 fs = ufsvfsp->vfs_fs;
6268 
6269                 rw_enter(&ip->i_contents, RW_READER);
6270 
6271                 if (bmap_has_holes(ip)) {
6272                         rw_exit(&ip->i_contents);
6273                         return (EFAULT);
6274                 }
6275 
6276                 /*
6277                  * calculate and allocate space needed according to i_size
6278                  */
6279                 entries = (int)lblkno(fs, blkroundup(fs, ip->i_size));
6280                 dump_info = kmem_alloc(sizeof (struct dump) +
6281                     (entries - 1) * sizeof (daddr32_t), KM_NOSLEEP);
6282                 if (dump_info == NULL) {
6283                         rw_exit(&ip->i_contents);
6284                         return (ENOMEM);
6285                 }
6286 
6287                 /* Start saving the info */
6288                 dump_info->fsbs = entries;
6289                 dump_info->ip = ip;
6290                 storeblk = &dump_info->dblk[0];
6291 
6292                 /* Direct Blocks */
6293                 for (entry = 0; entry < NDADDR && entry < entries; entry++)
6294                         *storeblk++ = ip->i_db[entry];
6295 
6296                 /* Indirect Blocks */
6297                 for (i = 0; i < NIADDR; i++) {
6298                         int error = 0;
6299 
6300                         bp = UFS_BREAD(ufsvfsp,
6301                             ip->i_dev, fsbtodb(fs, ip->i_ib[i]), fs->fs_bsize);
6302                         if (bp->b_flags & B_ERROR)
6303                                 error = EIO;
6304                         else {
6305                                 dblk = bp->b_un.b_daddr;
6306                                 if ((storeblk = save_dblks(ip, ufsvfsp,
6307                                     storeblk, dblk, i, entries)) == NULL)
6308                                         error = EIO;
6309                         }
6310 
6311                         brelse(bp);
6312 
6313                         if (error != 0) {
6314                                 kmem_free(dump_info, sizeof (struct dump) +
6315                                     (entries - 1) * sizeof (daddr32_t));
6316                                 rw_exit(&ip->i_contents);
6317                                 dump_info = NULL;
6318                                 return (error);
6319                         }
6320                 }
6321                 /* and time stamp the information */
6322                 mutex_enter(&ip->i_tlock);
6323                 dump_info->time = ip->i_mtime;
6324                 mutex_exit(&ip->i_tlock);
6325 
6326                 rw_exit(&ip->i_contents);
6327         } else if (action == DUMP_FREE) {
6328                 /*
6329                  * free dump_info
6330                  */
6331                 if (dump_info == NULL)
6332                         return (EINVAL);
6333                 entries = dump_info->fsbs - 1;
6334                 kmem_free(dump_info, sizeof (struct dump) +
6335                     entries * sizeof (daddr32_t));
6336                 dump_info = NULL;
6337         } else if (action == DUMP_SCAN) {
6338                 /*
6339                  * scan dump_info
6340                  */
6341                 if (dump_info == NULL)
6342                         return (EINVAL);
6343 
6344                 dblk = dump_info->dblk;
6345                 nextblk = dblk + 1;
6346                 endblk = dblk + dump_info->fsbs - 1;
6347                 fs = ufsvfsp->vfs_fs;
6348                 ncontig = *blkp >> (fs->fs_bshift - DEV_BSHIFT);
6349 
6350                 /*
6351                  * scan dblk[] entries; contig fs space is found when:
6352                  * ((current blkno + frags per block) == next blkno)
6353                  */
6354                 n = 0;
6355                 while (n < ncontig && dblk < endblk) {
6356                         if ((*dblk + fs->fs_frag) == *nextblk)
6357                                 n++;
6358                         else
6359                                 n = 0;
6360                         dblk++;
6361                         nextblk++;
6362                 }
6363 
6364                 /*
6365                  * index is where size bytes of contig space begins;
6366                  * conversion from index to the file's DEV_BSIZE lbn
6367                  * is equivalent to:  (index * fs_bsize) / DEV_BSIZE
6368                  */
6369                 if (n == ncontig) {
6370                         i = (dblk - dump_info->dblk) - ncontig;
6371                         *blkp = i << (fs->fs_bshift - DEV_BSHIFT);
6372                 } else
6373                         return (EFAULT);
6374         }
6375         return (0);
6376 }
6377 
6378 /*
6379  * Recursive helper function for ufs_dumpctl().  It follows the indirect file
6380  * system  blocks until it reaches the the disk block addresses, which are
6381  * then stored into the given buffer, storeblk.
6382  */
6383 static daddr32_t *
6384 save_dblks(struct inode *ip, struct ufsvfs *ufsvfsp,  daddr32_t *storeblk,
6385     daddr32_t *dblk, int level, int entries)
6386 {
6387         struct fs       *fs = ufsvfsp->vfs_fs;
6388         struct buf      *bp;
6389         int             i;
6390 
6391         if (level == 0) {
6392                 for (i = 0; i < NINDIR(fs); i++) {
6393                         if (storeblk - dump_info->dblk >= entries)
6394                                 break;
6395                         *storeblk++ = dblk[i];
6396                 }
6397                 return (storeblk);
6398         }
6399         for (i = 0; i < NINDIR(fs); i++) {
6400                 if (storeblk - dump_info->dblk >= entries)
6401                         break;
6402                 bp = UFS_BREAD(ufsvfsp,
6403                     ip->i_dev, fsbtodb(fs, dblk[i]), fs->fs_bsize);
6404                 if (bp->b_flags & B_ERROR) {
6405                         brelse(bp);
6406                         return (NULL);
6407                 }
6408                 storeblk = save_dblks(ip, ufsvfsp, storeblk, bp->b_un.b_daddr,
6409                     level - 1, entries);
6410                 brelse(bp);
6411 
6412                 if (storeblk == NULL)
6413                         return (NULL);
6414         }
6415         return (storeblk);
6416 }
6417 
6418 /* ARGSUSED */
6419 static int
6420 ufs_getsecattr(struct vnode *vp, vsecattr_t *vsap, int flag,
6421         struct cred *cr, caller_context_t *ct)
6422 {
6423         struct inode    *ip = VTOI(vp);
6424         struct ulockfs  *ulp;
6425         struct ufsvfs   *ufsvfsp = ip->i_ufsvfs;
6426         ulong_t         vsa_mask = vsap->vsa_mask;
6427         int             err = EINVAL;
6428 
6429         vsa_mask &= (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT);
6430 
6431         /*
6432          * Only grab locks if needed - they're not needed to check vsa_mask
6433          * or if the mask contains no acl flags.
6434          */
6435         if (vsa_mask != 0) {
6436                 if (err = ufs_lockfs_begin(ufsvfsp, &ulp,
6437                     ULOCKFS_GETATTR_MASK))
6438                         return (err);
6439 
6440                 rw_enter(&ip->i_contents, RW_READER);
6441                 err = ufs_acl_get(ip, vsap, flag, cr);
6442                 rw_exit(&ip->i_contents);
6443 
6444                 if (ulp)
6445                         ufs_lockfs_end(ulp);
6446         }
6447         return (err);
6448 }
6449 
6450 /* ARGSUSED */
6451 static int
6452 ufs_setsecattr(struct vnode *vp, vsecattr_t *vsap, int flag, struct cred *cr,
6453         caller_context_t *ct)
6454 {
6455         struct inode    *ip = VTOI(vp);
6456         struct ulockfs  *ulp = NULL;
6457         struct ufsvfs   *ufsvfsp = VTOI(vp)->i_ufsvfs;
6458         ulong_t         vsa_mask = vsap->vsa_mask;
6459         int             err;
6460         int             haverwlock = 1;
6461         int             trans_size;
6462         int             donetrans = 0;
6463         int             retry = 1;
6464 
6465         ASSERT(RW_LOCK_HELD(&ip->i_rwlock));
6466 
6467         /* Abort now if the request is either empty or invalid. */
6468         vsa_mask &= (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT);
6469         if ((vsa_mask == 0) ||
6470             ((vsap->vsa_aclentp == NULL) &&
6471             (vsap->vsa_dfaclentp == NULL))) {
6472                 err = EINVAL;
6473                 goto out;
6474         }
6475 
6476         /*
6477          * Following convention, if this is a directory then we acquire the
6478          * inode's i_rwlock after starting a UFS logging transaction;
6479          * otherwise, we acquire it beforehand. Since we were called (and
6480          * must therefore return) with the lock held, we will have to drop it,
6481          * and later reacquire it, if operating on a directory.
6482          */
6483         if (vp->v_type == VDIR) {
6484                 rw_exit(&ip->i_rwlock);
6485                 haverwlock = 0;
6486         } else {
6487                 /* Upgrade the lock if required. */
6488                 if (!rw_write_held(&ip->i_rwlock)) {
6489                         rw_exit(&ip->i_rwlock);
6490                         rw_enter(&ip->i_rwlock, RW_WRITER);
6491                 }
6492         }
6493 
6494 again:
6495         ASSERT(!(vp->v_type == VDIR && haverwlock));
6496         if (err = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SETATTR_MASK)) {
6497                 ulp = NULL;
6498                 retry = 0;
6499                 goto out;
6500         }
6501 
6502         /*
6503          * Check that the file system supports this operation. Note that
6504          * ufs_lockfs_begin() will have checked that the file system had
6505          * not been forcibly unmounted.
6506          */
6507         if (ufsvfsp->vfs_fs->fs_ronly) {
6508                 err = EROFS;
6509                 goto out;
6510         }
6511         if (ufsvfsp->vfs_nosetsec) {
6512                 err = ENOSYS;
6513                 goto out;
6514         }
6515 
6516         if (ulp) {
6517                 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_SETSECATTR,
6518                     trans_size = TOP_SETSECATTR_SIZE(VTOI(vp)));
6519                 donetrans = 1;
6520         }
6521 
6522         if (vp->v_type == VDIR) {
6523                 rw_enter(&ip->i_rwlock, RW_WRITER);
6524                 haverwlock = 1;
6525         }
6526 
6527         ASSERT(haverwlock);
6528 
6529         /* Do the actual work. */
6530         rw_enter(&ip->i_contents, RW_WRITER);
6531         /*
6532          * Suppress out of inodes messages if we will retry.
6533          */
6534         if (retry)
6535                 ip->i_flag |= IQUIET;
6536         err = ufs_acl_set(ip, vsap, flag, cr);
6537         ip->i_flag &= ~IQUIET;
6538         rw_exit(&ip->i_contents);
6539 
6540 out:
6541         if (ulp) {
6542                 if (donetrans) {
6543                         /*
6544                          * top_end_async() can eventually call
6545                          * top_end_sync(), which can block. We must
6546                          * therefore observe the lock-ordering protocol
6547                          * here as well.
6548                          */
6549                         if (vp->v_type == VDIR) {
6550                                 rw_exit(&ip->i_rwlock);
6551                                 haverwlock = 0;
6552                         }
6553                         TRANS_END_ASYNC(ufsvfsp, TOP_SETSECATTR, trans_size);
6554                 }
6555                 ufs_lockfs_end(ulp);
6556         }
6557         /*
6558          * If no inodes available, try scaring a logically-
6559          * free one out of the delete queue to someplace
6560          * that we can find it.
6561          */
6562         if ((err == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
6563                 ufs_delete_drain_wait(ufsvfsp, 1);
6564                 retry = 0;
6565                 if (vp->v_type == VDIR && haverwlock) {
6566                         rw_exit(&ip->i_rwlock);
6567                         haverwlock = 0;
6568                 }
6569                 goto again;
6570         }
6571         /*
6572          * If we need to reacquire the lock then it is safe to do so
6573          * as a reader. This is because ufs_rwunlock(), which will be
6574          * called by our caller after we return, does not differentiate
6575          * between shared and exclusive locks.
6576          */
6577         if (!haverwlock) {
6578                 ASSERT(vp->v_type == VDIR);
6579                 rw_enter(&ip->i_rwlock, RW_READER);
6580         }
6581 
6582         return (err);
6583 }
6584 
6585 /*
6586  * Locate the vnode to be used for an event notification. As this will
6587  * be called prior to the name space change perform basic verification
6588  * that the change will be allowed.
6589  */
6590 
6591 static int
6592 ufs_eventlookup(struct vnode *dvp, char *nm, struct cred *cr,
6593     struct vnode **vpp)
6594 {
6595         int     namlen;
6596         int     error;
6597         struct vnode    *vp;
6598         struct inode    *ip;
6599         struct inode    *xip;
6600         struct ufsvfs   *ufsvfsp;
6601         struct ulockfs  *ulp;
6602 
6603         ip = VTOI(dvp);
6604         *vpp = NULL;
6605 
6606         if ((namlen = strlen(nm)) == 0)
6607                 return (EINVAL);
6608 
6609         if (nm[0] == '.') {
6610                 if (namlen == 1)
6611                         return (EINVAL);
6612                 else if ((namlen == 2) && nm[1] == '.') {
6613                         return (EEXIST);
6614                 }
6615         }
6616 
6617         /*
6618          * Check accessibility and write access of parent directory as we
6619          * only want to post the event if we're able to make a change.
6620          */
6621         if (error = ufs_diraccess(ip, IEXEC|IWRITE, cr))
6622                 return (error);
6623 
6624         if (vp = dnlc_lookup(dvp, nm)) {
6625                 if (vp == DNLC_NO_VNODE) {
6626                         VN_RELE(vp);
6627                         return (ENOENT);
6628                 }
6629 
6630                 *vpp = vp;
6631                 return (0);
6632         }
6633 
6634         /*
6635          * Keep the idle queue from getting too long by idling two
6636          * inodes before attempting to allocate another.
6637          * This operation must be performed before entering lockfs
6638          * or a transaction.
6639          */
6640         if (ufs_idle_q.uq_ne > ufs_idle_q.uq_hiwat)
6641                 if ((curthread->t_flag & T_DONTBLOCK) == 0) {
6642                         ins.in_lidles.value.ul += ufs_lookup_idle_count;
6643                         ufs_idle_some(ufs_lookup_idle_count);
6644                 }
6645 
6646         ufsvfsp = ip->i_ufsvfs;
6647 
6648 retry_lookup:
6649         if (error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_LOOKUP_MASK))
6650                 return (error);
6651 
6652         if ((error = ufs_dirlook(ip, nm, &xip, cr, 1, 1)) == 0) {
6653                 vp = ITOV(xip);
6654                 *vpp = vp;
6655         }
6656 
6657         if (ulp) {
6658                 ufs_lockfs_end(ulp);
6659         }
6660 
6661         if (error == EAGAIN)
6662                 goto retry_lookup;
6663 
6664         return (error);
6665 }