1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1984, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2017 Joyent, Inc.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  */
  27 
  28 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  29 /*        All Rights Reserved   */
  30 
  31 /*
  32  * Portions of this source code were derived from Berkeley 4.3 BSD
  33  * under license from the Regents of the University of California.
  34  */
  35 
  36 #include <sys/types.h>
  37 #include <sys/t_lock.h>
  38 #include <sys/ksynch.h>
  39 #include <sys/param.h>
  40 #include <sys/time.h>
  41 #include <sys/systm.h>
  42 #include <sys/sysmacros.h>
  43 #include <sys/resource.h>
  44 #include <sys/signal.h>
  45 #include <sys/cred.h>
  46 #include <sys/user.h>
  47 #include <sys/buf.h>
  48 #include <sys/vfs.h>
  49 #include <sys/vfs_opreg.h>
  50 #include <sys/vnode.h>
  51 #include <sys/proc.h>
  52 #include <sys/disp.h>
  53 #include <sys/file.h>
  54 #include <sys/fcntl.h>
  55 #include <sys/flock.h>
  56 #include <sys/atomic.h>
  57 #include <sys/kmem.h>
  58 #include <sys/uio.h>
  59 #include <sys/dnlc.h>
  60 #include <sys/conf.h>
  61 #include <sys/mman.h>
  62 #include <sys/pathname.h>
  63 #include <sys/debug.h>
  64 #include <sys/vmsystm.h>
  65 #include <sys/cmn_err.h>
  66 #include <sys/filio.h>
  67 #include <sys/policy.h>
  68 
  69 #include <sys/fs/ufs_fs.h>
  70 #include <sys/fs/ufs_lockfs.h>
  71 #include <sys/fs/ufs_filio.h>
  72 #include <sys/fs/ufs_inode.h>
  73 #include <sys/fs/ufs_fsdir.h>
  74 #include <sys/fs/ufs_quota.h>
  75 #include <sys/fs/ufs_log.h>
  76 #include <sys/fs/ufs_snap.h>
  77 #include <sys/fs/ufs_trans.h>
  78 #include <sys/fs/ufs_panic.h>
  79 #include <sys/fs/ufs_bio.h>
  80 #include <sys/dirent.h>           /* must be AFTER <sys/fs/fsdir.h>! */
  81 #include <sys/errno.h>
  82 #include <sys/fssnap_if.h>
  83 #include <sys/unistd.h>
  84 #include <sys/sunddi.h>
  85 
  86 #include <sys/filio.h>            /* _FIOIO */
  87 
  88 #include <vm/hat.h>
  89 #include <vm/page.h>
  90 #include <vm/pvn.h>
  91 #include <vm/as.h>
  92 #include <vm/seg.h>
  93 #include <vm/seg_map.h>
  94 #include <vm/seg_vn.h>
  95 #include <vm/seg_kmem.h>
  96 #include <vm/rm.h>
  97 #include <sys/swap.h>
  98 
  99 #include <fs/fs_subr.h>
 100 
 101 #include <sys/fs/decomp.h>
 102 
 103 static struct instats ins;
 104 
 105 static  int ufs_getpage_ra(struct vnode *, u_offset_t, struct seg *, caddr_t);
 106 static  int ufs_getpage_miss(struct vnode *, u_offset_t, size_t, struct seg *,
 107                 caddr_t, struct page **, size_t, enum seg_rw, int);
 108 static  int ufs_open(struct vnode **, int, struct cred *, caller_context_t *);
 109 static  int ufs_close(struct vnode *, int, int, offset_t, struct cred *,
 110                 caller_context_t *);
 111 static  int ufs_read(struct vnode *, struct uio *, int, struct cred *,
 112                 struct caller_context *);
 113 static  int ufs_write(struct vnode *, struct uio *, int, struct cred *,
 114                 struct caller_context *);
 115 static  int ufs_ioctl(struct vnode *, int, intptr_t, int, struct cred *,
 116                 int *, caller_context_t *);
 117 static  int ufs_getattr(struct vnode *, struct vattr *, int, struct cred *,
 118                 caller_context_t *);
 119 static  int ufs_setattr(struct vnode *, struct vattr *, int, struct cred *,
 120                 caller_context_t *);
 121 static  int ufs_access(struct vnode *, int, int, struct cred *,
 122                 caller_context_t *);
 123 static  int ufs_lookup(struct vnode *, char *, struct vnode **,
 124                 struct pathname *, int, struct vnode *, struct cred *,
 125                 caller_context_t *, int *, pathname_t *);
 126 static  int ufs_create(struct vnode *, char *, struct vattr *, enum vcexcl,
 127                 int, struct vnode **, struct cred *, int,
 128                 caller_context_t *, vsecattr_t  *);
 129 static  int ufs_remove(struct vnode *, char *, struct cred *,
 130                 caller_context_t *, int);
 131 static  int ufs_link(struct vnode *, struct vnode *, char *, struct cred *,
 132                 caller_context_t *, int);
 133 static  int ufs_rename(struct vnode *, char *, struct vnode *, char *,
 134                 struct cred *, caller_context_t *, int);
 135 static  int ufs_mkdir(struct vnode *, char *, struct vattr *, struct vnode **,
 136                 struct cred *, caller_context_t *, int, vsecattr_t *);
 137 static  int ufs_rmdir(struct vnode *, char *, struct vnode *, struct cred *,
 138                 caller_context_t *, int);
 139 static  int ufs_readdir(struct vnode *, struct uio *, struct cred *, int *,
 140                 caller_context_t *, int);
 141 static  int ufs_symlink(struct vnode *, char *, struct vattr *, char *,
 142                 struct cred *, caller_context_t *, int);
 143 static  int ufs_readlink(struct vnode *, struct uio *, struct cred *,
 144                 caller_context_t *);
 145 static  int ufs_fsync(struct vnode *, int, struct cred *, caller_context_t *);
 146 static  void ufs_inactive(struct vnode *, struct cred *, caller_context_t *);
 147 static  int ufs_fid(struct vnode *, struct fid *, caller_context_t *);
 148 static  int ufs_rwlock(struct vnode *, int, caller_context_t *);
 149 static  void ufs_rwunlock(struct vnode *, int, caller_context_t *);
 150 static  int ufs_seek(struct vnode *, offset_t, offset_t *, caller_context_t *);
 151 static  int ufs_frlock(struct vnode *, int, struct flock64 *, int, offset_t,
 152                 struct flk_callback *, struct cred *,
 153                 caller_context_t *);
 154 static  int ufs_space(struct vnode *, int, struct flock64 *, int, offset_t,
 155                 cred_t *, caller_context_t *);
 156 static  int ufs_getpage(struct vnode *, offset_t, size_t, uint_t *,
 157                 struct page **, size_t, struct seg *, caddr_t,
 158                 enum seg_rw, struct cred *, caller_context_t *);
 159 static  int ufs_putpage(struct vnode *, offset_t, size_t, int, struct cred *,
 160                 caller_context_t *);
 161 static  int ufs_putpages(struct vnode *, offset_t, size_t, int, struct cred *);
 162 static  int ufs_map(struct vnode *, offset_t, struct as *, caddr_t *, size_t,
 163                 uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
 164 static  int ufs_addmap(struct vnode *, offset_t, struct as *, caddr_t,  size_t,
 165                 uchar_t, uchar_t, uint_t, struct cred *, caller_context_t *);
 166 static  int ufs_delmap(struct vnode *, offset_t, struct as *, caddr_t,  size_t,
 167                 uint_t, uint_t, uint_t, struct cred *, caller_context_t *);
 168 static  int ufs_poll(vnode_t *, short, int, short *, struct pollhead **,
 169                 caller_context_t *);
 170 static  int ufs_dump(vnode_t *, caddr_t, offset_t, offset_t,
 171     caller_context_t *);
 172 static  int ufs_l_pathconf(struct vnode *, int, ulong_t *, struct cred *,
 173                 caller_context_t *);
 174 static  int ufs_pageio(struct vnode *, struct page *, u_offset_t, size_t, int,
 175                 struct cred *, caller_context_t *);
 176 static  int ufs_dumpctl(vnode_t *, int, offset_t *, caller_context_t *);
 177 static  daddr32_t *save_dblks(struct inode *, struct ufsvfs *, daddr32_t *,
 178                 daddr32_t *, int, int);
 179 static  int ufs_getsecattr(struct vnode *, vsecattr_t *, int, struct cred *,
 180                 caller_context_t *);
 181 static  int ufs_setsecattr(struct vnode *, vsecattr_t *, int, struct cred *,
 182                 caller_context_t *);
 183 static  int ufs_priv_access(void *, int, struct cred *);
 184 static  int ufs_eventlookup(struct vnode *, char *, struct cred *,
 185     struct vnode **);
 186 extern int as_map_locked(struct as *, caddr_t, size_t, int ((*)()), void *);
 187 
 188 /*
 189  * For lockfs: ulockfs begin/end is now inlined in the ufs_xxx functions.
 190  *
 191  * XXX - ULOCKFS in fs_pathconf and ufs_ioctl is not inlined yet.
 192  */
 193 struct vnodeops *ufs_vnodeops;
 194 
 195 /* NOTE: "not blkd" below  means that the operation isn't blocked by lockfs */
 196 const fs_operation_def_t ufs_vnodeops_template[] = {
 197         VOPNAME_OPEN,           { .vop_open = ufs_open },       /* not blkd */
 198         VOPNAME_CLOSE,          { .vop_close = ufs_close },     /* not blkd */
 199         VOPNAME_READ,           { .vop_read = ufs_read },
 200         VOPNAME_WRITE,          { .vop_write = ufs_write },
 201         VOPNAME_IOCTL,          { .vop_ioctl = ufs_ioctl },
 202         VOPNAME_GETATTR,        { .vop_getattr = ufs_getattr },
 203         VOPNAME_SETATTR,        { .vop_setattr = ufs_setattr },
 204         VOPNAME_ACCESS,         { .vop_access = ufs_access },
 205         VOPNAME_LOOKUP,         { .vop_lookup = ufs_lookup },
 206         VOPNAME_CREATE,         { .vop_create = ufs_create },
 207         VOPNAME_REMOVE,         { .vop_remove = ufs_remove },
 208         VOPNAME_LINK,           { .vop_link = ufs_link },
 209         VOPNAME_RENAME,         { .vop_rename = ufs_rename },
 210         VOPNAME_MKDIR,          { .vop_mkdir = ufs_mkdir },
 211         VOPNAME_RMDIR,          { .vop_rmdir = ufs_rmdir },
 212         VOPNAME_READDIR,        { .vop_readdir = ufs_readdir },
 213         VOPNAME_SYMLINK,        { .vop_symlink = ufs_symlink },
 214         VOPNAME_READLINK,       { .vop_readlink = ufs_readlink },
 215         VOPNAME_FSYNC,          { .vop_fsync = ufs_fsync },
 216         VOPNAME_INACTIVE,       { .vop_inactive = ufs_inactive }, /* not blkd */
 217         VOPNAME_FID,            { .vop_fid = ufs_fid },
 218         VOPNAME_RWLOCK,         { .vop_rwlock = ufs_rwlock },   /* not blkd */
 219         VOPNAME_RWUNLOCK,       { .vop_rwunlock = ufs_rwunlock }, /* not blkd */
 220         VOPNAME_SEEK,           { .vop_seek = ufs_seek },
 221         VOPNAME_FRLOCK,         { .vop_frlock = ufs_frlock },
 222         VOPNAME_SPACE,          { .vop_space = ufs_space },
 223         VOPNAME_GETPAGE,        { .vop_getpage = ufs_getpage },
 224         VOPNAME_PUTPAGE,        { .vop_putpage = ufs_putpage },
 225         VOPNAME_MAP,            { .vop_map = ufs_map },
 226         VOPNAME_ADDMAP,         { .vop_addmap = ufs_addmap },   /* not blkd */
 227         VOPNAME_DELMAP,         { .vop_delmap = ufs_delmap },   /* not blkd */
 228         VOPNAME_POLL,           { .vop_poll = ufs_poll },       /* not blkd */
 229         VOPNAME_DUMP,           { .vop_dump = ufs_dump },
 230         VOPNAME_PATHCONF,       { .vop_pathconf = ufs_l_pathconf },
 231         VOPNAME_PAGEIO,         { .vop_pageio = ufs_pageio },
 232         VOPNAME_DUMPCTL,        { .vop_dumpctl = ufs_dumpctl },
 233         VOPNAME_GETSECATTR,     { .vop_getsecattr = ufs_getsecattr },
 234         VOPNAME_SETSECATTR,     { .vop_setsecattr = ufs_setsecattr },
 235         VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
 236         NULL,                   NULL
 237 };
 238 
 239 #define MAX_BACKFILE_COUNT      9999
 240 
 241 /*
 242  * Created by ufs_dumpctl() to store a file's disk block info into memory.
 243  * Used by ufs_dump() to dump data to disk directly.
 244  */
 245 struct dump {
 246         struct inode    *ip;            /* the file we contain */
 247         daddr_t         fsbs;           /* number of blocks stored */
 248         struct timeval32 time;          /* time stamp for the struct */
 249         daddr32_t       dblk[1];        /* place holder for block info */
 250 };
 251 
 252 static struct dump *dump_info = NULL;
 253 
 254 /*
 255  * Previously there was no special action required for ordinary files.
 256  * (Devices are handled through the device file system.)
 257  * Now we support Large Files and Large File API requires open to
 258  * fail if file is large.
 259  * We could take care to prevent data corruption
 260  * by doing an atomic check of size and truncate if file is opened with
 261  * FTRUNC flag set but traditionally this is being done by the vfs/vnode
 262  * layers. So taking care of truncation here is a change in the existing
 263  * semantics of VOP_OPEN and therefore we chose not to implement any thing
 264  * here. The check for the size of the file > 2GB is being done at the
 265  * vfs layer in routine vn_open().
 266  */
 267 
 268 /* ARGSUSED */
 269 static int
 270 ufs_open(struct vnode **vpp, int flag, struct cred *cr, caller_context_t *ct)
 271 {
 272         return (0);
 273 }
 274 
 275 /*ARGSUSED*/
 276 static int
 277 ufs_close(struct vnode *vp, int flag, int count, offset_t offset,
 278     struct cred *cr, caller_context_t *ct)
 279 {
 280         cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
 281         cleanshares(vp, ttoproc(curthread)->p_pid);
 282 
 283         /*
 284          * Push partially filled cluster at last close.
 285          * ``last close'' is approximated because the dnlc
 286          * may have a hold on the vnode.
 287          * Checking for VBAD here will also act as a forced umount check.
 288          */
 289         if (vp->v_count <= 2 && vp->v_type != VBAD) {
 290                 struct inode *ip = VTOI(vp);
 291                 if (ip->i_delaylen) {
 292                         ins.in_poc.value.ul++;
 293                         (void) ufs_putpages(vp, ip->i_delayoff, ip->i_delaylen,
 294                             B_ASYNC | B_FREE, cr);
 295                         ip->i_delaylen = 0;
 296                 }
 297         }
 298 
 299         return (0);
 300 }
 301 
 302 /*ARGSUSED*/
 303 static int
 304 ufs_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr,
 305     struct caller_context *ct)
 306 {
 307         struct inode *ip = VTOI(vp);
 308         struct ufsvfs *ufsvfsp;
 309         struct ulockfs *ulp = NULL;
 310         int error = 0;
 311         int intrans = 0;
 312 
 313         ASSERT(RW_READ_HELD(&ip->i_rwlock));
 314 
 315         /*
 316          * Mandatory locking needs to be done before ufs_lockfs_begin()
 317          * and TRANS_BEGIN_SYNC() calls since mandatory locks can sleep.
 318          */
 319         if (MANDLOCK(vp, ip->i_mode)) {
 320                 /*
 321                  * ufs_getattr ends up being called by chklock
 322                  */
 323                 error = chklock(vp, FREAD, uiop->uio_loffset,
 324                     uiop->uio_resid, uiop->uio_fmode, ct);
 325                 if (error)
 326                         goto out;
 327         }
 328 
 329         ufsvfsp = ip->i_ufsvfs;
 330         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READ_MASK);
 331         if (error)
 332                 goto out;
 333 
 334         /*
 335          * In the case that a directory is opened for reading as a file
 336          * (eg "cat .") with the  O_RSYNC, O_SYNC and O_DSYNC flags set.
 337          * The locking order had to be changed to avoid a deadlock with
 338          * an update taking place on that directory at the same time.
 339          */
 340         if ((ip->i_mode & IFMT) == IFDIR) {
 341 
 342                 rw_enter(&ip->i_contents, RW_READER);
 343                 error = rdip(ip, uiop, ioflag, cr);
 344                 rw_exit(&ip->i_contents);
 345 
 346                 if (error) {
 347                         if (ulp)
 348                                 ufs_lockfs_end(ulp);
 349                         goto out;
 350                 }
 351 
 352                 if (ulp && (ioflag & FRSYNC) && (ioflag & (FSYNC | FDSYNC)) &&
 353                     TRANS_ISTRANS(ufsvfsp)) {
 354                         rw_exit(&ip->i_rwlock);
 355                         TRANS_BEGIN_SYNC(ufsvfsp, TOP_READ_SYNC, TOP_READ_SIZE,
 356                             error);
 357                         ASSERT(!error);
 358                         TRANS_END_SYNC(ufsvfsp, error, TOP_READ_SYNC,
 359                             TOP_READ_SIZE);
 360                         rw_enter(&ip->i_rwlock, RW_READER);
 361                 }
 362         } else {
 363                 /*
 364                  * Only transact reads to files opened for sync-read and
 365                  * sync-write on a file system that is not write locked.
 366                  *
 367                  * The ``not write locked'' check prevents problems with
 368                  * enabling/disabling logging on a busy file system.  E.g.,
 369                  * logging exists at the beginning of the read but does not
 370                  * at the end.
 371                  *
 372                  */
 373                 if (ulp && (ioflag & FRSYNC) && (ioflag & (FSYNC | FDSYNC)) &&
 374                     TRANS_ISTRANS(ufsvfsp)) {
 375                         TRANS_BEGIN_SYNC(ufsvfsp, TOP_READ_SYNC, TOP_READ_SIZE,
 376                             error);
 377                         ASSERT(!error);
 378                         intrans = 1;
 379                 }
 380 
 381                 rw_enter(&ip->i_contents, RW_READER);
 382                 error = rdip(ip, uiop, ioflag, cr);
 383                 rw_exit(&ip->i_contents);
 384 
 385                 if (intrans) {
 386                         TRANS_END_SYNC(ufsvfsp, error, TOP_READ_SYNC,
 387                             TOP_READ_SIZE);
 388                 }
 389         }
 390 
 391         if (ulp) {
 392                 ufs_lockfs_end(ulp);
 393         }
 394 out:
 395 
 396         return (error);
 397 }
 398 
 399 extern  int     ufs_HW;         /* high water mark */
 400 extern  int     ufs_LW;         /* low water mark */
 401 int     ufs_WRITES = 1;         /* XXX - enable/disable */
 402 int     ufs_throttles = 0;      /* throttling count */
 403 int     ufs_allow_shared_writes = 1;    /* directio shared writes */
 404 
 405 static int
 406 ufs_check_rewrite(struct inode *ip, struct uio *uiop, int ioflag)
 407 {
 408         int     shared_write;
 409 
 410         /*
 411          * If the FDSYNC flag is set then ignore the global
 412          * ufs_allow_shared_writes in this case.
 413          */
 414         shared_write = (ioflag & FDSYNC) | ufs_allow_shared_writes;
 415 
 416         /*
 417          * Filter to determine if this request is suitable as a
 418          * concurrent rewrite. This write must not allocate blocks
 419          * by extending the file or filling in holes. No use trying
 420          * through FSYNC descriptors as the inode will be synchronously
 421          * updated after the write. The uio structure has not yet been
 422          * checked for sanity, so assume nothing.
 423          */
 424         return (((ip->i_mode & IFMT) == IFREG) && !(ioflag & FAPPEND) &&
 425             (uiop->uio_loffset >= (offset_t)0) &&
 426             (uiop->uio_loffset < ip->i_size) && (uiop->uio_resid > 0) &&
 427             ((ip->i_size - uiop->uio_loffset) >= uiop->uio_resid) &&
 428             !(ioflag & FSYNC) && !bmap_has_holes(ip) &&
 429             shared_write);
 430 }
 431 
 432 /*ARGSUSED*/
 433 static int
 434 ufs_write(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cr,
 435     caller_context_t *ct)
 436 {
 437         struct inode *ip = VTOI(vp);
 438         struct ufsvfs *ufsvfsp;
 439         struct ulockfs *ulp;
 440         int retry = 1;
 441         int error, resv, resid = 0;
 442         int directio_status;
 443         int exclusive;
 444         int rewriteflg;
 445         long start_resid = uiop->uio_resid;
 446 
 447         ASSERT(RW_LOCK_HELD(&ip->i_rwlock));
 448 
 449 retry_mandlock:
 450         /*
 451          * Mandatory locking needs to be done before ufs_lockfs_begin()
 452          * and TRANS_BEGIN_[A]SYNC() calls since mandatory locks can sleep.
 453          * Check for forced unmounts normally done in ufs_lockfs_begin().
 454          */
 455         if ((ufsvfsp = ip->i_ufsvfs) == NULL) {
 456                 error = EIO;
 457                 goto out;
 458         }
 459         if (MANDLOCK(vp, ip->i_mode)) {
 460 
 461                 ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
 462 
 463                 /*
 464                  * ufs_getattr ends up being called by chklock
 465                  */
 466                 error = chklock(vp, FWRITE, uiop->uio_loffset,
 467                     uiop->uio_resid, uiop->uio_fmode, ct);
 468                 if (error)
 469                         goto out;
 470         }
 471 
 472         /* i_rwlock can change in chklock */
 473         exclusive = rw_write_held(&ip->i_rwlock);
 474         rewriteflg = ufs_check_rewrite(ip, uiop, ioflag);
 475 
 476         /*
 477          * Check for fast-path special case of directio re-writes.
 478          */
 479         if ((ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) &&
 480             !exclusive && rewriteflg) {
 481 
 482                 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_WRITE_MASK);
 483                 if (error)
 484                         goto out;
 485 
 486                 rw_enter(&ip->i_contents, RW_READER);
 487                 error = ufs_directio_write(ip, uiop, ioflag, 1, cr,
 488                     &directio_status);
 489                 if (directio_status == DIRECTIO_SUCCESS) {
 490                         uint_t i_flag_save;
 491 
 492                         if (start_resid != uiop->uio_resid)
 493                                 error = 0;
 494                         /*
 495                          * Special treatment of access times for re-writes.
 496                          * If IMOD is not already set, then convert it
 497                          * to IMODACC for this operation. This defers
 498                          * entering a delta into the log until the inode
 499                          * is flushed. This mimics what is done for read
 500                          * operations and inode access time.
 501                          */
 502                         mutex_enter(&ip->i_tlock);
 503                         i_flag_save = ip->i_flag;
 504                         ip->i_flag |= IUPD | ICHG;
 505                         ip->i_seq++;
 506                         ITIMES_NOLOCK(ip);
 507                         if ((i_flag_save & IMOD) == 0) {
 508                                 ip->i_flag &= ~IMOD;
 509                                 ip->i_flag |= IMODACC;
 510                         }
 511                         mutex_exit(&ip->i_tlock);
 512                         rw_exit(&ip->i_contents);
 513                         if (ulp)
 514                                 ufs_lockfs_end(ulp);
 515                         goto out;
 516                 }
 517                 rw_exit(&ip->i_contents);
 518                 if (ulp)
 519                         ufs_lockfs_end(ulp);
 520         }
 521 
 522         if (!exclusive && !rw_tryupgrade(&ip->i_rwlock)) {
 523                 rw_exit(&ip->i_rwlock);
 524                 rw_enter(&ip->i_rwlock, RW_WRITER);
 525                 /*
 526                  * Mandatory locking could have been enabled
 527                  * after dropping the i_rwlock.
 528                  */
 529                 if (MANDLOCK(vp, ip->i_mode))
 530                         goto retry_mandlock;
 531         }
 532 
 533         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_WRITE_MASK);
 534         if (error)
 535                 goto out;
 536 
 537         /*
 538          * Amount of log space needed for this write
 539          */
 540         if (!rewriteflg || !(ioflag & FDSYNC))
 541                 TRANS_WRITE_RESV(ip, uiop, ulp, &resv, &resid);
 542 
 543         /*
 544          * Throttle writes.
 545          */
 546         if (ufs_WRITES && (ip->i_writes > ufs_HW)) {
 547                 mutex_enter(&ip->i_tlock);
 548                 while (ip->i_writes > ufs_HW) {
 549                         ufs_throttles++;
 550                         cv_wait(&ip->i_wrcv, &ip->i_tlock);
 551                 }
 552                 mutex_exit(&ip->i_tlock);
 553         }
 554 
 555         /*
 556          * Enter Transaction
 557          *
 558          * If the write is a rewrite there is no need to open a transaction
 559          * if the FDSYNC flag is set and not the FSYNC.  In this case just
 560          * set the IMODACC flag to modify do the update at a later time
 561          * thus avoiding the overhead of the logging transaction that is
 562          * not required.
 563          */
 564         if (ioflag & (FSYNC|FDSYNC)) {
 565                 if (ulp) {
 566                         if (rewriteflg) {
 567                                 uint_t i_flag_save;
 568 
 569                                 rw_enter(&ip->i_contents, RW_READER);
 570                                 mutex_enter(&ip->i_tlock);
 571                                 i_flag_save = ip->i_flag;
 572                                 ip->i_flag |= IUPD | ICHG;
 573                                 ip->i_seq++;
 574                                 ITIMES_NOLOCK(ip);
 575                                 if ((i_flag_save & IMOD) == 0) {
 576                                         ip->i_flag &= ~IMOD;
 577                                         ip->i_flag |= IMODACC;
 578                                 }
 579                                 mutex_exit(&ip->i_tlock);
 580                                 rw_exit(&ip->i_contents);
 581                         } else {
 582                                 int terr = 0;
 583                                 TRANS_BEGIN_SYNC(ufsvfsp, TOP_WRITE_SYNC, resv,
 584                                     terr);
 585                                 ASSERT(!terr);
 586                         }
 587                 }
 588         } else {
 589                 if (ulp)
 590                         TRANS_BEGIN_ASYNC(ufsvfsp, TOP_WRITE, resv);
 591         }
 592 
 593         /*
 594          * Write the file
 595          */
 596         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
 597         rw_enter(&ip->i_contents, RW_WRITER);
 598         if ((ioflag & FAPPEND) != 0 && (ip->i_mode & IFMT) == IFREG) {
 599                 /*
 600                  * In append mode start at end of file.
 601                  */
 602                 uiop->uio_loffset = ip->i_size;
 603         }
 604 
 605         /*
 606          * Mild optimisation, don't call ufs_trans_write() unless we have to
 607          * Also, suppress file system full messages if we will retry.
 608          */
 609         if (retry)
 610                 ip->i_flag |= IQUIET;
 611         if (resid) {
 612                 TRANS_WRITE(ip, uiop, ioflag, error, ulp, cr, resv, resid);
 613         } else {
 614                 error = wrip(ip, uiop, ioflag, cr);
 615         }
 616         ip->i_flag &= ~IQUIET;
 617 
 618         rw_exit(&ip->i_contents);
 619         rw_exit(&ufsvfsp->vfs_dqrwlock);
 620 
 621         /*
 622          * Leave Transaction
 623          */
 624         if (ulp) {
 625                 if (ioflag & (FSYNC|FDSYNC)) {
 626                         if (!rewriteflg) {
 627                                 int terr = 0;
 628 
 629                                 TRANS_END_SYNC(ufsvfsp, terr, TOP_WRITE_SYNC,
 630                                     resv);
 631                                 if (error == 0)
 632                                         error = terr;
 633                         }
 634                 } else {
 635                         TRANS_END_ASYNC(ufsvfsp, TOP_WRITE, resv);
 636                 }
 637                 ufs_lockfs_end(ulp);
 638         }
 639 out:
 640         if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
 641                 /*
 642                  * Any blocks tied up in pending deletes?
 643                  */
 644                 ufs_delete_drain_wait(ufsvfsp, 1);
 645                 retry = 0;
 646                 goto retry_mandlock;
 647         }
 648 
 649         if (error == ENOSPC && (start_resid != uiop->uio_resid))
 650                 error = 0;
 651 
 652         return (error);
 653 }
 654 
 655 /*
 656  * Don't cache write blocks to files with the sticky bit set.
 657  * Used to keep swap files from blowing the page cache on a server.
 658  */
 659 int stickyhack = 1;
 660 
 661 /*
 662  * Free behind hacks.  The pager is busted.
 663  * XXX - need to pass the information down to writedone() in a flag like B_SEQ
 664  * or B_FREE_IF_TIGHT_ON_MEMORY.
 665  */
 666 int     freebehind = 1;
 667 int     smallfile = 0;
 668 u_offset_t smallfile64 = 32 * 1024;
 669 
 670 /*
 671  * While we should, in most cases, cache the pages for write, we
 672  * may also want to cache the pages for read as long as they are
 673  * frequently re-usable.
 674  *
 675  * If cache_read_ahead = 1, the pages for read will go to the tail
 676  * of the cache list when they are released, otherwise go to the head.
 677  */
 678 int     cache_read_ahead = 0;
 679 
 680 /*
 681  * Freebehind exists  so that as we read  large files  sequentially we
 682  * don't consume most of memory with pages  from a few files. It takes
 683  * longer to re-read from disk multiple small files as it does reading
 684  * one large one sequentially.  As system  memory grows customers need
 685  * to retain bigger chunks   of files in  memory.   The advent of  the
 686  * cachelist opens up of the possibility freeing pages  to the head or
 687  * tail of the list.
 688  *
 689  * Not freeing a page is a bet that the page will be read again before
 690  * it's segmap slot is needed for something else. If we loose the bet,
 691  * it means some  other thread is  burdened with the  page free we did
 692  * not do. If we win we save a free and reclaim.
 693  *
 694  * Freeing it at the tail  vs the head of cachelist  is a bet that the
 695  * page will survive until the next  read.  It's also saying that this
 696  * page is more likely to  be re-used than a  page freed some time ago
 697  * and never reclaimed.
 698  *
 699  * Freebehind maintains a  range of  file offset [smallfile1; smallfile2]
 700  *
 701  *            0 < offset < smallfile1 : pages are not freed.
 702  *   smallfile1 < offset < smallfile2 : pages freed to tail of cachelist.
 703  *   smallfile2 < offset              : pages freed to head of cachelist.
 704  *
 705  * The range  is  computed  at most  once  per second  and  depends on
 706  * freemem  and  ncpus_online.  Both parameters  are   bounded to be
 707  * >= smallfile && >= smallfile64.
 708  *
 709  * smallfile1 = (free memory / ncpu) / 1000
 710  * smallfile2 = (free memory / ncpu) / 10
 711  *
 712  * A few examples values:
 713  *
 714  *       Free Mem (in Bytes) [smallfile1; smallfile2]  [smallfile1; smallfile2]
 715  *                                 ncpus_online = 4          ncpus_online = 64
 716  *       ------------------  -----------------------   -----------------------
 717  *             1G                   [256K;  25M]               [32K; 1.5M]
 718  *            10G                   [2.5M; 250M]              [156K; 15M]
 719  *           100G                    [25M; 2.5G]              [1.5M; 150M]
 720  *
 721  */
 722 
 723 #define SMALLFILE1_D 1000
 724 #define SMALLFILE2_D 10
 725 static u_offset_t smallfile1 = 32 * 1024;
 726 static u_offset_t smallfile2 = 32 * 1024;
 727 static clock_t smallfile_update = 0; /* lbolt value of when to recompute */
 728 uint_t smallfile1_d = SMALLFILE1_D;
 729 uint_t smallfile2_d = SMALLFILE2_D;
 730 
 731 /*
 732  * wrip does the real work of write requests for ufs.
 733  */
 734 int
 735 wrip(struct inode *ip, struct uio *uio, int ioflag, struct cred *cr)
 736 {
 737         rlim64_t limit = uio->uio_llimit;
 738         u_offset_t off;
 739         u_offset_t old_i_size;
 740         struct fs *fs;
 741         struct vnode *vp;
 742         struct ufsvfs *ufsvfsp;
 743         caddr_t base;
 744         long start_resid = uio->uio_resid;   /* save starting resid */
 745         long premove_resid;                     /* resid before uiomove() */
 746         uint_t flags;
 747         int newpage;
 748         int iupdat_flag, directio_status;
 749         int n, on, mapon;
 750         int error, pagecreate;
 751         int do_dqrwlock;                /* drop/reacquire vfs_dqrwlock */
 752         int32_t iblocks;
 753         int     new_iblocks;
 754 
 755         /*
 756          * ip->i_size is incremented before the uiomove
 757          * is done on a write.  If the move fails (bad user
 758          * address) reset ip->i_size.
 759          * The better way would be to increment ip->i_size
 760          * only if the uiomove succeeds.
 761          */
 762         int i_size_changed = 0;
 763         o_mode_t type;
 764         int i_seq_needed = 0;
 765 
 766         vp = ITOV(ip);
 767 
 768         /*
 769          * check for forced unmount - should not happen as
 770          * the request passed the lockfs checks.
 771          */
 772         if ((ufsvfsp = ip->i_ufsvfs) == NULL)
 773                 return (EIO);
 774 
 775         fs = ip->i_fs;
 776 
 777         ASSERT(RW_WRITE_HELD(&ip->i_contents));
 778 
 779         /* check for valid filetype */
 780         type = ip->i_mode & IFMT;
 781         if ((type != IFREG) && (type != IFDIR) && (type != IFATTRDIR) &&
 782             (type != IFLNK) && (type != IFSHAD)) {
 783                 return (EIO);
 784         }
 785 
 786         /*
 787          * the actual limit of UFS file size
 788          * is UFS_MAXOFFSET_T
 789          */
 790         if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 791                 limit = MAXOFFSET_T;
 792 
 793         if (uio->uio_loffset >= limit) {
 794                 proc_t *p = ttoproc(curthread);
 795 
 796                 mutex_enter(&p->p_lock);
 797                 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls,
 798                     p, RCA_UNSAFE_SIGINFO);
 799                 mutex_exit(&p->p_lock);
 800                 return (EFBIG);
 801         }
 802 
 803         /*
 804          * if largefiles are disallowed, the limit is
 805          * the pre-largefiles value of 2GB
 806          */
 807         if (ufsvfsp->vfs_lfflags & UFS_LARGEFILES)
 808                 limit = MIN(UFS_MAXOFFSET_T, limit);
 809         else
 810                 limit = MIN(MAXOFF32_T, limit);
 811 
 812         if (uio->uio_loffset < (offset_t)0) {
 813                 return (EINVAL);
 814         }
 815         if (uio->uio_resid == 0) {
 816                 return (0);
 817         }
 818 
 819         if (uio->uio_loffset >= limit)
 820                 return (EFBIG);
 821 
 822         ip->i_flag |= INOACC;        /* don't update ref time in getpage */
 823 
 824         if (ioflag & (FSYNC|FDSYNC)) {
 825                 ip->i_flag |= ISYNC;
 826                 iupdat_flag = 1;
 827         }
 828         /*
 829          * Try to go direct
 830          */
 831         if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) {
 832                 uio->uio_llimit = limit;
 833                 error = ufs_directio_write(ip, uio, ioflag, 0, cr,
 834                     &directio_status);
 835                 /*
 836                  * If ufs_directio wrote to the file or set the flags,
 837                  * we need to update i_seq, but it may be deferred.
 838                  */
 839                 if (start_resid != uio->uio_resid ||
 840                     (ip->i_flag & (ICHG|IUPD))) {
 841                         i_seq_needed = 1;
 842                         ip->i_flag |= ISEQ;
 843                 }
 844                 if (directio_status == DIRECTIO_SUCCESS)
 845                         goto out;
 846         }
 847 
 848         /*
 849          * Behavior with respect to dropping/reacquiring vfs_dqrwlock:
 850          *
 851          * o shadow inodes: vfs_dqrwlock is not held at all
 852          * o quota updates: vfs_dqrwlock is read or write held
 853          * o other updates: vfs_dqrwlock is read held
 854          *
 855          * The first case is the only one where we do not hold
 856          * vfs_dqrwlock at all while entering wrip().
 857          * We must make sure not to downgrade/drop vfs_dqrwlock if we
 858          * have it as writer, i.e. if we are updating the quota inode.
 859          * There is no potential deadlock scenario in this case as
 860          * ufs_getpage() takes care of this and avoids reacquiring
 861          * vfs_dqrwlock in that case.
 862          *
 863          * This check is done here since the above conditions do not change
 864          * and we possibly loop below, so save a few cycles.
 865          */
 866         if ((type == IFSHAD) ||
 867             (rw_owner(&ufsvfsp->vfs_dqrwlock) == curthread)) {
 868                 do_dqrwlock = 0;
 869         } else {
 870                 do_dqrwlock = 1;
 871         }
 872 
 873         /*
 874          * Large Files: We cast MAXBMASK to offset_t
 875          * inorder to mask out the higher bits. Since offset_t
 876          * is a signed value, the high order bit set in MAXBMASK
 877          * value makes it do the right thing by having all bits 1
 878          * in the higher word. May be removed for _SOLARIS64_.
 879          */
 880 
 881         fs = ip->i_fs;
 882         do {
 883                 u_offset_t uoff = uio->uio_loffset;
 884                 off = uoff & (offset_t)MAXBMASK;
 885                 mapon = (int)(uoff & (offset_t)MAXBOFFSET);
 886                 on = (int)blkoff(fs, uoff);
 887                 n = (int)MIN(fs->fs_bsize - on, uio->uio_resid);
 888                 new_iblocks = 1;
 889 
 890                 if (type == IFREG && uoff + n >= limit) {
 891                         if (uoff >= limit) {
 892                                 error = EFBIG;
 893                                 goto out;
 894                         }
 895                         /*
 896                          * since uoff + n >= limit,
 897                          * therefore n >= limit - uoff, and n is an int
 898                          * so it is safe to cast it to an int
 899                          */
 900                         n = (int)(limit - (rlim64_t)uoff);
 901                 }
 902                 if (uoff + n > ip->i_size) {
 903                         /*
 904                          * We are extending the length of the file.
 905                          * bmap is used so that we are sure that
 906                          * if we need to allocate new blocks, that it
 907                          * is done here before we up the file size.
 908                          */
 909                         error = bmap_write(ip, uoff, (int)(on + n),
 910                             mapon == 0, NULL, cr);
 911                         /*
 912                          * bmap_write never drops i_contents so if
 913                          * the flags are set it changed the file.
 914                          */
 915                         if (ip->i_flag & (ICHG|IUPD)) {
 916                                 i_seq_needed = 1;
 917                                 ip->i_flag |= ISEQ;
 918                         }
 919                         if (error)
 920                                 break;
 921                         /*
 922                          * There is a window of vulnerability here.
 923                          * The sequence of operations: allocate file
 924                          * system blocks, uiomove the data into pages,
 925                          * and then update the size of the file in the
 926                          * inode, must happen atomically.  However, due
 927                          * to current locking constraints, this can not
 928                          * be done.
 929                          */
 930                         ASSERT(ip->i_writer == NULL);
 931                         ip->i_writer = curthread;
 932                         i_size_changed = 1;
 933                         /*
 934                          * If we are writing from the beginning of
 935                          * the mapping, we can just create the
 936                          * pages without having to read them.
 937                          */
 938                         pagecreate = (mapon == 0);
 939                 } else if (n == MAXBSIZE) {
 940                         /*
 941                          * Going to do a whole mappings worth,
 942                          * so we can just create the pages w/o
 943                          * having to read them in.  But before
 944                          * we do that, we need to make sure any
 945                          * needed blocks are allocated first.
 946                          */
 947                         iblocks = ip->i_blocks;
 948                         error = bmap_write(ip, uoff, (int)(on + n),
 949                             BI_ALLOC_ONLY, NULL, cr);
 950                         /*
 951                          * bmap_write never drops i_contents so if
 952                          * the flags are set it changed the file.
 953                          */
 954                         if (ip->i_flag & (ICHG|IUPD)) {
 955                                 i_seq_needed = 1;
 956                                 ip->i_flag |= ISEQ;
 957                         }
 958                         if (error)
 959                                 break;
 960                         pagecreate = 1;
 961                         /*
 962                          * check if the new created page needed the
 963                          * allocation of new disk blocks.
 964                          */
 965                         if (iblocks == ip->i_blocks)
 966                                 new_iblocks = 0; /* no new blocks allocated */
 967                 } else {
 968                         pagecreate = 0;
 969                         /*
 970                          * In sync mode flush the indirect blocks which
 971                          * may have been allocated and not written on
 972                          * disk. In above cases bmap_write will allocate
 973                          * in sync mode.
 974                          */
 975                         if (ioflag & (FSYNC|FDSYNC)) {
 976                                 error = ufs_indirblk_sync(ip, uoff);
 977                                 if (error)
 978                                         break;
 979                         }
 980                 }
 981 
 982                 /*
 983                  * At this point we can enter ufs_getpage() in one
 984                  * of two ways:
 985                  * 1) segmap_getmapflt() calls ufs_getpage() when the
 986                  *    forcefault parameter is true (pagecreate == 0)
 987                  * 2) uiomove() causes a page fault.
 988                  *
 989                  * We have to drop the contents lock to prevent the VM
 990                  * system from trying to reacquire it in ufs_getpage()
 991                  * should the uiomove cause a pagefault.
 992                  *
 993                  * We have to drop the reader vfs_dqrwlock here as well.
 994                  */
 995                 rw_exit(&ip->i_contents);
 996                 if (do_dqrwlock) {
 997                         ASSERT(RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock));
 998                         ASSERT(!(RW_WRITE_HELD(&ufsvfsp->vfs_dqrwlock)));
 999                         rw_exit(&ufsvfsp->vfs_dqrwlock);
1000                 }
1001 
1002                 newpage = 0;
1003                 premove_resid = uio->uio_resid;
1004 
1005                 /*
1006                  * Touch the page and fault it in if it is not in core
1007                  * before segmap_getmapflt or vpm_data_copy can lock it.
1008                  * This is to avoid the deadlock if the buffer is mapped
1009                  * to the same file through mmap which we want to write.
1010                  */
1011                 uio_prefaultpages((long)n, uio);
1012 
1013                 if (vpm_enable) {
1014                         /*
1015                          * Copy data. If new pages are created, part of
1016                          * the page that is not written will be initizliazed
1017                          * with zeros.
1018                          */
1019                         error = vpm_data_copy(vp, (off + mapon), (uint_t)n,
1020                             uio, !pagecreate, &newpage, 0, S_WRITE);
1021                 } else {
1022 
1023                         base = segmap_getmapflt(segkmap, vp, (off + mapon),
1024                             (uint_t)n, !pagecreate, S_WRITE);
1025 
1026                         /*
1027                          * segmap_pagecreate() returns 1 if it calls
1028                          * page_create_va() to allocate any pages.
1029                          */
1030 
1031                         if (pagecreate)
1032                                 newpage = segmap_pagecreate(segkmap, base,
1033                                     (size_t)n, 0);
1034 
1035                         error = uiomove(base + mapon, (long)n, UIO_WRITE, uio);
1036                 }
1037 
1038                 /*
1039                  * If "newpage" is set, then a new page was created and it
1040                  * does not contain valid data, so it needs to be initialized
1041                  * at this point.
1042                  * Otherwise the page contains old data, which was overwritten
1043                  * partially or as a whole in uiomove.
1044                  * If there is only one iovec structure within uio, then
1045                  * on error uiomove will not be able to update uio->uio_loffset
1046                  * and we would zero the whole page here!
1047                  *
1048                  * If uiomove fails because of an error, the old valid data
1049                  * is kept instead of filling the rest of the page with zero's.
1050                  */
1051                 if (!vpm_enable && newpage &&
1052                     uio->uio_loffset < roundup(off + mapon + n, PAGESIZE)) {
1053                         /*
1054                          * We created pages w/o initializing them completely,
1055                          * thus we need to zero the part that wasn't set up.
1056                          * This happens on most EOF write cases and if
1057                          * we had some sort of error during the uiomove.
1058                          */
1059                         int nzero, nmoved;
1060 
1061                         nmoved = (int)(uio->uio_loffset - (off + mapon));
1062                         ASSERT(nmoved >= 0 && nmoved <= n);
1063                         nzero = roundup(on + n, PAGESIZE) - nmoved;
1064                         ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE);
1065                         (void) kzero(base + mapon + nmoved, (uint_t)nzero);
1066                 }
1067 
1068                 /*
1069                  * Unlock the pages allocated by page_create_va()
1070                  * in segmap_pagecreate()
1071                  */
1072                 if (!vpm_enable && newpage)
1073                         segmap_pageunlock(segkmap, base, (size_t)n, S_WRITE);
1074 
1075                 /*
1076                  * If the size of the file changed, then update the
1077                  * size field in the inode now.  This can't be done
1078                  * before the call to segmap_pageunlock or there is
1079                  * a potential deadlock with callers to ufs_putpage().
1080                  * They will be holding i_contents and trying to lock
1081                  * a page, while this thread is holding a page locked
1082                  * and trying to acquire i_contents.
1083                  */
1084                 if (i_size_changed) {
1085                         rw_enter(&ip->i_contents, RW_WRITER);
1086                         old_i_size = ip->i_size;
1087                         UFS_SET_ISIZE(uoff + n, ip);
1088                         TRANS_INODE(ufsvfsp, ip);
1089                         /*
1090                          * file has grown larger than 2GB. Set flag
1091                          * in superblock to indicate this, if it
1092                          * is not already set.
1093                          */
1094                         if ((ip->i_size > MAXOFF32_T) &&
1095                             !(fs->fs_flags & FSLARGEFILES)) {
1096                                 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
1097                                 mutex_enter(&ufsvfsp->vfs_lock);
1098                                 fs->fs_flags |= FSLARGEFILES;
1099                                 ufs_sbwrite(ufsvfsp);
1100                                 mutex_exit(&ufsvfsp->vfs_lock);
1101                         }
1102                         mutex_enter(&ip->i_tlock);
1103                         ip->i_writer = NULL;
1104                         cv_broadcast(&ip->i_wrcv);
1105                         mutex_exit(&ip->i_tlock);
1106                         rw_exit(&ip->i_contents);
1107                 }
1108 
1109                 if (error) {
1110                         /*
1111                          * If we failed on a write, we may have already
1112                          * allocated file blocks as well as pages.  It's
1113                          * hard to undo the block allocation, but we must
1114                          * be sure to invalidate any pages that may have
1115                          * been allocated.
1116                          *
1117                          * If the page was created without initialization
1118                          * then we must check if it should be possible
1119                          * to destroy the new page and to keep the old data
1120                          * on the disk.
1121                          *
1122                          * It is possible to destroy the page without
1123                          * having to write back its contents only when
1124                          * - the size of the file keeps unchanged
1125                          * - bmap_write() did not allocate new disk blocks
1126                          *   it is possible to create big files using "seek" and
1127                          *   write to the end of the file. A "write" to a
1128                          *   position before the end of the file would not
1129                          *   change the size of the file but it would allocate
1130                          *   new disk blocks.
1131                          * - uiomove intended to overwrite the whole page.
1132                          * - a new page was created (newpage == 1).
1133                          */
1134 
1135                         if (i_size_changed == 0 && new_iblocks == 0 &&
1136                             newpage) {
1137 
1138                                 /* unwind what uiomove eventually last did */
1139                                 uio->uio_resid = premove_resid;
1140 
1141                                 /*
1142                                  * destroy the page, do not write ambiguous
1143                                  * data to the disk.
1144                                  */
1145                                 flags = SM_DESTROY;
1146                         } else {
1147                                 /*
1148                                  * write the page back to the disk, if dirty,
1149                                  * and remove the page from the cache.
1150                                  */
1151                                 flags = SM_INVAL;
1152                         }
1153 
1154                         if (vpm_enable) {
1155                                 /*
1156                                  *  Flush pages.
1157                                  */
1158                                 (void) vpm_sync_pages(vp, off, n, flags);
1159                         } else {
1160                                 (void) segmap_release(segkmap, base, flags);
1161                         }
1162                 } else {
1163                         flags = 0;
1164                         /*
1165                          * Force write back for synchronous write cases.
1166                          */
1167                         if ((ioflag & (FSYNC|FDSYNC)) || type == IFDIR) {
1168                                 /*
1169                                  * If the sticky bit is set but the
1170                                  * execute bit is not set, we do a
1171                                  * synchronous write back and free
1172                                  * the page when done.  We set up swap
1173                                  * files to be handled this way to
1174                                  * prevent servers from keeping around
1175                                  * the client's swap pages too long.
1176                                  * XXX - there ought to be a better way.
1177                                  */
1178                                 if (IS_SWAPVP(vp)) {
1179                                         flags = SM_WRITE | SM_FREE |
1180                                             SM_DONTNEED;
1181                                         iupdat_flag = 0;
1182                                 } else {
1183                                         flags = SM_WRITE;
1184                                 }
1185                         } else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) {
1186                                 /*
1187                                  * Have written a whole block.
1188                                  * Start an asynchronous write and
1189                                  * mark the buffer to indicate that
1190                                  * it won't be needed again soon.
1191                                  */
1192                                 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
1193                         }
1194                         if (vpm_enable) {
1195                                 /*
1196                                  * Flush pages.
1197                                  */
1198                                 error = vpm_sync_pages(vp, off, n, flags);
1199                         } else {
1200                                 error = segmap_release(segkmap, base, flags);
1201                         }
1202                         /*
1203                          * If the operation failed and is synchronous,
1204                          * then we need to unwind what uiomove() last
1205                          * did so we can potentially return an error to
1206                          * the caller.  If this write operation was
1207                          * done in two pieces and the first succeeded,
1208                          * then we won't return an error for the second
1209                          * piece that failed.  However, we only want to
1210                          * return a resid value that reflects what was
1211                          * really done.
1212                          *
1213                          * Failures for non-synchronous operations can
1214                          * be ignored since the page subsystem will
1215                          * retry the operation until it succeeds or the
1216                          * file system is unmounted.
1217                          */
1218                         if (error) {
1219                                 if ((ioflag & (FSYNC | FDSYNC)) ||
1220                                     type == IFDIR) {
1221                                         uio->uio_resid = premove_resid;
1222                                 } else {
1223                                         error = 0;
1224                                 }
1225                         }
1226                 }
1227 
1228                 /*
1229                  * Re-acquire contents lock.
1230                  * If it was dropped, reacquire reader vfs_dqrwlock as well.
1231                  */
1232                 if (do_dqrwlock)
1233                         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
1234                 rw_enter(&ip->i_contents, RW_WRITER);
1235 
1236                 /*
1237                  * If the uiomove() failed or if a synchronous
1238                  * page push failed, fix up i_size.
1239                  */
1240                 if (error) {
1241                         if (i_size_changed) {
1242                                 /*
1243                                  * The uiomove failed, and we
1244                                  * allocated blocks,so get rid
1245                                  * of them.
1246                                  */
1247                                 (void) ufs_itrunc(ip, old_i_size, 0, cr);
1248                         }
1249                 } else {
1250                         /*
1251                          * XXX - Can this be out of the loop?
1252                          */
1253                         ip->i_flag |= IUPD | ICHG;
1254                         /*
1255                          * Only do one increase of i_seq for multiple
1256                          * pieces.  Because we drop locks, record
1257                          * the fact that we changed the timestamp and
1258                          * are deferring the increase in case another thread
1259                          * pushes our timestamp update.
1260                          */
1261                         i_seq_needed = 1;
1262                         ip->i_flag |= ISEQ;
1263                         if (i_size_changed)
1264                                 ip->i_flag |= IATTCHG;
1265                         if ((ip->i_mode & (IEXEC | (IEXEC >> 3) |
1266                             (IEXEC >> 6))) != 0 &&
1267                             (ip->i_mode & (ISUID | ISGID)) != 0 &&
1268                             secpolicy_vnode_setid_retain(cr,
1269                             (ip->i_mode & ISUID) != 0 && ip->i_uid == 0) != 0) {
1270                                 /*
1271                                  * Clear Set-UID & Set-GID bits on
1272                                  * successful write if not privileged
1273                                  * and at least one of the execute bits
1274                                  * is set.  If we always clear Set-GID,
1275                                  * mandatory file and record locking is
1276                                  * unuseable.
1277                                  */
1278                                 ip->i_mode &= ~(ISUID | ISGID);
1279                         }
1280                 }
1281                 /*
1282                  * In the case the FDSYNC flag is set and this is a
1283                  * "rewrite" we won't log a delta.
1284                  * The FSYNC flag overrides all cases.
1285                  */
1286                 if (!ufs_check_rewrite(ip, uio, ioflag) || !(ioflag & FDSYNC)) {
1287                         TRANS_INODE(ufsvfsp, ip);
1288                 }
1289         } while (error == 0 && uio->uio_resid > 0 && n != 0);
1290 
1291 out:
1292         /*
1293          * Make sure i_seq is increased at least once per write
1294          */
1295         if (i_seq_needed) {
1296                 ip->i_seq++;
1297                 ip->i_flag &= ~ISEQ;     /* no longer deferred */
1298         }
1299 
1300         /*
1301          * Inode is updated according to this table -
1302          *
1303          *   FSYNC        FDSYNC(posix.4)
1304          *   --------------------------
1305          *   always@      IATTCHG|IBDWRITE
1306          *
1307          * @ -  If we are doing synchronous write the only time we should
1308          *      not be sync'ing the ip here is if we have the stickyhack
1309          *      activated, the file is marked with the sticky bit and
1310          *      no exec bit, the file length has not been changed and
1311          *      no new blocks have been allocated during this write.
1312          */
1313 
1314         if ((ip->i_flag & ISYNC) != 0) {
1315                 /*
1316                  * we have eliminated nosync
1317                  */
1318                 if ((ip->i_flag & (IATTCHG|IBDWRITE)) ||
1319                     ((ioflag & FSYNC) && iupdat_flag)) {
1320                         ufs_iupdat(ip, 1);
1321                 }
1322         }
1323 
1324         /*
1325          * If we've already done a partial-write, terminate
1326          * the write but return no error unless the error is ENOSPC
1327          * because the caller can detect this and free resources and
1328          * try again.
1329          */
1330         if ((start_resid != uio->uio_resid) && (error != ENOSPC))
1331                 error = 0;
1332 
1333         ip->i_flag &= ~(INOACC | ISYNC);
1334         ITIMES_NOLOCK(ip);
1335         return (error);
1336 }
1337 
1338 /*
1339  * rdip does the real work of read requests for ufs.
1340  */
1341 int
1342 rdip(struct inode *ip, struct uio *uio, int ioflag, cred_t *cr)
1343 {
1344         u_offset_t off;
1345         caddr_t base;
1346         struct fs *fs;
1347         struct ufsvfs *ufsvfsp;
1348         struct vnode *vp;
1349         long oresid = uio->uio_resid;
1350         u_offset_t n, on, mapon;
1351         int error = 0;
1352         int doupdate = 1;
1353         uint_t flags;
1354         int dofree, directio_status;
1355         krw_t rwtype;
1356         o_mode_t type;
1357         clock_t now;
1358 
1359         vp = ITOV(ip);
1360 
1361         ASSERT(RW_LOCK_HELD(&ip->i_contents));
1362 
1363         ufsvfsp = ip->i_ufsvfs;
1364 
1365         if (ufsvfsp == NULL)
1366                 return (EIO);
1367 
1368         fs = ufsvfsp->vfs_fs;
1369 
1370         /* check for valid filetype */
1371         type = ip->i_mode & IFMT;
1372         if ((type != IFREG) && (type != IFDIR) && (type != IFATTRDIR) &&
1373             (type != IFLNK) && (type != IFSHAD)) {
1374                 return (EIO);
1375         }
1376 
1377         if (uio->uio_loffset > UFS_MAXOFFSET_T) {
1378                 error = 0;
1379                 goto out;
1380         }
1381         if (uio->uio_loffset < (offset_t)0) {
1382                 return (EINVAL);
1383         }
1384         if (uio->uio_resid == 0) {
1385                 return (0);
1386         }
1387 
1388         if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && (fs->fs_ronly == 0) &&
1389             (!ufsvfsp->vfs_noatime)) {
1390                 mutex_enter(&ip->i_tlock);
1391                 ip->i_flag |= IACC;
1392                 mutex_exit(&ip->i_tlock);
1393         }
1394         /*
1395          * Try to go direct
1396          */
1397         if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio) {
1398                 error = ufs_directio_read(ip, uio, cr, &directio_status);
1399                 if (directio_status == DIRECTIO_SUCCESS)
1400                         goto out;
1401         }
1402 
1403         rwtype = (rw_write_held(&ip->i_contents)?RW_WRITER:RW_READER);
1404 
1405         do {
1406                 offset_t diff;
1407                 u_offset_t uoff = uio->uio_loffset;
1408                 off = uoff & (offset_t)MAXBMASK;
1409                 mapon = (u_offset_t)(uoff & (offset_t)MAXBOFFSET);
1410                 on = (u_offset_t)blkoff(fs, uoff);
1411                 n = MIN((u_offset_t)fs->fs_bsize - on,
1412                     (u_offset_t)uio->uio_resid);
1413 
1414                 diff = ip->i_size - uoff;
1415 
1416                 if (diff <= (offset_t)0) {
1417                         error = 0;
1418                         goto out;
1419                 }
1420                 if (diff < (offset_t)n)
1421                         n = (int)diff;
1422 
1423                 /*
1424                  * We update smallfile2 and smallfile1 at most every second.
1425                  */
1426                 now = ddi_get_lbolt();
1427                 if (now >= smallfile_update) {
1428                         uint64_t percpufreeb;
1429                         if (smallfile1_d == 0) smallfile1_d = SMALLFILE1_D;
1430                         if (smallfile2_d == 0) smallfile2_d = SMALLFILE2_D;
1431                         percpufreeb = ptob((uint64_t)freemem) / ncpus_online;
1432                         smallfile1 = percpufreeb / smallfile1_d;
1433                         smallfile2 = percpufreeb / smallfile2_d;
1434                         smallfile1 = MAX(smallfile1, smallfile);
1435                         smallfile1 = MAX(smallfile1, smallfile64);
1436                         smallfile2 = MAX(smallfile1, smallfile2);
1437                         smallfile_update = now + hz;
1438                 }
1439 
1440                 dofree = freebehind &&
1441                     ip->i_nextr == (off & PAGEMASK) && off > smallfile1;
1442 
1443                 /*
1444                  * At this point we can enter ufs_getpage() in one of two
1445                  * ways:
1446                  * 1) segmap_getmapflt() calls ufs_getpage() when the
1447                  *    forcefault parameter is true (value of 1 is passed)
1448                  * 2) uiomove() causes a page fault.
1449                  *
1450                  * We cannot hold onto an i_contents reader lock without
1451                  * risking deadlock in ufs_getpage() so drop a reader lock.
1452                  * The ufs_getpage() dolock logic already allows for a
1453                  * thread holding i_contents as writer to work properly
1454                  * so we keep a writer lock.
1455                  */
1456                 if (rwtype == RW_READER)
1457                         rw_exit(&ip->i_contents);
1458 
1459                 if (vpm_enable) {
1460                         /*
1461                          * Copy data.
1462                          */
1463                         error = vpm_data_copy(vp, (off + mapon), (uint_t)n,
1464                             uio, 1, NULL, 0, S_READ);
1465                 } else {
1466                         base = segmap_getmapflt(segkmap, vp, (off + mapon),
1467                             (uint_t)n, 1, S_READ);
1468                         error = uiomove(base + mapon, (long)n, UIO_READ, uio);
1469                 }
1470 
1471                 flags = 0;
1472                 if (!error) {
1473                         /*
1474                          * If  reading sequential  we won't need  this
1475                          * buffer again  soon.  For  offsets in  range
1476                          * [smallfile1,  smallfile2] release the pages
1477                          * at   the  tail  of the   cache list, larger
1478                          * offsets are released at the head.
1479                          */
1480                         if (dofree) {
1481                                 flags = SM_FREE | SM_ASYNC;
1482                                 if ((cache_read_ahead == 0) &&
1483                                     (off > smallfile2))
1484                                         flags |=  SM_DONTNEED;
1485                         }
1486                         /*
1487                          * In POSIX SYNC (FSYNC and FDSYNC) read mode,
1488                          * we want to make sure that the page which has
1489                          * been read, is written on disk if it is dirty.
1490                          * And corresponding indirect blocks should also
1491                          * be flushed out.
1492                          */
1493                         if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) {
1494                                 flags &= ~SM_ASYNC;
1495                                 flags |= SM_WRITE;
1496                         }
1497                         if (vpm_enable) {
1498                                 error = vpm_sync_pages(vp, off, n, flags);
1499                         } else {
1500                                 error = segmap_release(segkmap, base, flags);
1501                         }
1502                 } else {
1503                         if (vpm_enable) {
1504                                 (void) vpm_sync_pages(vp, off, n, flags);
1505                         } else {
1506                                 (void) segmap_release(segkmap, base, flags);
1507                         }
1508                 }
1509 
1510                 if (rwtype == RW_READER)
1511                         rw_enter(&ip->i_contents, rwtype);
1512         } while (error == 0 && uio->uio_resid > 0 && n != 0);
1513 out:
1514         /*
1515          * Inode is updated according to this table if FRSYNC is set.
1516          *
1517          *   FSYNC        FDSYNC(posix.4)
1518          *   --------------------------
1519          *   always       IATTCHG|IBDWRITE
1520          */
1521         /*
1522          * The inode is not updated if we're logging and the inode is a
1523          * directory with FRSYNC, FSYNC and FDSYNC flags set.
1524          */
1525         if (ioflag & FRSYNC) {
1526                 if (TRANS_ISTRANS(ufsvfsp) && ((ip->i_mode & IFMT) == IFDIR)) {
1527                         doupdate = 0;
1528                 }
1529                 if (doupdate) {
1530                         if ((ioflag & FSYNC) ||
1531                             ((ioflag & FDSYNC) &&
1532                             (ip->i_flag & (IATTCHG|IBDWRITE)))) {
1533                                 ufs_iupdat(ip, 1);
1534                         }
1535                 }
1536         }
1537         /*
1538          * If we've already done a partial read, terminate
1539          * the read but return no error.
1540          */
1541         if (oresid != uio->uio_resid)
1542                 error = 0;
1543         ITIMES(ip);
1544 
1545         return (error);
1546 }
1547 
1548 /* ARGSUSED */
1549 static int
1550 ufs_ioctl(
1551         struct vnode    *vp,
1552         int             cmd,
1553         intptr_t        arg,
1554         int             flag,
1555         struct cred     *cr,
1556         int             *rvalp,
1557         caller_context_t *ct)
1558 {
1559         struct lockfs   lockfs, lockfs_out;
1560         struct ufsvfs   *ufsvfsp = VTOI(vp)->i_ufsvfs;
1561         char            *comment, *original_comment;
1562         struct fs       *fs;
1563         struct ulockfs  *ulp;
1564         offset_t        off;
1565         extern int      maxphys;
1566         int             error;
1567         int             issync;
1568         int             trans_size;
1569 
1570 
1571         /*
1572          * forcibly unmounted
1573          */
1574         if (ufsvfsp == NULL || vp->v_vfsp == NULL ||
1575             vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)
1576                 return (EIO);
1577         fs = ufsvfsp->vfs_fs;
1578 
1579         if (cmd == Q_QUOTACTL) {
1580                 error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_QUOTA_MASK);
1581                 if (error)
1582                         return (error);
1583 
1584                 if (ulp) {
1585                         TRANS_BEGIN_ASYNC(ufsvfsp, TOP_QUOTA,
1586                             TOP_SETQUOTA_SIZE(fs));
1587                 }
1588 
1589                 error = quotactl(vp, arg, flag, cr);
1590 
1591                 if (ulp) {
1592                         TRANS_END_ASYNC(ufsvfsp, TOP_QUOTA,
1593                             TOP_SETQUOTA_SIZE(fs));
1594                         ufs_lockfs_end(ulp);
1595                 }
1596                 return (error);
1597         }
1598 
1599         switch (cmd) {
1600                 case _FIOLFS:
1601                         /*
1602                          * file system locking
1603                          */
1604                         if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1605                                 return (EPERM);
1606 
1607                         if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
1608                                 if (copyin((caddr_t)arg, &lockfs,
1609                                     sizeof (struct lockfs)))
1610                                         return (EFAULT);
1611                         }
1612 #ifdef _SYSCALL32_IMPL
1613                         else {
1614                                 struct lockfs32 lockfs32;
1615                                 /* Translate ILP32 lockfs to LP64 lockfs */
1616                                 if (copyin((caddr_t)arg, &lockfs32,
1617                                     sizeof (struct lockfs32)))
1618                                         return (EFAULT);
1619                                 lockfs.lf_lock = (ulong_t)lockfs32.lf_lock;
1620                                 lockfs.lf_flags = (ulong_t)lockfs32.lf_flags;
1621                                 lockfs.lf_key = (ulong_t)lockfs32.lf_key;
1622                                 lockfs.lf_comlen = (ulong_t)lockfs32.lf_comlen;
1623                                 lockfs.lf_comment =
1624                                     (caddr_t)(uintptr_t)lockfs32.lf_comment;
1625                         }
1626 #endif /* _SYSCALL32_IMPL */
1627 
1628                         if (lockfs.lf_comlen) {
1629                                 if (lockfs.lf_comlen > LOCKFS_MAXCOMMENTLEN)
1630                                         return (ENAMETOOLONG);
1631                                 comment =
1632                                     kmem_alloc(lockfs.lf_comlen, KM_SLEEP);
1633                                 if (copyin(lockfs.lf_comment, comment,
1634                                     lockfs.lf_comlen)) {
1635                                         kmem_free(comment, lockfs.lf_comlen);
1636                                         return (EFAULT);
1637                                 }
1638                                 original_comment = lockfs.lf_comment;
1639                                 lockfs.lf_comment = comment;
1640                         }
1641                         if ((error = ufs_fiolfs(vp, &lockfs, 0)) == 0) {
1642                                 lockfs.lf_comment = original_comment;
1643 
1644                                 if ((flag & DATAMODEL_MASK) ==
1645                                     DATAMODEL_NATIVE) {
1646                                         (void) copyout(&lockfs, (caddr_t)arg,
1647                                             sizeof (struct lockfs));
1648                                 }
1649 #ifdef _SYSCALL32_IMPL
1650                                 else {
1651                                         struct lockfs32 lockfs32;
1652                                         /* Translate LP64 to ILP32 lockfs */
1653                                         lockfs32.lf_lock =
1654                                             (uint32_t)lockfs.lf_lock;
1655                                         lockfs32.lf_flags =
1656                                             (uint32_t)lockfs.lf_flags;
1657                                         lockfs32.lf_key =
1658                                             (uint32_t)lockfs.lf_key;
1659                                         lockfs32.lf_comlen =
1660                                             (uint32_t)lockfs.lf_comlen;
1661                                         lockfs32.lf_comment =
1662                                             (uint32_t)(uintptr_t)
1663                                             lockfs.lf_comment;
1664                                         (void) copyout(&lockfs32, (caddr_t)arg,
1665                                             sizeof (struct lockfs32));
1666                                 }
1667 #endif /* _SYSCALL32_IMPL */
1668 
1669                         } else {
1670                                 if (lockfs.lf_comlen)
1671                                         kmem_free(comment, lockfs.lf_comlen);
1672                         }
1673                         return (error);
1674 
1675                 case _FIOLFSS:
1676                         /*
1677                          * get file system locking status
1678                          */
1679 
1680                         if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
1681                                 if (copyin((caddr_t)arg, &lockfs,
1682                                     sizeof (struct lockfs)))
1683                                         return (EFAULT);
1684                         }
1685 #ifdef _SYSCALL32_IMPL
1686                         else {
1687                                 struct lockfs32 lockfs32;
1688                                 /* Translate ILP32 lockfs to LP64 lockfs */
1689                                 if (copyin((caddr_t)arg, &lockfs32,
1690                                     sizeof (struct lockfs32)))
1691                                         return (EFAULT);
1692                                 lockfs.lf_lock = (ulong_t)lockfs32.lf_lock;
1693                                 lockfs.lf_flags = (ulong_t)lockfs32.lf_flags;
1694                                 lockfs.lf_key = (ulong_t)lockfs32.lf_key;
1695                                 lockfs.lf_comlen = (ulong_t)lockfs32.lf_comlen;
1696                                 lockfs.lf_comment =
1697                                     (caddr_t)(uintptr_t)lockfs32.lf_comment;
1698                         }
1699 #endif /* _SYSCALL32_IMPL */
1700 
1701                         if (error =  ufs_fiolfss(vp, &lockfs_out))
1702                                 return (error);
1703                         lockfs.lf_lock = lockfs_out.lf_lock;
1704                         lockfs.lf_key = lockfs_out.lf_key;
1705                         lockfs.lf_flags = lockfs_out.lf_flags;
1706                         lockfs.lf_comlen = MIN(lockfs.lf_comlen,
1707                             lockfs_out.lf_comlen);
1708 
1709                         if ((flag & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
1710                                 if (copyout(&lockfs, (caddr_t)arg,
1711                                     sizeof (struct lockfs)))
1712                                         return (EFAULT);
1713                         }
1714 #ifdef _SYSCALL32_IMPL
1715                         else {
1716                                 /* Translate LP64 to ILP32 lockfs */
1717                                 struct lockfs32 lockfs32;
1718                                 lockfs32.lf_lock = (uint32_t)lockfs.lf_lock;
1719                                 lockfs32.lf_flags = (uint32_t)lockfs.lf_flags;
1720                                 lockfs32.lf_key = (uint32_t)lockfs.lf_key;
1721                                 lockfs32.lf_comlen = (uint32_t)lockfs.lf_comlen;
1722                                 lockfs32.lf_comment =
1723                                     (uint32_t)(uintptr_t)lockfs.lf_comment;
1724                                 if (copyout(&lockfs32, (caddr_t)arg,
1725                                     sizeof (struct lockfs32)))
1726                                         return (EFAULT);
1727                         }
1728 #endif /* _SYSCALL32_IMPL */
1729 
1730                         if (lockfs.lf_comlen &&
1731                             lockfs.lf_comment && lockfs_out.lf_comment)
1732                                 if (copyout(lockfs_out.lf_comment,
1733                                     lockfs.lf_comment, lockfs.lf_comlen))
1734                                         return (EFAULT);
1735                         return (0);
1736 
1737                 case _FIOSATIME:
1738                         /*
1739                          * set access time
1740                          */
1741 
1742                         /*
1743                          * if mounted w/o atime, return quietly.
1744                          * I briefly thought about returning ENOSYS, but
1745                          * figured that most apps would consider this fatal
1746                          * but the idea is to make this as seamless as poss.
1747                          */
1748                         if (ufsvfsp->vfs_noatime)
1749                                 return (0);
1750 
1751                         error = ufs_lockfs_begin(ufsvfsp, &ulp,
1752                             ULOCKFS_SETATTR_MASK);
1753                         if (error)
1754                                 return (error);
1755 
1756                         if (ulp) {
1757                                 trans_size = (int)TOP_SETATTR_SIZE(VTOI(vp));
1758                                 TRANS_BEGIN_CSYNC(ufsvfsp, issync,
1759                                     TOP_SETATTR, trans_size);
1760                         }
1761 
1762                         error = ufs_fiosatime(vp, (struct timeval *)arg,
1763                             flag, cr);
1764 
1765                         if (ulp) {
1766                                 TRANS_END_CSYNC(ufsvfsp, error, issync,
1767                                     TOP_SETATTR, trans_size);
1768                                 ufs_lockfs_end(ulp);
1769                         }
1770                         return (error);
1771 
1772                 case _FIOSDIO:
1773                         /*
1774                          * set delayed-io
1775                          */
1776                         return (ufs_fiosdio(vp, (uint_t *)arg, flag, cr));
1777 
1778                 case _FIOGDIO:
1779                         /*
1780                          * get delayed-io
1781                          */
1782                         return (ufs_fiogdio(vp, (uint_t *)arg, flag, cr));
1783 
1784                 case _FIOIO:
1785                         /*
1786                          * inode open
1787                          */
1788                         error = ufs_lockfs_begin(ufsvfsp, &ulp,
1789                             ULOCKFS_VGET_MASK);
1790                         if (error)
1791                                 return (error);
1792 
1793                         error = ufs_fioio(vp, (struct fioio *)arg, flag, cr);
1794 
1795                         if (ulp) {
1796                                 ufs_lockfs_end(ulp);
1797                         }
1798                         return (error);
1799 
1800                 case _FIOFFS:
1801                         /*
1802                          * file system flush (push w/invalidate)
1803                          */
1804                         if ((caddr_t)arg != NULL)
1805                                 return (EINVAL);
1806                         return (ufs_fioffs(vp, NULL, cr));
1807 
1808                 case _FIOISBUSY:
1809                         /*
1810                          * Contract-private interface for Legato
1811                          * Purge this vnode from the DNLC and decide
1812                          * if this vnode is busy (*arg == 1) or not
1813                          * (*arg == 0)
1814                          */
1815                         if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1816                                 return (EPERM);
1817                         error = ufs_fioisbusy(vp, (int *)arg, cr);
1818                         return (error);
1819 
1820                 case _FIODIRECTIO:
1821                         return (ufs_fiodirectio(vp, (int)arg, cr));
1822 
1823                 case _FIOTUNE:
1824                         /*
1825                          * Tune the file system (aka setting fs attributes)
1826                          */
1827                         error = ufs_lockfs_begin(ufsvfsp, &ulp,
1828                             ULOCKFS_SETATTR_MASK);
1829                         if (error)
1830                                 return (error);
1831 
1832                         error = ufs_fiotune(vp, (struct fiotune *)arg, cr);
1833 
1834                         if (ulp)
1835                                 ufs_lockfs_end(ulp);
1836                         return (error);
1837 
1838                 case _FIOLOGENABLE:
1839                         if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1840                                 return (EPERM);
1841                         return (ufs_fiologenable(vp, (void *)arg, cr, flag));
1842 
1843                 case _FIOLOGDISABLE:
1844                         if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1845                                 return (EPERM);
1846                         return (ufs_fiologdisable(vp, (void *)arg, cr, flag));
1847 
1848                 case _FIOISLOG:
1849                         return (ufs_fioislog(vp, (void *)arg, cr, flag));
1850 
1851                 case _FIOSNAPSHOTCREATE_MULTI:
1852                 {
1853                         struct fiosnapcreate_multi      fc, *fcp;
1854                         size_t  fcm_size;
1855 
1856                         if (copyin((void *)arg, &fc, sizeof (fc)))
1857                                 return (EFAULT);
1858                         if (fc.backfilecount > MAX_BACKFILE_COUNT)
1859                                 return (EINVAL);
1860                         fcm_size = sizeof (struct fiosnapcreate_multi) +
1861                             (fc.backfilecount - 1) * sizeof (int);
1862                         fcp = (struct fiosnapcreate_multi *)
1863                             kmem_alloc(fcm_size, KM_SLEEP);
1864                         if (copyin((void *)arg, fcp, fcm_size)) {
1865                                 kmem_free(fcp, fcm_size);
1866                                 return (EFAULT);
1867                         }
1868                         error = ufs_snap_create(vp, fcp, cr);
1869                         /*
1870                          * Do copyout even if there is an error because
1871                          * the details of error is stored in fcp.
1872                          */
1873                         if (copyout(fcp, (void *)arg, fcm_size))
1874                                 error = EFAULT;
1875                         kmem_free(fcp, fcm_size);
1876                         return (error);
1877                 }
1878 
1879                 case _FIOSNAPSHOTDELETE:
1880                 {
1881                         struct fiosnapdelete    fc;
1882 
1883                         if (copyin((void *)arg, &fc, sizeof (fc)))
1884                                 return (EFAULT);
1885                         error = ufs_snap_delete(vp, &fc, cr);
1886                         if (!error && copyout(&fc, (void *)arg, sizeof (fc)))
1887                                 error = EFAULT;
1888                         return (error);
1889                 }
1890 
1891                 case _FIOGETSUPERBLOCK:
1892                         if (copyout(fs, (void *)arg, SBSIZE))
1893                                 return (EFAULT);
1894                         return (0);
1895 
1896                 case _FIOGETMAXPHYS:
1897                         if (copyout(&maxphys, (void *)arg, sizeof (maxphys)))
1898                                 return (EFAULT);
1899                         return (0);
1900 
1901                 /*
1902                  * The following 3 ioctls are for TSufs support
1903                  * although could potentially be used elsewhere
1904                  */
1905                 case _FIO_SET_LUFS_DEBUG:
1906                         if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1907                                 return (EPERM);
1908                         lufs_debug = (uint32_t)arg;
1909                         return (0);
1910 
1911                 case _FIO_SET_LUFS_ERROR:
1912                         if (secpolicy_fs_config(cr, ufsvfsp->vfs_vfs) != 0)
1913                                 return (EPERM);
1914                         TRANS_SETERROR(ufsvfsp);
1915                         return (0);
1916 
1917                 case _FIO_GET_TOP_STATS:
1918                 {
1919                         fio_lufs_stats_t *ls;
1920                         ml_unit_t *ul = ufsvfsp->vfs_log;
1921 
1922                         ls = kmem_zalloc(sizeof (*ls), KM_SLEEP);
1923                         ls->ls_debug = ul->un_debug; /* return debug value */
1924                         /* Copy stucture if statistics are being kept */
1925                         if (ul->un_logmap->mtm_tops) {
1926                                 ls->ls_topstats = *(ul->un_logmap->mtm_tops);
1927                         }
1928                         error = 0;
1929                         if (copyout(ls, (void *)arg, sizeof (*ls)))
1930                                 error = EFAULT;
1931                         kmem_free(ls, sizeof (*ls));
1932                         return (error);
1933                 }
1934 
1935                 case _FIO_SEEK_DATA:
1936                 case _FIO_SEEK_HOLE:
1937                         if (ddi_copyin((void *)arg, &off, sizeof (off), flag))
1938                                 return (EFAULT);
1939                         /* offset paramater is in/out */
1940                         error = ufs_fio_holey(vp, cmd, &off);
1941                         if (error)
1942                                 return (error);
1943                         if (ddi_copyout(&off, (void *)arg, sizeof (off), flag))
1944                                 return (EFAULT);
1945                         return (0);
1946 
1947                 case _FIO_COMPRESSED:
1948                 {
1949                         /*
1950                          * This is a project private ufs ioctl() to mark
1951                          * the inode as that belonging to a compressed
1952                          * file. This is used to mark individual
1953                          * compressed files in a miniroot archive.
1954                          * The files compressed in this manner are
1955                          * automatically decompressed by the dcfs filesystem
1956                          * (via an interception in ufs_lookup - see decompvp())
1957                          * which is layered on top of ufs on a system running
1958                          * from the archive. See uts/common/fs/dcfs for details.
1959                          * This ioctl only marks the file as compressed - the
1960                          * actual compression is done by fiocompress (a
1961                          * userland utility) which invokes this ioctl().
1962                          */
1963                         struct inode *ip = VTOI(vp);
1964 
1965                         error = ufs_lockfs_begin(ufsvfsp, &ulp,
1966                             ULOCKFS_SETATTR_MASK);
1967                         if (error)
1968                                 return (error);
1969 
1970                         if (ulp) {
1971                                 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_IUPDAT,
1972                                     TOP_IUPDAT_SIZE(ip));
1973                         }
1974 
1975                         error = ufs_mark_compressed(vp);
1976 
1977                         if (ulp) {
1978                                 TRANS_END_ASYNC(ufsvfsp, TOP_IUPDAT,
1979                                     TOP_IUPDAT_SIZE(ip));
1980                                 ufs_lockfs_end(ulp);
1981                         }
1982 
1983                         return (error);
1984 
1985                 }
1986 
1987                 default:
1988                         return (ENOTTY);
1989         }
1990 }
1991 
1992 
1993 /* ARGSUSED */
1994 static int
1995 ufs_getattr(struct vnode *vp, struct vattr *vap, int flags,
1996     struct cred *cr, caller_context_t *ct)
1997 {
1998         struct inode *ip = VTOI(vp);
1999         struct ufsvfs *ufsvfsp;
2000         int err;
2001 
2002         if (vap->va_mask == AT_SIZE) {
2003                 /*
2004                  * for performance, if only the size is requested don't bother
2005                  * with anything else.
2006                  */
2007                 UFS_GET_ISIZE(&vap->va_size, ip);
2008                 return (0);
2009         }
2010 
2011         /*
2012          * inlined lockfs checks
2013          */
2014         ufsvfsp = ip->i_ufsvfs;
2015         if ((ufsvfsp == NULL) || ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs)) {
2016                 err = EIO;
2017                 goto out;
2018         }
2019 
2020         rw_enter(&ip->i_contents, RW_READER);
2021         /*
2022          * Return all the attributes.  This should be refined so
2023          * that it only returns what's asked for.
2024          */
2025 
2026         /*
2027          * Copy from inode table.
2028          */
2029         vap->va_type = vp->v_type;
2030         vap->va_mode = ip->i_mode & MODEMASK;
2031         /*
2032          * If there is an ACL and there is a mask entry, then do the
2033          * extra work that completes the equivalent of an acltomode(3)
2034          * call.  According to POSIX P1003.1e, the acl mask should be
2035          * returned in the group permissions field.
2036          *
2037          * - start with the original permission and mode bits (from above)
2038          * - clear the group owner bits
2039          * - add in the mask bits.
2040          */
2041         if (ip->i_ufs_acl && ip->i_ufs_acl->aclass.acl_ismask) {
2042                 vap->va_mode &= ~((VREAD | VWRITE | VEXEC) >> 3);
2043                 vap->va_mode |=
2044                     (ip->i_ufs_acl->aclass.acl_maskbits & PERMMASK) << 3;
2045         }
2046         vap->va_uid = ip->i_uid;
2047         vap->va_gid = ip->i_gid;
2048         vap->va_fsid = ip->i_dev;
2049         vap->va_nodeid = (ino64_t)ip->i_number;
2050         vap->va_nlink = ip->i_nlink;
2051         vap->va_size = ip->i_size;
2052         if (vp->v_type == VCHR || vp->v_type == VBLK)
2053                 vap->va_rdev = ip->i_rdev;
2054         else
2055                 vap->va_rdev = 0;    /* not a b/c spec. */
2056         mutex_enter(&ip->i_tlock);
2057         ITIMES_NOLOCK(ip);      /* mark correct time in inode */
2058         vap->va_seq = ip->i_seq;
2059         vap->va_atime.tv_sec = (time_t)ip->i_atime.tv_sec;
2060         vap->va_atime.tv_nsec = ip->i_atime.tv_usec*1000;
2061         vap->va_mtime.tv_sec = (time_t)ip->i_mtime.tv_sec;
2062         vap->va_mtime.tv_nsec = ip->i_mtime.tv_usec*1000;
2063         vap->va_ctime.tv_sec = (time_t)ip->i_ctime.tv_sec;
2064         vap->va_ctime.tv_nsec = ip->i_ctime.tv_usec*1000;
2065         mutex_exit(&ip->i_tlock);
2066 
2067         switch (ip->i_mode & IFMT) {
2068 
2069         case IFBLK:
2070                 vap->va_blksize = MAXBSIZE;          /* was BLKDEV_IOSIZE */
2071                 break;
2072 
2073         case IFCHR:
2074                 vap->va_blksize = MAXBSIZE;
2075                 break;
2076 
2077         default:
2078                 vap->va_blksize = ip->i_fs->fs_bsize;
2079                 break;
2080         }
2081         vap->va_nblocks = (fsblkcnt64_t)ip->i_blocks;
2082         rw_exit(&ip->i_contents);
2083         err = 0;
2084 
2085 out:
2086         return (err);
2087 }
2088 
2089 /*
2090  * Special wrapper to provide a callback for secpolicy_vnode_setattr().
2091  * The i_contents lock is already held by the caller and we need to
2092  * declare the inode as 'void *' argument.
2093  */
2094 static int
2095 ufs_priv_access(void *vip, int mode, struct cred *cr)
2096 {
2097         struct inode *ip = vip;
2098 
2099         return (ufs_iaccess(ip, mode, cr, 0));
2100 }
2101 
2102 /*ARGSUSED4*/
2103 static int
2104 ufs_setattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cr,
2105     caller_context_t *ct)
2106 {
2107         struct inode *ip = VTOI(vp);
2108         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
2109         struct fs *fs;
2110         struct ulockfs *ulp;
2111         char *errmsg1;
2112         char *errmsg2;
2113         long blocks;
2114         long int mask = vap->va_mask;
2115         size_t len1, len2;
2116         int issync;
2117         int trans_size;
2118         int dotrans;
2119         int dorwlock;
2120         int error;
2121         int owner_change;
2122         int dodqlock;
2123         timestruc_t now;
2124         vattr_t oldva;
2125         int retry = 1;
2126         int indeadlock;
2127 
2128         /*
2129          * Cannot set these attributes.
2130          */
2131         if ((mask & AT_NOSET) || (mask & AT_XVATTR))
2132                 return (EINVAL);
2133 
2134         /*
2135          * check for forced unmount
2136          */
2137         if (ufsvfsp == NULL)
2138                 return (EIO);
2139 
2140         fs = ufsvfsp->vfs_fs;
2141         if (fs->fs_ronly != 0)
2142                 return (EROFS);
2143 
2144 again:
2145         errmsg1 = NULL;
2146         errmsg2 = NULL;
2147         dotrans = 0;
2148         dorwlock = 0;
2149         dodqlock = 0;
2150 
2151         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SETATTR_MASK);
2152         if (error)
2153                 goto out;
2154 
2155         /*
2156          * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file.
2157          * This follows the protocol for read()/write().
2158          */
2159         if (vp->v_type != VDIR) {
2160                 /*
2161                  * ufs_tryirwlock uses rw_tryenter and checks for SLOCK to
2162                  * avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
2163                  * possible, retries the operation.
2164                  */
2165                 ufs_tryirwlock(&ip->i_rwlock, RW_WRITER, retry_file);
2166                 if (indeadlock) {
2167                         if (ulp)
2168                                 ufs_lockfs_end(ulp);
2169                         goto again;
2170                 }
2171                 dorwlock = 1;
2172         }
2173 
2174         /*
2175          * Truncate file.  Must have write permission and not be a directory.
2176          */
2177         if (mask & AT_SIZE) {
2178                 rw_enter(&ip->i_contents, RW_WRITER);
2179                 if (vp->v_type == VDIR) {
2180                         error = EISDIR;
2181                         goto update_inode;
2182                 }
2183                 if (error = ufs_iaccess(ip, IWRITE, cr, 0))
2184                         goto update_inode;
2185 
2186                 rw_exit(&ip->i_contents);
2187                 error = TRANS_ITRUNC(ip, vap->va_size, 0, cr);
2188                 if (error) {
2189                         rw_enter(&ip->i_contents, RW_WRITER);
2190                         goto update_inode;
2191                 }
2192 
2193                 if (error == 0 && vap->va_size)
2194                         vnevent_truncate(vp, ct);
2195         }
2196 
2197         if (ulp) {
2198                 trans_size = (int)TOP_SETATTR_SIZE(ip);
2199                 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_SETATTR, trans_size);
2200                 ++dotrans;
2201         }
2202 
2203         /*
2204          * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory.
2205          * This follows the protocol established by
2206          * ufs_link/create/remove/rename/mkdir/rmdir/symlink.
2207          */
2208         if (vp->v_type == VDIR) {
2209                 ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_SETATTR,
2210                     retry_dir);
2211                 if (indeadlock)
2212                         goto again;
2213                 dorwlock = 1;
2214         }
2215 
2216         /*
2217          * Grab quota lock if we are changing the file's owner.
2218          */
2219         if (mask & AT_UID) {
2220                 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
2221                 dodqlock = 1;
2222         }
2223         rw_enter(&ip->i_contents, RW_WRITER);
2224 
2225         oldva.va_mode = ip->i_mode;
2226         oldva.va_uid = ip->i_uid;
2227         oldva.va_gid = ip->i_gid;
2228 
2229         vap->va_mask &= ~AT_SIZE;
2230 
2231         error = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
2232             ufs_priv_access, ip);
2233         if (error)
2234                 goto update_inode;
2235 
2236         mask = vap->va_mask;
2237 
2238         /*
2239          * Change file access modes.
2240          */
2241         if (mask & AT_MODE) {
2242                 ip->i_mode = (ip->i_mode & IFMT) | (vap->va_mode & ~IFMT);
2243                 TRANS_INODE(ufsvfsp, ip);
2244                 ip->i_flag |= ICHG;
2245                 if (stickyhack) {
2246                         mutex_enter(&vp->v_lock);
2247                         if ((ip->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX)
2248                                 vp->v_flag |= VSWAPLIKE;
2249                         else
2250                                 vp->v_flag &= ~VSWAPLIKE;
2251                         mutex_exit(&vp->v_lock);
2252                 }
2253         }
2254         if (mask & (AT_UID|AT_GID)) {
2255                 if (mask & AT_UID) {
2256                         /*
2257                          * Don't change ownership of the quota inode.
2258                          */
2259                         if (ufsvfsp->vfs_qinod == ip) {
2260                                 ASSERT(ufsvfsp->vfs_qflags & MQ_ENABLED);
2261                                 error = EINVAL;
2262                                 goto update_inode;
2263                         }
2264 
2265                         /*
2266                          * No real ownership change.
2267                          */
2268                         if (ip->i_uid == vap->va_uid) {
2269                                 blocks = 0;
2270                                 owner_change = 0;
2271                         }
2272                         /*
2273                          * Remove the blocks and the file, from the old user's
2274                          * quota.
2275                          */
2276                         else {
2277                                 blocks = ip->i_blocks;
2278                                 owner_change = 1;
2279 
2280                                 (void) chkdq(ip, -blocks, /* force */ 1, cr,
2281                                     (char **)NULL, (size_t *)NULL);
2282                                 (void) chkiq(ufsvfsp, /* change */ -1, ip,
2283                                     (uid_t)ip->i_uid, /* force */ 1, cr,
2284                                     (char **)NULL, (size_t *)NULL);
2285                                 dqrele(ip->i_dquot);
2286                         }
2287 
2288                         ip->i_uid = vap->va_uid;
2289 
2290                         /*
2291                          * There is a real ownership change.
2292                          */
2293                         if (owner_change) {
2294                                 /*
2295                                  * Add the blocks and the file to the new
2296                                  * user's quota.
2297                                  */
2298                                 ip->i_dquot = getinoquota(ip);
2299                                 (void) chkdq(ip, blocks, /* force */ 1, cr,
2300                                     &errmsg1, &len1);
2301                                 (void) chkiq(ufsvfsp, /* change */ 1,
2302                                     (struct inode *)NULL, (uid_t)ip->i_uid,
2303                                     /* force */ 1, cr, &errmsg2, &len2);
2304                         }
2305                 }
2306                 if (mask & AT_GID) {
2307                         ip->i_gid = vap->va_gid;
2308                 }
2309                 TRANS_INODE(ufsvfsp, ip);
2310                 ip->i_flag |= ICHG;
2311         }
2312         /*
2313          * Change file access or modified times.
2314          */
2315         if (mask & (AT_ATIME|AT_MTIME)) {
2316                 /* Check that the time value is within ufs range */
2317                 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2318                     ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2319                         error = EOVERFLOW;
2320                         goto update_inode;
2321                 }
2322 
2323                 /*
2324                  * if the "noaccess" mount option is set and only atime
2325                  * update is requested, do nothing. No error is returned.
2326                  */
2327                 if ((ufsvfsp->vfs_noatime) &&
2328                     ((mask & (AT_ATIME|AT_MTIME)) == AT_ATIME))
2329                         goto skip_atime;
2330 
2331                 if (mask & AT_ATIME) {
2332                         ip->i_atime.tv_sec = vap->va_atime.tv_sec;
2333                         ip->i_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2334                         ip->i_flag &= ~IACC;
2335                 }
2336                 if (mask & AT_MTIME) {
2337                         ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
2338                         ip->i_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2339                         gethrestime(&now);
2340                         if (now.tv_sec > TIME32_MAX) {
2341                                 /*
2342                                  * In 2038, ctime sticks forever..
2343                                  */
2344                                 ip->i_ctime.tv_sec = TIME32_MAX;
2345                                 ip->i_ctime.tv_usec = 0;
2346                         } else {
2347                                 ip->i_ctime.tv_sec = now.tv_sec;
2348                                 ip->i_ctime.tv_usec = now.tv_nsec / 1000;
2349                         }
2350                         ip->i_flag &= ~(IUPD|ICHG);
2351                         ip->i_flag |= IMODTIME;
2352                 }
2353                 TRANS_INODE(ufsvfsp, ip);
2354                 ip->i_flag |= IMOD;
2355         }
2356 
2357 skip_atime:
2358         /*
2359          * The presence of a shadow inode may indicate an ACL, but does
2360          * not imply an ACL.  Future FSD types should be handled here too
2361          * and check for the presence of the attribute-specific data
2362          * before referencing it.
2363          */
2364         if (ip->i_shadow) {
2365                 /*
2366                  * XXX if ufs_iupdat is changed to sandbagged write fix
2367                  * ufs_acl_setattr to push ip to keep acls consistent
2368                  *
2369                  * Suppress out of inodes messages if we will retry.
2370                  */
2371                 if (retry)
2372                         ip->i_flag |= IQUIET;
2373                 error = ufs_acl_setattr(ip, vap, cr);
2374                 ip->i_flag &= ~IQUIET;
2375         }
2376 
2377 update_inode:
2378         /*
2379          * Setattr always increases the sequence number
2380          */
2381         ip->i_seq++;
2382 
2383         /*
2384          * if nfsd and not logging; push synchronously
2385          */
2386         if ((curthread->t_flag & T_DONTPEND) && !TRANS_ISTRANS(ufsvfsp)) {
2387                 ufs_iupdat(ip, 1);
2388         } else {
2389                 ITIMES_NOLOCK(ip);
2390         }
2391 
2392         rw_exit(&ip->i_contents);
2393         if (dodqlock) {
2394                 rw_exit(&ufsvfsp->vfs_dqrwlock);
2395         }
2396         if (dorwlock)
2397                 rw_exit(&ip->i_rwlock);
2398 
2399         if (ulp) {
2400                 if (dotrans) {
2401                         int terr = 0;
2402                         TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_SETATTR,
2403                             trans_size);
2404                         if (error == 0)
2405                                 error = terr;
2406                 }
2407                 ufs_lockfs_end(ulp);
2408         }
2409 out:
2410         /*
2411          * If out of inodes or blocks, see if we can free something
2412          * up from the delete queue.
2413          */
2414         if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
2415                 ufs_delete_drain_wait(ufsvfsp, 1);
2416                 retry = 0;
2417                 if (errmsg1 != NULL)
2418                         kmem_free(errmsg1, len1);
2419                 if (errmsg2 != NULL)
2420                         kmem_free(errmsg2, len2);
2421                 goto again;
2422         }
2423         if (errmsg1 != NULL) {
2424                 uprintf(errmsg1);
2425                 kmem_free(errmsg1, len1);
2426         }
2427         if (errmsg2 != NULL) {
2428                 uprintf(errmsg2);
2429                 kmem_free(errmsg2, len2);
2430         }
2431         return (error);
2432 }
2433 
2434 /*ARGSUSED*/
2435 static int
2436 ufs_access(struct vnode *vp, int mode, int flags, struct cred *cr,
2437     caller_context_t *ct)
2438 {
2439         struct inode *ip = VTOI(vp);
2440 
2441         if (ip->i_ufsvfs == NULL)
2442                 return (EIO);
2443 
2444         /*
2445          * The ufs_iaccess function wants to be called with
2446          * mode bits expressed as "ufs specific" bits.
2447          * I.e., VWRITE|VREAD|VEXEC do not make sense to
2448          * ufs_iaccess() but IWRITE|IREAD|IEXEC do.
2449          * But since they're the same we just pass the vnode mode
2450          * bit but just verify that assumption at compile time.
2451          */
2452 #if IWRITE != VWRITE || IREAD != VREAD || IEXEC != VEXEC
2453 #error "ufs_access needs to map Vmodes to Imodes"
2454 #endif
2455         return (ufs_iaccess(ip, mode, cr, 1));
2456 }
2457 
2458 /* ARGSUSED */
2459 static int
2460 ufs_readlink(struct vnode *vp, struct uio *uiop, struct cred *cr,
2461     caller_context_t *ct)
2462 {
2463         struct inode *ip = VTOI(vp);
2464         struct ufsvfs *ufsvfsp;
2465         struct ulockfs *ulp;
2466         int error;
2467         int fastsymlink;
2468 
2469         if (vp->v_type != VLNK) {
2470                 error = EINVAL;
2471                 goto nolockout;
2472         }
2473 
2474         /*
2475          * If the symbolic link is empty there is nothing to read.
2476          * Fast-track these empty symbolic links
2477          */
2478         if (ip->i_size == 0) {
2479                 error = 0;
2480                 goto nolockout;
2481         }
2482 
2483         ufsvfsp = ip->i_ufsvfs;
2484         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READLINK_MASK);
2485         if (error)
2486                 goto nolockout;
2487         /*
2488          * The ip->i_rwlock protects the data blocks used for FASTSYMLINK
2489          */
2490 again:
2491         fastsymlink = 0;
2492         if (ip->i_flag & IFASTSYMLNK) {
2493                 rw_enter(&ip->i_rwlock, RW_READER);
2494                 rw_enter(&ip->i_contents, RW_READER);
2495                 if (ip->i_flag & IFASTSYMLNK) {
2496                         if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) &&
2497                             (ip->i_fs->fs_ronly == 0) &&
2498                             (!ufsvfsp->vfs_noatime)) {
2499                                 mutex_enter(&ip->i_tlock);
2500                                 ip->i_flag |= IACC;
2501                                 mutex_exit(&ip->i_tlock);
2502                         }
2503                         error = uiomove((caddr_t)&ip->i_db[1],
2504                             MIN(ip->i_size, uiop->uio_resid),
2505                             UIO_READ, uiop);
2506                         ITIMES(ip);
2507                         ++fastsymlink;
2508                 }
2509                 rw_exit(&ip->i_contents);
2510                 rw_exit(&ip->i_rwlock);
2511         }
2512         if (!fastsymlink) {
2513                 ssize_t size;   /* number of bytes read  */
2514                 caddr_t basep;  /* pointer to input data */
2515                 ino_t ino;
2516                 long  igen;
2517                 struct uio tuio;        /* temp uio struct */
2518                 struct uio *tuiop;
2519                 iovec_t tiov;           /* temp iovec struct */
2520                 char kbuf[FSL_SIZE];    /* buffer to hold fast symlink */
2521                 int tflag = 0;          /* flag to indicate temp vars used */
2522 
2523                 ino = ip->i_number;
2524                 igen = ip->i_gen;
2525                 size = uiop->uio_resid;
2526                 basep = uiop->uio_iov->iov_base;
2527                 tuiop = uiop;
2528 
2529                 rw_enter(&ip->i_rwlock, RW_WRITER);
2530                 rw_enter(&ip->i_contents, RW_WRITER);
2531                 if (ip->i_flag & IFASTSYMLNK) {
2532                         rw_exit(&ip->i_contents);
2533                         rw_exit(&ip->i_rwlock);
2534                         goto again;
2535                 }
2536 
2537                 /* can this be a fast symlink and is it a user buffer? */
2538                 if (ip->i_size <= FSL_SIZE &&
2539                     (uiop->uio_segflg == UIO_USERSPACE ||
2540                     uiop->uio_segflg == UIO_USERISPACE)) {
2541 
2542                         bzero(&tuio, sizeof (struct uio));
2543                         /*
2544                          * setup a kernel buffer to read link into.  this
2545                          * is to fix a race condition where the user buffer
2546                          * got corrupted before copying it into the inode.
2547                          */
2548                         size = ip->i_size;
2549                         tiov.iov_len = size;
2550                         tiov.iov_base = kbuf;
2551                         tuio.uio_iov = &tiov;
2552                         tuio.uio_iovcnt = 1;
2553                         tuio.uio_offset = uiop->uio_offset;
2554                         tuio.uio_segflg = UIO_SYSSPACE;
2555                         tuio.uio_fmode = uiop->uio_fmode;
2556                         tuio.uio_extflg = uiop->uio_extflg;
2557                         tuio.uio_limit = uiop->uio_limit;
2558                         tuio.uio_resid = size;
2559 
2560                         basep = tuio.uio_iov->iov_base;
2561                         tuiop = &tuio;
2562                         tflag = 1;
2563                 }
2564 
2565                 error = rdip(ip, tuiop, 0, cr);
2566                 if (!(error == 0 && ip->i_number == ino && ip->i_gen == igen)) {
2567                         rw_exit(&ip->i_contents);
2568                         rw_exit(&ip->i_rwlock);
2569                         goto out;
2570                 }
2571 
2572                 if (tflag == 0)
2573                         size -= uiop->uio_resid;
2574 
2575                 if ((tflag == 0 && ip->i_size <= FSL_SIZE &&
2576                     ip->i_size == size) || (tflag == 1 &&
2577                     tuio.uio_resid == 0)) {
2578                         error = kcopy(basep, &ip->i_db[1], ip->i_size);
2579                         if (error == 0) {
2580                                 ip->i_flag |= IFASTSYMLNK;
2581                                 /*
2582                                  * free page
2583                                  */
2584                                 (void) VOP_PUTPAGE(ITOV(ip),
2585                                     (offset_t)0, PAGESIZE,
2586                                     (B_DONTNEED | B_FREE | B_FORCE | B_ASYNC),
2587                                     cr, ct);
2588                         } else {
2589                                 int i;
2590                                 /* error, clear garbage left behind */
2591                                 for (i = 1; i < NDADDR; i++)
2592                                         ip->i_db[i] = 0;
2593                                 for (i = 0; i < NIADDR; i++)
2594                                         ip->i_ib[i] = 0;
2595                         }
2596                 }
2597                 if (tflag == 1) {
2598                         /* now, copy it into the user buffer */
2599                         error = uiomove((caddr_t)kbuf,
2600                             MIN(size, uiop->uio_resid),
2601                             UIO_READ, uiop);
2602                 }
2603                 rw_exit(&ip->i_contents);
2604                 rw_exit(&ip->i_rwlock);
2605         }
2606 out:
2607         if (ulp) {
2608                 ufs_lockfs_end(ulp);
2609         }
2610 nolockout:
2611         return (error);
2612 }
2613 
2614 /* ARGSUSED */
2615 static int
2616 ufs_fsync(struct vnode *vp, int syncflag, struct cred *cr, caller_context_t *ct)
2617 {
2618         struct inode *ip = VTOI(vp);
2619         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
2620         struct ulockfs *ulp;
2621         int error;
2622 
2623         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_FSYNC_MASK);
2624         if (error)
2625                 return (error);
2626 
2627         if (TRANS_ISTRANS(ufsvfsp)) {
2628                 /*
2629                  * First push out any data pages
2630                  */
2631                 if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) &&
2632                     (vp->v_type != VCHR) && !(IS_SWAPVP(vp))) {
2633                         error = VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
2634                             0, CRED(), ct);
2635                         if (error)
2636                                 goto out;
2637                 }
2638 
2639                 /*
2640                  * Delta any delayed inode times updates
2641                  * and push inode to log.
2642                  * All other inode deltas will have already been delta'd
2643                  * and will be pushed during the commit.
2644                  */
2645                 if (!(syncflag & FDSYNC) &&
2646                     ((ip->i_flag & (IMOD|IMODACC)) == IMODACC)) {
2647                         if (ulp) {
2648                                 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_FSYNC,
2649                                     TOP_SYNCIP_SIZE);
2650                         }
2651                         rw_enter(&ip->i_contents, RW_READER);
2652                         mutex_enter(&ip->i_tlock);
2653                         ip->i_flag &= ~IMODTIME;
2654                         mutex_exit(&ip->i_tlock);
2655                         ufs_iupdat(ip, I_SYNC);
2656                         rw_exit(&ip->i_contents);
2657                         if (ulp) {
2658                                 TRANS_END_ASYNC(ufsvfsp, TOP_FSYNC,
2659                                     TOP_SYNCIP_SIZE);
2660                         }
2661                 }
2662 
2663                 /*
2664                  * Commit the Moby transaction
2665                  *
2666                  * Deltas have already been made so we just need to
2667                  * commit them with a synchronous transaction.
2668                  * TRANS_BEGIN_SYNC() will return an error
2669                  * if there are no deltas to commit, for an
2670                  * empty transaction.
2671                  */
2672                 if (ulp) {
2673                         TRANS_BEGIN_SYNC(ufsvfsp, TOP_FSYNC, TOP_COMMIT_SIZE,
2674                             error);
2675                         if (error) {
2676                                 error = 0; /* commit wasn't needed */
2677                                 goto out;
2678                         }
2679                         TRANS_END_SYNC(ufsvfsp, error, TOP_FSYNC,
2680                             TOP_COMMIT_SIZE);
2681                 }
2682         } else {        /* not logging */
2683                 if (!(IS_SWAPVP(vp)))
2684                         if (syncflag & FNODSYNC) {
2685                                 /* Just update the inode only */
2686                                 TRANS_IUPDAT(ip, 1);
2687                                 error = 0;
2688                         } else if (syncflag & FDSYNC)
2689                                 /* Do data-synchronous writes */
2690                                 error = TRANS_SYNCIP(ip, 0, I_DSYNC, TOP_FSYNC);
2691                         else
2692                                 /* Do synchronous writes */
2693                                 error = TRANS_SYNCIP(ip, 0, I_SYNC, TOP_FSYNC);
2694 
2695                 rw_enter(&ip->i_contents, RW_WRITER);
2696                 if (!error)
2697                         error = ufs_sync_indir(ip);
2698                 rw_exit(&ip->i_contents);
2699         }
2700 out:
2701         if (ulp) {
2702                 ufs_lockfs_end(ulp);
2703         }
2704         return (error);
2705 }
2706 
2707 /*ARGSUSED*/
2708 static void
2709 ufs_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ct)
2710 {
2711         ufs_iinactive(VTOI(vp));
2712 }
2713 
2714 /*
2715  * Unix file system operations having to do with directory manipulation.
2716  */
2717 int ufs_lookup_idle_count = 2;  /* Number of inodes to idle each time */
2718 /* ARGSUSED */
2719 static int
2720 ufs_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
2721     struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cr,
2722     caller_context_t *ct, int *direntflags, pathname_t *realpnp)
2723 {
2724         struct inode *ip;
2725         struct inode *sip;
2726         struct inode *xip;
2727         struct ufsvfs *ufsvfsp;
2728         struct ulockfs *ulp;
2729         struct vnode *vp;
2730         int error;
2731 
2732         /*
2733          * Check flags for type of lookup (regular file or attribute file)
2734          */
2735 
2736         ip = VTOI(dvp);
2737 
2738         if (flags & LOOKUP_XATTR) {
2739 
2740                 /*
2741                  * If not mounted with XATTR support then return EINVAL
2742                  */
2743 
2744                 if (!(ip->i_ufsvfs->vfs_vfs->vfs_flag & VFS_XATTR))
2745                         return (EINVAL);
2746                 /*
2747                  * We don't allow recursive attributes...
2748                  * Maybe someday we will.
2749                  */
2750                 if ((ip->i_cflags & IXATTR)) {
2751                         return (EINVAL);
2752                 }
2753 
2754                 if ((vp = dnlc_lookup(dvp, XATTR_DIR_NAME)) == NULL) {
2755                         error = ufs_xattr_getattrdir(dvp, &sip, flags, cr);
2756                         if (error) {
2757                                 *vpp = NULL;
2758                                 goto out;
2759                         }
2760 
2761                         vp = ITOV(sip);
2762                         dnlc_update(dvp, XATTR_DIR_NAME, vp);
2763                 }
2764 
2765                 /*
2766                  * Check accessibility of directory.
2767                  */
2768                 if (vp == DNLC_NO_VNODE) {
2769                         VN_RELE(vp);
2770                         error = ENOENT;
2771                         goto out;
2772                 }
2773                 if ((error = ufs_iaccess(VTOI(vp), IEXEC, cr, 1)) != 0) {
2774                         VN_RELE(vp);
2775                         goto out;
2776                 }
2777 
2778                 *vpp = vp;
2779                 return (0);
2780         }
2781 
2782         /*
2783          * Check for a null component, which we should treat as
2784          * looking at dvp from within it's parent, so we don't
2785          * need a call to ufs_iaccess(), as it has already been
2786          * done.
2787          */
2788         if (nm[0] == 0) {
2789                 VN_HOLD(dvp);
2790                 error = 0;
2791                 *vpp = dvp;
2792                 goto out;
2793         }
2794 
2795         /*
2796          * Check for "." ie itself. this is a quick check and
2797          * avoids adding "." into the dnlc (which have been seen
2798          * to occupy >10% of the cache).
2799          */
2800         if ((nm[0] == '.') && (nm[1] == 0)) {
2801                 /*
2802                  * Don't return without checking accessibility
2803                  * of the directory. We only need the lock if
2804                  * we are going to return it.
2805                  */
2806                 if ((error = ufs_iaccess(ip, IEXEC, cr, 1)) == 0) {
2807                         VN_HOLD(dvp);
2808                         *vpp = dvp;
2809                 }
2810                 goto out;
2811         }
2812 
2813         /*
2814          * Fast path: Check the directory name lookup cache.
2815          */
2816         if (vp = dnlc_lookup(dvp, nm)) {
2817                 /*
2818                  * Check accessibility of directory.
2819                  */
2820                 if ((error = ufs_iaccess(ip, IEXEC, cr, 1)) != 0) {
2821                         VN_RELE(vp);
2822                         goto out;
2823                 }
2824                 if (vp == DNLC_NO_VNODE) {
2825                         VN_RELE(vp);
2826                         error = ENOENT;
2827                         goto out;
2828                 }
2829                 xip = VTOI(vp);
2830                 ulp = NULL;
2831                 goto fastpath;
2832         }
2833 
2834         /*
2835          * Keep the idle queue from getting too long by
2836          * idling two inodes before attempting to allocate another.
2837          *    This operation must be performed before entering
2838          *    lockfs or a transaction.
2839          */
2840         if (ufs_idle_q.uq_ne > ufs_idle_q.uq_hiwat)
2841                 if ((curthread->t_flag & T_DONTBLOCK) == 0) {
2842                         ins.in_lidles.value.ul += ufs_lookup_idle_count;
2843                         ufs_idle_some(ufs_lookup_idle_count);
2844                 }
2845 
2846 retry_lookup:
2847         /*
2848          * Check accessibility of directory.
2849          */
2850         if (error = ufs_diraccess(ip, IEXEC, cr))
2851                 goto out;
2852 
2853         ufsvfsp = ip->i_ufsvfs;
2854         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_LOOKUP_MASK);
2855         if (error)
2856                 goto out;
2857 
2858         error = ufs_dirlook(ip, nm, &xip, cr, 1, 0);
2859 
2860 fastpath:
2861         if (error == 0) {
2862                 ip = xip;
2863                 *vpp = ITOV(ip);
2864 
2865                 /*
2866                  * If vnode is a device return special vnode instead.
2867                  */
2868                 if (IS_DEVVP(*vpp)) {
2869                         struct vnode *newvp;
2870 
2871                         newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type,
2872                             cr);
2873                         VN_RELE(*vpp);
2874                         if (newvp == NULL)
2875                                 error = ENOSYS;
2876                         else
2877                                 *vpp = newvp;
2878                 } else if (ip->i_cflags & ICOMPRESS) {
2879                         struct vnode *newvp;
2880 
2881                         /*
2882                          * Compressed file, substitute dcfs vnode
2883                          */
2884                         newvp = decompvp(*vpp, cr, ct);
2885                         VN_RELE(*vpp);
2886                         if (newvp == NULL)
2887                                 error = ENOSYS;
2888                         else
2889                                 *vpp = newvp;
2890                 }
2891         }
2892         if (ulp) {
2893                 ufs_lockfs_end(ulp);
2894         }
2895 
2896         if (error == EAGAIN)
2897                 goto retry_lookup;
2898 
2899 out:
2900         return (error);
2901 }
2902 
2903 /*ARGSUSED*/
2904 static int
2905 ufs_create(struct vnode *dvp, char *name, struct vattr *vap, enum vcexcl excl,
2906     int mode, struct vnode **vpp, struct cred *cr, int flag,
2907     caller_context_t *ct, vsecattr_t *vsecp)
2908 {
2909         struct inode *ip;
2910         struct inode *xip;
2911         struct inode *dip;
2912         struct vnode *xvp;
2913         struct ufsvfs *ufsvfsp;
2914         struct ulockfs *ulp;
2915         int error;
2916         int issync;
2917         int truncflag;
2918         int trans_size;
2919         int noentry;
2920         int defer_dip_seq_update = 0;   /* need to defer update of dip->i_seq */
2921         int retry = 1;
2922         int indeadlock;
2923 
2924 again:
2925         ip = VTOI(dvp);
2926         ufsvfsp = ip->i_ufsvfs;
2927         truncflag = 0;
2928 
2929         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_CREATE_MASK);
2930         if (error)
2931                 goto out;
2932 
2933         if (ulp) {
2934                 trans_size = (int)TOP_CREATE_SIZE(ip);
2935                 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_CREATE, trans_size);
2936         }
2937 
2938         if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr) != 0)
2939                 vap->va_mode &= ~VSVTX;
2940 
2941         if (*name == '\0') {
2942                 /*
2943                  * Null component name refers to the directory itself.
2944                  */
2945                 VN_HOLD(dvp);
2946                 /*
2947                  * Even though this is an error case, we need to grab the
2948                  * quota lock since the error handling code below is common.
2949                  */
2950                 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
2951                 rw_enter(&ip->i_contents, RW_WRITER);
2952                 error = EEXIST;
2953         } else {
2954                 xip = NULL;
2955                 noentry = 0;
2956                 /*
2957                  * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
2958                  * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
2959                  * possible, retries the operation.
2960                  */
2961                 ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_CREATE,
2962                     retry_dir);
2963                 if (indeadlock)
2964                         goto again;
2965 
2966                 xvp = dnlc_lookup(dvp, name);
2967                 if (xvp == DNLC_NO_VNODE) {
2968                         noentry = 1;
2969                         VN_RELE(xvp);
2970                         xvp = NULL;
2971                 }
2972                 if (xvp) {
2973                         rw_exit(&ip->i_rwlock);
2974                         if (error = ufs_iaccess(ip, IEXEC, cr, 1)) {
2975                                 VN_RELE(xvp);
2976                         } else {
2977                                 error = EEXIST;
2978                                 xip = VTOI(xvp);
2979                         }
2980                 } else {
2981                         /*
2982                          * Suppress file system full message if we will retry
2983                          */
2984                         error = ufs_direnter_cm(ip, name, DE_CREATE,
2985                             vap, &xip, cr, (noentry | (retry ? IQUIET : 0)));
2986                         if (error == EAGAIN) {
2987                                 if (ulp) {
2988                                         TRANS_END_CSYNC(ufsvfsp, error, issync,
2989                                             TOP_CREATE, trans_size);
2990                                         ufs_lockfs_end(ulp);
2991                                 }
2992                                 goto again;
2993                         }
2994                         rw_exit(&ip->i_rwlock);
2995                 }
2996                 ip = xip;
2997                 if (ip != NULL) {
2998                         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
2999                         rw_enter(&ip->i_contents, RW_WRITER);
3000                 }
3001         }
3002 
3003         /*
3004          * If the file already exists and this is a non-exclusive create,
3005          * check permissions and allow access for non-directories.
3006          * Read-only create of an existing directory is also allowed.
3007          * We fail an exclusive create of anything which already exists.
3008          */
3009         if (error == EEXIST) {
3010                 dip = VTOI(dvp);
3011                 if (excl == NONEXCL) {
3012                         if ((((ip->i_mode & IFMT) == IFDIR) ||
3013                             ((ip->i_mode & IFMT) == IFATTRDIR)) &&
3014                             (mode & IWRITE))
3015                                 error = EISDIR;
3016                         else if (mode)
3017                                 error = ufs_iaccess(ip, mode, cr, 0);
3018                         else
3019                                 error = 0;
3020                 }
3021                 if (error) {
3022                         rw_exit(&ip->i_contents);
3023                         rw_exit(&ufsvfsp->vfs_dqrwlock);
3024                         VN_RELE(ITOV(ip));
3025                         goto unlock;
3026                 }
3027                 /*
3028                  * If the error EEXIST was set, then i_seq can not
3029                  * have been updated. The sequence number interface
3030                  * is defined such that a non-error VOP_CREATE must
3031                  * increase the dir va_seq it by at least one. If we
3032                  * have cleared the error, increase i_seq. Note that
3033                  * we are increasing the dir i_seq and in rare cases
3034                  * ip may actually be from the dvp, so we already have
3035                  * the locks and it will not be subject to truncation.
3036                  * In case we have to update i_seq of the parent
3037                  * directory dip, we have to defer it till we have
3038                  * released our locks on ip due to lock ordering requirements.
3039                  */
3040                 if (ip != dip)
3041                         defer_dip_seq_update = 1;
3042                 else
3043                         ip->i_seq++;
3044 
3045                 if (((ip->i_mode & IFMT) == IFREG) &&
3046                     (vap->va_mask & AT_SIZE) && vap->va_size == 0) {
3047                         /*
3048                          * Truncate regular files, if requested by caller.
3049                          * Grab i_rwlock to make sure no one else is
3050                          * currently writing to the file (we promised
3051                          * bmap we would do this).
3052                          * Must get the locks in the correct order.
3053                          */
3054                         if (ip->i_size == 0) {
3055                                 ip->i_flag |= ICHG | IUPD;
3056                                 ip->i_seq++;
3057                                 TRANS_INODE(ufsvfsp, ip);
3058                         } else {
3059                                 /*
3060                                  * Large Files: Why this check here?
3061                                  * Though we do it in vn_create() we really
3062                                  * want to guarantee that we do not destroy
3063                                  * Large file data by atomically checking
3064                                  * the size while holding the contents
3065                                  * lock.
3066                                  */
3067                                 if (flag && !(flag & FOFFMAX) &&
3068                                     ((ip->i_mode & IFMT) == IFREG) &&
3069                                     (ip->i_size > (offset_t)MAXOFF32_T)) {
3070                                         rw_exit(&ip->i_contents);
3071                                         rw_exit(&ufsvfsp->vfs_dqrwlock);
3072                                         error = EOVERFLOW;
3073                                         goto unlock;
3074                                 }
3075                                 if (TRANS_ISTRANS(ufsvfsp))
3076                                         truncflag++;
3077                                 else {
3078                                         rw_exit(&ip->i_contents);
3079                                         rw_exit(&ufsvfsp->vfs_dqrwlock);
3080                                         ufs_tryirwlock_trans(&ip->i_rwlock,
3081                                             RW_WRITER, TOP_CREATE,
3082                                             retry_file);
3083                                         if (indeadlock) {
3084                                                 VN_RELE(ITOV(ip));
3085                                                 goto again;
3086                                         }
3087                                         rw_enter(&ufsvfsp->vfs_dqrwlock,
3088                                             RW_READER);
3089                                         rw_enter(&ip->i_contents, RW_WRITER);
3090                                         (void) ufs_itrunc(ip, (u_offset_t)0, 0,
3091                                             cr);
3092                                         rw_exit(&ip->i_rwlock);
3093                                 }
3094 
3095                         }
3096                         if (error == 0) {
3097                                 vnevent_create(ITOV(ip), ct);
3098                         }
3099                 }
3100         }
3101 
3102         if (error) {
3103                 if (ip != NULL) {
3104                         rw_exit(&ufsvfsp->vfs_dqrwlock);
3105                         rw_exit(&ip->i_contents);
3106                 }
3107                 goto unlock;
3108         }
3109 
3110         *vpp = ITOV(ip);
3111         ITIMES(ip);
3112         rw_exit(&ip->i_contents);
3113         rw_exit(&ufsvfsp->vfs_dqrwlock);
3114 
3115         /*
3116          * If vnode is a device return special vnode instead.
3117          */
3118         if (!error && IS_DEVVP(*vpp)) {
3119                 struct vnode *newvp;
3120 
3121                 newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
3122                 VN_RELE(*vpp);
3123                 if (newvp == NULL) {
3124                         error = ENOSYS;
3125                         goto unlock;
3126                 }
3127                 truncflag = 0;
3128                 *vpp = newvp;
3129         }
3130 unlock:
3131 
3132         /*
3133          * Do the deferred update of the parent directory's sequence
3134          * number now.
3135          */
3136         if (defer_dip_seq_update == 1) {
3137                 rw_enter(&dip->i_contents, RW_READER);
3138                 mutex_enter(&dip->i_tlock);
3139                 dip->i_seq++;
3140                 mutex_exit(&dip->i_tlock);
3141                 rw_exit(&dip->i_contents);
3142         }
3143 
3144         if (ulp) {
3145                 int terr = 0;
3146 
3147                 TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_CREATE,
3148                     trans_size);
3149 
3150                 /*
3151                  * If we haven't had a more interesting failure
3152                  * already, then anything that might've happened
3153                  * here should be reported.
3154                  */
3155                 if (error == 0)
3156                         error = terr;
3157         }
3158 
3159         if (!error && truncflag) {
3160                 ufs_tryirwlock(&ip->i_rwlock, RW_WRITER, retry_trunc);
3161                 if (indeadlock) {
3162                         if (ulp)
3163                                 ufs_lockfs_end(ulp);
3164                         VN_RELE(ITOV(ip));
3165                         goto again;
3166                 }
3167                 (void) TRANS_ITRUNC(ip, (u_offset_t)0, 0, cr);
3168                 rw_exit(&ip->i_rwlock);
3169         }
3170 
3171         if (ulp)
3172                 ufs_lockfs_end(ulp);
3173 
3174         /*
3175          * If no inodes available, try to free one up out of the
3176          * pending delete queue.
3177          */
3178         if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
3179                 ufs_delete_drain_wait(ufsvfsp, 1);
3180                 retry = 0;
3181                 goto again;
3182         }
3183 
3184 out:
3185         return (error);
3186 }
3187 
3188 extern int ufs_idle_max;
3189 /*ARGSUSED*/
3190 static int
3191 ufs_remove(struct vnode *vp, char *nm, struct cred *cr, caller_context_t *ct,
3192     int flags)
3193 {
3194         struct inode *ip = VTOI(vp);
3195         struct ufsvfs *ufsvfsp  = ip->i_ufsvfs;
3196         struct ulockfs *ulp;
3197         vnode_t *rmvp = NULL;   /* Vnode corresponding to name being removed */
3198         int indeadlock;
3199         int error;
3200         int issync;
3201         int trans_size;
3202 
3203         /*
3204          * don't let the delete queue get too long
3205          */
3206         if (ufsvfsp == NULL) {
3207                 error = EIO;
3208                 goto out;
3209         }
3210         if (ufsvfsp->vfs_delete.uq_ne > ufs_idle_max)
3211                 ufs_delete_drain(vp->v_vfsp, 1, 1);
3212 
3213         error = ufs_eventlookup(vp, nm, cr, &rmvp);
3214         if (rmvp != NULL) {
3215                 /* Only send the event if there were no errors */
3216                 if (error == 0)
3217                         vnevent_remove(rmvp, vp, nm, ct);
3218                 VN_RELE(rmvp);
3219         }
3220 
3221 retry_remove:
3222         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_REMOVE_MASK);
3223         if (error)
3224                 goto out;
3225 
3226         if (ulp)
3227                 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_REMOVE,
3228                     trans_size = (int)TOP_REMOVE_SIZE(VTOI(vp)));
3229 
3230         /*
3231          * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
3232          * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
3233          * possible, retries the operation.
3234          */
3235         ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_REMOVE, retry);
3236         if (indeadlock)
3237                 goto retry_remove;
3238         error = ufs_dirremove(ip, nm, (struct inode *)0, (struct vnode *)0,
3239             DR_REMOVE, cr);
3240         rw_exit(&ip->i_rwlock);
3241 
3242         if (ulp) {
3243                 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_REMOVE, trans_size);
3244                 ufs_lockfs_end(ulp);
3245         }
3246 
3247 out:
3248         return (error);
3249 }
3250 
3251 /*
3252  * Link a file or a directory.  Only privileged processes are allowed to
3253  * make links to directories.
3254  */
3255 /*ARGSUSED*/
3256 static int
3257 ufs_link(struct vnode *tdvp, struct vnode *svp, char *tnm, struct cred *cr,
3258     caller_context_t *ct, int flags)
3259 {
3260         struct inode *sip;
3261         struct inode *tdp = VTOI(tdvp);
3262         struct ufsvfs *ufsvfsp = tdp->i_ufsvfs;
3263         struct ulockfs *ulp;
3264         struct vnode *realvp;
3265         int error;
3266         int issync;
3267         int trans_size;
3268         int isdev;
3269         int indeadlock;
3270 
3271 retry_link:
3272         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_LINK_MASK);
3273         if (error)
3274                 goto out;
3275 
3276         if (ulp)
3277                 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_LINK,
3278                     trans_size = (int)TOP_LINK_SIZE(VTOI(tdvp)));
3279 
3280         if (VOP_REALVP(svp, &realvp, ct) == 0)
3281                 svp = realvp;
3282 
3283         /*
3284          * Make sure link for extended attributes is valid
3285          * We only support hard linking of attr in ATTRDIR to ATTRDIR
3286          *
3287          * Make certain we don't attempt to look at a device node as
3288          * a ufs inode.
3289          */
3290 
3291         isdev = IS_DEVVP(svp);
3292         if (((isdev == 0) && ((VTOI(svp)->i_cflags & IXATTR) == 0) &&
3293             ((tdp->i_mode & IFMT) == IFATTRDIR)) ||
3294             ((isdev == 0) && (VTOI(svp)->i_cflags & IXATTR) &&
3295             ((tdp->i_mode & IFMT) == IFDIR))) {
3296                 error = EINVAL;
3297                 goto unlock;
3298         }
3299 
3300         sip = VTOI(svp);
3301         if ((svp->v_type == VDIR &&
3302             secpolicy_fs_linkdir(cr, ufsvfsp->vfs_vfs) != 0) ||
3303             (sip->i_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0)) {
3304                 error = EPERM;
3305                 goto unlock;
3306         }
3307 
3308         /*
3309          * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
3310          * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
3311          * possible, retries the operation.
3312          */
3313         ufs_tryirwlock_trans(&tdp->i_rwlock, RW_WRITER, TOP_LINK, retry);
3314         if (indeadlock)
3315                 goto retry_link;
3316         error = ufs_direnter_lr(tdp, tnm, DE_LINK, (struct inode *)0,
3317             sip, cr);
3318         rw_exit(&tdp->i_rwlock);
3319 
3320 unlock:
3321         if (ulp) {
3322                 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_LINK, trans_size);
3323                 ufs_lockfs_end(ulp);
3324         }
3325 
3326         if (!error) {
3327                 vnevent_link(svp, ct);
3328         }
3329 out:
3330         return (error);
3331 }
3332 
3333 uint64_t ufs_rename_retry_cnt;
3334 uint64_t ufs_rename_upgrade_retry_cnt;
3335 uint64_t ufs_rename_dircheck_retry_cnt;
3336 clock_t  ufs_rename_backoff_delay = 1;
3337 
3338 /*
3339  * Rename a file or directory.
3340  * We are given the vnode and entry string of the source and the
3341  * vnode and entry string of the place we want to move the source
3342  * to (the target). The essential operation is:
3343  *      unlink(target);
3344  *      link(source, target);
3345  *      unlink(source);
3346  * but "atomically".  Can't do full commit without saving state in
3347  * the inode on disk, which isn't feasible at this time.  Best we
3348  * can do is always guarantee that the TARGET exists.
3349  */
3350 
3351 /*ARGSUSED*/
3352 static int
3353 ufs_rename(struct vnode *sdvp, char *snm, struct vnode *tdvp, char *tnm,
3354     struct cred *cr, caller_context_t *ct, int flags)
3355 {
3356         struct inode *sip = NULL;       /* source inode */
3357         struct inode *ip = NULL;        /* check inode */
3358         struct inode *sdp;              /* old (source) parent inode */
3359         struct inode *tdp;              /* new (target) parent inode */
3360         struct vnode *svp = NULL;       /* source vnode */
3361         struct vnode *tvp = NULL;       /* target vnode, if it exists */
3362         struct vnode *realvp;
3363         struct ufsvfs *ufsvfsp;
3364         struct ulockfs *ulp = NULL;
3365         struct ufs_slot slot;
3366         timestruc_t now;
3367         int error;
3368         int issync;
3369         int trans_size;
3370         krwlock_t *first_lock;
3371         krwlock_t *second_lock;
3372         krwlock_t *reverse_lock;
3373         int serr, terr;
3374 
3375         sdp = VTOI(sdvp);
3376         slot.fbp = NULL;
3377         ufsvfsp = sdp->i_ufsvfs;
3378 
3379         if (VOP_REALVP(tdvp, &realvp, ct) == 0)
3380                 tdvp = realvp;
3381 
3382         /* Must do this before taking locks in case of DNLC miss */
3383         terr = ufs_eventlookup(tdvp, tnm, cr, &tvp);
3384         serr = ufs_eventlookup(sdvp, snm, cr, &svp);
3385 
3386         if ((serr == 0) && ((terr == 0) || (terr == ENOENT))) {
3387                 if (tvp != NULL)
3388                         vnevent_pre_rename_dest(tvp, tdvp, tnm, ct);
3389 
3390                 /*
3391                  * Notify the target directory of the rename event
3392                  * if source and target directories are not the same.
3393                  */
3394                 if (sdvp != tdvp)
3395                         vnevent_pre_rename_dest_dir(tdvp, svp, tnm, ct);
3396 
3397                 if (svp != NULL)
3398                         vnevent_pre_rename_src(svp, sdvp, snm, ct);
3399         }
3400 
3401         if (svp != NULL)
3402                 VN_RELE(svp);
3403 
3404 retry_rename:
3405         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_RENAME_MASK);
3406         if (error)
3407                 goto unlock;
3408 
3409         if (ulp)
3410                 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_RENAME,
3411                     trans_size = (int)TOP_RENAME_SIZE(sdp));
3412 
3413         if (VOP_REALVP(tdvp, &realvp, ct) == 0)
3414                 tdvp = realvp;
3415 
3416         tdp = VTOI(tdvp);
3417 
3418         /*
3419          * We only allow renaming of attributes from ATTRDIR to ATTRDIR.
3420          */
3421         if ((tdp->i_mode & IFMT) != (sdp->i_mode & IFMT)) {
3422                 error = EINVAL;
3423                 goto unlock;
3424         }
3425 
3426         /*
3427          * Check accessibility of directory.
3428          */
3429         if (error = ufs_diraccess(sdp, IEXEC, cr))
3430                 goto unlock;
3431 
3432         /*
3433          * Look up inode of file we're supposed to rename.
3434          */
3435         gethrestime(&now);
3436         if (error = ufs_dirlook(sdp, snm, &sip, cr, 0, 0)) {
3437                 if (error == EAGAIN) {
3438                         if (ulp) {
3439                                 TRANS_END_CSYNC(ufsvfsp, error, issync,
3440                                     TOP_RENAME, trans_size);
3441                                 ufs_lockfs_end(ulp);
3442                         }
3443                         goto retry_rename;
3444                 }
3445 
3446                 goto unlock;
3447         }
3448 
3449         /*
3450          * Lock both the source and target directories (they may be
3451          * the same) to provide the atomicity semantics that was
3452          * previously provided by the per file system vfs_rename_lock
3453          *
3454          * with vfs_rename_lock removed to allow simultaneous renames
3455          * within a file system, ufs_dircheckpath can deadlock while
3456          * traversing back to ensure that source is not a parent directory
3457          * of target parent directory. This is because we get into
3458          * ufs_dircheckpath with the sdp and tdp locks held as RW_WRITER.
3459          * If the tdp and sdp of the simultaneous renames happen to be
3460          * in the path of each other, it can lead to a deadlock. This
3461          * can be avoided by getting the locks as RW_READER here and then
3462          * upgrading to RW_WRITER after completing the ufs_dircheckpath.
3463          *
3464          * We hold the target directory's i_rwlock after calling
3465          * ufs_lockfs_begin but in many other operations (like ufs_readdir)
3466          * VOP_RWLOCK is explicitly called by the filesystem independent code
3467          * before calling the file system operation. In these cases the order
3468          * is reversed (i.e i_rwlock is taken first and then ufs_lockfs_begin
3469          * is called). This is fine as long as ufs_lockfs_begin acts as a VOP
3470          * counter but with ufs_quiesce setting the SLOCK bit this becomes a
3471          * synchronizing object which might lead to a deadlock. So we use
3472          * rw_tryenter instead of rw_enter. If we fail to get this lock and
3473          * find that SLOCK bit is set, we call ufs_lockfs_end and restart the
3474          * operation.
3475          */
3476 retry:
3477         first_lock = &tdp->i_rwlock;
3478         second_lock = &sdp->i_rwlock;
3479 retry_firstlock:
3480         if (!rw_tryenter(first_lock, RW_READER)) {
3481                 /*
3482                  * We didn't get the lock. Check if the SLOCK is set in the
3483                  * ufsvfs. If yes, we might be in a deadlock. Safer to give up
3484                  * and wait for SLOCK to be cleared.
3485                  */
3486 
3487                 if (ulp && ULOCKFS_IS_SLOCK(ulp)) {
3488                         TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RENAME,
3489                             trans_size);
3490                         ufs_lockfs_end(ulp);
3491                         goto retry_rename;
3492 
3493                 } else {
3494                         /*
3495                          * SLOCK isn't set so this is a genuine synchronization
3496                          * case. Let's try again after giving them a breather.
3497                          */
3498                         delay(RETRY_LOCK_DELAY);
3499                         goto  retry_firstlock;
3500                 }
3501         }
3502         /*
3503          * Need to check if the tdp and sdp are same !!!
3504          */
3505         if ((tdp != sdp) && (!rw_tryenter(second_lock, RW_READER))) {
3506                 /*
3507                  * We didn't get the lock. Check if the SLOCK is set in the
3508                  * ufsvfs. If yes, we might be in a deadlock. Safer to give up
3509                  * and wait for SLOCK to be cleared.
3510                  */
3511 
3512                 rw_exit(first_lock);
3513                 if (ulp && ULOCKFS_IS_SLOCK(ulp)) {
3514                         TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RENAME,
3515                             trans_size);
3516                         ufs_lockfs_end(ulp);
3517                         goto retry_rename;
3518 
3519                 } else {
3520                         /*
3521                          * So we couldn't get the second level peer lock *and*
3522                          * the SLOCK bit isn't set. Too bad we can be
3523                          * contentding with someone wanting these locks otherway
3524                          * round. Reverse the locks in case there is a heavy
3525                          * contention for the second level lock.
3526                          */
3527                         reverse_lock = first_lock;
3528                         first_lock = second_lock;
3529                         second_lock = reverse_lock;
3530                         ufs_rename_retry_cnt++;
3531                         goto  retry_firstlock;
3532                 }
3533         }
3534 
3535         if (sip == tdp) {
3536                 error = EINVAL;
3537                 goto errout;
3538         }
3539         /*
3540          * Make sure we can delete the source entry.  This requires
3541          * write permission on the containing directory.
3542          * Check for sticky directories.
3543          */
3544         rw_enter(&sdp->i_contents, RW_READER);
3545         rw_enter(&sip->i_contents, RW_READER);
3546         if ((error = ufs_iaccess(sdp, IWRITE, cr, 0)) != 0 ||
3547             (error = ufs_sticky_remove_access(sdp, sip, cr)) != 0) {
3548                 rw_exit(&sip->i_contents);
3549                 rw_exit(&sdp->i_contents);
3550                 goto errout;
3551         }
3552 
3553         /*
3554          * If this is a rename of a directory and the parent is
3555          * different (".." must be changed), then the source
3556          * directory must not be in the directory hierarchy
3557          * above the target, as this would orphan everything
3558          * below the source directory.  Also the user must have
3559          * write permission in the source so as to be able to
3560          * change "..".
3561          */
3562         if ((((sip->i_mode & IFMT) == IFDIR) ||
3563             ((sip->i_mode & IFMT) == IFATTRDIR)) && sdp != tdp) {
3564                 ino_t   inum;
3565 
3566                 if (error = ufs_iaccess(sip, IWRITE, cr, 0)) {
3567                         rw_exit(&sip->i_contents);
3568                         rw_exit(&sdp->i_contents);
3569                         goto errout;
3570                 }
3571                 inum = sip->i_number;
3572                 rw_exit(&sip->i_contents);
3573                 rw_exit(&sdp->i_contents);
3574                 if ((error = ufs_dircheckpath(inum, tdp, sdp, cr))) {
3575                         /*
3576                          * If we got EAGAIN ufs_dircheckpath detected a
3577                          * potential deadlock and backed out. We need
3578                          * to retry the operation since sdp and tdp have
3579                          * to be released to avoid the deadlock.
3580                          */
3581                         if (error == EAGAIN) {
3582                                 rw_exit(&tdp->i_rwlock);
3583                                 if (tdp != sdp)
3584                                         rw_exit(&sdp->i_rwlock);
3585                                 delay(ufs_rename_backoff_delay);
3586                                 ufs_rename_dircheck_retry_cnt++;
3587                                 goto retry;
3588                         }
3589                         goto errout;
3590                 }
3591         } else {
3592                 rw_exit(&sip->i_contents);
3593                 rw_exit(&sdp->i_contents);
3594         }
3595 
3596 
3597         /*
3598          * Check for renaming '.' or '..' or alias of '.'
3599          */
3600         if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0 || sdp == sip) {
3601                 error = EINVAL;
3602                 goto errout;
3603         }
3604 
3605         /*
3606          * Simultaneous renames can deadlock in ufs_dircheckpath since it
3607          * tries to traverse back the file tree with both tdp and sdp held
3608          * as RW_WRITER. To avoid that we have to hold the tdp and sdp locks
3609          * as RW_READERS  till ufs_dircheckpath is done.
3610          * Now that ufs_dircheckpath is done with, we can upgrade the locks
3611          * to RW_WRITER.
3612          */
3613         if (!rw_tryupgrade(&tdp->i_rwlock)) {
3614                 /*
3615                  * The upgrade failed. We got to give away the lock
3616                  * as to avoid deadlocking with someone else who is
3617                  * waiting for writer lock. With the lock gone, we
3618                  * cannot be sure the checks done above will hold
3619                  * good when we eventually get them back as writer.
3620                  * So if we can't upgrade we drop the locks and retry
3621                  * everything again.
3622                  */
3623                 rw_exit(&tdp->i_rwlock);
3624                 if (tdp != sdp)
3625                         rw_exit(&sdp->i_rwlock);
3626                 delay(ufs_rename_backoff_delay);
3627                 ufs_rename_upgrade_retry_cnt++;
3628                 goto retry;
3629         }
3630         if (tdp != sdp) {
3631                 if (!rw_tryupgrade(&sdp->i_rwlock)) {
3632                         /*
3633                          * The upgrade failed. We got to give away the lock
3634                          * as to avoid deadlocking with someone else who is
3635                          * waiting for writer lock. With the lock gone, we
3636                          * cannot be sure the checks done above will hold
3637                          * good when we eventually get them back as writer.
3638                          * So if we can't upgrade we drop the locks and retry
3639                          * everything again.
3640                          */
3641                         rw_exit(&tdp->i_rwlock);
3642                         rw_exit(&sdp->i_rwlock);
3643                         delay(ufs_rename_backoff_delay);
3644                         ufs_rename_upgrade_retry_cnt++;
3645                         goto retry;
3646                 }
3647         }
3648 
3649         /*
3650          * Now that all the locks are held check to make sure another thread
3651          * didn't slip in and take out the sip.
3652          */
3653         slot.status = NONE;
3654         if ((sip->i_ctime.tv_usec * 1000) > now.tv_nsec ||
3655             sip->i_ctime.tv_sec > now.tv_sec) {
3656                 rw_enter(&sdp->i_ufsvfs->vfs_dqrwlock, RW_READER);
3657                 rw_enter(&sdp->i_contents, RW_WRITER);
3658                 error = ufs_dircheckforname(sdp, snm, strlen(snm), &slot,
3659                     &ip, cr, 0);
3660                 rw_exit(&sdp->i_contents);
3661                 rw_exit(&sdp->i_ufsvfs->vfs_dqrwlock);
3662                 if (error) {
3663                         goto errout;
3664                 }
3665                 if (ip == NULL) {
3666                         error = ENOENT;
3667                         goto errout;
3668                 } else {
3669                         /*
3670                          * If the inode was found need to drop the v_count
3671                          * so as not to keep the filesystem from being
3672                          * unmounted at a later time.
3673                          */
3674                         VN_RELE(ITOV(ip));
3675                 }
3676 
3677                 /*
3678                  * Release the slot.fbp that has the page mapped and
3679                  * locked SE_SHARED, and could be used in in
3680                  * ufs_direnter_lr() which needs to get the SE_EXCL lock
3681                  * on said page.
3682                  */
3683                 if (slot.fbp) {
3684                         fbrelse(slot.fbp, S_OTHER);
3685                         slot.fbp = NULL;
3686                 }
3687         }
3688 
3689         /*
3690          * Link source to the target.
3691          */
3692         if (error = ufs_direnter_lr(tdp, tnm, DE_RENAME, sdp, sip, cr)) {
3693                 /*
3694                  * ESAME isn't really an error; it indicates that the
3695                  * operation should not be done because the source and target
3696                  * are the same file, but that no error should be reported.
3697                  */
3698                 if (error == ESAME)
3699                         error = 0;
3700                 goto errout;
3701         }
3702 
3703         if (error == 0 && tvp != NULL)
3704                 vnevent_rename_dest(tvp, tdvp, tnm, ct);
3705 
3706         /*
3707          * Unlink the source.
3708          * Remove the source entry.  ufs_dirremove() checks that the entry
3709          * still reflects sip, and returns an error if it doesn't.
3710          * If the entry has changed just forget about it.  Release
3711          * the source inode.
3712          */
3713         if ((error = ufs_dirremove(sdp, snm, sip, (struct vnode *)0,
3714             DR_RENAME, cr)) == ENOENT)
3715                 error = 0;
3716 
3717         if (error == 0) {
3718                 vnevent_rename_src(ITOV(sip), sdvp, snm, ct);
3719                 /*
3720                  * Notify the target directory of the rename event
3721                  * if source and target directories are not the same.
3722                  */
3723                 if (sdvp != tdvp)
3724                         vnevent_rename_dest_dir(tdvp, ct);
3725         }
3726 
3727 errout:
3728         if (slot.fbp)
3729                 fbrelse(slot.fbp, S_OTHER);
3730 
3731         rw_exit(&tdp->i_rwlock);
3732         if (sdp != tdp) {
3733                 rw_exit(&sdp->i_rwlock);
3734         }
3735 
3736 unlock:
3737         if (tvp != NULL)
3738                 VN_RELE(tvp);
3739         if (sip != NULL)
3740                 VN_RELE(ITOV(sip));
3741 
3742         if (ulp) {
3743                 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RENAME, trans_size);
3744                 ufs_lockfs_end(ulp);
3745         }
3746 
3747         return (error);
3748 }
3749 
3750 /*ARGSUSED*/
3751 static int
3752 ufs_mkdir(struct vnode *dvp, char *dirname, struct vattr *vap,
3753     struct vnode **vpp, struct cred *cr, caller_context_t *ct, int flags,
3754     vsecattr_t *vsecp)
3755 {
3756         struct inode *ip;
3757         struct inode *xip;
3758         struct ufsvfs *ufsvfsp;
3759         struct ulockfs *ulp;
3760         int error;
3761         int issync;
3762         int trans_size;
3763         int indeadlock;
3764         int retry = 1;
3765 
3766         ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
3767 
3768         /*
3769          * Can't make directory in attr hidden dir
3770          */
3771         if ((VTOI(dvp)->i_mode & IFMT) == IFATTRDIR)
3772                 return (EINVAL);
3773 
3774 again:
3775         ip = VTOI(dvp);
3776         ufsvfsp = ip->i_ufsvfs;
3777         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MKDIR_MASK);
3778         if (error)
3779                 goto out;
3780         if (ulp)
3781                 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_MKDIR,
3782                     trans_size = (int)TOP_MKDIR_SIZE(ip));
3783 
3784         /*
3785          * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
3786          * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
3787          * possible, retries the operation.
3788          */
3789         ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_MKDIR, retry);
3790         if (indeadlock)
3791                 goto again;
3792 
3793         error = ufs_direnter_cm(ip, dirname, DE_MKDIR, vap, &xip, cr,
3794             (retry ? IQUIET : 0));
3795         if (error == EAGAIN) {
3796                 if (ulp) {
3797                         TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_MKDIR,
3798                             trans_size);
3799                         ufs_lockfs_end(ulp);
3800                 }
3801                 goto again;
3802         }
3803 
3804         rw_exit(&ip->i_rwlock);
3805         if (error == 0) {
3806                 ip = xip;
3807                 *vpp = ITOV(ip);
3808         } else if (error == EEXIST)
3809                 VN_RELE(ITOV(xip));
3810 
3811         if (ulp) {
3812                 int terr = 0;
3813                 TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_MKDIR, trans_size);
3814                 ufs_lockfs_end(ulp);
3815                 if (error == 0)
3816                         error = terr;
3817         }
3818 out:
3819         if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
3820                 ufs_delete_drain_wait(ufsvfsp, 1);
3821                 retry = 0;
3822                 goto again;
3823         }
3824 
3825         return (error);
3826 }
3827 
3828 /*ARGSUSED*/
3829 static int
3830 ufs_rmdir(struct vnode *vp, char *nm, struct vnode *cdir, struct cred *cr,
3831     caller_context_t *ct, int flags)
3832 {
3833         struct inode *ip = VTOI(vp);
3834         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
3835         struct ulockfs *ulp;
3836         vnode_t *rmvp = NULL;   /* Vnode of removed directory */
3837         int error;
3838         int issync;
3839         int trans_size;
3840         int indeadlock;
3841 
3842         /*
3843          * don't let the delete queue get too long
3844          */
3845         if (ufsvfsp == NULL) {
3846                 error = EIO;
3847                 goto out;
3848         }
3849         if (ufsvfsp->vfs_delete.uq_ne > ufs_idle_max)
3850                 ufs_delete_drain(vp->v_vfsp, 1, 1);
3851 
3852         error = ufs_eventlookup(vp, nm, cr, &rmvp);
3853         if (rmvp != NULL) {
3854                 /* Only send the event if there were no errors */
3855                 if (error == 0)
3856                         vnevent_rmdir(rmvp, vp, nm, ct);
3857                 VN_RELE(rmvp);
3858         }
3859 
3860 retry_rmdir:
3861         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_RMDIR_MASK);
3862         if (error)
3863                 goto out;
3864 
3865         if (ulp)
3866                 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_RMDIR,
3867                     trans_size = TOP_RMDIR_SIZE);
3868 
3869         /*
3870          * ufs_tryirwlock_trans uses rw_tryenter and checks for SLOCK
3871          * to avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock
3872          * possible, retries the operation.
3873          */
3874         ufs_tryirwlock_trans(&ip->i_rwlock, RW_WRITER, TOP_RMDIR, retry);
3875         if (indeadlock)
3876                 goto retry_rmdir;
3877         error = ufs_dirremove(ip, nm, (struct inode *)0, cdir, DR_RMDIR, cr);
3878 
3879         rw_exit(&ip->i_rwlock);
3880 
3881         if (ulp) {
3882                 TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RMDIR,
3883                     trans_size);
3884                 ufs_lockfs_end(ulp);
3885         }
3886 
3887 out:
3888         return (error);
3889 }
3890 
3891 /* ARGSUSED */
3892 static int
3893 ufs_readdir(struct vnode *vp, struct uio *uiop, struct cred *cr, int *eofp,
3894     caller_context_t *ct, int flags)
3895 {
3896         struct iovec *iovp;
3897         struct inode *ip;
3898         struct direct *idp;
3899         struct dirent64 *odp;
3900         struct fbuf *fbp;
3901         struct ufsvfs *ufsvfsp;
3902         struct ulockfs *ulp;
3903         caddr_t outbuf;
3904         size_t bufsize;
3905         uint_t offset;
3906         uint_t bytes_wanted, total_bytes_wanted;
3907         int incount = 0;
3908         int outcount = 0;
3909         int error;
3910 
3911         ip = VTOI(vp);
3912         ASSERT(RW_READ_HELD(&ip->i_rwlock));
3913 
3914         if (uiop->uio_loffset >= MAXOFF32_T) {
3915                 if (eofp)
3916                         *eofp = 1;
3917                 return (0);
3918         }
3919 
3920         /*
3921          * Check if we have been called with a valid iov_len
3922          * and bail out if not, otherwise we may potentially loop
3923          * forever further down.
3924          */
3925         if (uiop->uio_iov->iov_len <= 0) {
3926                 error = EINVAL;
3927                 goto out;
3928         }
3929 
3930         /*
3931          * Large Files: When we come here we are guaranteed that
3932          * uio_offset can be used safely. The high word is zero.
3933          */
3934 
3935         ufsvfsp = ip->i_ufsvfs;
3936         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_READDIR_MASK);
3937         if (error)
3938                 goto out;
3939 
3940         iovp = uiop->uio_iov;
3941         total_bytes_wanted = iovp->iov_len;
3942 
3943         /* Large Files: directory files should not be "large" */
3944 
3945         ASSERT(ip->i_size <= MAXOFF32_T);
3946 
3947         /* Force offset to be valid (to guard against bogus lseek() values) */
3948         offset = (uint_t)uiop->uio_offset & ~(DIRBLKSIZ - 1);
3949 
3950         /* Quit if at end of file or link count of zero (posix) */
3951         if (offset >= (uint_t)ip->i_size || ip->i_nlink <= 0) {
3952                 if (eofp)
3953                         *eofp = 1;
3954                 error = 0;
3955                 goto unlock;
3956         }
3957 
3958         /*
3959          * Get space to change directory entries into fs independent format.
3960          * Do fast alloc for the most commonly used-request size (filesystem
3961          * block size).
3962          */
3963         if (uiop->uio_segflg != UIO_SYSSPACE || uiop->uio_iovcnt != 1) {
3964                 bufsize = total_bytes_wanted;
3965                 outbuf = kmem_alloc(bufsize, KM_SLEEP);
3966                 odp = (struct dirent64 *)outbuf;
3967         } else {
3968                 bufsize = total_bytes_wanted;
3969                 odp = (struct dirent64 *)iovp->iov_base;
3970         }
3971 
3972 nextblk:
3973         bytes_wanted = total_bytes_wanted;
3974 
3975         /* Truncate request to file size */
3976         if (offset + bytes_wanted > (int)ip->i_size)
3977                 bytes_wanted = (int)(ip->i_size - offset);
3978 
3979         /* Comply with MAXBSIZE boundary restrictions of fbread() */
3980         if ((offset & MAXBOFFSET) + bytes_wanted > MAXBSIZE)
3981                 bytes_wanted = MAXBSIZE - (offset & MAXBOFFSET);
3982 
3983         /*
3984          * Read in the next chunk.
3985          * We are still holding the i_rwlock.
3986          */
3987         error = fbread(vp, (offset_t)offset, bytes_wanted, S_OTHER, &fbp);
3988 
3989         if (error)
3990                 goto update_inode;
3991         if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) && (ip->i_fs->fs_ronly == 0) &&
3992             (!ufsvfsp->vfs_noatime)) {
3993                 ip->i_flag |= IACC;
3994         }
3995         incount = 0;
3996         idp = (struct direct *)fbp->fb_addr;
3997         if (idp->d_ino == 0 && idp->d_reclen == 0 && idp->d_namlen == 0) {
3998                 cmn_err(CE_WARN, "ufs_readdir: bad dir, inumber = %llu, "
3999                     "fs = %s\n",
4000                     (u_longlong_t)ip->i_number, ufsvfsp->vfs_fs->fs_fsmnt);
4001                 fbrelse(fbp, S_OTHER);
4002                 error = ENXIO;
4003                 goto update_inode;
4004         }
4005         /* Transform to file-system independent format */
4006         while (incount < bytes_wanted) {
4007                 /*
4008                  * If the current directory entry is mangled, then skip
4009                  * to the next block.  It would be nice to set the FSBAD
4010                  * flag in the super-block so that a fsck is forced on
4011                  * next reboot, but locking is a problem.
4012                  */
4013                 if (idp->d_reclen & 0x3) {
4014                         offset = (offset + DIRBLKSIZ) & ~(DIRBLKSIZ-1);
4015                         break;
4016                 }
4017 
4018                 /* Skip to requested offset and skip empty entries */
4019                 if (idp->d_ino != 0 && offset >= (uint_t)uiop->uio_offset) {
4020                         ushort_t this_reclen =
4021                             DIRENT64_RECLEN(idp->d_namlen);
4022                         /* Buffer too small for any entries */
4023                         if (!outcount && this_reclen > bufsize) {
4024                                 fbrelse(fbp, S_OTHER);
4025                                 error = EINVAL;
4026                                 goto update_inode;
4027                         }
4028                         /* If would overrun the buffer, quit */
4029                         if (outcount + this_reclen > bufsize) {
4030                                 break;
4031                         }
4032                         /* Take this entry */
4033                         odp->d_ino = (ino64_t)idp->d_ino;
4034                         odp->d_reclen = (ushort_t)this_reclen;
4035                         odp->d_off = (offset_t)(offset + idp->d_reclen);
4036 
4037                         /* use strncpy(9f) to zero out uninitialized bytes */
4038 
4039                         ASSERT(strlen(idp->d_name) + 1 <=
4040                             DIRENT64_NAMELEN(this_reclen));
4041                         (void) strncpy(odp->d_name, idp->d_name,
4042                             DIRENT64_NAMELEN(this_reclen));
4043                         outcount += odp->d_reclen;
4044                         odp = (struct dirent64 *)
4045                             ((intptr_t)odp + odp->d_reclen);
4046                         ASSERT(outcount <= bufsize);
4047                 }
4048                 if (idp->d_reclen) {
4049                         incount += idp->d_reclen;
4050                         offset += idp->d_reclen;
4051                         idp = (struct direct *)((intptr_t)idp + idp->d_reclen);
4052                 } else {
4053                         offset = (offset + DIRBLKSIZ) & ~(DIRBLKSIZ-1);
4054                         break;
4055                 }
4056         }
4057         /* Release the chunk */
4058         fbrelse(fbp, S_OTHER);
4059 
4060         /* Read whole block, but got no entries, read another if not eof */
4061 
4062         /*
4063          * Large Files: casting i_size to int here is not a problem
4064          * because directory sizes are always less than MAXOFF32_T.
4065          * See assertion above.
4066          */
4067 
4068         if (offset < (int)ip->i_size && !outcount)
4069                 goto nextblk;
4070 
4071         /* Copy out the entry data */
4072         if (uiop->uio_segflg == UIO_SYSSPACE && uiop->uio_iovcnt == 1) {
4073                 iovp->iov_base += outcount;
4074                 iovp->iov_len -= outcount;
4075                 uiop->uio_resid -= outcount;
4076                 uiop->uio_offset = offset;
4077         } else if ((error = uiomove(outbuf, (long)outcount, UIO_READ,
4078             uiop)) == 0)
4079                 uiop->uio_offset = offset;
4080 update_inode:
4081         ITIMES(ip);
4082         if (uiop->uio_segflg != UIO_SYSSPACE || uiop->uio_iovcnt != 1)
4083                 kmem_free(outbuf, bufsize);
4084 
4085         if (eofp && error == 0)
4086                 *eofp = (uiop->uio_offset >= (int)ip->i_size);
4087 unlock:
4088         if (ulp) {
4089                 ufs_lockfs_end(ulp);
4090         }
4091 out:
4092         return (error);
4093 }
4094 
4095 /*ARGSUSED*/
4096 static int
4097 ufs_symlink(struct vnode *dvp, char *linkname, struct vattr *vap, char *target,
4098     struct cred *cr, caller_context_t *ct, int flags)
4099 {
4100         struct inode *ip, *dip = VTOI(dvp);
4101         struct ufsvfs *ufsvfsp = dip->i_ufsvfs;
4102         struct ulockfs *ulp;
4103         int error;
4104         int issync;
4105         int trans_size;
4106         int residual;
4107         int ioflag;
4108         int retry = 1;
4109 
4110         /*
4111          * No symlinks in attrdirs at this time
4112          */
4113         if ((VTOI(dvp)->i_mode & IFMT) == IFATTRDIR)
4114                 return (EINVAL);
4115 
4116 again:
4117         ip = (struct inode *)NULL;
4118         vap->va_type = VLNK;
4119         vap->va_rdev = 0;
4120 
4121         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SYMLINK_MASK);
4122         if (error)
4123                 goto out;
4124 
4125         if (ulp)
4126                 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_SYMLINK,
4127                     trans_size = (int)TOP_SYMLINK_SIZE(dip));
4128 
4129         /*
4130          * We must create the inode before the directory entry, to avoid
4131          * racing with readlink().  ufs_dirmakeinode requires that we
4132          * hold the quota lock as reader, and directory locks as writer.
4133          */
4134 
4135         rw_enter(&dip->i_rwlock, RW_WRITER);
4136         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
4137         rw_enter(&dip->i_contents, RW_WRITER);
4138 
4139         /*
4140          * Suppress any out of inodes messages if we will retry on
4141          * ENOSP
4142          */
4143         if (retry)
4144                 dip->i_flag |= IQUIET;
4145 
4146         error = ufs_dirmakeinode(dip, &ip, vap, DE_SYMLINK, cr);
4147 
4148         dip->i_flag &= ~IQUIET;
4149 
4150         rw_exit(&dip->i_contents);
4151         rw_exit(&ufsvfsp->vfs_dqrwlock);
4152         rw_exit(&dip->i_rwlock);
4153 
4154         if (error)
4155                 goto unlock;
4156 
4157         /*
4158          * OK.  The inode has been created.  Write out the data of the
4159          * symbolic link.  Since symbolic links are metadata, and should
4160          * remain consistent across a system crash, we need to force the
4161          * data out synchronously.
4162          *
4163          * (This is a change from the semantics in earlier releases, which
4164          * only created symbolic links synchronously if the semi-documented
4165          * 'syncdir' option was set, or if we were being invoked by the NFS
4166          * server, which requires symbolic links to be created synchronously.)
4167          *
4168          * We need to pass in a pointer for the residual length; otherwise
4169          * ufs_rdwri() will always return EIO if it can't write the data,
4170          * even if the error was really ENOSPC or EDQUOT.
4171          */
4172 
4173         ioflag = FWRITE | FDSYNC;
4174         residual = 0;
4175 
4176         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
4177         rw_enter(&ip->i_contents, RW_WRITER);
4178 
4179         /*
4180          * Suppress file system full messages if we will retry
4181          */
4182         if (retry)
4183                 ip->i_flag |= IQUIET;
4184 
4185         error = ufs_rdwri(UIO_WRITE, ioflag, ip, target, strlen(target),
4186             (offset_t)0, UIO_SYSSPACE, &residual, cr);
4187 
4188         ip->i_flag &= ~IQUIET;
4189 
4190         if (error) {
4191                 rw_exit(&ip->i_contents);
4192                 rw_exit(&ufsvfsp->vfs_dqrwlock);
4193                 goto remove;
4194         }
4195 
4196         /*
4197          * If the link's data is small enough, we can cache it in the inode.
4198          * This is a "fast symbolic link".  We don't use the first direct
4199          * block because that's actually used to point at the symbolic link's
4200          * contents on disk; but we know that none of the other direct or
4201          * indirect blocks can be used because symbolic links are restricted
4202          * to be smaller than a file system block.
4203          */
4204 
4205         ASSERT(MAXPATHLEN <= VBSIZE(ITOV(ip)));
4206 
4207         if (ip->i_size > 0 && ip->i_size <= FSL_SIZE) {
4208                 if (kcopy(target, &ip->i_db[1], ip->i_size) == 0) {
4209                         ip->i_flag |= IFASTSYMLNK;
4210                 } else {
4211                         int i;
4212                         /* error, clear garbage left behind */
4213                         for (i = 1; i < NDADDR; i++)
4214                                 ip->i_db[i] = 0;
4215                         for (i = 0; i < NIADDR; i++)
4216                                 ip->i_ib[i] = 0;
4217                 }
4218         }
4219 
4220         rw_exit(&ip->i_contents);
4221         rw_exit(&ufsvfsp->vfs_dqrwlock);
4222 
4223         /*
4224          * OK.  We've successfully created the symbolic link.  All that
4225          * remains is to insert it into the appropriate directory.
4226          */
4227 
4228         rw_enter(&dip->i_rwlock, RW_WRITER);
4229         error = ufs_direnter_lr(dip, linkname, DE_SYMLINK, NULL, ip, cr);
4230         rw_exit(&dip->i_rwlock);
4231 
4232         /*
4233          * Fall through into remove-on-error code.  We're either done, or we
4234          * need to remove the inode (if we couldn't insert it).
4235          */
4236 
4237 remove:
4238         if (error && (ip != NULL)) {
4239                 rw_enter(&ip->i_contents, RW_WRITER);
4240                 ip->i_nlink--;
4241                 ip->i_flag |= ICHG;
4242                 ip->i_seq++;
4243                 ufs_setreclaim(ip);
4244                 rw_exit(&ip->i_contents);
4245         }
4246 
4247 unlock:
4248         if (ip != NULL)
4249                 VN_RELE(ITOV(ip));
4250 
4251         if (ulp) {
4252                 int terr = 0;
4253 
4254                 TRANS_END_CSYNC(ufsvfsp, terr, issync, TOP_SYMLINK,
4255                     trans_size);
4256                 ufs_lockfs_end(ulp);
4257                 if (error == 0)
4258                         error = terr;
4259         }
4260 
4261         /*
4262          * We may have failed due to lack of an inode or of a block to
4263          * store the target in.  Try flushing the delete queue to free
4264          * logically-available things up and try again.
4265          */
4266         if ((error == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
4267                 ufs_delete_drain_wait(ufsvfsp, 1);
4268                 retry = 0;
4269                 goto again;
4270         }
4271 
4272 out:
4273         return (error);
4274 }
4275 
4276 /*
4277  * Ufs specific routine used to do ufs io.
4278  */
4279 int
4280 ufs_rdwri(enum uio_rw rw, int ioflag, struct inode *ip, caddr_t base,
4281     ssize_t len, offset_t offset, enum uio_seg seg, int *aresid,
4282     struct cred *cr)
4283 {
4284         struct uio auio;
4285         struct iovec aiov;
4286         int error;
4287 
4288         ASSERT(RW_LOCK_HELD(&ip->i_contents));
4289 
4290         bzero((caddr_t)&auio, sizeof (uio_t));
4291         bzero((caddr_t)&aiov, sizeof (iovec_t));
4292 
4293         aiov.iov_base = base;
4294         aiov.iov_len = len;
4295         auio.uio_iov = &aiov;
4296         auio.uio_iovcnt = 1;
4297         auio.uio_loffset = offset;
4298         auio.uio_segflg = (short)seg;
4299         auio.uio_resid = len;
4300 
4301         if (rw == UIO_WRITE) {
4302                 auio.uio_fmode = FWRITE;
4303                 auio.uio_extflg = UIO_COPY_DEFAULT;
4304                 auio.uio_llimit = curproc->p_fsz_ctl;
4305                 error = wrip(ip, &auio, ioflag, cr);
4306         } else {
4307                 auio.uio_fmode = FREAD;
4308                 auio.uio_extflg = UIO_COPY_CACHED;
4309                 auio.uio_llimit = MAXOFFSET_T;
4310                 error = rdip(ip, &auio, ioflag, cr);
4311         }
4312 
4313         if (aresid) {
4314                 *aresid = auio.uio_resid;
4315         } else if (auio.uio_resid) {
4316                 error = EIO;
4317         }
4318         return (error);
4319 }
4320 
4321 /*ARGSUSED*/
4322 static int
4323 ufs_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
4324 {
4325         struct ufid *ufid;
4326         struct inode *ip = VTOI(vp);
4327 
4328         if (ip->i_ufsvfs == NULL)
4329                 return (EIO);
4330 
4331         if (fidp->fid_len < (sizeof (struct ufid) - sizeof (ushort_t))) {
4332                 fidp->fid_len = sizeof (struct ufid) - sizeof (ushort_t);
4333                 return (ENOSPC);
4334         }
4335 
4336         ufid = (struct ufid *)fidp;
4337         bzero((char *)ufid, sizeof (struct ufid));
4338         ufid->ufid_len = sizeof (struct ufid) - sizeof (ushort_t);
4339         ufid->ufid_ino = ip->i_number;
4340         ufid->ufid_gen = ip->i_gen;
4341 
4342         return (0);
4343 }
4344 
4345 /* ARGSUSED2 */
4346 static int
4347 ufs_rwlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
4348 {
4349         struct inode    *ip = VTOI(vp);
4350         struct ufsvfs   *ufsvfsp;
4351         int             forcedirectio;
4352 
4353         /*
4354          * Read case is easy.
4355          */
4356         if (!write_lock) {
4357                 rw_enter(&ip->i_rwlock, RW_READER);
4358                 return (V_WRITELOCK_FALSE);
4359         }
4360 
4361         /*
4362          * Caller has requested a writer lock, but that inhibits any
4363          * concurrency in the VOPs that follow. Acquire the lock shared
4364          * and defer exclusive access until it is known to be needed in
4365          * other VOP handlers. Some cases can be determined here.
4366          */
4367 
4368         /*
4369          * If directio is not set, there is no chance of concurrency,
4370          * so just acquire the lock exclusive. Beware of a forced
4371          * unmount before looking at the mount option.
4372          */
4373         ufsvfsp = ip->i_ufsvfs;
4374         forcedirectio = ufsvfsp ? ufsvfsp->vfs_forcedirectio : 0;
4375         if (!(ip->i_flag & IDIRECTIO || forcedirectio) ||
4376             !ufs_allow_shared_writes) {
4377                 rw_enter(&ip->i_rwlock, RW_WRITER);
4378                 return (V_WRITELOCK_TRUE);
4379         }
4380 
4381         /*
4382          * Mandatory locking forces acquiring i_rwlock exclusive.
4383          */
4384         if (MANDLOCK(vp, ip->i_mode)) {
4385                 rw_enter(&ip->i_rwlock, RW_WRITER);
4386                 return (V_WRITELOCK_TRUE);
4387         }
4388 
4389         /*
4390          * Acquire the lock shared in case a concurrent write follows.
4391          * Mandatory locking could have become enabled before the lock
4392          * was acquired. Re-check and upgrade if needed.
4393          */
4394         rw_enter(&ip->i_rwlock, RW_READER);
4395         if (MANDLOCK(vp, ip->i_mode)) {
4396                 rw_exit(&ip->i_rwlock);
4397                 rw_enter(&ip->i_rwlock, RW_WRITER);
4398                 return (V_WRITELOCK_TRUE);
4399         }
4400         return (V_WRITELOCK_FALSE);
4401 }
4402 
4403 /*ARGSUSED*/
4404 static void
4405 ufs_rwunlock(struct vnode *vp, int write_lock, caller_context_t *ctp)
4406 {
4407         struct inode    *ip = VTOI(vp);
4408 
4409         rw_exit(&ip->i_rwlock);
4410 }
4411 
4412 /* ARGSUSED */
4413 static int
4414 ufs_seek(struct vnode *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
4415 {
4416         return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
4417 }
4418 
4419 /* ARGSUSED */
4420 static int
4421 ufs_frlock(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
4422     offset_t offset, struct flk_callback *flk_cbp, struct cred *cr,
4423     caller_context_t *ct)
4424 {
4425         struct inode *ip = VTOI(vp);
4426 
4427         if (ip->i_ufsvfs == NULL)
4428                 return (EIO);
4429 
4430         /*
4431          * If file is being mapped, disallow frlock.
4432          * XXX I am not holding tlock while checking i_mapcnt because the
4433          * current locking strategy drops all locks before calling fs_frlock.
4434          * So, mapcnt could change before we enter fs_frlock making is
4435          * meaningless to have held tlock in the first place.
4436          */
4437         if (ip->i_mapcnt > 0 && MANDLOCK(vp, ip->i_mode))
4438                 return (EAGAIN);
4439         return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4440 }
4441 
4442 /* ARGSUSED */
4443 static int
4444 ufs_space(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
4445     offset_t offset, cred_t *cr, caller_context_t *ct)
4446 {
4447         struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs;
4448         struct ulockfs *ulp;
4449         int error;
4450 
4451         if ((error = convoff(vp, bfp, 0, offset)) == 0) {
4452                 if (cmd == F_FREESP) {
4453                         error = ufs_lockfs_begin(ufsvfsp, &ulp,
4454                             ULOCKFS_SPACE_MASK);
4455                         if (error)
4456                                 return (error);
4457                         error = ufs_freesp(vp, bfp, flag, cr);
4458 
4459                         if (error == 0 && bfp->l_start == 0)
4460                                 vnevent_truncate(vp, ct);
4461                 } else if (cmd == F_ALLOCSP) {
4462                         error = ufs_lockfs_begin(ufsvfsp, &ulp,
4463                             ULOCKFS_FALLOCATE_MASK);
4464                         if (error)
4465                                 return (error);
4466                         error = ufs_allocsp(vp, bfp, cr);
4467                 } else
4468                         return (EINVAL); /* Command not handled here */
4469 
4470                 if (ulp)
4471                         ufs_lockfs_end(ulp);
4472 
4473         }
4474         return (error);
4475 }
4476 
4477 /*
4478  * Used to determine if read ahead should be done. Also used to
4479  * to determine when write back occurs.
4480  */
4481 #define CLUSTSZ(ip)             ((ip)->i_ufsvfs->vfs_ioclustsz)
4482 
4483 /*
4484  * A faster version of ufs_getpage.
4485  *
4486  * We optimize by inlining the pvn_getpages iterator, eliminating
4487  * calls to bmap_read if file doesn't have UFS holes, and avoiding
4488  * the overhead of page_exists().
4489  *
4490  * When files has UFS_HOLES and ufs_getpage is called with S_READ,
4491  * we set *protp to PROT_READ to avoid calling bmap_read. This approach
4492  * victimizes performance when a file with UFS holes is faulted
4493  * first in the S_READ mode, and then in the S_WRITE mode. We will get
4494  * two MMU faults in this case.
4495  *
4496  * XXX - the inode fields which control the sequential mode are not
4497  *       protected by any mutex. The read ahead will act wild if
4498  *       multiple processes will access the file concurrently and
4499  *       some of them in sequential mode. One particulary bad case
4500  *       is if another thread will change the value of i_nextrio between
4501  *       the time this thread tests the i_nextrio value and then reads it
4502  *       again to use it as the offset for the read ahead.
4503  */
4504 /*ARGSUSED*/
4505 static int
4506 ufs_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp,
4507     page_t *plarr[], size_t plsz, struct seg *seg, caddr_t addr,
4508     enum seg_rw rw, struct cred *cr, caller_context_t *ct)
4509 {
4510         u_offset_t      uoff = (u_offset_t)off; /* type conversion */
4511         u_offset_t      pgoff;
4512         u_offset_t      eoff;
4513         struct inode    *ip = VTOI(vp);
4514         struct ufsvfs   *ufsvfsp = ip->i_ufsvfs;
4515         struct fs       *fs;
4516         struct ulockfs  *ulp;
4517         page_t          **pl;
4518         caddr_t         pgaddr;
4519         krw_t           rwtype;
4520         int             err;
4521         int             has_holes;
4522         int             beyond_eof;
4523         int             seqmode;
4524         int             pgsize = PAGESIZE;
4525         int             dolock;
4526         int             do_qlock;
4527         int             trans_size;
4528 
4529         ASSERT((uoff & PAGEOFFSET) == 0);
4530 
4531         if (protp)
4532                 *protp = PROT_ALL;
4533 
4534         /*
4535          * Obey the lockfs protocol
4536          */
4537         err = ufs_lockfs_begin_getpage(ufsvfsp, &ulp, seg,
4538             rw == S_READ || rw == S_EXEC, protp);
4539         if (err)
4540                 goto out;
4541 
4542         fs = ufsvfsp->vfs_fs;
4543 
4544         if (ulp && (rw == S_CREATE || rw == S_WRITE) &&
4545             !(vp->v_flag & VISSWAP)) {
4546                 /*
4547                  * Try to start a transaction, will return if blocking is
4548                  * expected to occur and the address space is not the
4549                  * kernel address space.
4550                  */
4551                 trans_size = TOP_GETPAGE_SIZE(ip);
4552                 if (seg->s_as != &kas) {
4553                         TRANS_TRY_BEGIN_ASYNC(ufsvfsp, TOP_GETPAGE,
4554                             trans_size, err)
4555                         if (err == EWOULDBLOCK) {
4556                                 /*
4557                                  * Use EDEADLK here because the VM code
4558                                  * can normally never see this error.
4559                                  */
4560                                 err = EDEADLK;
4561                                 ufs_lockfs_end(ulp);
4562                                 goto out;
4563                         }
4564                 } else {
4565                         TRANS_BEGIN_ASYNC(ufsvfsp, TOP_GETPAGE, trans_size);
4566                 }
4567         }
4568 
4569         if (vp->v_flag & VNOMAP) {
4570                 err = ENOSYS;
4571                 goto unlock;
4572         }
4573 
4574         seqmode = ip->i_nextr == uoff && rw != S_CREATE;
4575 
4576         rwtype = RW_READER;             /* start as a reader */
4577         dolock = (rw_owner(&ip->i_contents) != curthread);
4578         /*
4579          * If this thread owns the lock, i.e., this thread grabbed it
4580          * as writer somewhere above, then we don't need to grab the
4581          * lock as reader in this routine.
4582          */
4583         do_qlock = (rw_owner(&ufsvfsp->vfs_dqrwlock) != curthread);
4584 
4585 retrylock:
4586         if (dolock) {
4587                 /*
4588                  * Grab the quota lock if we need to call
4589                  * bmap_write() below (with i_contents as writer).
4590                  */
4591                 if (do_qlock && rwtype == RW_WRITER)
4592                         rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
4593                 rw_enter(&ip->i_contents, rwtype);
4594         }
4595 
4596         /*
4597          * We may be getting called as a side effect of a bmap using
4598          * fbread() when the blocks might be being allocated and the
4599          * size has not yet been up'ed.  In this case we want to be
4600          * able to return zero pages if we get back UFS_HOLE from
4601          * calling bmap for a non write case here.  We also might have
4602          * to read some frags from the disk into a page if we are
4603          * extending the number of frags for a given lbn in bmap().
4604          * Large Files: The read of i_size here is atomic because
4605          * i_contents is held here. If dolock is zero, the lock
4606          * is held in bmap routines.
4607          */
4608         beyond_eof = uoff + len >
4609             P2ROUNDUP_TYPED(ip->i_size, PAGESIZE, u_offset_t);
4610         if (beyond_eof && seg != segkmap) {
4611                 if (dolock) {
4612                         rw_exit(&ip->i_contents);
4613                         if (do_qlock && rwtype == RW_WRITER)
4614                                 rw_exit(&ufsvfsp->vfs_dqrwlock);
4615                 }
4616                 err = EFAULT;
4617                 goto unlock;
4618         }
4619 
4620         /*
4621          * Must hold i_contents lock throughout the call to pvn_getpages
4622          * since locked pages are returned from each call to ufs_getapage.
4623          * Must *not* return locked pages and then try for contents lock
4624          * due to lock ordering requirements (inode > page)
4625          */
4626 
4627         has_holes = bmap_has_holes(ip);
4628 
4629         if ((rw == S_WRITE || rw == S_CREATE) && has_holes && !beyond_eof) {
4630                 int     blk_size;
4631                 u_offset_t offset;
4632 
4633                 /*
4634                  * We must acquire the RW_WRITER lock in order to
4635                  * call bmap_write().
4636                  */
4637                 if (dolock && rwtype == RW_READER) {
4638                         rwtype = RW_WRITER;
4639 
4640                         /*
4641                          * Grab the quota lock before
4642                          * upgrading i_contents, but if we can't grab it
4643                          * don't wait here due to lock order:
4644                          * vfs_dqrwlock > i_contents.
4645                          */
4646                         if (do_qlock &&
4647                             rw_tryenter(&ufsvfsp->vfs_dqrwlock, RW_READER)
4648                             == 0) {
4649                                 rw_exit(&ip->i_contents);
4650                                 goto retrylock;
4651                         }
4652                         if (!rw_tryupgrade(&ip->i_contents)) {
4653                                 rw_exit(&ip->i_contents);
4654                                 if (do_qlock)
4655                                         rw_exit(&ufsvfsp->vfs_dqrwlock);
4656                                 goto retrylock;
4657                         }
4658                 }
4659 
4660                 /*
4661                  * May be allocating disk blocks for holes here as
4662                  * a result of mmap faults. write(2) does the bmap_write
4663                  * in rdip/wrip, not here. We are not dealing with frags
4664                  * in this case.
4665                  */
4666                 /*
4667                  * Large Files: We cast fs_bmask field to offset_t
4668                  * just as we do for MAXBMASK because uoff is a 64-bit
4669                  * data type. fs_bmask will still be a 32-bit type
4670                  * as we cannot change any ondisk data structures.
4671                  */
4672 
4673                 offset = uoff & (offset_t)fs->fs_bmask;
4674                 while (offset < uoff + len) {
4675                         blk_size = (int)blksize(fs, ip, lblkno(fs, offset));
4676                         err = bmap_write(ip, offset, blk_size,
4677                             BI_NORMAL, NULL, cr);
4678                         if (ip->i_flag & (ICHG|IUPD))
4679                                 ip->i_seq++;
4680                         if (err)
4681                                 goto update_inode;
4682                         offset += blk_size; /* XXX - make this contig */
4683                 }
4684         }
4685 
4686         /*
4687          * Can be a reader from now on.
4688          */
4689         if (dolock && rwtype == RW_WRITER) {
4690                 rw_downgrade(&ip->i_contents);
4691                 /*
4692                  * We can release vfs_dqrwlock early so do it, but make
4693                  * sure we don't try to release it again at the bottom.
4694                  */
4695                 if (do_qlock) {
4696                         rw_exit(&ufsvfsp->vfs_dqrwlock);
4697                         do_qlock = 0;
4698                 }
4699         }
4700 
4701         /*
4702          * We remove PROT_WRITE in cases when the file has UFS holes
4703          * because we don't  want to call bmap_read() to check each
4704          * page if it is backed with a disk block.
4705          */
4706         if (protp && has_holes && rw != S_WRITE && rw != S_CREATE)
4707                 *protp &= ~PROT_WRITE;
4708 
4709         err = 0;
4710 
4711         /*
4712          * The loop looks up pages in the range [off, off + len).
4713          * For each page, we first check if we should initiate an asynchronous
4714          * read ahead before we call page_lookup (we may sleep in page_lookup
4715          * for a previously initiated disk read).
4716          */
4717         eoff = (uoff + len);
4718         for (pgoff = uoff, pgaddr = addr, pl = plarr;
4719             pgoff < eoff; /* empty */) {
4720                 page_t  *pp;
4721                 u_offset_t      nextrio;
4722                 se_t    se;
4723                 int retval;
4724 
4725                 se = ((rw == S_CREATE || rw == S_OTHER) ? SE_EXCL : SE_SHARED);
4726 
4727                 /* Handle async getpage (faultahead) */
4728                 if (plarr == NULL) {
4729                         ip->i_nextrio = pgoff;
4730                         (void) ufs_getpage_ra(vp, pgoff, seg, pgaddr);
4731                         pgoff += pgsize;
4732                         pgaddr += pgsize;
4733                         continue;
4734                 }
4735                 /*
4736                  * Check if we should initiate read ahead of next cluster.
4737                  * We call page_exists only when we need to confirm that
4738                  * we have the current page before we initiate the read ahead.
4739                  */
4740                 nextrio = ip->i_nextrio;
4741                 if (seqmode &&
4742                     pgoff + CLUSTSZ(ip) >= nextrio && pgoff <= nextrio &&
4743                     nextrio < ip->i_size && page_exists(vp, pgoff)) {
4744                         retval = ufs_getpage_ra(vp, pgoff, seg, pgaddr);
4745                         /*
4746                          * We always read ahead the next cluster of data
4747                          * starting from i_nextrio. If the page (vp,nextrio)
4748                          * is actually in core at this point, the routine
4749                          * ufs_getpage_ra() will stop pre-fetching data
4750                          * until we read that page in a synchronized manner
4751                          * through ufs_getpage_miss(). So, we should increase
4752                          * i_nextrio if the page (vp, nextrio) exists.
4753                          */
4754                         if ((retval == 0) && page_exists(vp, nextrio)) {
4755                                 ip->i_nextrio = nextrio + pgsize;
4756                         }
4757                 }
4758 
4759                 if ((pp = page_lookup(vp, pgoff, se)) != NULL) {
4760                         /*
4761                          * We found the page in the page cache.
4762                          */
4763                         *pl++ = pp;
4764                         pgoff += pgsize;
4765                         pgaddr += pgsize;
4766                         len -= pgsize;
4767                         plsz -= pgsize;
4768                 } else  {
4769                         /*
4770                          * We have to create the page, or read it from disk.
4771                          */
4772                         if (err = ufs_getpage_miss(vp, pgoff, len, seg, pgaddr,
4773                             pl, plsz, rw, seqmode))
4774                                 goto error;
4775 
4776                         while (*pl != NULL) {
4777                                 pl++;
4778                                 pgoff += pgsize;
4779                                 pgaddr += pgsize;
4780                                 len -= pgsize;
4781                                 plsz -= pgsize;
4782                         }
4783                 }
4784         }
4785 
4786         /*
4787          * Return pages up to plsz if they are in the page cache.
4788          * We cannot return pages if there is a chance that they are
4789          * backed with a UFS hole and rw is S_WRITE or S_CREATE.
4790          */
4791         if (plarr && !(has_holes && (rw == S_WRITE || rw == S_CREATE))) {
4792 
4793                 ASSERT((protp == NULL) ||
4794                     !(has_holes && (*protp & PROT_WRITE)));
4795 
4796                 eoff = pgoff + plsz;
4797                 while (pgoff < eoff) {
4798                         page_t          *pp;
4799 
4800                         if ((pp = page_lookup_nowait(vp, pgoff,
4801                             SE_SHARED)) == NULL)
4802                                 break;
4803 
4804                         *pl++ = pp;
4805                         pgoff += pgsize;
4806                         plsz -= pgsize;
4807                 }
4808         }
4809 
4810         if (plarr)
4811                 *pl = NULL;                     /* Terminate page list */
4812         ip->i_nextr = pgoff;
4813 
4814 error:
4815         if (err && plarr) {
4816                 /*
4817                  * Release any pages we have locked.
4818                  */
4819                 while (pl > &plarr[0])
4820                         page_unlock(*--pl);
4821 
4822                 plarr[0] = NULL;
4823         }
4824 
4825 update_inode:
4826         /*
4827          * If the inode is not already marked for IACC (in rdip() for read)
4828          * and the inode is not marked for no access time update (in wrip()
4829          * for write) then update the inode access time and mod time now.
4830          */
4831         if ((ip->i_flag & (IACC | INOACC)) == 0) {
4832                 if ((rw != S_OTHER) && (ip->i_mode & IFMT) != IFDIR) {
4833                         if (!ULOCKFS_IS_NOIACC(ITOUL(ip)) &&
4834                             (fs->fs_ronly == 0) &&
4835                             (!ufsvfsp->vfs_noatime)) {
4836                                 mutex_enter(&ip->i_tlock);
4837                                 ip->i_flag |= IACC;
4838                                 ITIMES_NOLOCK(ip);
4839                                 mutex_exit(&ip->i_tlock);
4840                         }
4841                 }
4842         }
4843 
4844         if (dolock) {
4845                 rw_exit(&ip->i_contents);
4846                 if (do_qlock && rwtype == RW_WRITER)
4847                         rw_exit(&ufsvfsp->vfs_dqrwlock);
4848         }
4849 
4850 unlock:
4851         if (ulp) {
4852                 if ((rw == S_CREATE || rw == S_WRITE) &&
4853                     !(vp->v_flag & VISSWAP)) {
4854                         TRANS_END_ASYNC(ufsvfsp, TOP_GETPAGE, trans_size);
4855                 }
4856                 ufs_lockfs_end(ulp);
4857         }
4858 out:
4859         return (err);
4860 }
4861 
4862 /*
4863  * ufs_getpage_miss is called when ufs_getpage missed the page in the page
4864  * cache. The page is either read from the disk, or it's created.
4865  * A page is created (without disk read) if rw == S_CREATE, or if
4866  * the page is not backed with a real disk block (UFS hole).
4867  */
4868 /* ARGSUSED */
4869 static int
4870 ufs_getpage_miss(struct vnode *vp, u_offset_t off, size_t len, struct seg *seg,
4871     caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw, int seq)
4872 {
4873         struct inode    *ip = VTOI(vp);
4874         page_t          *pp;
4875         daddr_t         bn;
4876         size_t          io_len;
4877         int             crpage = 0;
4878         int             err;
4879         int             contig;
4880         int             bsize = ip->i_fs->fs_bsize;
4881 
4882         /*
4883          * Figure out whether the page can be created, or must be
4884          * must be read from the disk.
4885          */
4886         if (rw == S_CREATE)
4887                 crpage = 1;
4888         else {
4889                 contig = 0;
4890                 if (err = bmap_read(ip, off, &bn, &contig))
4891                         return (err);
4892 
4893                 crpage = (bn == UFS_HOLE);
4894 
4895                 /*
4896                  * If its also a fallocated block that hasn't been written to
4897                  * yet, we will treat it just like a UFS_HOLE and create
4898                  * a zero page for it
4899                  */
4900                 if (ISFALLOCBLK(ip, bn))
4901                         crpage = 1;
4902         }
4903 
4904         if (crpage) {
4905                 if ((pp = page_create_va(vp, off, PAGESIZE, PG_WAIT, seg,
4906                     addr)) == NULL) {
4907                         return (ufs_fault(vp,
4908                             "ufs_getpage_miss: page_create == NULL"));
4909                 }
4910 
4911                 if (rw != S_CREATE)
4912                         pagezero(pp, 0, PAGESIZE);
4913 
4914                 io_len = PAGESIZE;
4915         } else {
4916                 u_offset_t      io_off;
4917                 uint_t  xlen;
4918                 struct buf      *bp;
4919                 ufsvfs_t        *ufsvfsp = ip->i_ufsvfs;
4920 
4921                 /*
4922                  * If access is not in sequential order, we read from disk
4923                  * in bsize units.
4924                  *
4925                  * We limit the size of the transfer to bsize if we are reading
4926                  * from the beginning of the file. Note in this situation we
4927                  * will hedge our bets and initiate an async read ahead of
4928                  * the second block.
4929                  */
4930                 if (!seq || off == 0)
4931                         contig = MIN(contig, bsize);
4932 
4933                 pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4934                     &io_len, off, contig, 0);
4935 
4936                 /*
4937                  * Some other thread has entered the page.
4938                  * ufs_getpage will retry page_lookup.
4939                  */
4940                 if (pp == NULL) {
4941                         pl[0] = NULL;
4942                         return (0);
4943                 }
4944 
4945                 /*
4946                  * Zero part of the page which we are not
4947                  * going to read from the disk.
4948                  */
4949                 xlen = io_len & PAGEOFFSET;
4950                 if (xlen != 0)
4951                         pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
4952 
4953                 bp = pageio_setup(pp, io_len, ip->i_devvp, B_READ);
4954                 bp->b_edev = ip->i_dev;
4955                 bp->b_dev = cmpdev(ip->i_dev);
4956                 bp->b_blkno = bn;
4957                 bp->b_un.b_addr = (caddr_t)0;
4958                 bp->b_file = ip->i_vnode;
4959                 bp->b_offset = off;
4960 
4961                 if (ufsvfsp->vfs_log) {
4962                         lufs_read_strategy(ufsvfsp->vfs_log, bp);
4963                 } else if (ufsvfsp->vfs_snapshot) {
4964                         fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
4965                 } else {
4966                         ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
4967                         ub.ub_getpages.value.ul++;
4968                         (void) bdev_strategy(bp);
4969                         lwp_stat_update(LWP_STAT_INBLK, 1);
4970                 }
4971 
4972                 ip->i_nextrio = off + ((io_len + PAGESIZE - 1) & PAGEMASK);
4973 
4974                 /*
4975                  * If the file access is sequential, initiate read ahead
4976                  * of the next cluster.
4977                  */
4978                 if (seq && ip->i_nextrio < ip->i_size)
4979                         (void) ufs_getpage_ra(vp, off, seg, addr);
4980                 err = biowait(bp);
4981                 pageio_done(bp);
4982 
4983                 if (err) {
4984                         pvn_read_done(pp, B_ERROR);
4985                         return (err);
4986                 }
4987         }
4988 
4989         pvn_plist_init(pp, pl, plsz, off, io_len, rw);
4990         return (0);
4991 }
4992 
4993 /*
4994  * Read ahead a cluster from the disk. Returns the length in bytes.
4995  */
4996 static int
4997 ufs_getpage_ra(struct vnode *vp, u_offset_t off, struct seg *seg, caddr_t addr)
4998 {
4999         struct inode    *ip = VTOI(vp);
5000         page_t          *pp;
5001         u_offset_t      io_off = ip->i_nextrio;
5002         ufsvfs_t        *ufsvfsp;
5003         caddr_t         addr2 = addr + (io_off - off);
5004         struct buf      *bp;
5005         daddr_t         bn;
5006         size_t          io_len;
5007         int             err;
5008         int             contig;
5009         int             xlen;
5010         int             bsize = ip->i_fs->fs_bsize;
5011 
5012         /*
5013          * If the directio advisory is in effect on this file,
5014          * then do not do buffered read ahead. Read ahead makes
5015          * it more difficult on threads using directio as they
5016          * will be forced to flush the pages from this vnode.
5017          */
5018         if ((ufsvfsp = ip->i_ufsvfs) == NULL)
5019                 return (0);
5020         if (ip->i_flag & IDIRECTIO || ufsvfsp->vfs_forcedirectio)
5021                 return (0);
5022 
5023         /*
5024          * Is this test needed?
5025          */
5026         if (addr2 >= seg->s_base + seg->s_size)
5027                 return (0);
5028 
5029         contig = 0;
5030         err = bmap_read(ip, io_off, &bn, &contig);
5031         /*
5032          * If its a UFS_HOLE or a fallocated block, do not perform
5033          * any read ahead's since there probably is nothing to read ahead
5034          */
5035         if (err || bn == UFS_HOLE || ISFALLOCBLK(ip, bn))
5036                 return (0);
5037 
5038         /*
5039          * Limit the transfer size to bsize if this is the 2nd block.
5040          */
5041         if (io_off == (u_offset_t)bsize)
5042                 contig = MIN(contig, bsize);
5043 
5044         if ((pp = pvn_read_kluster(vp, io_off, seg, addr2, &io_off,
5045             &io_len, io_off, contig, 1)) == NULL)
5046                 return (0);
5047 
5048         /*
5049          * Zero part of page which we are not going to read from disk
5050          */
5051         if ((xlen = (io_len & PAGEOFFSET)) > 0)
5052                 pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
5053 
5054         ip->i_nextrio = (io_off + io_len + PAGESIZE - 1) & PAGEMASK;
5055 
5056         bp = pageio_setup(pp, io_len, ip->i_devvp, B_READ | B_ASYNC);
5057         bp->b_edev = ip->i_dev;
5058         bp->b_dev = cmpdev(ip->i_dev);
5059         bp->b_blkno = bn;
5060         bp->b_un.b_addr = (caddr_t)0;
5061         bp->b_file = ip->i_vnode;
5062         bp->b_offset = off;
5063 
5064         if (ufsvfsp->vfs_log) {
5065                 lufs_read_strategy(ufsvfsp->vfs_log, bp);
5066         } else if (ufsvfsp->vfs_snapshot) {
5067                 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
5068         } else {
5069                 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
5070                 ub.ub_getras.value.ul++;
5071                 (void) bdev_strategy(bp);
5072                 lwp_stat_update(LWP_STAT_INBLK, 1);
5073         }
5074 
5075         return (io_len);
5076 }
5077 
5078 int     ufs_delay = 1;
5079 /*
5080  * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE, B_ASYNC}
5081  *
5082  * LMXXX - the inode really ought to contain a pointer to one of these
5083  * async args.  Stuff gunk in there and just hand the whole mess off.
5084  * This would replace i_delaylen, i_delayoff.
5085  */
5086 /*ARGSUSED*/
5087 static int
5088 ufs_putpage(struct vnode *vp, offset_t off, size_t len, int flags,
5089     struct cred *cr, caller_context_t *ct)
5090 {
5091         struct inode *ip = VTOI(vp);
5092         int err = 0;
5093 
5094         if (vp->v_count == 0) {
5095                 return (ufs_fault(vp, "ufs_putpage: bad v_count == 0"));
5096         }
5097 
5098         /*
5099          * XXX - Why should this check be made here?
5100          */
5101         if (vp->v_flag & VNOMAP) {
5102                 err = ENOSYS;
5103                 goto errout;
5104         }
5105 
5106         if (ip->i_ufsvfs == NULL) {
5107                 err = EIO;
5108                 goto errout;
5109         }
5110 
5111         if (flags & B_ASYNC) {
5112                 if (ufs_delay && len &&
5113                     (flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0) {
5114                         mutex_enter(&ip->i_tlock);
5115                         /*
5116                          * If nobody stalled, start a new cluster.
5117                          */
5118                         if (ip->i_delaylen == 0) {
5119                                 ip->i_delayoff = off;
5120                                 ip->i_delaylen = len;
5121                                 mutex_exit(&ip->i_tlock);
5122                                 goto errout;
5123                         }
5124                         /*
5125                          * If we have a full cluster or they are not contig,
5126                          * then push last cluster and start over.
5127                          */
5128                         if (ip->i_delaylen >= CLUSTSZ(ip) ||
5129                             ip->i_delayoff + ip->i_delaylen != off) {
5130                                 u_offset_t doff;
5131                                 size_t dlen;
5132 
5133                                 doff = ip->i_delayoff;
5134                                 dlen = ip->i_delaylen;
5135                                 ip->i_delayoff = off;
5136                                 ip->i_delaylen = len;
5137                                 mutex_exit(&ip->i_tlock);
5138                                 err = ufs_putpages(vp, doff, dlen,
5139                                     flags, cr);
5140                                 /* LMXXX - flags are new val, not old */
5141                                 goto errout;
5142                         }
5143                         /*
5144                          * There is something there, it's not full, and
5145                          * it is contig.
5146                          */
5147                         ip->i_delaylen += len;
5148                         mutex_exit(&ip->i_tlock);
5149                         goto errout;
5150                 }
5151                 /*
5152                  * Must have weird flags or we are not clustering.
5153                  */
5154         }
5155 
5156         err = ufs_putpages(vp, off, len, flags, cr);
5157 
5158 errout:
5159         return (err);
5160 }
5161 
5162 /*
5163  * If len == 0, do from off to EOF.
5164  *
5165  * The normal cases should be len == 0 & off == 0 (entire vp list),
5166  * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
5167  * (from pageout).
5168  */
5169 /*ARGSUSED*/
5170 static int
5171 ufs_putpages(struct vnode *vp, offset_t off, size_t len, int flags,
5172     struct cred *cr)
5173 {
5174         u_offset_t io_off;
5175         u_offset_t eoff;
5176         struct inode *ip = VTOI(vp);
5177         page_t *pp;
5178         size_t io_len;
5179         int err = 0;
5180         int dolock;
5181 
5182         if (vp->v_count == 0)
5183                 return (ufs_fault(vp, "ufs_putpages: v_count == 0"));
5184         /*
5185          * Acquire the readers/write inode lock before locking
5186          * any pages in this inode.
5187          * The inode lock is held during i/o.
5188          */
5189         if (len == 0) {
5190                 mutex_enter(&ip->i_tlock);
5191                 ip->i_delayoff = ip->i_delaylen = 0;
5192                 mutex_exit(&ip->i_tlock);
5193         }
5194         dolock = (rw_owner(&ip->i_contents) != curthread);
5195         if (dolock) {
5196                 /*
5197                  * Must synchronize this thread and any possible thread
5198                  * operating in the window of vulnerability in wrip().
5199                  * It is dangerous to allow both a thread doing a putpage
5200                  * and a thread writing, so serialize them.  The exception
5201                  * is when the thread in wrip() does something which causes
5202                  * a putpage operation.  Then, the thread must be allowed
5203                  * to continue.  It may encounter a bmap_read problem in
5204                  * ufs_putapage, but that is handled in ufs_putapage.
5205                  * Allow async writers to proceed, we don't want to block
5206                  * the pageout daemon.
5207                  */
5208                 if (ip->i_writer == curthread)
5209                         rw_enter(&ip->i_contents, RW_READER);
5210                 else {
5211                         for (;;) {
5212                                 rw_enter(&ip->i_contents, RW_READER);
5213                                 mutex_enter(&ip->i_tlock);
5214                                 /*
5215                                  * If there is no thread in the critical
5216                                  * section of wrip(), then proceed.
5217                                  * Otherwise, wait until there isn't one.
5218                                  */
5219                                 if (ip->i_writer == NULL) {
5220                                         mutex_exit(&ip->i_tlock);
5221                                         break;
5222                                 }
5223                                 rw_exit(&ip->i_contents);
5224                                 /*
5225                                  * Bounce async writers when we have a writer
5226                                  * working on this file so we don't deadlock
5227                                  * the pageout daemon.
5228                                  */
5229                                 if (flags & B_ASYNC) {
5230                                         mutex_exit(&ip->i_tlock);
5231                                         return (0);
5232                                 }
5233                                 cv_wait(&ip->i_wrcv, &ip->i_tlock);
5234                                 mutex_exit(&ip->i_tlock);
5235                         }
5236                 }
5237         }
5238 
5239         if (!vn_has_cached_data(vp)) {
5240                 if (dolock)
5241                         rw_exit(&ip->i_contents);
5242                 return (0);
5243         }
5244 
5245         if (len == 0) {
5246                 /*
5247                  * Search the entire vp list for pages >= off.
5248                  */
5249                 err = pvn_vplist_dirty(vp, (u_offset_t)off, ufs_putapage,
5250                     flags, cr);
5251         } else {
5252                 /*
5253                  * Loop over all offsets in the range looking for
5254                  * pages to deal with.
5255                  */
5256                 if ((eoff = blkroundup(ip->i_fs, ip->i_size)) != 0)
5257                         eoff = MIN(off + len, eoff);
5258                 else
5259                         eoff = off + len;
5260 
5261                 for (io_off = off; io_off < eoff; io_off += io_len) {
5262                         /*
5263                          * If we are not invalidating, synchronously
5264                          * freeing or writing pages, use the routine
5265                          * page_lookup_nowait() to prevent reclaiming
5266                          * them from the free list.
5267                          */
5268                         if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
5269                                 pp = page_lookup(vp, io_off,
5270                                     (flags & (B_INVAL | B_FREE)) ?
5271                                     SE_EXCL : SE_SHARED);
5272                         } else {
5273                                 pp = page_lookup_nowait(vp, io_off,
5274                                     (flags & B_FREE) ? SE_EXCL : SE_SHARED);
5275                         }
5276 
5277                         if (pp == NULL || pvn_getdirty(pp, flags) == 0)
5278                                 io_len = PAGESIZE;
5279                         else {
5280                                 u_offset_t *io_offp = &io_off;
5281 
5282                                 err = ufs_putapage(vp, pp, io_offp, &io_len,
5283                                     flags, cr);
5284                                 if (err != 0)
5285                                         break;
5286                                 /*
5287                                  * "io_off" and "io_len" are returned as
5288                                  * the range of pages we actually wrote.
5289                                  * This allows us to skip ahead more quickly
5290                                  * since several pages may've been dealt
5291                                  * with by this iteration of the loop.
5292                                  */
5293                         }
5294                 }
5295         }
5296         if (err == 0 && off == 0 && (len == 0 || len >= ip->i_size)) {
5297                 /*
5298                  * We have just sync'ed back all the pages on
5299                  * the inode, turn off the IMODTIME flag.
5300                  */
5301                 mutex_enter(&ip->i_tlock);
5302                 ip->i_flag &= ~IMODTIME;
5303                 mutex_exit(&ip->i_tlock);
5304         }
5305         if (dolock)
5306                 rw_exit(&ip->i_contents);
5307         return (err);
5308 }
5309 
5310 static void
5311 ufs_iodone(buf_t *bp)
5312 {
5313         struct inode *ip;
5314 
5315         ASSERT((bp->b_pages->p_vnode != NULL) && !(bp->b_flags & B_READ));
5316 
5317         bp->b_iodone = NULL;
5318 
5319         ip = VTOI(bp->b_pages->p_vnode);
5320 
5321         mutex_enter(&ip->i_tlock);
5322         if (ip->i_writes >= ufs_LW) {
5323                 if ((ip->i_writes -= bp->b_bcount) <= ufs_LW)
5324                         if (ufs_WRITES)
5325                                 cv_broadcast(&ip->i_wrcv); /* wake all up */
5326         } else {
5327                 ip->i_writes -= bp->b_bcount;
5328         }
5329 
5330         mutex_exit(&ip->i_tlock);
5331         iodone(bp);
5332 }
5333 
5334 /*
5335  * Write out a single page, possibly klustering adjacent
5336  * dirty pages.  The inode lock must be held.
5337  *
5338  * LMXXX - bsize < pagesize not done.
5339  */
5340 /*ARGSUSED*/
5341 int
5342 ufs_putapage(struct vnode *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
5343     int flags, struct cred *cr)
5344 {
5345         u_offset_t io_off;
5346         u_offset_t off;
5347         struct inode *ip = VTOI(vp);
5348         struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
5349         struct fs *fs;
5350         struct buf *bp;
5351         size_t io_len;
5352         daddr_t bn;
5353         int err;
5354         int contig;
5355         int dotrans;
5356 
5357         ASSERT(RW_LOCK_HELD(&ip->i_contents));
5358 
5359         if (ufsvfsp == NULL) {
5360                 err = EIO;
5361                 goto out_trace;
5362         }
5363 
5364         fs = ip->i_fs;
5365         ASSERT(fs->fs_ronly == 0);
5366 
5367         /*
5368          * If the modified time on the inode has not already been
5369          * set elsewhere (e.g. for write/setattr) we set the time now.
5370          * This gives us approximate modified times for mmap'ed files
5371          * which are modified via stores in the user address space.
5372          */
5373         if ((ip->i_flag & IMODTIME) == 0) {
5374                 mutex_enter(&ip->i_tlock);
5375                 ip->i_flag |= IUPD;
5376                 ip->i_seq++;
5377                 ITIMES_NOLOCK(ip);
5378                 mutex_exit(&ip->i_tlock);
5379         }
5380 
5381         /*
5382          * Align the request to a block boundry (for old file systems),
5383          * and go ask bmap() how contiguous things are for this file.
5384          */
5385         off = pp->p_offset & (offset_t)fs->fs_bmask;  /* block align it */
5386         contig = 0;
5387         err = bmap_read(ip, off, &bn, &contig);
5388         if (err)
5389                 goto out;
5390         if (bn == UFS_HOLE) {                   /* putpage never allocates */
5391                 /*
5392                  * logging device is in error mode; simply return EIO
5393                  */
5394                 if (TRANS_ISERROR(ufsvfsp)) {
5395                         err = EIO;
5396                         goto out;
5397                 }
5398                 /*
5399                  * Oops, the thread in the window in wrip() did some
5400                  * sort of operation which caused a putpage in the bad
5401                  * range.  In this case, just return an error which will
5402                  * cause the software modified bit on the page to set
5403                  * and the page will get written out again later.
5404                  */
5405                 if (ip->i_writer == curthread) {
5406                         err = EIO;
5407                         goto out;
5408                 }
5409                 /*
5410                  * If the pager is trying to push a page in the bad range
5411                  * just tell it to try again later when things are better.
5412                  */
5413                 if (flags & B_ASYNC) {
5414                         err = EAGAIN;
5415                         goto out;
5416                 }
5417                 err = ufs_fault(ITOV(ip), "ufs_putapage: bn == UFS_HOLE");
5418                 goto out;
5419         }
5420 
5421         /*
5422          * If it is an fallocate'd block, reverse the negativity since
5423          * we are now writing to it
5424          */
5425         if (ISFALLOCBLK(ip, bn)) {
5426                 err = bmap_set_bn(vp, off, dbtofsb(fs, -bn));
5427                 if (err)
5428                         goto out;
5429 
5430                 bn = -bn;
5431         }
5432 
5433         /*
5434          * Take the length (of contiguous bytes) passed back from bmap()
5435          * and _try_ and get a set of pages covering that extent.
5436          */
5437         pp = pvn_write_kluster(vp, pp, &io_off, &io_len, off, contig, flags);
5438 
5439         /*
5440          * May have run out of memory and not clustered backwards.
5441          * off          p_offset
5442          * [  pp - 1  ][   pp   ]
5443          * [    block           ]
5444          * We told bmap off, so we have to adjust the bn accordingly.
5445          */
5446         if (io_off > off) {
5447                 bn += btod(io_off - off);
5448                 contig -= (io_off - off);
5449         }
5450 
5451         /*
5452          * bmap was carefull to tell us the right size so use that.
5453          * There might be unallocated frags at the end.
5454          * LMXXX - bzero the end of the page?  We must be writing after EOF.
5455          */
5456         if (io_len > contig) {
5457                 ASSERT(io_len - contig < fs->fs_bsize);
5458                 io_len -= (io_len - contig);
5459         }
5460 
5461         /*
5462          * Handle the case where we are writing the last page after EOF.
5463          *
5464          * XXX - just a patch for i-mt3.
5465          */
5466         if (io_len == 0) {
5467                 ASSERT(pp->p_offset >=
5468                     (u_offset_t)(roundup(ip->i_size, PAGESIZE)));
5469                 io_len = PAGESIZE;
5470         }
5471 
5472         bp = pageio_setup(pp, io_len, ip->i_devvp, B_WRITE | flags);
5473 
5474         ULOCKFS_SET_MOD(ITOUL(ip));
5475 
5476         bp->b_edev = ip->i_dev;
5477         bp->b_dev = cmpdev(ip->i_dev);
5478         bp->b_blkno = bn;
5479         bp->b_un.b_addr = (caddr_t)0;
5480         bp->b_file = ip->i_vnode;
5481 
5482         /*
5483          * File contents of shadow or quota inodes are metadata, and updates
5484          * to these need to be put into a logging transaction. All direct
5485          * callers in UFS do that, but fsflush can come here _before_ the
5486          * normal codepath. An example would be updating ACL information, for
5487          * which the normal codepath would be:
5488          *      ufs_si_store()
5489          *      ufs_rdwri()
5490          *      wrip()
5491          *      segmap_release()
5492          *      VOP_PUTPAGE()
5493          * Here, fsflush can pick up the dirty page before segmap_release()
5494          * forces it out. If that happens, there's no transaction.
5495          * We therefore need to test whether a transaction exists, and if not
5496          * create one - for fsflush.
5497          */
5498         dotrans =
5499             (((ip->i_mode & IFMT) == IFSHAD || ufsvfsp->vfs_qinod == ip) &&
5500             ((curthread->t_flag & T_DONTBLOCK) == 0) &&
5501             (TRANS_ISTRANS(ufsvfsp)));
5502 
5503         if (dotrans) {
5504                 curthread->t_flag |= T_DONTBLOCK;
5505                 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_PUTPAGE, TOP_PUTPAGE_SIZE(ip));
5506         }
5507         if (TRANS_ISTRANS(ufsvfsp)) {
5508                 if ((ip->i_mode & IFMT) == IFSHAD) {
5509                         TRANS_BUF(ufsvfsp, 0, io_len, bp, DT_SHAD);
5510                 } else if (ufsvfsp->vfs_qinod == ip) {
5511                         TRANS_DELTA(ufsvfsp, ldbtob(bn), bp->b_bcount, DT_QR,
5512                             0, 0);
5513                 }
5514         }
5515         if (dotrans) {
5516                 TRANS_END_ASYNC(ufsvfsp, TOP_PUTPAGE, TOP_PUTPAGE_SIZE(ip));
5517                 curthread->t_flag &= ~T_DONTBLOCK;
5518         }
5519 
5520         /* write throttle */
5521 
5522         ASSERT(bp->b_iodone == NULL);
5523         bp->b_iodone = (int (*)())ufs_iodone;
5524         mutex_enter(&ip->i_tlock);
5525         ip->i_writes += bp->b_bcount;
5526         mutex_exit(&ip->i_tlock);
5527 
5528         if (bp->b_flags & B_ASYNC) {
5529                 if (ufsvfsp->vfs_log) {
5530                         lufs_write_strategy(ufsvfsp->vfs_log, bp);
5531                 } else if (ufsvfsp->vfs_snapshot) {
5532                         fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
5533                 } else {
5534                         ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
5535                         ub.ub_putasyncs.value.ul++;
5536                         (void) bdev_strategy(bp);
5537                         lwp_stat_update(LWP_STAT_OUBLK, 1);
5538                 }
5539         } else {
5540                 if (ufsvfsp->vfs_log) {
5541                         lufs_write_strategy(ufsvfsp->vfs_log, bp);
5542                 } else if (ufsvfsp->vfs_snapshot) {
5543                         fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
5544                 } else {
5545                         ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
5546                         ub.ub_putsyncs.value.ul++;
5547                         (void) bdev_strategy(bp);
5548                         lwp_stat_update(LWP_STAT_OUBLK, 1);
5549                 }
5550                 err = biowait(bp);
5551                 pageio_done(bp);
5552                 pvn_write_done(pp, ((err) ? B_ERROR : 0) | B_WRITE | flags);
5553         }
5554 
5555         pp = NULL;
5556 
5557 out:
5558         if (err != 0 && pp != NULL)
5559                 pvn_write_done(pp, B_ERROR | B_WRITE | flags);
5560 
5561         if (offp)
5562                 *offp = io_off;
5563         if (lenp)
5564                 *lenp = io_len;
5565 out_trace:
5566         return (err);
5567 }
5568 
5569 uint64_t ufs_map_alock_retry_cnt;
5570 uint64_t ufs_map_lockfs_retry_cnt;
5571 
5572 /* ARGSUSED */
5573 static int
5574 ufs_map(struct vnode *vp, offset_t off, struct as *as, caddr_t *addrp,
5575     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, struct cred *cr,
5576     caller_context_t *ct)
5577 {
5578         struct segvn_crargs vn_a;
5579         struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs;
5580         struct ulockfs *ulp;
5581         int error, sig;
5582         k_sigset_t smask;
5583         caddr_t hint = *addrp;
5584 
5585         if (vp->v_flag & VNOMAP) {
5586                 error = ENOSYS;
5587                 goto out;
5588         }
5589 
5590         if (off < (offset_t)0 || (offset_t)(off + len) < (offset_t)0) {
5591                 error = ENXIO;
5592                 goto out;
5593         }
5594 
5595         if (vp->v_type != VREG) {
5596                 error = ENODEV;
5597                 goto out;
5598         }
5599 
5600 retry_map:
5601         *addrp = hint;
5602         /*
5603          * If file is being locked, disallow mapping.
5604          */
5605         if (vn_has_mandatory_locks(vp, VTOI(vp)->i_mode)) {
5606                 error = EAGAIN;
5607                 goto out;
5608         }
5609 
5610         as_rangelock(as);
5611         /*
5612          * Note that if we are retrying (because ufs_lockfs_trybegin failed in
5613          * the previous attempt), some other thread could have grabbed
5614          * the same VA range if MAP_FIXED is set. In that case, choose_addr
5615          * would unmap the valid VA range, that is ok.
5616          */
5617         error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
5618         if (error != 0) {
5619                 as_rangeunlock(as);
5620                 goto out;
5621         }
5622 
5623         /*
5624          * a_lock has to be acquired before entering the lockfs protocol
5625          * because that is the order in which pagefault works. Also we cannot
5626          * block on a_lock here because this waiting writer will prevent
5627          * further readers like ufs_read from progressing and could cause
5628          * deadlock between ufs_read/ufs_map/pagefault when a quiesce is
5629          * pending.
5630          */
5631         while (!AS_LOCK_TRYENTER(as, RW_WRITER)) {
5632                 ufs_map_alock_retry_cnt++;
5633                 delay(RETRY_LOCK_DELAY);
5634         }
5635 
5636         /*
5637          * We can't hold as->a_lock and wait for lockfs to succeed because
5638          * the proc tools might hang on a_lock, so call ufs_lockfs_trybegin()
5639          * instead.
5640          */
5641         if (error = ufs_lockfs_trybegin(ufsvfsp, &ulp, ULOCKFS_MAP_MASK)) {
5642                 /*
5643                  * ufs_lockfs_trybegin() did not succeed. It is safer to give up
5644                  * as->a_lock and wait for ulp->ul_fs_lock status to change.
5645                  */
5646                 ufs_map_lockfs_retry_cnt++;
5647                 AS_LOCK_EXIT(as);
5648                 as_rangeunlock(as);
5649                 if (error == EIO)
5650                         goto out;
5651 
5652                 mutex_enter(&ulp->ul_lock);
5653                 while (ulp->ul_fs_lock & ULOCKFS_MAP_MASK) {
5654                         if (ULOCKFS_IS_SLOCK(ulp) || ufsvfsp->vfs_nointr) {
5655                                 cv_wait(&ulp->ul_cv, &ulp->ul_lock);
5656                         } else {
5657                                 sigintr(&smask, 1);
5658                                 sig = cv_wait_sig(&ulp->ul_cv, &ulp->ul_lock);
5659                                 sigunintr(&smask);
5660                                 if (((ulp->ul_fs_lock & ULOCKFS_MAP_MASK) &&
5661                                     !sig) || ufsvfsp->vfs_dontblock) {
5662                                         mutex_exit(&ulp->ul_lock);
5663                                         return (EINTR);
5664                                 }
5665                         }
5666                 }
5667                 mutex_exit(&ulp->ul_lock);
5668                 goto retry_map;
5669         }
5670 
5671         vn_a.vp = vp;
5672         vn_a.offset = (u_offset_t)off;
5673         vn_a.type = flags & MAP_TYPE;
5674         vn_a.prot = prot;
5675         vn_a.maxprot = maxprot;
5676         vn_a.cred = cr;
5677         vn_a.amp = NULL;
5678         vn_a.flags = flags & ~MAP_TYPE;
5679         vn_a.szc = 0;
5680         vn_a.lgrp_mem_policy_flags = 0;
5681 
5682         error = as_map_locked(as, *addrp, len, segvn_create, &vn_a);
5683         if (ulp)
5684                 ufs_lockfs_end(ulp);
5685         as_rangeunlock(as);
5686 out:
5687         return (error);
5688 }
5689 
5690 /* ARGSUSED */
5691 static int
5692 ufs_addmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
5693     size_t len, uchar_t  prot, uchar_t  maxprot, uint_t    flags,
5694     struct cred *cr, caller_context_t *ct)
5695 {
5696         struct inode *ip = VTOI(vp);
5697 
5698         if (vp->v_flag & VNOMAP) {
5699                 return (ENOSYS);
5700         }
5701 
5702         mutex_enter(&ip->i_tlock);
5703         ip->i_mapcnt += btopr(len);
5704         mutex_exit(&ip->i_tlock);
5705         return (0);
5706 }
5707 
5708 /*ARGSUSED*/
5709 static int
5710 ufs_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
5711     size_t len, uint_t prot,  uint_t maxprot,  uint_t flags, struct cred *cr,
5712     caller_context_t *ct)
5713 {
5714         struct inode *ip = VTOI(vp);
5715 
5716         if (vp->v_flag & VNOMAP) {
5717                 return (ENOSYS);
5718         }
5719 
5720         mutex_enter(&ip->i_tlock);
5721         ip->i_mapcnt -= btopr(len);  /* Count released mappings */
5722         ASSERT(ip->i_mapcnt >= 0);
5723         mutex_exit(&ip->i_tlock);
5724         return (0);
5725 }
5726 /*
5727  * Return the answer requested to poll() for non-device files
5728  */
5729 struct pollhead ufs_pollhd;
5730 
5731 /* ARGSUSED */
5732 int
5733 ufs_poll(vnode_t *vp, short ev, int any, short *revp, struct pollhead **phpp,
5734     caller_context_t *ct)
5735 {
5736         struct ufsvfs   *ufsvfsp;
5737 
5738         /*
5739          * Regular files reject edge-triggered pollers.
5740          * See the comment in fs_poll() for a more detailed explanation.
5741          */
5742         if (ev & POLLET) {
5743                 return (EPERM);
5744         }
5745 
5746         *revp = 0;
5747         ufsvfsp = VTOI(vp)->i_ufsvfs;
5748 
5749         if (!ufsvfsp) {
5750                 *revp = POLLHUP;
5751                 goto out;
5752         }
5753 
5754         if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs) ||
5755             ULOCKFS_IS_ELOCK(&ufsvfsp->vfs_ulockfs)) {
5756                 *revp |= POLLERR;
5757 
5758         } else {
5759                 if ((ev & POLLOUT) && !ufsvfsp->vfs_fs->fs_ronly &&
5760                     !ULOCKFS_IS_WLOCK(&ufsvfsp->vfs_ulockfs))
5761                         *revp |= POLLOUT;
5762 
5763                 if ((ev & POLLWRBAND) && !ufsvfsp->vfs_fs->fs_ronly &&
5764                     !ULOCKFS_IS_WLOCK(&ufsvfsp->vfs_ulockfs))
5765                         *revp |= POLLWRBAND;
5766 
5767                 if (ev & POLLIN)
5768                         *revp |= POLLIN;
5769 
5770                 if (ev & POLLRDNORM)
5771                         *revp |= POLLRDNORM;
5772 
5773                 if (ev & POLLRDBAND)
5774                         *revp |= POLLRDBAND;
5775         }
5776 
5777         if ((ev & POLLPRI) && (*revp & (POLLERR|POLLHUP)))
5778                 *revp |= POLLPRI;
5779 out:
5780         if (*revp == 0 && ! any) {
5781                 *phpp = &ufs_pollhd;
5782         }
5783 
5784         return (0);
5785 }
5786 
5787 /* ARGSUSED */
5788 static int
5789 ufs_l_pathconf(struct vnode *vp, int cmd, ulong_t *valp, struct cred *cr,
5790     caller_context_t *ct)
5791 {
5792         struct ufsvfs   *ufsvfsp = VTOI(vp)->i_ufsvfs;
5793         struct ulockfs  *ulp = NULL;
5794         struct inode    *sip = NULL;
5795         int             error;
5796         struct inode    *ip = VTOI(vp);
5797         int             issync;
5798 
5799         error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_PATHCONF_MASK);
5800         if (error)
5801                 return (error);
5802 
5803         switch (cmd) {
5804                 /*
5805                  * Have to handle _PC_NAME_MAX here, because the normal way
5806                  * [fs_pathconf() -> VOP_STATVFS() -> ufs_statvfs()]
5807                  * results in a lock ordering reversal between
5808                  * ufs_lockfs_{begin,end}() and
5809                  * ufs_thread_{suspend,continue}().
5810                  *
5811                  * Keep in sync with ufs_statvfs().
5812                  */
5813         case _PC_NAME_MAX:
5814                 *valp = MAXNAMLEN;
5815                 break;
5816 
5817         case _PC_FILESIZEBITS:
5818                 if (ufsvfsp->vfs_lfflags & UFS_LARGEFILES)
5819                         *valp = UFS_FILESIZE_BITS;
5820                 else
5821                         *valp = 32;
5822                 break;
5823 
5824         case _PC_XATTR_EXISTS:
5825                 if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
5826 
5827                         error =
5828                             ufs_xattr_getattrdir(vp, &sip, LOOKUP_XATTR, cr);
5829                         if (error ==  0 && sip != NULL) {
5830                                 /* Start transaction */
5831                                 if (ulp) {
5832                                         TRANS_BEGIN_CSYNC(ufsvfsp, issync,
5833                                             TOP_RMDIR, TOP_RMDIR_SIZE);
5834                                 }
5835                                 /*
5836                                  * Is directory empty
5837                                  */
5838                                 rw_enter(&sip->i_rwlock, RW_WRITER);
5839                                 rw_enter(&sip->i_contents, RW_WRITER);
5840                                 if (ufs_xattrdirempty(sip,
5841                                     sip->i_number, CRED())) {
5842                                         rw_enter(&ip->i_contents, RW_WRITER);
5843                                         ufs_unhook_shadow(ip, sip);
5844                                         rw_exit(&ip->i_contents);
5845 
5846                                         *valp = 0;
5847 
5848                                 } else
5849                                         *valp = 1;
5850                                 rw_exit(&sip->i_contents);
5851                                 rw_exit(&sip->i_rwlock);
5852                                 if (ulp) {
5853                                         TRANS_END_CSYNC(ufsvfsp, error, issync,
5854                                             TOP_RMDIR, TOP_RMDIR_SIZE);
5855                                 }
5856                                 VN_RELE(ITOV(sip));
5857                         } else if (error == ENOENT) {
5858                                 *valp = 0;
5859                                 error = 0;
5860                         }
5861                 } else {
5862                         error = fs_pathconf(vp, cmd, valp, cr, ct);
5863                 }
5864                 break;
5865 
5866         case _PC_ACL_ENABLED:
5867                 *valp = _ACL_ACLENT_ENABLED;
5868                 break;
5869 
5870         case _PC_MIN_HOLE_SIZE:
5871                 *valp = (ulong_t)ip->i_fs->fs_bsize;
5872                 break;
5873 
5874         case _PC_SATTR_ENABLED:
5875         case _PC_SATTR_EXISTS:
5876                 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
5877                     (vp->v_type == VREG || vp->v_type == VDIR);
5878                 break;
5879 
5880         case _PC_TIMESTAMP_RESOLUTION:
5881                 /*
5882                  * UFS keeps only microsecond timestamp resolution.
5883                  * This is historical and will probably never change.
5884                  */
5885                 *valp = 1000L;
5886                 break;
5887 
5888         default:
5889                 error = fs_pathconf(vp, cmd, valp, cr, ct);
5890                 break;
5891         }
5892 
5893         if (ulp != NULL) {
5894                 ufs_lockfs_end(ulp);
5895         }
5896         return (error);
5897 }
5898 
5899 int ufs_pageio_writes, ufs_pageio_reads;
5900 
5901 /*ARGSUSED*/
5902 static int
5903 ufs_pageio(struct vnode *vp, page_t *pp, u_offset_t io_off, size_t io_len,
5904     int flags, struct cred *cr, caller_context_t *ct)
5905 {
5906         struct inode *ip = VTOI(vp);
5907         struct ufsvfs *ufsvfsp;
5908         page_t *npp = NULL, *opp = NULL, *cpp = pp;
5909         struct buf *bp;
5910         daddr_t bn;
5911         size_t done_len = 0, cur_len = 0;
5912         int err = 0;
5913         int contig = 0;
5914         int dolock;
5915         int vmpss = 0;
5916         struct ulockfs *ulp;
5917 
5918         if ((flags & B_READ) && pp != NULL && pp->p_vnode == vp &&
5919             vp->v_mpssdata != NULL) {
5920                 vmpss = 1;
5921         }
5922 
5923         dolock = (rw_owner(&ip->i_contents) != curthread);
5924         /*
5925          * We need a better check.  Ideally, we would use another
5926          * vnodeops so that hlocked and forcibly unmounted file
5927          * systems would return EIO where appropriate and w/o the
5928          * need for these checks.
5929          */
5930         if ((ufsvfsp = ip->i_ufsvfs) == NULL)
5931                 return (EIO);
5932 
5933         /*
5934          * For vmpss (pp can be NULL) case respect the quiesce protocol.
5935          * ul_lock must be taken before locking pages so we can't use it here
5936          * if pp is non NULL because segvn already locked pages
5937          * SE_EXCL. Instead we rely on the fact that a forced umount or
5938          * applying a filesystem lock via ufs_fiolfs() will block in the
5939          * implicit call to ufs_flush() until we unlock the pages after the
5940          * return to segvn. Other ufs_quiesce() callers keep ufs_quiesce_pend
5941          * above 0 until they are done. We have to be careful not to increment
5942          * ul_vnops_cnt here after forceful unmount hlocks the file system.
5943          *
5944          * If pp is NULL use ul_lock to make sure we don't increment
5945          * ul_vnops_cnt after forceful unmount hlocks the file system.
5946          */
5947         if (vmpss || pp == NULL) {
5948                 ulp = &ufsvfsp->vfs_ulockfs;
5949                 if (pp == NULL)
5950                         mutex_enter(&ulp->ul_lock);
5951                 if (ulp->ul_fs_lock & ULOCKFS_GETREAD_MASK) {
5952                         if (pp == NULL) {
5953                                 mutex_exit(&ulp->ul_lock);
5954                         }
5955                         return (vmpss ? EIO : EINVAL);
5956                 }
5957                 atomic_inc_ulong(&ulp->ul_vnops_cnt);
5958                 if (pp == NULL)
5959                         mutex_exit(&ulp->ul_lock);
5960                 if (ufs_quiesce_pend) {
5961                         if (!atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
5962                                 cv_broadcast(&ulp->ul_cv);
5963                         return (vmpss ? EIO : EINVAL);
5964                 }
5965         }
5966 
5967         if (dolock) {
5968                 /*
5969                  * segvn may call VOP_PAGEIO() instead of VOP_GETPAGE() to
5970                  * handle a fault against a segment that maps vnode pages with
5971                  * large mappings.  Segvn creates pages and holds them locked
5972                  * SE_EXCL during VOP_PAGEIO() call. In this case we have to
5973                  * use rw_tryenter() to avoid a potential deadlock since in
5974                  * lock order i_contents needs to be taken first.
5975                  * Segvn will retry via VOP_GETPAGE() if VOP_PAGEIO() fails.
5976                  */
5977                 if (!vmpss) {
5978                         rw_enter(&ip->i_contents, RW_READER);
5979                 } else if (!rw_tryenter(&ip->i_contents, RW_READER)) {
5980                         if (!atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
5981                                 cv_broadcast(&ulp->ul_cv);
5982                         return (EDEADLK);
5983                 }
5984         }
5985 
5986         /*
5987          * Return an error to segvn because the pagefault request is beyond
5988          * PAGESIZE rounded EOF.
5989          */
5990         if (vmpss && btopr(io_off + io_len) > btopr(ip->i_size)) {
5991                 if (dolock)
5992                         rw_exit(&ip->i_contents);
5993                 if (!atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
5994                         cv_broadcast(&ulp->ul_cv);
5995                 return (EFAULT);
5996         }
5997 
5998         if (pp == NULL) {
5999                 if (bmap_has_holes(ip)) {
6000                         err = ENOSYS;
6001                 } else {
6002                         err = EINVAL;
6003                 }
6004                 if (dolock)
6005                         rw_exit(&ip->i_contents);
6006                 if (!atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
6007                         cv_broadcast(&ulp->ul_cv);
6008                 return (err);
6009         }
6010 
6011         /*
6012          * Break the io request into chunks, one for each contiguous
6013          * stretch of disk blocks in the target file.
6014          */
6015         while (done_len < io_len) {
6016                 ASSERT(cpp);
6017                 contig = 0;
6018                 if (err = bmap_read(ip, (u_offset_t)(io_off + done_len),
6019                     &bn, &contig))
6020                         break;
6021 
6022                 if (bn == UFS_HOLE) {   /* No holey swapfiles */
6023                         if (vmpss) {
6024                                 err = EFAULT;
6025                                 break;
6026                         }
6027                         err = ufs_fault(ITOV(ip), "ufs_pageio: bn == UFS_HOLE");
6028                         break;
6029                 }
6030 
6031                 cur_len = MIN(io_len - done_len, contig);
6032                 /*
6033                  * Zero out a page beyond EOF, when the last block of
6034                  * a file is a UFS fragment so that ufs_pageio() can be used
6035                  * instead of ufs_getpage() to handle faults against
6036                  * segvn segments that use large pages.
6037                  */
6038                 page_list_break(&cpp, &npp, btopr(cur_len));
6039                 if ((flags & B_READ) && (cur_len & PAGEOFFSET)) {
6040                         size_t xlen = cur_len & PAGEOFFSET;
6041                         pagezero(cpp->p_prev, xlen, PAGESIZE - xlen);
6042                 }
6043 
6044                 bp = pageio_setup(cpp, cur_len, ip->i_devvp, flags);
6045                 ASSERT(bp != NULL);
6046 
6047                 bp->b_edev = ip->i_dev;
6048                 bp->b_dev = cmpdev(ip->i_dev);
6049                 bp->b_blkno = bn;
6050                 bp->b_un.b_addr = (caddr_t)0;
6051                 bp->b_file = ip->i_vnode;
6052 
6053                 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
6054                 ub.ub_pageios.value.ul++;
6055                 if (ufsvfsp->vfs_snapshot)
6056                         fssnap_strategy(&(ufsvfsp->vfs_snapshot), bp);
6057                 else
6058                         (void) bdev_strategy(bp);
6059 
6060                 if (flags & B_READ)
6061                         ufs_pageio_reads++;
6062                 else
6063                         ufs_pageio_writes++;
6064                 if (flags & B_READ)
6065                         lwp_stat_update(LWP_STAT_INBLK, 1);
6066                 else
6067                         lwp_stat_update(LWP_STAT_OUBLK, 1);
6068                 /*
6069                  * If the request is not B_ASYNC, wait for i/o to complete
6070                  * and re-assemble the page list to return to the caller.
6071                  * If it is B_ASYNC we leave the page list in pieces and
6072                  * cleanup() will dispose of them.
6073                  */
6074                 if ((flags & B_ASYNC) == 0) {
6075                         err = biowait(bp);
6076                         pageio_done(bp);
6077                         if (err)
6078                                 break;
6079                         page_list_concat(&opp, &cpp);
6080                 }
6081                 cpp = npp;
6082                 npp = NULL;
6083                 if (flags & B_READ)
6084                         cur_len = P2ROUNDUP_TYPED(cur_len, PAGESIZE, size_t);
6085                 done_len += cur_len;
6086         }
6087         ASSERT(err || (cpp == NULL && npp == NULL && done_len == io_len));
6088         if (err) {
6089                 if (flags & B_ASYNC) {
6090                         /* Cleanup unprocessed parts of list */
6091                         page_list_concat(&cpp, &npp);
6092                         if (flags & B_READ)
6093                                 pvn_read_done(cpp, B_ERROR);
6094                         else
6095                                 pvn_write_done(cpp, B_ERROR);
6096                 } else {
6097                         /* Re-assemble list and let caller clean up */
6098                         page_list_concat(&opp, &cpp);
6099                         page_list_concat(&opp, &npp);
6100                 }
6101         }
6102 
6103         if (vmpss && !(ip->i_flag & IACC) && !ULOCKFS_IS_NOIACC(ulp) &&
6104             ufsvfsp->vfs_fs->fs_ronly == 0 && !ufsvfsp->vfs_noatime) {
6105                 mutex_enter(&ip->i_tlock);
6106                 ip->i_flag |= IACC;
6107                 ITIMES_NOLOCK(ip);
6108                 mutex_exit(&ip->i_tlock);
6109         }
6110 
6111         if (dolock)
6112                 rw_exit(&ip->i_contents);
6113         if (vmpss && !atomic_dec_ulong_nv(&ulp->ul_vnops_cnt))
6114                 cv_broadcast(&ulp->ul_cv);
6115         return (err);
6116 }
6117 
6118 /*
6119  * Called when the kernel is in a frozen state to dump data
6120  * directly to the device. It uses a private dump data structure,
6121  * set up by dump_ctl, to locate the correct disk block to which to dump.
6122  */
6123 /*ARGSUSED*/
6124 static int
6125 ufs_dump(vnode_t *vp, caddr_t addr, offset_t ldbn, offset_t dblks,
6126     caller_context_t *ct)
6127 {
6128         u_offset_t      file_size;
6129         struct inode    *ip = VTOI(vp);
6130         struct fs       *fs = ip->i_fs;
6131         daddr_t         dbn, lfsbn;
6132         int             disk_blks = fs->fs_bsize >> DEV_BSHIFT;
6133         int             error = 0;
6134         int             ndbs, nfsbs;
6135 
6136         /*
6137          * forced unmount case
6138          */
6139         if (ip->i_ufsvfs == NULL)
6140                 return (EIO);
6141         /*
6142          * Validate the inode that it has not been modified since
6143          * the dump structure is allocated.
6144          */
6145         mutex_enter(&ip->i_tlock);
6146         if ((dump_info == NULL) ||
6147             (dump_info->ip != ip) ||
6148             (dump_info->time.tv_sec != ip->i_mtime.tv_sec) ||
6149             (dump_info->time.tv_usec != ip->i_mtime.tv_usec)) {
6150                 mutex_exit(&ip->i_tlock);
6151                 return (-1);
6152         }
6153         mutex_exit(&ip->i_tlock);
6154 
6155         /*
6156          * See that the file has room for this write
6157          */
6158         UFS_GET_ISIZE(&file_size, ip);
6159 
6160         if (ldbtob(ldbn + dblks) > file_size)
6161                 return (ENOSPC);
6162 
6163         /*
6164          * Find the physical disk block numbers from the dump
6165          * private data structure directly and write out the data
6166          * in contiguous block lumps
6167          */
6168         while (dblks > 0 && !error) {
6169                 lfsbn = (daddr_t)lblkno(fs, ldbtob(ldbn));
6170                 dbn = fsbtodb(fs, dump_info->dblk[lfsbn]) + ldbn % disk_blks;
6171                 nfsbs = 1;
6172                 ndbs = disk_blks - ldbn % disk_blks;
6173                 while (ndbs < dblks && fsbtodb(fs, dump_info->dblk[lfsbn +
6174                     nfsbs]) == dbn + ndbs) {
6175                         nfsbs++;
6176                         ndbs += disk_blks;
6177                 }
6178                 if (ndbs > dblks)
6179                         ndbs = dblks;
6180                 error = bdev_dump(ip->i_dev, addr, dbn, ndbs);
6181                 addr += ldbtob((offset_t)ndbs);
6182                 dblks -= ndbs;
6183                 ldbn += ndbs;
6184         }
6185         return (error);
6186 
6187 }
6188 
6189 /*
6190  * Prepare the file system before and after the dump operation.
6191  *
6192  * action = DUMP_ALLOC:
6193  * Preparation before dump, allocate dump private data structure
6194  * to hold all the direct and indirect block info for dump.
6195  *
6196  * action = DUMP_FREE:
6197  * Clean up after dump, deallocate the dump private data structure.
6198  *
6199  * action = DUMP_SCAN:
6200  * Scan dump_info for *blkp DEV_BSIZE blocks of contig fs space;
6201  * if found, the starting file-relative DEV_BSIZE lbn is written
6202  * to *bklp; that lbn is intended for use with VOP_DUMP()
6203  */
6204 /*ARGSUSED*/
6205 static int
6206 ufs_dumpctl(vnode_t *vp, int action, offset_t *blkp, caller_context_t *ct)
6207 {
6208         struct inode    *ip = VTOI(vp);
6209         ufsvfs_t        *ufsvfsp = ip->i_ufsvfs;
6210         struct fs       *fs;
6211         daddr32_t       *dblk, *storeblk;
6212         daddr32_t       *nextblk, *endblk;
6213         struct buf      *bp;
6214         int             i, entry, entries;
6215         int             n, ncontig;
6216 
6217         /*
6218          * check for forced unmount
6219          */
6220         if (ufsvfsp == NULL)
6221                 return (EIO);
6222 
6223         if (action == DUMP_ALLOC) {
6224                 /*
6225                  * alloc and record dump_info
6226                  */
6227                 if (dump_info != NULL)
6228                         return (EINVAL);
6229 
6230                 ASSERT(vp->v_type == VREG);
6231                 fs = ufsvfsp->vfs_fs;
6232 
6233                 rw_enter(&ip->i_contents, RW_READER);
6234 
6235                 if (bmap_has_holes(ip)) {
6236                         rw_exit(&ip->i_contents);
6237                         return (EFAULT);
6238                 }
6239 
6240                 /*
6241                  * calculate and allocate space needed according to i_size
6242                  */
6243                 entries = (int)lblkno(fs, blkroundup(fs, ip->i_size));
6244                 dump_info = kmem_alloc(sizeof (struct dump) +
6245                     (entries - 1) * sizeof (daddr32_t), KM_NOSLEEP);
6246                 if (dump_info == NULL) {
6247                         rw_exit(&ip->i_contents);
6248                         return (ENOMEM);
6249                 }
6250 
6251                 /* Start saving the info */
6252                 dump_info->fsbs = entries;
6253                 dump_info->ip = ip;
6254                 storeblk = &dump_info->dblk[0];
6255 
6256                 /* Direct Blocks */
6257                 for (entry = 0; entry < NDADDR && entry < entries; entry++)
6258                         *storeblk++ = ip->i_db[entry];
6259 
6260                 /* Indirect Blocks */
6261                 for (i = 0; i < NIADDR; i++) {
6262                         int error = 0;
6263 
6264                         bp = UFS_BREAD(ufsvfsp,
6265                             ip->i_dev, fsbtodb(fs, ip->i_ib[i]), fs->fs_bsize);
6266                         if (bp->b_flags & B_ERROR)
6267                                 error = EIO;
6268                         else {
6269                                 dblk = bp->b_un.b_daddr;
6270                                 if ((storeblk = save_dblks(ip, ufsvfsp,
6271                                     storeblk, dblk, i, entries)) == NULL)
6272                                         error = EIO;
6273                         }
6274 
6275                         brelse(bp);
6276 
6277                         if (error != 0) {
6278                                 kmem_free(dump_info, sizeof (struct dump) +
6279                                     (entries - 1) * sizeof (daddr32_t));
6280                                 rw_exit(&ip->i_contents);
6281                                 dump_info = NULL;
6282                                 return (error);
6283                         }
6284                 }
6285                 /* and time stamp the information */
6286                 mutex_enter(&ip->i_tlock);
6287                 dump_info->time = ip->i_mtime;
6288                 mutex_exit(&ip->i_tlock);
6289 
6290                 rw_exit(&ip->i_contents);
6291         } else if (action == DUMP_FREE) {
6292                 /*
6293                  * free dump_info
6294                  */
6295                 if (dump_info == NULL)
6296                         return (EINVAL);
6297                 entries = dump_info->fsbs - 1;
6298                 kmem_free(dump_info, sizeof (struct dump) +
6299                     entries * sizeof (daddr32_t));
6300                 dump_info = NULL;
6301         } else if (action == DUMP_SCAN) {
6302                 /*
6303                  * scan dump_info
6304                  */
6305                 if (dump_info == NULL)
6306                         return (EINVAL);
6307 
6308                 dblk = dump_info->dblk;
6309                 nextblk = dblk + 1;
6310                 endblk = dblk + dump_info->fsbs - 1;
6311                 fs = ufsvfsp->vfs_fs;
6312                 ncontig = *blkp >> (fs->fs_bshift - DEV_BSHIFT);
6313 
6314                 /*
6315                  * scan dblk[] entries; contig fs space is found when:
6316                  * ((current blkno + frags per block) == next blkno)
6317                  */
6318                 n = 0;
6319                 while (n < ncontig && dblk < endblk) {
6320                         if ((*dblk + fs->fs_frag) == *nextblk)
6321                                 n++;
6322                         else
6323                                 n = 0;
6324                         dblk++;
6325                         nextblk++;
6326                 }
6327 
6328                 /*
6329                  * index is where size bytes of contig space begins;
6330                  * conversion from index to the file's DEV_BSIZE lbn
6331                  * is equivalent to:  (index * fs_bsize) / DEV_BSIZE
6332                  */
6333                 if (n == ncontig) {
6334                         i = (dblk - dump_info->dblk) - ncontig;
6335                         *blkp = i << (fs->fs_bshift - DEV_BSHIFT);
6336                 } else
6337                         return (EFAULT);
6338         }
6339         return (0);
6340 }
6341 
6342 /*
6343  * Recursive helper function for ufs_dumpctl().  It follows the indirect file
6344  * system  blocks until it reaches the the disk block addresses, which are
6345  * then stored into the given buffer, storeblk.
6346  */
6347 static daddr32_t *
6348 save_dblks(struct inode *ip, struct ufsvfs *ufsvfsp,  daddr32_t *storeblk,
6349     daddr32_t *dblk, int level, int entries)
6350 {
6351         struct fs       *fs = ufsvfsp->vfs_fs;
6352         struct buf      *bp;
6353         int             i;
6354 
6355         if (level == 0) {
6356                 for (i = 0; i < NINDIR(fs); i++) {
6357                         if (storeblk - dump_info->dblk >= entries)
6358                                 break;
6359                         *storeblk++ = dblk[i];
6360                 }
6361                 return (storeblk);
6362         }
6363         for (i = 0; i < NINDIR(fs); i++) {
6364                 if (storeblk - dump_info->dblk >= entries)
6365                         break;
6366                 bp = UFS_BREAD(ufsvfsp,
6367                     ip->i_dev, fsbtodb(fs, dblk[i]), fs->fs_bsize);
6368                 if (bp->b_flags & B_ERROR) {
6369                         brelse(bp);
6370                         return (NULL);
6371                 }
6372                 storeblk = save_dblks(ip, ufsvfsp, storeblk, bp->b_un.b_daddr,
6373                     level - 1, entries);
6374                 brelse(bp);
6375 
6376                 if (storeblk == NULL)
6377                         return (NULL);
6378         }
6379         return (storeblk);
6380 }
6381 
6382 /* ARGSUSED */
6383 static int
6384 ufs_getsecattr(struct vnode *vp, vsecattr_t *vsap, int flag,
6385     struct cred *cr, caller_context_t *ct)
6386 {
6387         struct inode    *ip = VTOI(vp);
6388         struct ulockfs  *ulp;
6389         struct ufsvfs   *ufsvfsp = ip->i_ufsvfs;
6390         ulong_t         vsa_mask = vsap->vsa_mask;
6391         int             err = EINVAL;
6392 
6393         vsa_mask &= (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT);
6394 
6395         /*
6396          * Only grab locks if needed - they're not needed to check vsa_mask
6397          * or if the mask contains no acl flags.
6398          */
6399         if (vsa_mask != 0) {
6400                 if (err = ufs_lockfs_begin(ufsvfsp, &ulp,
6401                     ULOCKFS_GETATTR_MASK))
6402                         return (err);
6403 
6404                 rw_enter(&ip->i_contents, RW_READER);
6405                 err = ufs_acl_get(ip, vsap, flag, cr);
6406                 rw_exit(&ip->i_contents);
6407 
6408                 if (ulp)
6409                         ufs_lockfs_end(ulp);
6410         }
6411         return (err);
6412 }
6413 
6414 /* ARGSUSED */
6415 static int
6416 ufs_setsecattr(struct vnode *vp, vsecattr_t *vsap, int flag, struct cred *cr,
6417     caller_context_t *ct)
6418 {
6419         struct inode    *ip = VTOI(vp);
6420         struct ulockfs  *ulp = NULL;
6421         struct ufsvfs   *ufsvfsp = VTOI(vp)->i_ufsvfs;
6422         ulong_t         vsa_mask = vsap->vsa_mask;
6423         int             err;
6424         int             haverwlock = 1;
6425         int             trans_size;
6426         int             donetrans = 0;
6427         int             retry = 1;
6428 
6429         ASSERT(RW_LOCK_HELD(&ip->i_rwlock));
6430 
6431         /* Abort now if the request is either empty or invalid. */
6432         vsa_mask &= (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT);
6433         if ((vsa_mask == 0) ||
6434             ((vsap->vsa_aclentp == NULL) &&
6435             (vsap->vsa_dfaclentp == NULL))) {
6436                 err = EINVAL;
6437                 goto out;
6438         }
6439 
6440         /*
6441          * Following convention, if this is a directory then we acquire the
6442          * inode's i_rwlock after starting a UFS logging transaction;
6443          * otherwise, we acquire it beforehand. Since we were called (and
6444          * must therefore return) with the lock held, we will have to drop it,
6445          * and later reacquire it, if operating on a directory.
6446          */
6447         if (vp->v_type == VDIR) {
6448                 rw_exit(&ip->i_rwlock);
6449                 haverwlock = 0;
6450         } else {
6451                 /* Upgrade the lock if required. */
6452                 if (!rw_write_held(&ip->i_rwlock)) {
6453                         rw_exit(&ip->i_rwlock);
6454                         rw_enter(&ip->i_rwlock, RW_WRITER);
6455                 }
6456         }
6457 
6458 again:
6459         ASSERT(!(vp->v_type == VDIR && haverwlock));
6460         if (err = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SETATTR_MASK)) {
6461                 ulp = NULL;
6462                 retry = 0;
6463                 goto out;
6464         }
6465 
6466         /*
6467          * Check that the file system supports this operation. Note that
6468          * ufs_lockfs_begin() will have checked that the file system had
6469          * not been forcibly unmounted.
6470          */
6471         if (ufsvfsp->vfs_fs->fs_ronly) {
6472                 err = EROFS;
6473                 goto out;
6474         }
6475         if (ufsvfsp->vfs_nosetsec) {
6476                 err = ENOSYS;
6477                 goto out;
6478         }
6479 
6480         if (ulp) {
6481                 TRANS_BEGIN_ASYNC(ufsvfsp, TOP_SETSECATTR,
6482                     trans_size = TOP_SETSECATTR_SIZE(VTOI(vp)));
6483                 donetrans = 1;
6484         }
6485 
6486         if (vp->v_type == VDIR) {
6487                 rw_enter(&ip->i_rwlock, RW_WRITER);
6488                 haverwlock = 1;
6489         }
6490 
6491         ASSERT(haverwlock);
6492 
6493         /* Do the actual work. */
6494         rw_enter(&ip->i_contents, RW_WRITER);
6495         /*
6496          * Suppress out of inodes messages if we will retry.
6497          */
6498         if (retry)
6499                 ip->i_flag |= IQUIET;
6500         err = ufs_acl_set(ip, vsap, flag, cr);
6501         ip->i_flag &= ~IQUIET;
6502         rw_exit(&ip->i_contents);
6503 
6504 out:
6505         if (ulp) {
6506                 if (donetrans) {
6507                         /*
6508                          * top_end_async() can eventually call
6509                          * top_end_sync(), which can block. We must
6510                          * therefore observe the lock-ordering protocol
6511                          * here as well.
6512                          */
6513                         if (vp->v_type == VDIR) {
6514                                 rw_exit(&ip->i_rwlock);
6515                                 haverwlock = 0;
6516                         }
6517                         TRANS_END_ASYNC(ufsvfsp, TOP_SETSECATTR, trans_size);
6518                 }
6519                 ufs_lockfs_end(ulp);
6520         }
6521         /*
6522          * If no inodes available, try scaring a logically-
6523          * free one out of the delete queue to someplace
6524          * that we can find it.
6525          */
6526         if ((err == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
6527                 ufs_delete_drain_wait(ufsvfsp, 1);
6528                 retry = 0;
6529                 if (vp->v_type == VDIR && haverwlock) {
6530                         rw_exit(&ip->i_rwlock);
6531                         haverwlock = 0;
6532                 }
6533                 goto again;
6534         }
6535         /*
6536          * If we need to reacquire the lock then it is safe to do so
6537          * as a reader. This is because ufs_rwunlock(), which will be
6538          * called by our caller after we return, does not differentiate
6539          * between shared and exclusive locks.
6540          */
6541         if (!haverwlock) {
6542                 ASSERT(vp->v_type == VDIR);
6543                 rw_enter(&ip->i_rwlock, RW_READER);
6544         }
6545 
6546         return (err);
6547 }
6548 
6549 /*
6550  * Locate the vnode to be used for an event notification. As this will
6551  * be called prior to the name space change perform basic verification
6552  * that the change will be allowed.
6553  */
6554 
6555 static int
6556 ufs_eventlookup(struct vnode *dvp, char *nm, struct cred *cr,
6557     struct vnode **vpp)
6558 {
6559         int     namlen;
6560         int     error;
6561         struct vnode    *vp;
6562         struct inode    *ip;
6563         struct inode    *xip;
6564         struct ufsvfs   *ufsvfsp;
6565         struct ulockfs  *ulp;
6566 
6567         ip = VTOI(dvp);
6568         *vpp = NULL;
6569 
6570         if ((namlen = strlen(nm)) == 0)
6571                 return (EINVAL);
6572 
6573         if (nm[0] == '.') {
6574                 if (namlen == 1)
6575                         return (EINVAL);
6576                 else if ((namlen == 2) && nm[1] == '.') {
6577                         return (EEXIST);
6578                 }
6579         }
6580 
6581         /*
6582          * Check accessibility and write access of parent directory as we
6583          * only want to post the event if we're able to make a change.
6584          */
6585         if (error = ufs_diraccess(ip, IEXEC|IWRITE, cr))
6586                 return (error);
6587 
6588         if (vp = dnlc_lookup(dvp, nm)) {
6589                 if (vp == DNLC_NO_VNODE) {
6590                         VN_RELE(vp);
6591                         return (ENOENT);
6592                 }
6593 
6594                 *vpp = vp;
6595                 return (0);
6596         }
6597 
6598         /*
6599          * Keep the idle queue from getting too long by idling two
6600          * inodes before attempting to allocate another.
6601          * This operation must be performed before entering lockfs
6602          * or a transaction.
6603          */
6604         if (ufs_idle_q.uq_ne > ufs_idle_q.uq_hiwat)
6605                 if ((curthread->t_flag & T_DONTBLOCK) == 0) {
6606                         ins.in_lidles.value.ul += ufs_lookup_idle_count;
6607                         ufs_idle_some(ufs_lookup_idle_count);
6608                 }
6609 
6610         ufsvfsp = ip->i_ufsvfs;
6611 
6612 retry_lookup:
6613         if (error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_LOOKUP_MASK))
6614                 return (error);
6615 
6616         if ((error = ufs_dirlook(ip, nm, &xip, cr, 1, 1)) == 0) {
6617                 vp = ITOV(xip);
6618                 *vpp = vp;
6619         }
6620 
6621         if (ulp) {
6622                 ufs_lockfs_end(ulp);
6623         }
6624 
6625         if (error == EAGAIN)
6626                 goto retry_lookup;
6627 
6628         return (error);
6629 }