big-one New usr/src/uts/common/fs/nfs/nfs

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  24  *
  25  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
  26  *      All rights reserved.
  27  */
  28 
  29 /*
  30  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  31  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  32  */
  33 
  34 #include <sys/param.h>
  35 #include <sys/types.h>
  36 #include <sys/systm.h>
  37 #include <sys/cred.h>
  38 #include <sys/time.h>
  39 #include <sys/vnode.h>
  40 #include <sys/vfs.h>
  41 #include <sys/vfs_opreg.h>
  42 #include <sys/file.h>
  43 #include <sys/filio.h>
  44 #include <sys/uio.h>
  45 #include <sys/buf.h>
  46 #include <sys/mman.h>
  47 #include <sys/pathname.h>
  48 #include <sys/dirent.h>
  49 #include <sys/debug.h>
  50 #include <sys/vmsystm.h>
  51 #include <sys/fcntl.h>
  52 #include <sys/flock.h>
  53 #include <sys/swap.h>
  54 #include <sys/errno.h>
  55 #include <sys/strsubr.h>
  56 #include <sys/sysmacros.h>
  57 #include <sys/kmem.h>
  58 #include <sys/cmn_err.h>
  59 #include <sys/pathconf.h>
  60 #include <sys/utsname.h>
  61 #include <sys/dnlc.h>
  62 #include <sys/acl.h>
  63 #include <sys/atomic.h>
  64 #include <sys/policy.h>
  65 #include <sys/sdt.h>
  66 
  67 #include <rpc/types.h>
  68 #include <rpc/auth.h>
  69 #include <rpc/clnt.h>
  70 
  71 #include <nfs/nfs.h>
  72 #include <nfs/nfs_clnt.h>
  73 #include <nfs/rnode.h>
  74 #include <nfs/nfs_acl.h>
  75 #include <nfs/lm.h>
  76 
  77 #include <vm/hat.h>
  78 #include <vm/as.h>
  79 #include <vm/page.h>
  80 #include <vm/pvn.h>
  81 #include <vm/seg.h>
  82 #include <vm/seg_map.h>
  83 #include <vm/seg_kpm.h>
  84 #include <vm/seg_vn.h>
  85 
  86 #include <fs/fs_subr.h>
  87 
  88 #include <sys/ddi.h>
  89 
  90 static int      nfs_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int,
  91                         cred_t *);
  92 static int      nfswrite(vnode_t *, caddr_t, uint_t, int, cred_t *);
  93 static int      nfsread(vnode_t *, caddr_t, uint_t, int, size_t *, cred_t *);
  94 static int      nfssetattr(vnode_t *, struct vattr *, int, cred_t *);
  95 static int      nfslookup_dnlc(vnode_t *, char *, vnode_t **, cred_t *);
  96 static int      nfslookup_otw(vnode_t *, char *, vnode_t **, cred_t *, int);
  97 static int      nfsrename(vnode_t *, char *, vnode_t *, char *, cred_t *,
  98                         caller_context_t *);
  99 static int      nfsreaddir(vnode_t *, rddir_cache *, cred_t *);
 100 static int      nfs_bio(struct buf *, cred_t *);
 101 static int      nfs_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
 102                         page_t *[], size_t, struct seg *, caddr_t,
 103                         enum seg_rw, cred_t *);
 104 static void     nfs_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *,
 105                         cred_t *);
 106 static int      nfs_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t,
 107                         int, cred_t *);
 108 static int      nfs_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t,
 109                         int, cred_t *);
 110 static void     nfs_delmap_callback(struct as *, void *, uint_t);
 111 
 112 /*
 113  * Error flags used to pass information about certain special errors
 114  * which need to be handled specially.
 115  */
 116 #define NFS_EOF                 -98
 117 
 118 /*
 119  * These are the vnode ops routines which implement the vnode interface to
 120  * the networked file system.  These routines just take their parameters,
 121  * make them look networkish by putting the right info into interface structs,
 122  * and then calling the appropriate remote routine(s) to do the work.
 123  *
 124  * Note on directory name lookup cacheing:  If we detect a stale fhandle,
 125  * we purge the directory cache relative to that vnode.  This way, the
 126  * user won't get burned by the cache repeatedly.  See <nfs/rnode.h> for
 127  * more details on rnode locking.
 128  */
 129 
 130 static int      nfs_open(vnode_t **, int, cred_t *, caller_context_t *);
 131 static int      nfs_close(vnode_t *, int, int, offset_t, cred_t *,
 132                         caller_context_t *);
 133 static int      nfs_read(vnode_t *, struct uio *, int, cred_t *,
 134                         caller_context_t *);
 135 static int      nfs_write(vnode_t *, struct uio *, int, cred_t *,
 136                         caller_context_t *);
 137 static int      nfs_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *,
 138                         caller_context_t *);
 139 static int      nfs_getattr(vnode_t *, struct vattr *, int, cred_t *,
 140                         caller_context_t *);
 141 static int      nfs_setattr(vnode_t *, struct vattr *, int, cred_t *,
 142                         caller_context_t *);
 143 static int      nfs_access(vnode_t *, int, int, cred_t *, caller_context_t *);
 144 static int      nfs_accessx(void *, int, cred_t *);
 145 static int      nfs_readlink(vnode_t *, struct uio *, cred_t *,
 146                         caller_context_t *);
 147 static int      nfs_fsync(vnode_t *, int, cred_t *, caller_context_t *);
 148 static void     nfs_inactive(vnode_t *, cred_t *, caller_context_t *);
 149 static int      nfs_lookup(vnode_t *, char *, vnode_t **, struct pathname *,
 150                         int, vnode_t *, cred_t *, caller_context_t *,
 151                         int *, pathname_t *);
 152 static int      nfs_create(vnode_t *, char *, struct vattr *, enum vcexcl,
 153                         int, vnode_t **, cred_t *, int, caller_context_t *,
 154                         vsecattr_t *);
 155 static int      nfs_remove(vnode_t *, char *, cred_t *, caller_context_t *,
 156                         int);
 157 static int      nfs_link(vnode_t *, vnode_t *, char *, cred_t *,
 158                         caller_context_t *, int);
 159 static int      nfs_rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
 160                         caller_context_t *, int);
 161 static int      nfs_mkdir(vnode_t *, char *, struct vattr *, vnode_t **,
 162                         cred_t *, caller_context_t *, int, vsecattr_t *);
 163 static int      nfs_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
 164                         caller_context_t *, int);
 165 static int      nfs_symlink(vnode_t *, char *, struct vattr *, char *,
 166                         cred_t *, caller_context_t *, int);
 167 static int      nfs_readdir(vnode_t *, struct uio *, cred_t *, int *,
 168                         caller_context_t *, int);
 169 static int      nfs_fid(vnode_t *, fid_t *, caller_context_t *);
 170 static int      nfs_rwlock(vnode_t *, int, caller_context_t *);
 171 static void     nfs_rwunlock(vnode_t *, int, caller_context_t *);
 172 static int      nfs_seek(vnode_t *, offset_t, offset_t *, caller_context_t *);
 173 static int      nfs_getpage(vnode_t *, offset_t, size_t, uint_t *,
 174                         page_t *[], size_t, struct seg *, caddr_t,
 175                         enum seg_rw, cred_t *, caller_context_t *);
 176 static int      nfs_putpage(vnode_t *, offset_t, size_t, int, cred_t *,
 177                         caller_context_t *);
 178 static int      nfs_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t,
 179                         uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
 180 static int      nfs_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
 181                         uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
 182 static int      nfs_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
 183                         struct flk_callback *, cred_t *, caller_context_t *);
 184 static int      nfs_space(vnode_t *, int, struct flock64 *, int, offset_t,
 185                         cred_t *, caller_context_t *);
 186 static int      nfs_realvp(vnode_t *, vnode_t **, caller_context_t *);
 187 static int      nfs_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
 188                         uint_t, uint_t, uint_t, cred_t *, caller_context_t *);
 189 static int      nfs_pathconf(vnode_t *, int, ulong_t *, cred_t *,
 190                         caller_context_t *);
 191 static int      nfs_pageio(vnode_t *, page_t *, u_offset_t, size_t, int,
 192                         cred_t *, caller_context_t *);
 193 static int      nfs_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
 194                         caller_context_t *);
 195 static int      nfs_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
 196                         caller_context_t *);
 197 static int      nfs_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
 198                         caller_context_t *);
 199 
 200 struct vnodeops *nfs_vnodeops;
 201 
 202 const fs_operation_def_t nfs_vnodeops_template[] = {
 203         VOPNAME_OPEN,           { .vop_open = nfs_open },
 204         VOPNAME_CLOSE,          { .vop_close = nfs_close },
 205         VOPNAME_READ,           { .vop_read = nfs_read },
 206         VOPNAME_WRITE,          { .vop_write = nfs_write },
 207         VOPNAME_IOCTL,          { .vop_ioctl = nfs_ioctl },
 208         VOPNAME_GETATTR,        { .vop_getattr = nfs_getattr },
 209         VOPNAME_SETATTR,        { .vop_setattr = nfs_setattr },
 210         VOPNAME_ACCESS,         { .vop_access = nfs_access },
 211         VOPNAME_LOOKUP,         { .vop_lookup = nfs_lookup },
 212         VOPNAME_CREATE,         { .vop_create = nfs_create },
 213         VOPNAME_REMOVE,         { .vop_remove = nfs_remove },
 214         VOPNAME_LINK,           { .vop_link = nfs_link },
 215         VOPNAME_RENAME,         { .vop_rename = nfs_rename },
 216         VOPNAME_MKDIR,          { .vop_mkdir = nfs_mkdir },
 217         VOPNAME_RMDIR,          { .vop_rmdir = nfs_rmdir },
 218         VOPNAME_READDIR,        { .vop_readdir = nfs_readdir },
 219         VOPNAME_SYMLINK,        { .vop_symlink = nfs_symlink },
 220         VOPNAME_READLINK,       { .vop_readlink = nfs_readlink },
 221         VOPNAME_FSYNC,          { .vop_fsync = nfs_fsync },
 222         VOPNAME_INACTIVE,       { .vop_inactive = nfs_inactive },
 223         VOPNAME_FID,            { .vop_fid = nfs_fid },
 224         VOPNAME_RWLOCK,         { .vop_rwlock = nfs_rwlock },
 225         VOPNAME_RWUNLOCK,       { .vop_rwunlock = nfs_rwunlock },
 226         VOPNAME_SEEK,           { .vop_seek = nfs_seek },
 227         VOPNAME_FRLOCK,         { .vop_frlock = nfs_frlock },
 228         VOPNAME_SPACE,          { .vop_space = nfs_space },
 229         VOPNAME_REALVP,         { .vop_realvp = nfs_realvp },
 230         VOPNAME_GETPAGE,        { .vop_getpage = nfs_getpage },
 231         VOPNAME_PUTPAGE,        { .vop_putpage = nfs_putpage },
 232         VOPNAME_MAP,            { .vop_map = nfs_map },
 233         VOPNAME_ADDMAP,         { .vop_addmap = nfs_addmap },
 234         VOPNAME_DELMAP,         { .vop_delmap = nfs_delmap },
 235         VOPNAME_DUMP,           { .vop_dump = nfs_dump },
 236         VOPNAME_PATHCONF,       { .vop_pathconf = nfs_pathconf },
 237         VOPNAME_PAGEIO,         { .vop_pageio = nfs_pageio },
 238         VOPNAME_SETSECATTR,     { .vop_setsecattr = nfs_setsecattr },
 239         VOPNAME_GETSECATTR,     { .vop_getsecattr = nfs_getsecattr },
 240         VOPNAME_SHRLOCK,        { .vop_shrlock = nfs_shrlock },
 241         VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
 242         NULL,                   NULL
 243 };
 244 
 245 /*
 246  * XXX:  This is referenced in modstubs.s
 247  */
 248 struct vnodeops *
 249 nfs_getvnodeops(void)
 250 {
 251         return (nfs_vnodeops);
 252 }
 253 
 254 /* ARGSUSED */
 255 static int
 256 nfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 257 {
 258         int error;
 259         struct vattr va;
 260         rnode_t *rp;
 261         vnode_t *vp;
 262 
 263         vp = *vpp;
 264         rp = VTOR(vp);
 265         if (nfs_zone() != VTOMI(vp)->mi_zone)
 266                 return (EIO);
 267         mutex_enter(&rp->r_statelock);
 268         if (rp->r_cred == NULL) {
 269                 crhold(cr);
 270                 rp->r_cred = cr;
 271         }
 272         mutex_exit(&rp->r_statelock);
 273 
 274         /*
 275          * If there is no cached data or if close-to-open
 276          * consistency checking is turned off, we can avoid
 277          * the over the wire getattr.  Otherwise, if the
 278          * file system is mounted readonly, then just verify
 279          * the caches are up to date using the normal mechanism.
 280          * Else, if the file is not mmap'd, then just mark
 281          * the attributes as timed out.  They will be refreshed
 282          * and the caches validated prior to being used.
 283          * Else, the file system is mounted writeable so
 284          * force an over the wire GETATTR in order to ensure
 285          * that all cached data is valid.
 286          */
 287         if (vp->v_count > 1 ||
 288             ((vn_has_cached_data(vp) || HAVE_RDDIR_CACHE(rp)) &&
 289             !(VTOMI(vp)->mi_flags & MI_NOCTO))) {
 290                 if (vn_is_readonly(vp))
 291                         error = nfs_validate_caches(vp, cr);
 292                 else if (rp->r_mapcnt == 0 && vp->v_count == 1) {
 293                         PURGE_ATTRCACHE(vp);
 294                         error = 0;
 295                 } else {
 296                         va.va_mask = AT_ALL;
 297                         error = nfs_getattr_otw(vp, &va, cr);
 298                 }
 299         } else
 300                 error = 0;
 301 
 302         return (error);
 303 }
 304 
 305 /* ARGSUSED */
 306 static int
 307 nfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
 308         caller_context_t *ct)
 309 {
 310         rnode_t *rp;
 311         int error;
 312         struct vattr va;
 313 
 314         /*
 315          * zone_enter(2) prevents processes from changing zones with NFS files
 316          * open; if we happen to get here from the wrong zone we can't do
 317          * anything over the wire.
 318          */
 319         if (VTOMI(vp)->mi_zone != nfs_zone()) {
 320                 /*
 321                  * We could attempt to clean up locks, except we're sure
 322                  * that the current process didn't acquire any locks on
 323                  * the file: any attempt to lock a file belong to another zone
 324                  * will fail, and one can't lock an NFS file and then change
 325                  * zones, as that fails too.
 326                  *
 327                  * Returning an error here is the sane thing to do.  A
 328                  * subsequent call to VN_RELE() which translates to a
 329                  * nfs_inactive() will clean up state: if the zone of the
 330                  * vnode's origin is still alive and kicking, an async worker
 331                  * thread will handle the request (from the correct zone), and
 332                  * everything (minus the final nfs_getattr_otw() call) should
 333                  * be OK. If the zone is going away nfs_async_inactive() will
 334                  * throw away cached pages inline.
 335                  */
 336                 return (EIO);
 337         }
 338 
 339         /*
 340          * If we are using local locking for this filesystem, then
 341          * release all of the SYSV style record locks.  Otherwise,
 342          * we are doing network locking and we need to release all
 343          * of the network locks.  All of the locks held by this
 344          * process on this file are released no matter what the
 345          * incoming reference count is.
 346          */
 347         if (VTOMI(vp)->mi_flags & MI_LLOCK) {
 348                 cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
 349                 cleanshares(vp, ttoproc(curthread)->p_pid);
 350         } else
 351                 nfs_lockrelease(vp, flag, offset, cr);
 352 
 353         if (count > 1)
 354                 return (0);
 355 
 356         /*
 357          * If the file has been `unlinked', then purge the
 358          * DNLC so that this vnode will get reycled quicker
 359          * and the .nfs* file on the server will get removed.
 360          */
 361         rp = VTOR(vp);
 362         if (rp->r_unldvp != NULL)
 363                 dnlc_purge_vp(vp);
 364 
 365         /*
 366          * If the file was open for write and there are pages,
 367          * then if the file system was mounted using the "no-close-
 368          *      to-open" semantics, then start an asynchronous flush
 369          *      of the all of the pages in the file.
 370          * else the file system was not mounted using the "no-close-
 371          *      to-open" semantics, then do a synchronous flush and
 372          *      commit of all of the dirty and uncommitted pages.
 373          *
 374          * The asynchronous flush of the pages in the "nocto" path
 375          * mostly just associates a cred pointer with the rnode so
 376          * writes which happen later will have a better chance of
 377          * working.  It also starts the data being written to the
 378          * server, but without unnecessarily delaying the application.
 379          */
 380         if ((flag & FWRITE) && vn_has_cached_data(vp)) {
 381                 if ((VTOMI(vp)->mi_flags & MI_NOCTO)) {
 382                         error = nfs_putpage(vp, (offset_t)0, 0, B_ASYNC,
 383                             cr, ct);
 384                         if (error == EAGAIN)
 385                                 error = 0;
 386                 } else
 387                         error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
 388                 if (!error) {
 389                         mutex_enter(&rp->r_statelock);
 390                         error = rp->r_error;
 391                         rp->r_error = 0;
 392                         mutex_exit(&rp->r_statelock);
 393                 }
 394         } else {
 395                 mutex_enter(&rp->r_statelock);
 396                 error = rp->r_error;
 397                 rp->r_error = 0;
 398                 mutex_exit(&rp->r_statelock);
 399         }
 400 
 401         /*
 402          * If RWRITEATTR is set, then issue an over the wire GETATTR to
 403          * refresh the attribute cache with a set of attributes which
 404          * weren't returned from a WRITE.  This will enable the close-
 405          * to-open processing to work.
 406          */
 407         if (rp->r_flags & RWRITEATTR)
 408                 (void) nfs_getattr_otw(vp, &va, cr);
 409 
 410         return (error);
 411 }
 412 
 413 /* ARGSUSED */
 414 static int
 415 nfs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
 416         caller_context_t *ct)
 417 {
 418         rnode_t *rp;
 419         u_offset_t off;
 420         offset_t diff;
 421         int on;
 422         size_t n;
 423         caddr_t base;
 424         uint_t flags;
 425         int error;
 426         mntinfo_t *mi;
 427 
 428         rp = VTOR(vp);
 429         mi = VTOMI(vp);
 430 
 431         if (nfs_zone() != mi->mi_zone)
 432                 return (EIO);
 433 
 434         ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
 435 
 436         if (vp->v_type != VREG)
 437                 return (EISDIR);
 438 
 439         if (uiop->uio_resid == 0)
 440                 return (0);
 441 
 442         if (uiop->uio_loffset > MAXOFF32_T)
 443                 return (EFBIG);
 444 
 445         if (uiop->uio_loffset < 0 ||
 446             uiop->uio_loffset + uiop->uio_resid > MAXOFF32_T)
 447                 return (EINVAL);
 448 
 449         /*
 450          * Bypass VM if caching has been disabled (e.g., locking) or if
 451          * using client-side direct I/O and the file is not mmap'd and
 452          * there are no cached pages.
 453          */
 454         if ((vp->v_flag & VNOCACHE) ||
 455             (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
 456             rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
 457             !vn_has_cached_data(vp))) {
 458                 size_t bufsize;
 459                 size_t resid = 0;
 460 
 461                 /*
 462                  * Let's try to do read in as large a chunk as we can
 463                  * (Filesystem (NFS client) bsize if possible/needed).
 464                  * For V3, this is 32K and for V2, this is 8K.
 465                  */
 466                 bufsize = MIN(uiop->uio_resid, VTOMI(vp)->mi_curread);
 467                 base = kmem_alloc(bufsize, KM_SLEEP);
 468                 do {
 469                         n = MIN(uiop->uio_resid, bufsize);
 470                         error = nfsread(vp, base, uiop->uio_offset, n,
 471                             &resid, cr);
 472                         if (!error) {
 473                                 n -= resid;
 474                                 error = uiomove(base, n, UIO_READ, uiop);
 475                         }
 476                 } while (!error && uiop->uio_resid > 0 && n > 0);
 477                 kmem_free(base, bufsize);
 478                 return (error);
 479         }
 480 
 481         error = 0;
 482 
 483         do {
 484                 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
 485                 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
 486                 n = MIN(MAXBSIZE - on, uiop->uio_resid);
 487 
 488                 error = nfs_validate_caches(vp, cr);
 489                 if (error)
 490                         break;
 491 
 492                 mutex_enter(&rp->r_statelock);
 493                 while (rp->r_flags & RINCACHEPURGE) {
 494                         if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 495                                 mutex_exit(&rp->r_statelock);
 496                                 return (EINTR);
 497                         }
 498                 }
 499                 diff = rp->r_size - uiop->uio_loffset;
 500                 mutex_exit(&rp->r_statelock);
 501                 if (diff <= 0)
 502                         break;
 503                 if (diff < n)
 504                         n = (size_t)diff;
 505 
 506                 if (vpm_enable) {
 507                         /*
 508                          * Copy data.
 509                          */
 510                         error = vpm_data_copy(vp, off + on, n, uiop,
 511                             1, NULL, 0, S_READ);
 512                 } else {
 513                         base = segmap_getmapflt(segkmap, vp, off + on, n,
 514                             1, S_READ);
 515                         error = uiomove(base + on, n, UIO_READ, uiop);
 516                 }
 517 
 518                 if (!error) {
 519                         /*
 520                          * If read a whole block or read to eof,
 521                          * won't need this buffer again soon.
 522                          */
 523                         mutex_enter(&rp->r_statelock);
 524                         if (n + on == MAXBSIZE ||
 525                             uiop->uio_loffset == rp->r_size)
 526                                 flags = SM_DONTNEED;
 527                         else
 528                                 flags = 0;
 529                         mutex_exit(&rp->r_statelock);
 530                         if (vpm_enable) {
 531                                 error = vpm_sync_pages(vp, off, n, flags);
 532                         } else {
 533                                 error = segmap_release(segkmap, base, flags);
 534                         }
 535                 } else {
 536                         if (vpm_enable) {
 537                                 (void) vpm_sync_pages(vp, off, n, 0);
 538                         } else {
 539                                 (void) segmap_release(segkmap, base, 0);
 540                         }
 541                 }
 542         } while (!error && uiop->uio_resid > 0);
 543 
 544         return (error);
 545 }
 546 
 547 /* ARGSUSED */
 548 static int
 549 nfs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
 550         caller_context_t *ct)
 551 {
 552         rnode_t *rp;
 553         u_offset_t off;
 554         caddr_t base;
 555         uint_t flags;
 556         int remainder;
 557         size_t n;
 558         int on;
 559         int error;
 560         int resid;
 561         offset_t offset;
 562         rlim_t limit;
 563         mntinfo_t *mi;
 564 
 565         rp = VTOR(vp);
 566 
 567         mi = VTOMI(vp);
 568         if (nfs_zone() != mi->mi_zone)
 569                 return (EIO);
 570         if (vp->v_type != VREG)
 571                 return (EISDIR);
 572 
 573         if (uiop->uio_resid == 0)
 574                 return (0);
 575 
 576         if (ioflag & FAPPEND) {
 577                 struct vattr va;
 578 
 579                 /*
 580                  * Must serialize if appending.
 581                  */
 582                 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
 583                         nfs_rw_exit(&rp->r_rwlock);
 584                         if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
 585                             INTR(vp)))
 586                                 return (EINTR);
 587                 }
 588 
 589                 va.va_mask = AT_SIZE;
 590                 error = nfsgetattr(vp, &va, cr);
 591                 if (error)
 592                         return (error);
 593                 uiop->uio_loffset = va.va_size;
 594         }
 595 
 596         if (uiop->uio_loffset > MAXOFF32_T)
 597                 return (EFBIG);
 598 
 599         offset = uiop->uio_loffset + uiop->uio_resid;
 600 
 601         if (uiop->uio_loffset < 0 || offset > MAXOFF32_T)
 602                 return (EINVAL);
 603 
 604         if (uiop->uio_llimit > (rlim64_t)MAXOFF32_T) {
 605                 limit = MAXOFF32_T;
 606         } else {
 607                 limit = (rlim_t)uiop->uio_llimit;
 608         }
 609 
 610         /*
 611          * Check to make sure that the process will not exceed
 612          * its limit on file size.  It is okay to write up to
 613          * the limit, but not beyond.  Thus, the write which
 614          * reaches the limit will be short and the next write
 615          * will return an error.
 616          */
 617         remainder = 0;
 618         if (offset > limit) {
 619                 remainder = offset - limit;
 620                 uiop->uio_resid = limit - uiop->uio_offset;
 621                 if (uiop->uio_resid <= 0) {
 622                         proc_t *p = ttoproc(curthread);
 623 
 624                         uiop->uio_resid += remainder;
 625                         mutex_enter(&p->p_lock);
 626                         (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
 627                             p->p_rctls, p, RCA_UNSAFE_SIGINFO);
 628                         mutex_exit(&p->p_lock);
 629                         return (EFBIG);
 630                 }
 631         }
 632 
 633         if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp)))
 634                 return (EINTR);
 635 
 636         /*
 637          * Bypass VM if caching has been disabled (e.g., locking) or if
 638          * using client-side direct I/O and the file is not mmap'd and
 639          * there are no cached pages.
 640          */
 641         if ((vp->v_flag & VNOCACHE) ||
 642             (((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO)) &&
 643             rp->r_mapcnt == 0 && rp->r_inmap == 0 &&
 644             !vn_has_cached_data(vp))) {
 645                 size_t bufsize;
 646                 int count;
 647                 uint_t org_offset;
 648 
 649 nfs_fwrite:
 650                 if (rp->r_flags & RSTALE) {
 651                         resid = uiop->uio_resid;
 652                         offset = uiop->uio_loffset;
 653                         error = rp->r_error;
 654                         /*
 655                          * A close may have cleared r_error, if so,
 656                          * propagate ESTALE error return properly
 657                          */
 658                         if (error == 0)
 659                                 error = ESTALE;
 660                         goto bottom;
 661                 }
 662                 bufsize = MIN(uiop->uio_resid, mi->mi_curwrite);
 663                 base = kmem_alloc(bufsize, KM_SLEEP);
 664                 do {
 665                         resid = uiop->uio_resid;
 666                         offset = uiop->uio_loffset;
 667                         count = MIN(uiop->uio_resid, bufsize);
 668                         org_offset = uiop->uio_offset;
 669                         error = uiomove(base, count, UIO_WRITE, uiop);
 670                         if (!error) {
 671                                 error = nfswrite(vp, base, org_offset,
 672                                     count, cr);
 673                         }
 674                 } while (!error && uiop->uio_resid > 0);
 675                 kmem_free(base, bufsize);
 676                 goto bottom;
 677         }
 678 
 679         do {
 680                 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
 681                 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
 682                 n = MIN(MAXBSIZE - on, uiop->uio_resid);
 683 
 684                 resid = uiop->uio_resid;
 685                 offset = uiop->uio_loffset;
 686 
 687                 if (rp->r_flags & RSTALE) {
 688                         error = rp->r_error;
 689                         /*
 690                          * A close may have cleared r_error, if so,
 691                          * propagate ESTALE error return properly
 692                          */
 693                         if (error == 0)
 694                                 error = ESTALE;
 695                         break;
 696                 }
 697 
 698                 /*
 699                  * Don't create dirty pages faster than they
 700                  * can be cleaned so that the system doesn't
 701                  * get imbalanced.  If the async queue is
 702                  * maxed out, then wait for it to drain before
 703                  * creating more dirty pages.  Also, wait for
 704                  * any threads doing pagewalks in the vop_getattr
 705                  * entry points so that they don't block for
 706                  * long periods.
 707                  */
 708                 mutex_enter(&rp->r_statelock);
 709                 while ((mi->mi_max_threads != 0 &&
 710                     rp->r_awcount > 2 * mi->mi_max_threads) ||
 711                     rp->r_gcount > 0) {
 712                         if (INTR(vp)) {
 713                                 klwp_t *lwp = ttolwp(curthread);
 714 
 715                                 if (lwp != NULL)
 716                                         lwp->lwp_nostop++;
 717                                 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
 718                                         mutex_exit(&rp->r_statelock);
 719                                         if (lwp != NULL)
 720                                                 lwp->lwp_nostop--;
 721                                         error = EINTR;
 722                                         goto bottom;
 723                                 }
 724                                 if (lwp != NULL)
 725                                         lwp->lwp_nostop--;
 726                         } else
 727                                 cv_wait(&rp->r_cv, &rp->r_statelock);
 728                 }
 729                 mutex_exit(&rp->r_statelock);
 730 
 731                 /*
 732                  * Touch the page and fault it in if it is not in core
 733                  * before segmap_getmapflt or vpm_data_copy can lock it.
 734                  * This is to avoid the deadlock if the buffer is mapped
 735                  * to the same file through mmap which we want to write.
 736                  */
 737                 uio_prefaultpages((long)n, uiop);
 738 
 739                 if (vpm_enable) {
 740                         /*
 741                          * It will use kpm mappings, so no need to
 742                          * pass an address.
 743                          */
 744                         error = writerp(rp, NULL, n, uiop, 0);
 745                 } else  {
 746                         if (segmap_kpm) {
 747                                 int pon = uiop->uio_loffset & PAGEOFFSET;
 748                                 size_t pn = MIN(PAGESIZE - pon,
 749                                     uiop->uio_resid);
 750                                 int pagecreate;
 751 
 752                                 mutex_enter(&rp->r_statelock);
 753                                 pagecreate = (pon == 0) && (pn == PAGESIZE ||
 754                                     uiop->uio_loffset + pn >= rp->r_size);
 755                                 mutex_exit(&rp->r_statelock);
 756 
 757                                 base = segmap_getmapflt(segkmap, vp, off + on,
 758                                     pn, !pagecreate, S_WRITE);
 759 
 760                                 error = writerp(rp, base + pon, n, uiop,
 761                                     pagecreate);
 762 
 763                         } else {
 764                                 base = segmap_getmapflt(segkmap, vp, off + on,
 765                                     n, 0, S_READ);
 766                                 error = writerp(rp, base + on, n, uiop, 0);
 767                         }
 768                 }
 769 
 770                 if (!error) {
 771                         if (mi->mi_flags & MI_NOAC)
 772                                 flags = SM_WRITE;
 773                         else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) {
 774                                 /*
 775                                  * Have written a whole block.
 776                                  * Start an asynchronous write
 777                                  * and mark the buffer to
 778                                  * indicate that it won't be
 779                                  * needed again soon.
 780                                  */
 781                                 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
 782                         } else
 783                                 flags = 0;
 784                         if ((ioflag & (FSYNC|FDSYNC)) ||
 785                             (rp->r_flags & ROUTOFSPACE)) {
 786                                 flags &= ~SM_ASYNC;
 787                                 flags |= SM_WRITE;
 788                         }
 789                         if (vpm_enable) {
 790                                 error = vpm_sync_pages(vp, off, n, flags);
 791                         } else {
 792                                 error = segmap_release(segkmap, base, flags);
 793                         }
 794                 } else {
 795                         if (vpm_enable) {
 796                                 (void) vpm_sync_pages(vp, off, n, 0);
 797                         } else {
 798                                 (void) segmap_release(segkmap, base, 0);
 799                         }
 800                         /*
 801                          * In the event that we got an access error while
 802                          * faulting in a page for a write-only file just
 803                          * force a write.
 804                          */
 805                         if (error == EACCES)
 806                                 goto nfs_fwrite;
 807                 }
 808         } while (!error && uiop->uio_resid > 0);
 809 
 810 bottom:
 811         if (error) {
 812                 uiop->uio_resid = resid + remainder;
 813                 uiop->uio_loffset = offset;
 814         } else
 815                 uiop->uio_resid += remainder;
 816 
 817         nfs_rw_exit(&rp->r_lkserlock);
 818 
 819         return (error);
 820 }
 821 
 822 /*
 823  * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
 824  */
 825 static int
 826 nfs_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
 827         int flags, cred_t *cr)
 828 {
 829         struct buf *bp;
 830         int error;
 831 
 832         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
 833         bp = pageio_setup(pp, len, vp, flags);
 834         ASSERT(bp != NULL);
 835 
 836         /*
 837          * pageio_setup should have set b_addr to 0.  This
 838          * is correct since we want to do I/O on a page
 839          * boundary.  bp_mapin will use this addr to calculate
 840          * an offset, and then set b_addr to the kernel virtual
 841          * address it allocated for us.
 842          */
 843         ASSERT(bp->b_un.b_addr == 0);
 844 
 845         bp->b_edev = 0;
 846         bp->b_dev = 0;
 847         bp->b_lblkno = lbtodb(off);
 848         bp->b_file = vp;
 849         bp->b_offset = (offset_t)off;
 850         bp_mapin(bp);
 851 
 852         error = nfs_bio(bp, cr);
 853 
 854         bp_mapout(bp);
 855         pageio_done(bp);
 856 
 857         return (error);
 858 }
 859 
 860 /*
 861  * Write to file.  Writes to remote server in largest size
 862  * chunks that the server can handle.  Write is synchronous.
 863  */
 864 static int
 865 nfswrite(vnode_t *vp, caddr_t base, uint_t offset, int count, cred_t *cr)
 866 {
 867         rnode_t *rp;
 868         mntinfo_t *mi;
 869         struct nfswriteargs wa;
 870         struct nfsattrstat ns;
 871         int error;
 872         int tsize;
 873         int douprintf;
 874 
 875         douprintf = 1;
 876 
 877         rp = VTOR(vp);
 878         mi = VTOMI(vp);
 879 
 880         ASSERT(nfs_zone() == mi->mi_zone);
 881 
 882         wa.wa_args = &wa.wa_args_buf;
 883         wa.wa_fhandle = *VTOFH(vp);
 884 
 885         do {
 886                 tsize = MIN(mi->mi_curwrite, count);
 887                 wa.wa_data = base;
 888                 wa.wa_begoff = offset;
 889                 wa.wa_totcount = tsize;
 890                 wa.wa_count = tsize;
 891                 wa.wa_offset = offset;
 892 
 893                 if (mi->mi_io_kstats) {
 894                         mutex_enter(&mi->mi_lock);
 895                         kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
 896                         mutex_exit(&mi->mi_lock);
 897                 }
 898                 wa.wa_mblk = NULL;
 899                 do {
 900                         error = rfs2call(mi, RFS_WRITE,
 901                             xdr_writeargs, (caddr_t)&wa,
 902                             xdr_attrstat, (caddr_t)&ns, cr,
 903                             &douprintf, &ns.ns_status, 0, NULL);
 904                 } while (error == ENFS_TRYAGAIN);
 905                 if (mi->mi_io_kstats) {
 906                         mutex_enter(&mi->mi_lock);
 907                         kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
 908                         mutex_exit(&mi->mi_lock);
 909                 }
 910 
 911                 if (!error) {
 912                         error = geterrno(ns.ns_status);
 913                         /*
 914                          * Can't check for stale fhandle and purge caches
 915                          * here because pages are held by nfs_getpage.
 916                          * Just mark the attribute cache as timed out
 917                          * and set RWRITEATTR to indicate that the file
 918                          * was modified with a WRITE operation.
 919                          */
 920                         if (!error) {
 921                                 count -= tsize;
 922                                 base += tsize;
 923                                 offset += tsize;
 924                                 if (mi->mi_io_kstats) {
 925                                         mutex_enter(&mi->mi_lock);
 926                                         KSTAT_IO_PTR(mi->mi_io_kstats)->
 927                                             writes++;
 928                                         KSTAT_IO_PTR(mi->mi_io_kstats)->
 929                                             nwritten += tsize;
 930                                         mutex_exit(&mi->mi_lock);
 931                                 }
 932                                 lwp_stat_update(LWP_STAT_OUBLK, 1);
 933                                 mutex_enter(&rp->r_statelock);
 934                                 PURGE_ATTRCACHE_LOCKED(rp);
 935                                 rp->r_flags |= RWRITEATTR;
 936                                 mutex_exit(&rp->r_statelock);
 937                         }
 938                 }
 939         } while (!error && count);
 940 
 941         return (error);
 942 }
 943 
 944 /*
 945  * Read from a file.  Reads data in largest chunks our interface can handle.
 946  */
 947 static int
 948 nfsread(vnode_t *vp, caddr_t base, uint_t offset,
 949     int count, size_t *residp, cred_t *cr)
 950 {
 951         mntinfo_t *mi;
 952         struct nfsreadargs ra;
 953         struct nfsrdresult rr;
 954         int tsize;
 955         int error;
 956         int douprintf;
 957         failinfo_t fi;
 958         rnode_t *rp;
 959         struct vattr va;
 960         hrtime_t t;
 961 
 962         rp = VTOR(vp);
 963         mi = VTOMI(vp);
 964 
 965         ASSERT(nfs_zone() == mi->mi_zone);
 966 
 967         douprintf = 1;
 968 
 969         ra.ra_fhandle = *VTOFH(vp);
 970 
 971         fi.vp = vp;
 972         fi.fhp = (caddr_t)&ra.ra_fhandle;
 973         fi.copyproc = nfscopyfh;
 974         fi.lookupproc = nfslookup;
 975         fi.xattrdirproc = acl_getxattrdir2;
 976 
 977         do {
 978                 if (mi->mi_io_kstats) {
 979                         mutex_enter(&mi->mi_lock);
 980                         kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
 981                         mutex_exit(&mi->mi_lock);
 982                 }
 983 
 984                 do {
 985                         tsize = MIN(mi->mi_curread, count);
 986                         rr.rr_data = base;
 987                         ra.ra_offset = offset;
 988                         ra.ra_totcount = tsize;
 989                         ra.ra_count = tsize;
 990                         ra.ra_data = base;
 991                         t = gethrtime();
 992                         error = rfs2call(mi, RFS_READ,
 993                             xdr_readargs, (caddr_t)&ra,
 994                             xdr_rdresult, (caddr_t)&rr, cr,
 995                             &douprintf, &rr.rr_status, 0, &fi);
 996                 } while (error == ENFS_TRYAGAIN);
 997 
 998                 if (mi->mi_io_kstats) {
 999                         mutex_enter(&mi->mi_lock);
1000                         kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1001                         mutex_exit(&mi->mi_lock);
1002                 }
1003 
1004                 if (!error) {
1005                         error = geterrno(rr.rr_status);
1006                         if (!error) {
1007                                 count -= rr.rr_count;
1008                                 base += rr.rr_count;
1009                                 offset += rr.rr_count;
1010                                 if (mi->mi_io_kstats) {
1011                                         mutex_enter(&mi->mi_lock);
1012                                         KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
1013                                         KSTAT_IO_PTR(mi->mi_io_kstats)->nread +=
1014                                             rr.rr_count;
1015                                         mutex_exit(&mi->mi_lock);
1016                                 }
1017                                 lwp_stat_update(LWP_STAT_INBLK, 1);
1018                         }
1019                 }
1020         } while (!error && count && rr.rr_count == tsize);
1021 
1022         *residp = count;
1023 
1024         if (!error) {
1025                 /*
1026                  * Since no error occurred, we have the current
1027                  * attributes and we need to do a cache check and then
1028                  * potentially update the cached attributes.  We can't
1029                  * use the normal attribute check and cache mechanisms
1030                  * because they might cause a cache flush which would
1031                  * deadlock.  Instead, we just check the cache to see
1032                  * if the attributes have changed.  If it is, then we
1033                  * just mark the attributes as out of date.  The next
1034                  * time that the attributes are checked, they will be
1035                  * out of date, new attributes will be fetched, and
1036                  * the page cache will be flushed.  If the attributes
1037                  * weren't changed, then we just update the cached
1038                  * attributes with these attributes.
1039                  */
1040                 /*
1041                  * If NFS_ACL is supported on the server, then the
1042                  * attributes returned by server may have minimal
1043                  * permissions sometimes denying access to users having
1044                  * proper access.  To get the proper attributes, mark
1045                  * the attributes as expired so that they will be
1046                  * regotten via the NFS_ACL GETATTR2 procedure.
1047                  */
1048                 error = nattr_to_vattr(vp, &rr.rr_attr, &va);
1049                 mutex_enter(&rp->r_statelock);
1050                 if (error || !CACHE_VALID(rp, va.va_mtime, va.va_size) ||
1051                     (mi->mi_flags & MI_ACL)) {
1052                         mutex_exit(&rp->r_statelock);
1053                         PURGE_ATTRCACHE(vp);
1054                 } else {
1055                         if (rp->r_mtime <= t) {
1056                                 nfs_attrcache_va(vp, &va);
1057                         }
1058                         mutex_exit(&rp->r_statelock);
1059                 }
1060         }
1061 
1062         return (error);
1063 }
1064 
1065 /* ARGSUSED */
1066 static int
1067 nfs_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp,
1068         caller_context_t *ct)
1069 {
1070 
1071         if (nfs_zone() != VTOMI(vp)->mi_zone)
1072                 return (EIO);
1073         switch (cmd) {
1074                 case _FIODIRECTIO:
1075                         return (nfs_directio(vp, (int)arg, cr));
1076                 default:
1077                         return (ENOTTY);
1078         }
1079 }
1080 
1081 /* ARGSUSED */
1082 static int
1083 nfs_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1084         caller_context_t *ct)
1085 {
1086         int error;
1087         rnode_t *rp;
1088 
1089         if (nfs_zone() != VTOMI(vp)->mi_zone)
1090                 return (EIO);
1091         /*
1092          * If it has been specified that the return value will
1093          * just be used as a hint, and we are only being asked
1094          * for size, fsid or rdevid, then return the client's
1095          * notion of these values without checking to make sure
1096          * that the attribute cache is up to date.
1097          * The whole point is to avoid an over the wire GETATTR
1098          * call.
1099          */
1100         rp = VTOR(vp);
1101         if (flags & ATTR_HINT) {
1102                 if (vap->va_mask ==
1103                     (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) {
1104                         mutex_enter(&rp->r_statelock);
1105                         if (vap->va_mask | AT_SIZE)
1106                                 vap->va_size = rp->r_size;
1107                         if (vap->va_mask | AT_FSID)
1108                                 vap->va_fsid = rp->r_attr.va_fsid;
1109                         if (vap->va_mask | AT_RDEV)
1110                                 vap->va_rdev = rp->r_attr.va_rdev;
1111                         mutex_exit(&rp->r_statelock);
1112                         return (0);
1113                 }
1114         }
1115 
1116         /*
1117          * Only need to flush pages if asking for the mtime
1118          * and if there any dirty pages or any outstanding
1119          * asynchronous (write) requests for this file.
1120          */
1121         if (vap->va_mask & AT_MTIME) {
1122                 if (vn_has_cached_data(vp) &&
1123                     ((rp->r_flags & RDIRTY) || rp->r_awcount > 0)) {
1124                         mutex_enter(&rp->r_statelock);
1125                         rp->r_gcount++;
1126                         mutex_exit(&rp->r_statelock);
1127                         error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
1128                         mutex_enter(&rp->r_statelock);
1129                         if (error && (error == ENOSPC || error == EDQUOT)) {
1130                                 if (!rp->r_error)
1131                                         rp->r_error = error;
1132                         }
1133                         if (--rp->r_gcount == 0)
1134                                 cv_broadcast(&rp->r_cv);
1135                         mutex_exit(&rp->r_statelock);
1136                 }
1137         }
1138 
1139         return (nfsgetattr(vp, vap, cr));
1140 }
1141 
1142 /*ARGSUSED4*/
1143 static int
1144 nfs_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
1145                 caller_context_t *ct)
1146 {
1147         int error;
1148         uint_t mask;
1149         struct vattr va;
1150 
1151         mask = vap->va_mask;
1152 
1153         if (mask & AT_NOSET)
1154                 return (EINVAL);
1155 
1156         if ((mask & AT_SIZE) &&
1157             vap->va_type == VREG &&
1158             vap->va_size > MAXOFF32_T)
1159                 return (EFBIG);
1160 
1161         if (nfs_zone() != VTOMI(vp)->mi_zone)
1162                 return (EIO);
1163 
1164         va.va_mask = AT_UID | AT_MODE;
1165 
1166         error = nfsgetattr(vp, &va, cr);
1167         if (error)
1168                 return (error);
1169 
1170         error = secpolicy_vnode_setattr(cr, vp, vap, &va, flags, nfs_accessx,
1171             vp);
1172 
1173         if (error)
1174                 return (error);
1175 
1176         error = nfssetattr(vp, vap, flags, cr);
1177 
1178         if (error == 0 && (mask & AT_SIZE) && vap->va_size == 0)
1179                 vnevent_truncate(vp, ct);
1180 
1181         return (error);
1182 }
1183 
1184 static int
1185 nfssetattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr)
1186 {
1187         int error;
1188         uint_t mask;
1189         struct nfssaargs args;
1190         struct nfsattrstat ns;
1191         int douprintf;
1192         rnode_t *rp;
1193         struct vattr va;
1194         mode_t omode;
1195         mntinfo_t *mi;
1196         vsecattr_t *vsp;
1197         hrtime_t t;
1198 
1199         mask = vap->va_mask;
1200 
1201         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
1202 
1203         rp = VTOR(vp);
1204 
1205         /*
1206          * Only need to flush pages if there are any pages and
1207          * if the file is marked as dirty in some fashion.  The
1208          * file must be flushed so that we can accurately
1209          * determine the size of the file and the cached data
1210          * after the SETATTR returns.  A file is considered to
1211          * be dirty if it is either marked with RDIRTY, has
1212          * outstanding i/o's active, or is mmap'd.  In this
1213          * last case, we can't tell whether there are dirty
1214          * pages, so we flush just to be sure.
1215          */
1216         if (vn_has_cached_data(vp) &&
1217             ((rp->r_flags & RDIRTY) ||
1218             rp->r_count > 0 ||
1219             rp->r_mapcnt > 0)) {
1220                 ASSERT(vp->v_type != VCHR);
1221                 error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, NULL);
1222                 if (error && (error == ENOSPC || error == EDQUOT)) {
1223                         mutex_enter(&rp->r_statelock);
1224                         if (!rp->r_error)
1225                                 rp->r_error = error;
1226                         mutex_exit(&rp->r_statelock);
1227                 }
1228         }
1229 
1230         /*
1231          * If the system call was utime(2) or utimes(2) and the
1232          * application did not specify the times, then set the
1233          * mtime nanosecond field to 1 billion.  This will get
1234          * translated from 1 billion nanoseconds to 1 million
1235          * microseconds in the over the wire request.  The
1236          * server will use 1 million in the microsecond field
1237          * to tell whether both the mtime and atime should be
1238          * set to the server's current time.
1239          *
1240          * This is an overload of the protocol and should be
1241          * documented in the NFS Version 2 protocol specification.
1242          */
1243         if ((mask & AT_MTIME) && !(flags & ATTR_UTIME)) {
1244                 vap->va_mtime.tv_nsec = 1000000000;
1245                 if (NFS_TIME_T_OK(vap->va_mtime.tv_sec) &&
1246                     NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
1247                         error = vattr_to_sattr(vap, &args.saa_sa);
1248                 } else {
1249                         /*
1250                          * Use server times. vap time values will not be used.
1251                          * To ensure no time overflow, make sure vap has
1252                          * valid values, but retain the original values.
1253                          */
1254                         timestruc_t     mtime = vap->va_mtime;
1255                         timestruc_t     atime = vap->va_atime;
1256                         time_t          now;
1257 
1258                         now = gethrestime_sec();
1259                         if (NFS_TIME_T_OK(now)) {
1260                                 /* Just in case server does not know of this */
1261                                 vap->va_mtime.tv_sec = now;
1262                                 vap->va_atime.tv_sec = now;
1263                         } else {
1264                                 vap->va_mtime.tv_sec = 0;
1265                                 vap->va_atime.tv_sec = 0;
1266                         }
1267                         error = vattr_to_sattr(vap, &args.saa_sa);
1268                         /* set vap times back on */
1269                         vap->va_mtime = mtime;
1270                         vap->va_atime = atime;
1271                 }
1272         } else {
1273                 /* Either do not set times or use the client specified times */
1274                 error = vattr_to_sattr(vap, &args.saa_sa);
1275         }
1276         if (error) {
1277                 /* req time field(s) overflow - return immediately */
1278                 return (error);
1279         }
1280         args.saa_fh = *VTOFH(vp);
1281 
1282         va.va_mask = AT_MODE;
1283         error = nfsgetattr(vp, &va, cr);
1284         if (error)
1285                 return (error);
1286         omode = va.va_mode;
1287 
1288         mi = VTOMI(vp);
1289 
1290         douprintf = 1;
1291 
1292         t = gethrtime();
1293 
1294         error = rfs2call(mi, RFS_SETATTR,
1295             xdr_saargs, (caddr_t)&args,
1296             xdr_attrstat, (caddr_t)&ns, cr,
1297             &douprintf, &ns.ns_status, 0, NULL);
1298 
1299         /*
1300          * Purge the access cache and ACL cache if changing either the
1301          * owner of the file, the group owner, or the mode.  These may
1302          * change the access permissions of the file, so purge old
1303          * information and start over again.
1304          */
1305         if ((mask & (AT_UID | AT_GID | AT_MODE)) && (mi->mi_flags & MI_ACL)) {
1306                 (void) nfs_access_purge_rp(rp);
1307                 if (rp->r_secattr != NULL) {
1308                         mutex_enter(&rp->r_statelock);
1309                         vsp = rp->r_secattr;
1310                         rp->r_secattr = NULL;
1311                         mutex_exit(&rp->r_statelock);
1312                         if (vsp != NULL)
1313                                 nfs_acl_free(vsp);
1314                 }
1315         }
1316 
1317         if (!error) {
1318                 error = geterrno(ns.ns_status);
1319                 if (!error) {
1320                         /*
1321                          * If changing the size of the file, invalidate
1322                          * any local cached data which is no longer part
1323                          * of the file.  We also possibly invalidate the
1324                          * last page in the file.  We could use
1325                          * pvn_vpzero(), but this would mark the page as
1326                          * modified and require it to be written back to
1327                          * the server for no particularly good reason.
1328                          * This way, if we access it, then we bring it
1329                          * back in.  A read should be cheaper than a
1330                          * write.
1331                          */
1332                         if (mask & AT_SIZE) {
1333                                 nfs_invalidate_pages(vp,
1334                                     (vap->va_size & PAGEMASK), cr);
1335                         }
1336                         (void) nfs_cache_fattr(vp, &ns.ns_attr, &va, t, cr);
1337                         /*
1338                          * If NFS_ACL is supported on the server, then the
1339                          * attributes returned by server may have minimal
1340                          * permissions sometimes denying access to users having
1341                          * proper access.  To get the proper attributes, mark
1342                          * the attributes as expired so that they will be
1343                          * regotten via the NFS_ACL GETATTR2 procedure.
1344                          */
1345                         if (mi->mi_flags & MI_ACL) {
1346                                 PURGE_ATTRCACHE(vp);
1347                         }
1348                         /*
1349                          * This next check attempts to deal with NFS
1350                          * servers which can not handle increasing
1351                          * the size of the file via setattr.  Most
1352                          * of these servers do not return an error,
1353                          * but do not change the size of the file.
1354                          * Hence, this check and then attempt to set
1355                          * the file size by writing 1 byte at the
1356                          * offset of the end of the file that we need.
1357                          */
1358                         if ((mask & AT_SIZE) &&
1359                             ns.ns_attr.na_size < (uint32_t)vap->va_size) {
1360                                 char zb = '\0';
1361 
1362                                 error = nfswrite(vp, &zb,
1363                                     vap->va_size - sizeof (zb),
1364                                     sizeof (zb), cr);
1365                         }
1366                         /*
1367                          * Some servers will change the mode to clear the setuid
1368                          * and setgid bits when changing the uid or gid.  The
1369                          * client needs to compensate appropriately.
1370                          */
1371                         if (mask & (AT_UID | AT_GID)) {
1372                                 int terror;
1373 
1374                                 va.va_mask = AT_MODE;
1375                                 terror = nfsgetattr(vp, &va, cr);
1376                                 if (!terror &&
1377                                     (((mask & AT_MODE) &&
1378                                     va.va_mode != vap->va_mode) ||
1379                                     (!(mask & AT_MODE) &&
1380                                     va.va_mode != omode))) {
1381                                         va.va_mask = AT_MODE;
1382                                         if (mask & AT_MODE)
1383                                                 va.va_mode = vap->va_mode;
1384                                         else
1385                                                 va.va_mode = omode;
1386                                         (void) nfssetattr(vp, &va, 0, cr);
1387                                 }
1388                         }
1389                 } else {
1390                         PURGE_ATTRCACHE(vp);
1391                         PURGE_STALE_FH(error, vp, cr);
1392                 }
1393         } else {
1394                 PURGE_ATTRCACHE(vp);
1395         }
1396 
1397         return (error);
1398 }
1399 
1400 static int
1401 nfs_accessx(void *vp, int mode, cred_t *cr)
1402 {
1403         ASSERT(nfs_zone() == VTOMI((vnode_t *)vp)->mi_zone);
1404         return (nfs_access(vp, mode, 0, cr, NULL));
1405 }
1406 
1407 /* ARGSUSED */
1408 static int
1409 nfs_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
1410 {
1411         struct vattr va;
1412         int error;
1413         mntinfo_t *mi;
1414         int shift = 0;
1415 
1416         mi = VTOMI(vp);
1417 
1418         if (nfs_zone() != mi->mi_zone)
1419                 return (EIO);
1420         if (mi->mi_flags & MI_ACL) {
1421                 error = acl_access2(vp, mode, flags, cr);
1422                 if (mi->mi_flags & MI_ACL)
1423                         return (error);
1424         }
1425 
1426         va.va_mask = AT_MODE | AT_UID | AT_GID;
1427         error = nfsgetattr(vp, &va, cr);
1428         if (error)
1429                 return (error);
1430 
1431         /*
1432          * Disallow write attempts on read-only
1433          * file systems, unless the file is a
1434          * device node.
1435          */
1436         if ((mode & VWRITE) && vn_is_readonly(vp) && !IS_DEVVP(vp))
1437                 return (EROFS);
1438 
1439         /*
1440          * Disallow attempts to access mandatory lock files.
1441          */
1442         if ((mode & (VWRITE | VREAD | VEXEC)) &&
1443             MANDLOCK(vp, va.va_mode))
1444                 return (EACCES);
1445 
1446         /*
1447          * Access check is based on only
1448          * one of owner, group, public.
1449          * If not owner, then check group.
1450          * If not a member of the group,
1451          * then check public access.
1452          */
1453         if (crgetuid(cr) != va.va_uid) {
1454                 shift += 3;
1455                 if (!groupmember(va.va_gid, cr))
1456                         shift += 3;
1457         }
1458 
1459         return (secpolicy_vnode_access2(cr, vp, va.va_uid,
1460             va.va_mode << shift, mode));
1461 }
1462 
1463 volatile int nfs_do_symlink_cache = 1;
1464 
1465 /* ARGSUSED */
1466 static int
1467 nfs_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct)
1468 {
1469         int error;
1470         struct nfsrdlnres rl;
1471         rnode_t *rp;
1472         int douprintf;
1473         failinfo_t fi;
1474 
1475         /*
1476          * We want to be consistent with UFS semantics so we will return
1477          * EINVAL instead of ENXIO. This violates the XNFS spec and
1478          * the RFC 1094, which are wrong any way. BUGID 1138002.
1479          */
1480         if (vp->v_type != VLNK)
1481                 return (EINVAL);
1482 
1483         if (nfs_zone() != VTOMI(vp)->mi_zone)
1484                 return (EIO);
1485 
1486         rp = VTOR(vp);
1487         if (nfs_do_symlink_cache && rp->r_symlink.contents != NULL) {
1488                 error = nfs_validate_caches(vp, cr);
1489                 if (error)
1490                         return (error);
1491                 mutex_enter(&rp->r_statelock);
1492                 if (rp->r_symlink.contents != NULL) {
1493                         error = uiomove(rp->r_symlink.contents,
1494                             rp->r_symlink.len, UIO_READ, uiop);
1495                         mutex_exit(&rp->r_statelock);
1496                         return (error);
1497                 }
1498                 mutex_exit(&rp->r_statelock);
1499         }
1500 
1501 
1502         rl.rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
1503 
1504         fi.vp = vp;
1505         fi.fhp = NULL;          /* no need to update, filehandle not copied */
1506         fi.copyproc = nfscopyfh;
1507         fi.lookupproc = nfslookup;
1508         fi.xattrdirproc = acl_getxattrdir2;
1509 
1510         douprintf = 1;
1511 
1512         error = rfs2call(VTOMI(vp), RFS_READLINK,
1513             xdr_readlink, (caddr_t)VTOFH(vp),
1514             xdr_rdlnres, (caddr_t)&rl, cr,
1515             &douprintf, &rl.rl_status, 0, &fi);
1516 
1517         if (error) {
1518 
1519                 kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1520                 return (error);
1521         }
1522 
1523         error = geterrno(rl.rl_status);
1524         if (!error) {
1525                 error = uiomove(rl.rl_data, (int)rl.rl_count, UIO_READ, uiop);
1526                 if (nfs_do_symlink_cache && rp->r_symlink.contents == NULL) {
1527                         mutex_enter(&rp->r_statelock);
1528                         if (rp->r_symlink.contents == NULL) {
1529                                 rp->r_symlink.contents = rl.rl_data;
1530                                 rp->r_symlink.len = (int)rl.rl_count;
1531                                 rp->r_symlink.size = NFS_MAXPATHLEN;
1532                                 mutex_exit(&rp->r_statelock);
1533                         } else {
1534                                 mutex_exit(&rp->r_statelock);
1535 
1536                                 kmem_free((void *)rl.rl_data,
1537                                     NFS_MAXPATHLEN);
1538                         }
1539                 } else {
1540 
1541                         kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1542                 }
1543         } else {
1544                 PURGE_STALE_FH(error, vp, cr);
1545 
1546                 kmem_free((void *)rl.rl_data, NFS_MAXPATHLEN);
1547         }
1548 
1549         /*
1550          * Conform to UFS semantics (see comment above)
1551          */
1552         return (error == ENXIO ? EINVAL : error);
1553 }
1554 
1555 /*
1556  * Flush local dirty pages to stable storage on the server.
1557  *
1558  * If FNODSYNC is specified, then there is nothing to do because
1559  * metadata changes are not cached on the client before being
1560  * sent to the server.
1561  */
1562 /* ARGSUSED */
1563 static int
1564 nfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
1565 {
1566         int error;
1567 
1568         if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
1569                 return (0);
1570 
1571         if (nfs_zone() != VTOMI(vp)->mi_zone)
1572                 return (EIO);
1573 
1574         error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
1575         if (!error)
1576                 error = VTOR(vp)->r_error;
1577         return (error);
1578 }
1579 
1580 
1581 /*
1582  * Weirdness: if the file was removed or the target of a rename
1583  * operation while it was open, it got renamed instead.  Here we
1584  * remove the renamed file.
1585  */
1586 /* ARGSUSED */
1587 static void
1588 nfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
1589 {
1590         rnode_t *rp;
1591 
1592         ASSERT(vp != DNLC_NO_VNODE);
1593 
1594         /*
1595          * If this is coming from the wrong zone, we let someone in the right
1596          * zone take care of it asynchronously.  We can get here due to
1597          * VN_RELE() being called from pageout() or fsflush().  This call may
1598          * potentially turn into an expensive no-op if, for instance, v_count
1599          * gets incremented in the meantime, but it's still correct.
1600          */
1601         if (nfs_zone() != VTOMI(vp)->mi_zone) {
1602                 nfs_async_inactive(vp, cr, nfs_inactive);
1603                 return;
1604         }
1605 
1606         rp = VTOR(vp);
1607 redo:
1608         if (rp->r_unldvp != NULL) {
1609                 /*
1610                  * Save the vnode pointer for the directory where the
1611                  * unlinked-open file got renamed, then set it to NULL
1612                  * to prevent another thread from getting here before
1613                  * we're done with the remove.  While we have the
1614                  * statelock, make local copies of the pertinent rnode
1615                  * fields.  If we weren't to do this in an atomic way, the
1616                  * the unl* fields could become inconsistent with respect
1617                  * to each other due to a race condition between this
1618                  * code and nfs_remove().  See bug report 1034328.
1619                  */
1620                 mutex_enter(&rp->r_statelock);
1621                 if (rp->r_unldvp != NULL) {
1622                         vnode_t *unldvp;
1623                         char *unlname;
1624                         cred_t *unlcred;
1625                         struct nfsdiropargs da;
1626                         enum nfsstat status;
1627                         int douprintf;
1628                         int error;
1629 
1630                         unldvp = rp->r_unldvp;
1631                         rp->r_unldvp = NULL;
1632                         unlname = rp->r_unlname;
1633                         rp->r_unlname = NULL;
1634                         unlcred = rp->r_unlcred;
1635                         rp->r_unlcred = NULL;
1636                         mutex_exit(&rp->r_statelock);
1637 
1638                         /*
1639                          * If there are any dirty pages left, then flush
1640                          * them.  This is unfortunate because they just
1641                          * may get thrown away during the remove operation,
1642                          * but we have to do this for correctness.
1643                          */
1644                         if (vn_has_cached_data(vp) &&
1645                             ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
1646                                 ASSERT(vp->v_type != VCHR);
1647                                 error = nfs_putpage(vp, (offset_t)0, 0, 0,
1648                                     cr, ct);
1649                                 if (error) {
1650                                         mutex_enter(&rp->r_statelock);
1651                                         if (!rp->r_error)
1652                                                 rp->r_error = error;
1653                                         mutex_exit(&rp->r_statelock);
1654                                 }
1655                         }
1656 
1657                         /*
1658                          * Do the remove operation on the renamed file
1659                          */
1660                         setdiropargs(&da, unlname, unldvp);
1661 
1662                         douprintf = 1;
1663 
1664                         (void) rfs2call(VTOMI(unldvp), RFS_REMOVE,
1665                             xdr_diropargs, (caddr_t)&da,
1666                             xdr_enum, (caddr_t)&status, unlcred,
1667                             &douprintf, &status, 0, NULL);
1668 
1669                         if (HAVE_RDDIR_CACHE(VTOR(unldvp)))
1670                                 nfs_purge_rddir_cache(unldvp);
1671                         PURGE_ATTRCACHE(unldvp);
1672 
1673                         /*
1674                          * Release stuff held for the remove
1675                          */
1676                         VN_RELE(unldvp);
1677                         kmem_free(unlname, MAXNAMELEN);
1678                         crfree(unlcred);
1679                         goto redo;
1680                 }
1681                 mutex_exit(&rp->r_statelock);
1682         }
1683 
1684         rp_addfree(rp, cr);
1685 }
1686 
1687 /*
1688  * Remote file system operations having to do with directory manipulation.
1689  */
1690 
1691 /* ARGSUSED */
1692 static int
1693 nfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1694         int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
1695         int *direntflags, pathname_t *realpnp)
1696 {
1697         int error;
1698         vnode_t *vp;
1699         vnode_t *avp = NULL;
1700         rnode_t *drp;
1701 
1702         if (nfs_zone() != VTOMI(dvp)->mi_zone)
1703                 return (EPERM);
1704 
1705         drp = VTOR(dvp);
1706 
1707         /*
1708          * Are we looking up extended attributes?  If so, "dvp" is
1709          * the file or directory for which we want attributes, and
1710          * we need a lookup of the hidden attribute directory
1711          * before we lookup the rest of the path.
1712          */
1713         if (flags & LOOKUP_XATTR) {
1714                 bool_t cflag = ((flags & CREATE_XATTR_DIR) != 0);
1715                 mntinfo_t *mi;
1716 
1717                 mi = VTOMI(dvp);
1718                 if (!(mi->mi_flags & MI_EXTATTR))
1719                         return (EINVAL);
1720 
1721                 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp)))
1722                         return (EINTR);
1723 
1724                 (void) nfslookup_dnlc(dvp, XATTR_DIR_NAME, &avp, cr);
1725                 if (avp == NULL)
1726                         error = acl_getxattrdir2(dvp, &avp, cflag, cr, 0);
1727                 else
1728                         error = 0;
1729 
1730                 nfs_rw_exit(&drp->r_rwlock);
1731 
1732                 if (error) {
1733                         if (mi->mi_flags & MI_EXTATTR)
1734                                 return (error);
1735                         return (EINVAL);
1736                 }
1737                 dvp = avp;
1738                 drp = VTOR(dvp);
1739         }
1740 
1741         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR(dvp))) {
1742                 error = EINTR;
1743                 goto out;
1744         }
1745 
1746         error = nfslookup(dvp, nm, vpp, pnp, flags, rdir, cr, 0);
1747 
1748         nfs_rw_exit(&drp->r_rwlock);
1749 
1750         /*
1751          * If vnode is a device, create special vnode.
1752          */
1753         if (!error && IS_DEVVP(*vpp)) {
1754                 vp = *vpp;
1755                 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
1756                 VN_RELE(vp);
1757         }
1758 
1759 out:
1760         if (avp != NULL)
1761                 VN_RELE(avp);
1762 
1763         return (error);
1764 }
1765 
1766 volatile int nfs_lookup_neg_cache = 1;
1767 
1768 #ifdef DEBUG
1769 static int nfs_lookup_dnlc_hits = 0;
1770 static int nfs_lookup_dnlc_misses = 0;
1771 static int nfs_lookup_dnlc_neg_hits = 0;
1772 static int nfs_lookup_dnlc_disappears = 0;
1773 static int nfs_lookup_dnlc_lookups = 0;
1774 #endif
1775 
1776 /* ARGSUSED */
1777 int
1778 nfslookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1779         int flags, vnode_t *rdir, cred_t *cr, int rfscall_flags)
1780 {
1781         int error;
1782 
1783         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1784 
1785         /*
1786          * If lookup is for "", just return dvp.  Don't need
1787          * to send it over the wire, look it up in the dnlc,
1788          * or perform any access checks.
1789          */
1790         if (*nm == '\0') {
1791                 VN_HOLD(dvp);
1792                 *vpp = dvp;
1793                 return (0);
1794         }
1795 
1796         /*
1797          * Can't do lookups in non-directories.
1798          */
1799         if (dvp->v_type != VDIR)
1800                 return (ENOTDIR);
1801 
1802         /*
1803          * If we're called with RFSCALL_SOFT, it's important that
1804          * the only rfscall is one we make directly; if we permit
1805          * an access call because we're looking up "." or validating
1806          * a dnlc hit, we'll deadlock because that rfscall will not
1807          * have the RFSCALL_SOFT set.
1808          */
1809         if (rfscall_flags & RFSCALL_SOFT)
1810                 goto callit;
1811 
1812         /*
1813          * If lookup is for ".", just return dvp.  Don't need
1814          * to send it over the wire or look it up in the dnlc,
1815          * just need to check access.
1816          */
1817         if (strcmp(nm, ".") == 0) {
1818                 error = nfs_access(dvp, VEXEC, 0, cr, NULL);
1819                 if (error)
1820                         return (error);
1821                 VN_HOLD(dvp);
1822                 *vpp = dvp;
1823                 return (0);
1824         }
1825 
1826         /*
1827          * Lookup this name in the DNLC.  If there was a valid entry,
1828          * then return the results of the lookup.
1829          */
1830         error = nfslookup_dnlc(dvp, nm, vpp, cr);
1831         if (error || *vpp != NULL)
1832                 return (error);
1833 
1834 callit:
1835         error = nfslookup_otw(dvp, nm, vpp, cr, rfscall_flags);
1836 
1837         return (error);
1838 }
1839 
1840 static int
1841 nfslookup_dnlc(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
1842 {
1843         int error;
1844         vnode_t *vp;
1845 
1846         ASSERT(*nm != '\0');
1847         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1848 
1849         /*
1850          * Lookup this name in the DNLC.  If successful, then validate
1851          * the caches and then recheck the DNLC.  The DNLC is rechecked
1852          * just in case this entry got invalidated during the call
1853          * to nfs_validate_caches.
1854          *
1855          * An assumption is being made that it is safe to say that a
1856          * file exists which may not on the server.  Any operations to
1857          * the server will fail with ESTALE.
1858          */
1859 #ifdef DEBUG
1860         nfs_lookup_dnlc_lookups++;
1861 #endif
1862         vp = dnlc_lookup(dvp, nm);
1863         if (vp != NULL) {
1864                 VN_RELE(vp);
1865                 if (vp == DNLC_NO_VNODE && !vn_is_readonly(dvp)) {
1866                         PURGE_ATTRCACHE(dvp);
1867                 }
1868                 error = nfs_validate_caches(dvp, cr);
1869                 if (error)
1870                         return (error);
1871                 vp = dnlc_lookup(dvp, nm);
1872                 if (vp != NULL) {
1873                         error = nfs_access(dvp, VEXEC, 0, cr, NULL);
1874                         if (error) {
1875                                 VN_RELE(vp);
1876                                 return (error);
1877                         }
1878                         if (vp == DNLC_NO_VNODE) {
1879                                 VN_RELE(vp);
1880 #ifdef DEBUG
1881                                 nfs_lookup_dnlc_neg_hits++;
1882 #endif
1883                                 return (ENOENT);
1884                         }
1885                         *vpp = vp;
1886 #ifdef DEBUG
1887                         nfs_lookup_dnlc_hits++;
1888 #endif
1889                         return (0);
1890                 }
1891 #ifdef DEBUG
1892                 nfs_lookup_dnlc_disappears++;
1893 #endif
1894         }
1895 #ifdef DEBUG
1896         else
1897                 nfs_lookup_dnlc_misses++;
1898 #endif
1899 
1900         *vpp = NULL;
1901 
1902         return (0);
1903 }
1904 
1905 static int
1906 nfslookup_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr,
1907         int rfscall_flags)
1908 {
1909         int error;
1910         struct nfsdiropargs da;
1911         struct nfsdiropres dr;
1912         int douprintf;
1913         failinfo_t fi;
1914         hrtime_t t;
1915 
1916         ASSERT(*nm != '\0');
1917         ASSERT(dvp->v_type == VDIR);
1918         ASSERT(nfs_zone() == VTOMI(dvp)->mi_zone);
1919 
1920         setdiropargs(&da, nm, dvp);
1921 
1922         fi.vp = dvp;
1923         fi.fhp = NULL;          /* no need to update, filehandle not copied */
1924         fi.copyproc = nfscopyfh;
1925         fi.lookupproc = nfslookup;
1926         fi.xattrdirproc = acl_getxattrdir2;
1927 
1928         douprintf = 1;
1929 
1930         t = gethrtime();
1931 
1932         error = rfs2call(VTOMI(dvp), RFS_LOOKUP,
1933             xdr_diropargs, (caddr_t)&da,
1934             xdr_diropres, (caddr_t)&dr, cr,
1935             &douprintf, &dr.dr_status, rfscall_flags, &fi);
1936 
1937         if (!error) {
1938                 error = geterrno(dr.dr_status);
1939                 if (!error) {
1940                         *vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
1941                             dvp->v_vfsp, t, cr, VTOR(dvp)->r_path, nm);
1942                         /*
1943                          * If NFS_ACL is supported on the server, then the
1944                          * attributes returned by server may have minimal
1945                          * permissions sometimes denying access to users having
1946                          * proper access.  To get the proper attributes, mark
1947                          * the attributes as expired so that they will be
1948                          * regotten via the NFS_ACL GETATTR2 procedure.
1949                          */
1950                         if (VTOMI(*vpp)->mi_flags & MI_ACL) {
1951                                 PURGE_ATTRCACHE(*vpp);
1952                         }
1953                         if (!(rfscall_flags & RFSCALL_SOFT))
1954                                 dnlc_update(dvp, nm, *vpp);
1955                 } else {
1956                         PURGE_STALE_FH(error, dvp, cr);
1957                         if (error == ENOENT && nfs_lookup_neg_cache)
1958                                 dnlc_enter(dvp, nm, DNLC_NO_VNODE);
1959                 }
1960         }
1961 
1962         return (error);
1963 }
1964 
1965 /* ARGSUSED */
1966 static int
1967 nfs_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
1968         int mode, vnode_t **vpp, cred_t *cr, int lfaware, caller_context_t *ct,
1969         vsecattr_t *vsecp)
1970 {
1971         int error;
1972         struct nfscreatargs args;
1973         struct nfsdiropres dr;
1974         int douprintf;
1975         vnode_t *vp;
1976         rnode_t *rp;
1977         struct vattr vattr;
1978         rnode_t *drp;
1979         vnode_t *tempvp;
1980         hrtime_t t;
1981 
1982         drp = VTOR(dvp);
1983 
1984         if (nfs_zone() != VTOMI(dvp)->mi_zone)
1985                 return (EPERM);
1986         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
1987                 return (EINTR);
1988 
1989         /*
1990          * We make a copy of the attributes because the caller does not
1991          * expect us to change what va points to.
1992          */
1993         vattr = *va;
1994 
1995         /*
1996          * If the pathname is "", just use dvp.  Don't need
1997          * to send it over the wire, look it up in the dnlc,
1998          * or perform any access checks.
1999          */
2000         if (*nm == '\0') {
2001                 error = 0;
2002                 VN_HOLD(dvp);
2003                 vp = dvp;
2004         /*
2005          * If the pathname is ".", just use dvp.  Don't need
2006          * to send it over the wire or look it up in the dnlc,
2007          * just need to check access.
2008          */
2009         } else if (strcmp(nm, ".") == 0) {
2010                 error = nfs_access(dvp, VEXEC, 0, cr, ct);
2011                 if (error) {
2012                         nfs_rw_exit(&drp->r_rwlock);
2013                         return (error);
2014                 }
2015                 VN_HOLD(dvp);
2016                 vp = dvp;
2017         /*
2018          * We need to go over the wire, just to be sure whether the
2019          * file exists or not.  Using the DNLC can be dangerous in
2020          * this case when making a decision regarding existence.
2021          */
2022         } else {
2023                 error = nfslookup_otw(dvp, nm, &vp, cr, 0);
2024         }
2025         if (!error) {
2026                 if (exclusive == EXCL)
2027                         error = EEXIST;
2028                 else if (vp->v_type == VDIR && (mode & VWRITE))
2029                         error = EISDIR;
2030                 else {
2031                         /*
2032                          * If vnode is a device, create special vnode.
2033                          */
2034                         if (IS_DEVVP(vp)) {
2035                                 tempvp = vp;
2036                                 vp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2037                                 VN_RELE(tempvp);
2038                         }
2039                         if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) {
2040                                 if ((vattr.va_mask & AT_SIZE) &&
2041                                     vp->v_type == VREG) {
2042                                         vattr.va_mask = AT_SIZE;
2043                                         error = nfssetattr(vp, &vattr, 0, cr);
2044 
2045                                         if (!error) {
2046                                                 /*
2047                                                  * Existing file was truncated;
2048                                                  * emit a create event.
2049                                                  */
2050                                                 vnevent_create(vp, ct);
2051                                         }
2052                                 }
2053                         }
2054                 }
2055                 nfs_rw_exit(&drp->r_rwlock);
2056                 if (error) {
2057                         VN_RELE(vp);
2058                 } else {
2059                         *vpp = vp;
2060                 }
2061                 return (error);
2062         }
2063 
2064         ASSERT(vattr.va_mask & AT_TYPE);
2065         if (vattr.va_type == VREG) {
2066                 ASSERT(vattr.va_mask & AT_MODE);
2067                 if (MANDMODE(vattr.va_mode)) {
2068                         nfs_rw_exit(&drp->r_rwlock);
2069                         return (EACCES);
2070                 }
2071         }
2072 
2073         dnlc_remove(dvp, nm);
2074 
2075         setdiropargs(&args.ca_da, nm, dvp);
2076 
2077         /*
2078          * Decide what the group-id of the created file should be.
2079          * Set it in attribute list as advisory...then do a setattr
2080          * if the server didn't get it right the first time.
2081          */
2082         error = setdirgid(dvp, &vattr.va_gid, cr);
2083         if (error) {
2084                 nfs_rw_exit(&drp->r_rwlock);
2085                 return (error);
2086         }
2087         vattr.va_mask |= AT_GID;
2088 
2089         /*
2090          * This is a completely gross hack to make mknod
2091          * work over the wire until we can wack the protocol
2092          */
2093 #define IFCHR           0020000         /* character special */
2094 #define IFBLK           0060000         /* block special */
2095 #define IFSOCK          0140000         /* socket */
2096 
2097         /*
2098          * dev_t is uint_t in 5.x and short in 4.x. Both 4.x
2099          * supports 8 bit majors. 5.x supports 14 bit majors. 5.x supports 18
2100          * bits in the minor number where 4.x supports 8 bits.  If the 5.x
2101          * minor/major numbers <= 8 bits long, compress the device
2102          * number before sending it. Otherwise, the 4.x server will not
2103          * create the device with the correct device number and nothing can be
2104          * done about this.
2105          */
2106         if (vattr.va_type == VCHR || vattr.va_type == VBLK) {
2107                 dev_t d = vattr.va_rdev;
2108                 dev32_t dev32;
2109 
2110                 if (vattr.va_type == VCHR)
2111                         vattr.va_mode |= IFCHR;
2112                 else
2113                         vattr.va_mode |= IFBLK;
2114 
2115                 (void) cmpldev(&dev32, d);
2116                 if (dev32 & ~((SO4_MAXMAJ << L_BITSMINOR32) | SO4_MAXMIN))
2117                         vattr.va_size = (u_offset_t)dev32;
2118                 else
2119                         vattr.va_size = (u_offset_t)nfsv2_cmpdev(d);
2120 
2121                 vattr.va_mask |= AT_MODE|AT_SIZE;
2122         } else if (vattr.va_type == VFIFO) {
2123                 vattr.va_mode |= IFCHR;         /* xtra kludge for namedpipe */
2124                 vattr.va_size = (u_offset_t)NFS_FIFO_DEV;       /* blech */
2125                 vattr.va_mask |= AT_MODE|AT_SIZE;
2126         } else if (vattr.va_type == VSOCK) {
2127                 vattr.va_mode |= IFSOCK;
2128                 /*
2129                  * To avoid triggering bugs in the servers set AT_SIZE
2130                  * (all other RFS_CREATE calls set this).
2131                  */
2132                 vattr.va_size = 0;
2133                 vattr.va_mask |= AT_MODE|AT_SIZE;
2134         }
2135 
2136         args.ca_sa = &args.ca_sa_buf;
2137         error = vattr_to_sattr(&vattr, args.ca_sa);
2138         if (error) {
2139                 /* req time field(s) overflow - return immediately */
2140                 nfs_rw_exit(&drp->r_rwlock);
2141                 return (error);
2142         }
2143 
2144         douprintf = 1;
2145 
2146         t = gethrtime();
2147 
2148         error = rfs2call(VTOMI(dvp), RFS_CREATE,
2149             xdr_creatargs, (caddr_t)&args,
2150             xdr_diropres, (caddr_t)&dr, cr,
2151             &douprintf, &dr.dr_status, 0, NULL);
2152 
2153         PURGE_ATTRCACHE(dvp);   /* mod time changed */
2154 
2155         if (!error) {
2156                 error = geterrno(dr.dr_status);
2157                 if (!error) {
2158                         if (HAVE_RDDIR_CACHE(drp))
2159                                 nfs_purge_rddir_cache(dvp);
2160                         vp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
2161                             dvp->v_vfsp, t, cr, NULL, NULL);
2162                         /*
2163                          * If NFS_ACL is supported on the server, then the
2164                          * attributes returned by server may have minimal
2165                          * permissions sometimes denying access to users having
2166                          * proper access.  To get the proper attributes, mark
2167                          * the attributes as expired so that they will be
2168                          * regotten via the NFS_ACL GETATTR2 procedure.
2169                          */
2170                         if (VTOMI(vp)->mi_flags & MI_ACL) {
2171                                 PURGE_ATTRCACHE(vp);
2172                         }
2173                         dnlc_update(dvp, nm, vp);
2174                         rp = VTOR(vp);
2175                         if (vattr.va_size == 0) {
2176                                 mutex_enter(&rp->r_statelock);
2177                                 rp->r_size = 0;
2178                                 mutex_exit(&rp->r_statelock);
2179                                 if (vn_has_cached_data(vp)) {
2180                                         ASSERT(vp->v_type != VCHR);
2181                                         nfs_invalidate_pages(vp,
2182                                             (u_offset_t)0, cr);
2183                                 }
2184                         }
2185 
2186                         /*
2187                          * Make sure the gid was set correctly.
2188                          * If not, try to set it (but don't lose
2189                          * any sleep over it).
2190                          */
2191                         if (vattr.va_gid != rp->r_attr.va_gid) {
2192                                 vattr.va_mask = AT_GID;
2193                                 (void) nfssetattr(vp, &vattr, 0, cr);
2194                         }
2195 
2196                         /*
2197                          * If vnode is a device create special vnode
2198                          */
2199                         if (IS_DEVVP(vp)) {
2200                                 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
2201                                 VN_RELE(vp);
2202                         } else
2203                                 *vpp = vp;
2204                 } else {
2205                         PURGE_STALE_FH(error, dvp, cr);
2206                 }
2207         }
2208 
2209         nfs_rw_exit(&drp->r_rwlock);
2210 
2211         return (error);
2212 }
2213 
2214 /*
2215  * Weirdness: if the vnode to be removed is open
2216  * we rename it instead of removing it and nfs_inactive
2217  * will remove the new name.
2218  */
2219 /* ARGSUSED */
2220 static int
2221 nfs_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags)
2222 {
2223         int error;
2224         struct nfsdiropargs da;
2225         enum nfsstat status;
2226         vnode_t *vp;
2227         char *tmpname;
2228         int douprintf;
2229         rnode_t *rp;
2230         rnode_t *drp;
2231 
2232         if (nfs_zone() != VTOMI(dvp)->mi_zone)
2233                 return (EPERM);
2234         drp = VTOR(dvp);
2235         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2236                 return (EINTR);
2237 
2238         error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2239         if (error) {
2240                 nfs_rw_exit(&drp->r_rwlock);
2241                 return (error);
2242         }
2243 
2244         if (vp->v_type == VDIR && secpolicy_fs_linkdir(cr, dvp->v_vfsp)) {
2245                 VN_RELE(vp);
2246                 nfs_rw_exit(&drp->r_rwlock);
2247                 return (EPERM);
2248         }
2249 
2250         /*
2251          * First just remove the entry from the name cache, as it
2252          * is most likely the only entry for this vp.
2253          */
2254         dnlc_remove(dvp, nm);
2255 
2256         /*
2257          * If the file has a v_count > 1 then there may be more than one
2258          * entry in the name cache due multiple links or an open file,
2259          * but we don't have the real reference count so flush all
2260          * possible entries.
2261          */
2262         if (vp->v_count > 1)
2263                 dnlc_purge_vp(vp);
2264 
2265         /*
2266          * Now we have the real reference count on the vnode
2267          */
2268         rp = VTOR(vp);
2269         mutex_enter(&rp->r_statelock);
2270         if (vp->v_count > 1 &&
2271             (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) {
2272                 mutex_exit(&rp->r_statelock);
2273                 tmpname = newname();
2274                 error = nfsrename(dvp, nm, dvp, tmpname, cr, ct);
2275                 if (error)
2276                         kmem_free(tmpname, MAXNAMELEN);
2277                 else {
2278                         mutex_enter(&rp->r_statelock);
2279                         if (rp->r_unldvp == NULL) {
2280                                 VN_HOLD(dvp);
2281                                 rp->r_unldvp = dvp;
2282                                 if (rp->r_unlcred != NULL)
2283                                         crfree(rp->r_unlcred);
2284                                 crhold(cr);
2285                                 rp->r_unlcred = cr;
2286                                 rp->r_unlname = tmpname;
2287                         } else {
2288                                 kmem_free(rp->r_unlname, MAXNAMELEN);
2289                                 rp->r_unlname = tmpname;
2290                         }
2291                         mutex_exit(&rp->r_statelock);
2292                 }
2293         } else {
2294                 mutex_exit(&rp->r_statelock);
2295                 /*
2296                  * We need to flush any dirty pages which happen to
2297                  * be hanging around before removing the file.  This
2298                  * shouldn't happen very often and mostly on file
2299                  * systems mounted "nocto".
2300                  */
2301                 if (vn_has_cached_data(vp) &&
2302                     ((rp->r_flags & RDIRTY) || rp->r_count > 0)) {
2303                         error = nfs_putpage(vp, (offset_t)0, 0, 0, cr, ct);
2304                         if (error && (error == ENOSPC || error == EDQUOT)) {
2305                                 mutex_enter(&rp->r_statelock);
2306                                 if (!rp->r_error)
2307                                         rp->r_error = error;
2308                                 mutex_exit(&rp->r_statelock);
2309                         }
2310                 }
2311 
2312                 setdiropargs(&da, nm, dvp);
2313 
2314                 douprintf = 1;
2315 
2316                 error = rfs2call(VTOMI(dvp), RFS_REMOVE,
2317                     xdr_diropargs, (caddr_t)&da,
2318                     xdr_enum, (caddr_t)&status, cr,
2319                     &douprintf, &status, 0, NULL);
2320 
2321                 /*
2322                  * The xattr dir may be gone after last attr is removed,
2323                  * so flush it from dnlc.
2324                  */
2325                 if (dvp->v_flag & V_XATTRDIR)
2326                         dnlc_purge_vp(dvp);
2327 
2328                 PURGE_ATTRCACHE(dvp);   /* mod time changed */
2329                 PURGE_ATTRCACHE(vp);    /* link count changed */
2330 
2331                 if (!error) {
2332                         error = geterrno(status);
2333                         if (!error) {
2334                                 if (HAVE_RDDIR_CACHE(drp))
2335                                         nfs_purge_rddir_cache(dvp);
2336                         } else {
2337                                 PURGE_STALE_FH(error, dvp, cr);
2338                         }
2339                 }
2340         }
2341 
2342         if (error == 0) {
2343                 vnevent_remove(vp, dvp, nm, ct);
2344         }
2345         VN_RELE(vp);
2346 
2347         nfs_rw_exit(&drp->r_rwlock);
2348 
2349         return (error);
2350 }
2351 
2352 /* ARGSUSED */
2353 static int
2354 nfs_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
2355         caller_context_t *ct, int flags)
2356 {
2357         int error;
2358         struct nfslinkargs args;
2359         enum nfsstat status;
2360         vnode_t *realvp;
2361         int douprintf;
2362         rnode_t *tdrp;
2363 
2364         if (nfs_zone() != VTOMI(tdvp)->mi_zone)
2365                 return (EPERM);
2366         if (VOP_REALVP(svp, &realvp, ct) == 0)
2367                 svp = realvp;
2368 
2369         args.la_from = VTOFH(svp);
2370         setdiropargs(&args.la_to, tnm, tdvp);
2371 
2372         tdrp = VTOR(tdvp);
2373         if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR(tdvp)))
2374                 return (EINTR);
2375 
2376         dnlc_remove(tdvp, tnm);
2377 
2378         douprintf = 1;
2379 
2380         error = rfs2call(VTOMI(svp), RFS_LINK,
2381             xdr_linkargs, (caddr_t)&args,
2382             xdr_enum, (caddr_t)&status, cr,
2383             &douprintf, &status, 0, NULL);
2384 
2385         PURGE_ATTRCACHE(tdvp);  /* mod time changed */
2386         PURGE_ATTRCACHE(svp);   /* link count changed */
2387 
2388         if (!error) {
2389                 error = geterrno(status);
2390                 if (!error) {
2391                         if (HAVE_RDDIR_CACHE(tdrp))
2392                                 nfs_purge_rddir_cache(tdvp);
2393                 }
2394         }
2395 
2396         nfs_rw_exit(&tdrp->r_rwlock);
2397 
2398         if (!error) {
2399                 /*
2400                  * Notify the source file of this link operation.
2401                  */
2402                 vnevent_link(svp, ct);
2403         }
2404         return (error);
2405 }
2406 
2407 /* ARGSUSED */
2408 static int
2409 nfs_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
2410         caller_context_t *ct, int flags)
2411 {
2412         vnode_t *realvp;
2413 
2414         if (nfs_zone() != VTOMI(odvp)->mi_zone)
2415                 return (EPERM);
2416         if (VOP_REALVP(ndvp, &realvp, ct) == 0)
2417                 ndvp = realvp;
2418 
2419         return (nfsrename(odvp, onm, ndvp, nnm, cr, ct));
2420 }
2421 
2422 /*
2423  * nfsrename does the real work of renaming in NFS Version 2.
2424  */
2425 static int
2426 nfsrename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
2427     caller_context_t *ct)
2428 {
2429         int error;
2430         enum nfsstat status;
2431         struct nfsrnmargs args;
2432         int douprintf;
2433         vnode_t *nvp = NULL;
2434         vnode_t *ovp = NULL;
2435         char *tmpname;
2436         rnode_t *rp;
2437         rnode_t *odrp;
2438         rnode_t *ndrp;
2439 
2440         ASSERT(nfs_zone() == VTOMI(odvp)->mi_zone);
2441         if (strcmp(onm, ".") == 0 || strcmp(onm, "..") == 0 ||
2442             strcmp(nnm, ".") == 0 || strcmp(nnm, "..") == 0)
2443                 return (EINVAL);
2444 
2445         odrp = VTOR(odvp);
2446         ndrp = VTOR(ndvp);
2447         if ((intptr_t)odrp < (intptr_t)ndrp) {
2448                 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp)))
2449                         return (EINTR);
2450                 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp))) {
2451                         nfs_rw_exit(&odrp->r_rwlock);
2452                         return (EINTR);
2453                 }
2454         } else {
2455                 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR(ndvp)))
2456                         return (EINTR);
2457                 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR(odvp))) {
2458                         nfs_rw_exit(&ndrp->r_rwlock);
2459                         return (EINTR);
2460                 }
2461         }
2462 
2463         /*
2464          * Lookup the target file.  If it exists, it needs to be
2465          * checked to see whether it is a mount point and whether
2466          * it is active (open).
2467          */
2468         error = nfslookup(ndvp, nnm, &nvp, NULL, 0, NULL, cr, 0);
2469         if (!error) {
2470                 /*
2471                  * If this file has been mounted on, then just
2472                  * return busy because renaming to it would remove
2473                  * the mounted file system from the name space.
2474                  */
2475                 if (vn_mountedvfs(nvp) != NULL) {
2476                         VN_RELE(nvp);
2477                         nfs_rw_exit(&odrp->r_rwlock);
2478                         nfs_rw_exit(&ndrp->r_rwlock);
2479                         return (EBUSY);
2480                 }
2481 
2482                 /*
2483                  * Purge the name cache of all references to this vnode
2484                  * so that we can check the reference count to infer
2485                  * whether it is active or not.
2486                  */
2487                 /*
2488                  * First just remove the entry from the name cache, as it
2489                  * is most likely the only entry for this vp.
2490                  */
2491                 dnlc_remove(ndvp, nnm);
2492                 /*
2493                  * If the file has a v_count > 1 then there may be more
2494                  * than one entry in the name cache due multiple links
2495                  * or an open file, but we don't have the real reference
2496                  * count so flush all possible entries.
2497                  */
2498                 if (nvp->v_count > 1)
2499                         dnlc_purge_vp(nvp);
2500 
2501                 /*
2502                  * If the vnode is active and is not a directory,
2503                  * arrange to rename it to a
2504                  * temporary file so that it will continue to be
2505                  * accessible.  This implements the "unlink-open-file"
2506                  * semantics for the target of a rename operation.
2507                  * Before doing this though, make sure that the
2508                  * source and target files are not already the same.
2509                  */
2510                 if (nvp->v_count > 1 && nvp->v_type != VDIR) {
2511                         /*
2512                          * Lookup the source name.
2513                          */
2514                         error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL,
2515                             cr, 0);
2516 
2517                         /*
2518                          * The source name *should* already exist.
2519                          */
2520                         if (error) {
2521                                 VN_RELE(nvp);
2522                                 nfs_rw_exit(&odrp->r_rwlock);
2523                                 nfs_rw_exit(&ndrp->r_rwlock);
2524                                 return (error);
2525                         }
2526 
2527                         /*
2528                          * Compare the two vnodes.  If they are the same,
2529                          * just release all held vnodes and return success.
2530                          */
2531                         if (ovp == nvp) {
2532                                 VN_RELE(ovp);
2533                                 VN_RELE(nvp);
2534                                 nfs_rw_exit(&odrp->r_rwlock);
2535                                 nfs_rw_exit(&ndrp->r_rwlock);
2536                                 return (0);
2537                         }
2538 
2539                         /*
2540                          * Can't mix and match directories and non-
2541                          * directories in rename operations.  We already
2542                          * know that the target is not a directory.  If
2543                          * the source is a directory, return an error.
2544                          */
2545                         if (ovp->v_type == VDIR) {
2546                                 VN_RELE(ovp);
2547                                 VN_RELE(nvp);
2548                                 nfs_rw_exit(&odrp->r_rwlock);
2549                                 nfs_rw_exit(&ndrp->r_rwlock);
2550                                 return (ENOTDIR);
2551                         }
2552 
2553                         /*
2554                          * The target file exists, is not the same as
2555                          * the source file, and is active.  Link it
2556                          * to a temporary filename to avoid having
2557                          * the server removing the file completely.
2558                          */
2559                         tmpname = newname();
2560                         error = nfs_link(ndvp, nvp, tmpname, cr, NULL, 0);
2561                         if (error == EOPNOTSUPP) {
2562                                 error = nfs_rename(ndvp, nnm, ndvp, tmpname,
2563                                     cr, NULL, 0);
2564                         }
2565                         if (error) {
2566                                 kmem_free(tmpname, MAXNAMELEN);
2567                                 VN_RELE(ovp);
2568                                 VN_RELE(nvp);
2569                                 nfs_rw_exit(&odrp->r_rwlock);
2570                                 nfs_rw_exit(&ndrp->r_rwlock);
2571                                 return (error);
2572                         }
2573                         rp = VTOR(nvp);
2574                         mutex_enter(&rp->r_statelock);
2575                         if (rp->r_unldvp == NULL) {
2576                                 VN_HOLD(ndvp);
2577                                 rp->r_unldvp = ndvp;
2578                                 if (rp->r_unlcred != NULL)
2579                                         crfree(rp->r_unlcred);
2580                                 crhold(cr);
2581                                 rp->r_unlcred = cr;
2582                                 rp->r_unlname = tmpname;
2583                         } else {
2584                                 kmem_free(rp->r_unlname, MAXNAMELEN);
2585                                 rp->r_unlname = tmpname;
2586                         }
2587                         mutex_exit(&rp->r_statelock);
2588                 }
2589         }
2590 
2591         if (ovp == NULL) {
2592                 /*
2593                  * When renaming directories to be a subdirectory of a
2594                  * different parent, the dnlc entry for ".." will no
2595                  * longer be valid, so it must be removed.
2596                  *
2597                  * We do a lookup here to determine whether we are renaming
2598                  * a directory and we need to check if we are renaming
2599                  * an unlinked file.  This might have already been done
2600                  * in previous code, so we check ovp == NULL to avoid
2601                  * doing it twice.
2602                  */
2603 
2604                 error = nfslookup(odvp, onm, &ovp, NULL, 0, NULL, cr, 0);
2605 
2606                 /*
2607                  * The source name *should* already exist.
2608                  */
2609                 if (error) {
2610                         nfs_rw_exit(&odrp->r_rwlock);
2611                         nfs_rw_exit(&ndrp->r_rwlock);
2612                         if (nvp) {
2613                                 VN_RELE(nvp);
2614                         }
2615                         return (error);
2616                 }
2617                 ASSERT(ovp != NULL);
2618         }
2619 
2620         dnlc_remove(odvp, onm);
2621         dnlc_remove(ndvp, nnm);
2622 
2623         setdiropargs(&args.rna_from, onm, odvp);
2624         setdiropargs(&args.rna_to, nnm, ndvp);
2625 
2626         douprintf = 1;
2627 
2628         error = rfs2call(VTOMI(odvp), RFS_RENAME,
2629             xdr_rnmargs, (caddr_t)&args,
2630             xdr_enum, (caddr_t)&status, cr,
2631             &douprintf, &status, 0, NULL);
2632 
2633         PURGE_ATTRCACHE(odvp);  /* mod time changed */
2634         PURGE_ATTRCACHE(ndvp);  /* mod time changed */
2635 
2636         if (!error) {
2637                 error = geterrno(status);
2638                 if (!error) {
2639                         if (HAVE_RDDIR_CACHE(odrp))
2640                                 nfs_purge_rddir_cache(odvp);
2641                         if (HAVE_RDDIR_CACHE(ndrp))
2642                                 nfs_purge_rddir_cache(ndvp);
2643                         /*
2644                          * when renaming directories to be a subdirectory of a
2645                          * different parent, the dnlc entry for ".." will no
2646                          * longer be valid, so it must be removed
2647                          */
2648                         rp = VTOR(ovp);
2649                         if (ndvp != odvp) {
2650                                 if (ovp->v_type == VDIR) {
2651                                         dnlc_remove(ovp, "..");
2652                                         if (HAVE_RDDIR_CACHE(rp))
2653                                                 nfs_purge_rddir_cache(ovp);
2654                                 }
2655                         }
2656 
2657                         /*
2658                          * If we are renaming the unlinked file, update the
2659                          * r_unldvp and r_unlname as needed.
2660                          */
2661                         mutex_enter(&rp->r_statelock);
2662                         if (rp->r_unldvp != NULL) {
2663                                 if (strcmp(rp->r_unlname, onm) == 0) {
2664                                         (void) strncpy(rp->r_unlname,
2665                                             nnm, MAXNAMELEN);
2666                                         rp->r_unlname[MAXNAMELEN - 1] = '\0';
2667 
2668                                         if (ndvp != rp->r_unldvp) {
2669                                                 VN_RELE(rp->r_unldvp);
2670                                                 rp->r_unldvp = ndvp;
2671                                                 VN_HOLD(ndvp);
2672                                         }
2673                                 }
2674                         }
2675                         mutex_exit(&rp->r_statelock);
2676                 } else {
2677                         /*
2678                          * System V defines rename to return EEXIST, not
2679                          * ENOTEMPTY if the target directory is not empty.
2680                          * Over the wire, the error is NFSERR_ENOTEMPTY
2681                          * which geterrno maps to ENOTEMPTY.
2682                          */
2683                         if (error == ENOTEMPTY)
2684                                 error = EEXIST;
2685                 }
2686         }
2687 
2688         if (error == 0) {
2689                 if (nvp)
2690                         vnevent_rename_dest(nvp, ndvp, nnm, ct);
2691 
2692                 if (odvp != ndvp)
2693                         vnevent_rename_dest_dir(ndvp, ct);
2694 
2695                 ASSERT(ovp != NULL);
2696                 vnevent_rename_src(ovp, odvp, onm, ct);
2697         }
2698 
2699         if (nvp) {
2700                 VN_RELE(nvp);
2701         }
2702         VN_RELE(ovp);
2703 
2704         nfs_rw_exit(&odrp->r_rwlock);
2705         nfs_rw_exit(&ndrp->r_rwlock);
2706 
2707         return (error);
2708 }
2709 
2710 /* ARGSUSED */
2711 static int
2712 nfs_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr,
2713         caller_context_t *ct, int flags, vsecattr_t *vsecp)
2714 {
2715         int error;
2716         struct nfscreatargs args;
2717         struct nfsdiropres dr;
2718         int douprintf;
2719         rnode_t *drp;
2720         hrtime_t t;
2721 
2722         if (nfs_zone() != VTOMI(dvp)->mi_zone)
2723                 return (EPERM);
2724 
2725         setdiropargs(&args.ca_da, nm, dvp);
2726 
2727         /*
2728          * Decide what the group-id and set-gid bit of the created directory
2729          * should be.  May have to do a setattr to get the gid right.
2730          */
2731         error = setdirgid(dvp, &va->va_gid, cr);
2732         if (error)
2733                 return (error);
2734         error = setdirmode(dvp, &va->va_mode, cr);
2735         if (error)
2736                 return (error);
2737         va->va_mask |= AT_MODE|AT_GID;
2738 
2739         args.ca_sa = &args.ca_sa_buf;
2740         error = vattr_to_sattr(va, args.ca_sa);
2741         if (error) {
2742                 /* req time field(s) overflow - return immediately */
2743                 return (error);
2744         }
2745 
2746         drp = VTOR(dvp);
2747         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2748                 return (EINTR);
2749 
2750         dnlc_remove(dvp, nm);
2751 
2752         douprintf = 1;
2753 
2754         t = gethrtime();
2755 
2756         error = rfs2call(VTOMI(dvp), RFS_MKDIR,
2757             xdr_creatargs, (caddr_t)&args,
2758             xdr_diropres, (caddr_t)&dr, cr,
2759             &douprintf, &dr.dr_status, 0, NULL);
2760 
2761         PURGE_ATTRCACHE(dvp);   /* mod time changed */
2762 
2763         if (!error) {
2764                 error = geterrno(dr.dr_status);
2765                 if (!error) {
2766                         if (HAVE_RDDIR_CACHE(drp))
2767                                 nfs_purge_rddir_cache(dvp);
2768                         /*
2769                          * The attributes returned by RFS_MKDIR can not
2770                          * be depended upon, so mark the attribute cache
2771                          * as purged.  A subsequent GETATTR will get the
2772                          * correct attributes from the server.
2773                          */
2774                         *vpp = makenfsnode(&dr.dr_fhandle, &dr.dr_attr,
2775                             dvp->v_vfsp, t, cr, NULL, NULL);
2776                         PURGE_ATTRCACHE(*vpp);
2777                         dnlc_update(dvp, nm, *vpp);
2778 
2779                         /*
2780                          * Make sure the gid was set correctly.
2781                          * If not, try to set it (but don't lose
2782                          * any sleep over it).
2783                          */
2784                         if (va->va_gid != VTOR(*vpp)->r_attr.va_gid) {
2785                                 va->va_mask = AT_GID;
2786                                 (void) nfssetattr(*vpp, va, 0, cr);
2787                         }
2788                 } else {
2789                         PURGE_STALE_FH(error, dvp, cr);
2790                 }
2791         }
2792 
2793         nfs_rw_exit(&drp->r_rwlock);
2794 
2795         return (error);
2796 }
2797 
2798 /* ARGSUSED */
2799 static int
2800 nfs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
2801         caller_context_t *ct, int flags)
2802 {
2803         int error;
2804         enum nfsstat status;
2805         struct nfsdiropargs da;
2806         vnode_t *vp;
2807         int douprintf;
2808         rnode_t *drp;
2809 
2810         if (nfs_zone() != VTOMI(dvp)->mi_zone)
2811                 return (EPERM);
2812         drp = VTOR(dvp);
2813         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2814                 return (EINTR);
2815 
2816         /*
2817          * Attempt to prevent a rmdir(".") from succeeding.
2818          */
2819         error = nfslookup(dvp, nm, &vp, NULL, 0, NULL, cr, 0);
2820         if (error) {
2821                 nfs_rw_exit(&drp->r_rwlock);
2822                 return (error);
2823         }
2824 
2825         if (vp == cdir) {
2826                 VN_RELE(vp);
2827                 nfs_rw_exit(&drp->r_rwlock);
2828                 return (EINVAL);
2829         }
2830 
2831         setdiropargs(&da, nm, dvp);
2832 
2833         /*
2834          * First just remove the entry from the name cache, as it
2835          * is most likely an entry for this vp.
2836          */
2837         dnlc_remove(dvp, nm);
2838 
2839         /*
2840          * If there vnode reference count is greater than one, then
2841          * there may be additional references in the DNLC which will
2842          * need to be purged.  First, trying removing the entry for
2843          * the parent directory and see if that removes the additional
2844          * reference(s).  If that doesn't do it, then use dnlc_purge_vp
2845          * to completely remove any references to the directory which
2846          * might still exist in the DNLC.
2847          */
2848         if (vp->v_count > 1) {
2849                 dnlc_remove(vp, "..");
2850                 if (vp->v_count > 1)
2851                         dnlc_purge_vp(vp);
2852         }
2853 
2854         douprintf = 1;
2855 
2856         error = rfs2call(VTOMI(dvp), RFS_RMDIR,
2857             xdr_diropargs, (caddr_t)&da,
2858             xdr_enum, (caddr_t)&status, cr,
2859             &douprintf, &status, 0, NULL);
2860 
2861         PURGE_ATTRCACHE(dvp);   /* mod time changed */
2862 
2863         if (error) {
2864                 VN_RELE(vp);
2865                 nfs_rw_exit(&drp->r_rwlock);
2866                 return (error);
2867         }
2868 
2869         error = geterrno(status);
2870         if (!error) {
2871                 if (HAVE_RDDIR_CACHE(drp))
2872                         nfs_purge_rddir_cache(dvp);
2873                 if (HAVE_RDDIR_CACHE(VTOR(vp)))
2874                         nfs_purge_rddir_cache(vp);
2875         } else {
2876                 PURGE_STALE_FH(error, dvp, cr);
2877                 /*
2878                  * System V defines rmdir to return EEXIST, not
2879                  * ENOTEMPTY if the directory is not empty.  Over
2880                  * the wire, the error is NFSERR_ENOTEMPTY which
2881                  * geterrno maps to ENOTEMPTY.
2882                  */
2883                 if (error == ENOTEMPTY)
2884                         error = EEXIST;
2885         }
2886 
2887         if (error == 0) {
2888                 vnevent_rmdir(vp, dvp, nm, ct);
2889         }
2890         VN_RELE(vp);
2891 
2892         nfs_rw_exit(&drp->r_rwlock);
2893 
2894         return (error);
2895 }
2896 
2897 /* ARGSUSED */
2898 static int
2899 nfs_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr,
2900         caller_context_t *ct, int flags)
2901 {
2902         int error;
2903         struct nfsslargs args;
2904         enum nfsstat status;
2905         int douprintf;
2906         rnode_t *drp;
2907 
2908         if (nfs_zone() != VTOMI(dvp)->mi_zone)
2909                 return (EPERM);
2910         setdiropargs(&args.sla_from, lnm, dvp);
2911         args.sla_sa = &args.sla_sa_buf;
2912         error = vattr_to_sattr(tva, args.sla_sa);
2913         if (error) {
2914                 /* req time field(s) overflow - return immediately */
2915                 return (error);
2916         }
2917         args.sla_tnm = tnm;
2918 
2919         drp = VTOR(dvp);
2920         if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR(dvp)))
2921                 return (EINTR);
2922 
2923         dnlc_remove(dvp, lnm);
2924 
2925         douprintf = 1;
2926 
2927         error = rfs2call(VTOMI(dvp), RFS_SYMLINK,
2928             xdr_slargs, (caddr_t)&args,
2929             xdr_enum, (caddr_t)&status, cr,
2930             &douprintf, &status, 0, NULL);
2931 
2932         PURGE_ATTRCACHE(dvp);   /* mod time changed */
2933 
2934         if (!error) {
2935                 error = geterrno(status);
2936                 if (!error) {
2937                         if (HAVE_RDDIR_CACHE(drp))
2938                                 nfs_purge_rddir_cache(dvp);
2939                 } else {
2940                         PURGE_STALE_FH(error, dvp, cr);
2941                 }
2942         }
2943 
2944         nfs_rw_exit(&drp->r_rwlock);
2945 
2946         return (error);
2947 }
2948 
2949 #ifdef DEBUG
2950 static int nfs_readdir_cache_hits = 0;
2951 static int nfs_readdir_cache_shorts = 0;
2952 static int nfs_readdir_cache_waits = 0;
2953 static int nfs_readdir_cache_misses = 0;
2954 static int nfs_readdir_readahead = 0;
2955 #endif
2956 
2957 volatile int nfs_shrinkreaddir = 0;
2958 
2959 /*
2960  * Read directory entries.
2961  * There are some weird things to look out for here.  The uio_offset
2962  * field is either 0 or it is the offset returned from a previous
2963  * readdir.  It is an opaque value used by the server to find the
2964  * correct directory block to read. The count field is the number
2965  * of blocks to read on the server.  This is advisory only, the server
2966  * may return only one block's worth of entries.  Entries may be compressed
2967  * on the server.
2968  */
2969 /* ARGSUSED */
2970 static int
2971 nfs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
2972         caller_context_t *ct, int flags)
2973 {
2974         int error;
2975         size_t count;
2976         rnode_t *rp;
2977         rddir_cache *rdc;
2978         rddir_cache *nrdc;
2979         rddir_cache *rrdc;
2980 #ifdef DEBUG
2981         int missed;
2982 #endif
2983         rddir_cache srdc;
2984         avl_index_t where;
2985 
2986         rp = VTOR(vp);
2987 
2988         ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
2989         if (nfs_zone() != VTOMI(vp)->mi_zone)
2990                 return (EIO);
2991         /*
2992          * Make sure that the directory cache is valid.
2993          */
2994         if (HAVE_RDDIR_CACHE(rp)) {
2995                 if (nfs_disable_rddir_cache) {
2996                         /*
2997                          * Setting nfs_disable_rddir_cache in /etc/system
2998                          * allows interoperability with servers that do not
2999                          * properly update the attributes of directories.
3000                          * Any cached information gets purged before an
3001                          * access is made to it.
3002                          */
3003                         nfs_purge_rddir_cache(vp);
3004                 } else {
3005                         error = nfs_validate_caches(vp, cr);
3006                         if (error)
3007                                 return (error);
3008                 }
3009         }
3010 
3011         /*
3012          * UGLINESS: SunOS 3.2 servers apparently cannot always handle an
3013          * RFS_READDIR request with rda_count set to more than 0x400. So
3014          * we reduce the request size here purely for compatibility.
3015          *
3016          * In general, this is no longer required.  However, if a server
3017          * is discovered which can not handle requests larger than 1024,
3018          * nfs_shrinkreaddir can be set to 1 to enable this backwards
3019          * compatibility.
3020          *
3021          * In any case, the request size is limited to NFS_MAXDATA bytes.
3022          */
3023         count = MIN(uiop->uio_iov->iov_len,
3024             nfs_shrinkreaddir ? 0x400 : NFS_MAXDATA);
3025 
3026         nrdc = NULL;
3027 #ifdef DEBUG
3028         missed = 0;
3029 #endif
3030 top:
3031         /*
3032          * Short circuit last readdir which always returns 0 bytes.
3033          * This can be done after the directory has been read through
3034          * completely at least once.  This will set r_direof which
3035          * can be used to find the value of the last cookie.
3036          */
3037         mutex_enter(&rp->r_statelock);
3038         if (rp->r_direof != NULL &&
3039             uiop->uio_offset == rp->r_direof->nfs_ncookie) {
3040                 mutex_exit(&rp->r_statelock);
3041 #ifdef DEBUG
3042                 nfs_readdir_cache_shorts++;
3043 #endif
3044                 if (eofp)
3045                         *eofp = 1;
3046                 if (nrdc != NULL)
3047                         rddir_cache_rele(nrdc);
3048                 return (0);
3049         }
3050         /*
3051          * Look for a cache entry.  Cache entries are identified
3052          * by the NFS cookie value and the byte count requested.
3053          */
3054         srdc.nfs_cookie = uiop->uio_offset;
3055         srdc.buflen = count;
3056         rdc = avl_find(&rp->r_dir, &srdc, &where);
3057         if (rdc != NULL) {
3058                 rddir_cache_hold(rdc);
3059                 /*
3060                  * If the cache entry is in the process of being
3061                  * filled in, wait until this completes.  The
3062                  * RDDIRWAIT bit is set to indicate that someone
3063                  * is waiting and then the thread currently
3064                  * filling the entry is done, it should do a
3065                  * cv_broadcast to wakeup all of the threads
3066                  * waiting for it to finish.
3067                  */
3068                 if (rdc->flags & RDDIR) {
3069                         nfs_rw_exit(&rp->r_rwlock);
3070                         rdc->flags |= RDDIRWAIT;
3071 #ifdef DEBUG
3072                         nfs_readdir_cache_waits++;
3073 #endif
3074                         if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) {
3075                                 /*
3076                                  * We got interrupted, probably
3077                                  * the user typed ^C or an alarm
3078                                  * fired.  We free the new entry
3079                                  * if we allocated one.
3080                                  */
3081                                 mutex_exit(&rp->r_statelock);
3082                                 (void) nfs_rw_enter_sig(&rp->r_rwlock,
3083                                     RW_READER, FALSE);
3084                                 rddir_cache_rele(rdc);
3085                                 if (nrdc != NULL)
3086                                         rddir_cache_rele(nrdc);
3087                                 return (EINTR);
3088                         }
3089                         mutex_exit(&rp->r_statelock);
3090                         (void) nfs_rw_enter_sig(&rp->r_rwlock,
3091                             RW_READER, FALSE);
3092                         rddir_cache_rele(rdc);
3093                         goto top;
3094                 }
3095                 /*
3096                  * Check to see if a readdir is required to
3097                  * fill the entry.  If so, mark this entry
3098                  * as being filled, remove our reference,
3099                  * and branch to the code to fill the entry.
3100                  */
3101                 if (rdc->flags & RDDIRREQ) {
3102                         rdc->flags &= ~RDDIRREQ;
3103                         rdc->flags |= RDDIR;
3104                         if (nrdc != NULL)
3105                                 rddir_cache_rele(nrdc);
3106                         nrdc = rdc;
3107                         mutex_exit(&rp->r_statelock);
3108                         goto bottom;
3109                 }
3110 #ifdef DEBUG
3111                 if (!missed)
3112                         nfs_readdir_cache_hits++;
3113 #endif
3114                 /*
3115                  * If an error occurred while attempting
3116                  * to fill the cache entry, just return it.
3117                  */
3118                 if (rdc->error) {
3119                         error = rdc->error;
3120                         mutex_exit(&rp->r_statelock);
3121                         rddir_cache_rele(rdc);
3122                         if (nrdc != NULL)
3123                                 rddir_cache_rele(nrdc);
3124                         return (error);
3125                 }
3126 
3127                 /*
3128                  * The cache entry is complete and good,
3129                  * copyout the dirent structs to the calling
3130                  * thread.
3131                  */
3132                 error = uiomove(rdc->entries, rdc->entlen, UIO_READ, uiop);
3133 
3134                 /*
3135                  * If no error occurred during the copyout,
3136                  * update the offset in the uio struct to
3137                  * contain the value of the next cookie
3138                  * and set the eof value appropriately.
3139                  */
3140                 if (!error) {
3141                         uiop->uio_offset = rdc->nfs_ncookie;
3142                         if (eofp)
3143                                 *eofp = rdc->eof;
3144                 }
3145 
3146                 /*
3147                  * Decide whether to do readahead.  Don't if
3148                  * have already read to the end of directory.
3149                  */
3150                 if (rdc->eof) {
3151                         rp->r_direof = rdc;
3152                         mutex_exit(&rp->r_statelock);
3153                         rddir_cache_rele(rdc);
3154                         if (nrdc != NULL)
3155                                 rddir_cache_rele(nrdc);
3156                         return (error);
3157                 }
3158 
3159                 /*
3160                  * Check to see whether we found an entry
3161                  * for the readahead.  If so, we don't need
3162                  * to do anything further, so free the new
3163                  * entry if one was allocated.  Otherwise,
3164                  * allocate a new entry, add it to the cache,
3165                  * and then initiate an asynchronous readdir
3166                  * operation to fill it.
3167                  */
3168                 srdc.nfs_cookie = rdc->nfs_ncookie;
3169                 srdc.buflen = count;
3170                 rrdc = avl_find(&rp->r_dir, &srdc, &where);
3171                 if (rrdc != NULL) {
3172                         if (nrdc != NULL)
3173                                 rddir_cache_rele(nrdc);
3174                 } else {
3175                         if (nrdc != NULL)
3176                                 rrdc = nrdc;
3177                         else {
3178                                 rrdc = rddir_cache_alloc(KM_NOSLEEP);
3179                         }
3180                         if (rrdc != NULL) {
3181                                 rrdc->nfs_cookie = rdc->nfs_ncookie;
3182                                 rrdc->buflen = count;
3183                                 avl_insert(&rp->r_dir, rrdc, where);
3184                                 rddir_cache_hold(rrdc);
3185                                 mutex_exit(&rp->r_statelock);
3186                                 rddir_cache_rele(rdc);
3187 #ifdef DEBUG
3188                                 nfs_readdir_readahead++;
3189 #endif
3190                                 nfs_async_readdir(vp, rrdc, cr, nfsreaddir);
3191                                 return (error);
3192                         }
3193                 }
3194 
3195                 mutex_exit(&rp->r_statelock);
3196                 rddir_cache_rele(rdc);
3197                 return (error);
3198         }
3199 
3200         /*
3201          * Didn't find an entry in the cache.  Construct a new empty
3202          * entry and link it into the cache.  Other processes attempting
3203          * to access this entry will need to wait until it is filled in.
3204          *
3205          * Since kmem_alloc may block, another pass through the cache
3206          * will need to be taken to make sure that another process
3207          * hasn't already added an entry to the cache for this request.
3208          */
3209         if (nrdc == NULL) {
3210                 mutex_exit(&rp->r_statelock);
3211                 nrdc = rddir_cache_alloc(KM_SLEEP);
3212                 nrdc->nfs_cookie = uiop->uio_offset;
3213                 nrdc->buflen = count;
3214                 goto top;
3215         }
3216 
3217         /*
3218          * Add this entry to the cache.
3219          */
3220         avl_insert(&rp->r_dir, nrdc, where);
3221         rddir_cache_hold(nrdc);
3222         mutex_exit(&rp->r_statelock);
3223 
3224 bottom:
3225 #ifdef DEBUG
3226         missed = 1;
3227         nfs_readdir_cache_misses++;
3228 #endif
3229         /*
3230          * Do the readdir.
3231          */
3232         error = nfsreaddir(vp, nrdc, cr);
3233 
3234         /*
3235          * If this operation failed, just return the error which occurred.
3236          */
3237         if (error != 0)
3238                 return (error);
3239 
3240         /*
3241          * Since the RPC operation will have taken sometime and blocked
3242          * this process, another pass through the cache will need to be
3243          * taken to find the correct cache entry.  It is possible that
3244          * the correct cache entry will not be there (although one was
3245          * added) because the directory changed during the RPC operation
3246          * and the readdir cache was flushed.  In this case, just start
3247          * over.  It is hoped that this will not happen too often... :-)
3248          */
3249         nrdc = NULL;
3250         goto top;
3251         /* NOTREACHED */
3252 }
3253 
3254 static int
3255 nfsreaddir(vnode_t *vp, rddir_cache *rdc, cred_t *cr)
3256 {
3257         int error;
3258         struct nfsrddirargs rda;
3259         struct nfsrddirres rd;
3260         rnode_t *rp;
3261         mntinfo_t *mi;
3262         uint_t count;
3263         int douprintf;
3264         failinfo_t fi, *fip;
3265 
3266         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
3267         count = rdc->buflen;
3268 
3269         rp = VTOR(vp);
3270         mi = VTOMI(vp);
3271 
3272         rda.rda_fh = *VTOFH(vp);
3273         rda.rda_offset = rdc->nfs_cookie;
3274 
3275         /*
3276          * NFS client failover support
3277          * suppress failover unless we have a zero cookie
3278          */
3279         if (rdc->nfs_cookie == (off_t)0) {
3280                 fi.vp = vp;
3281                 fi.fhp = (caddr_t)&rda.rda_fh;
3282                 fi.copyproc = nfscopyfh;
3283                 fi.lookupproc = nfslookup;
3284                 fi.xattrdirproc = acl_getxattrdir2;
3285                 fip = &fi;
3286         } else {
3287                 fip = NULL;
3288         }
3289 
3290         rd.rd_dirents = kmem_alloc(rdc->buflen, KM_SLEEP);
3291         rd.rd_size = count;
3292         rd.rd_offset = rda.rda_offset;
3293 
3294         douprintf = 1;
3295 
3296         if (mi->mi_io_kstats) {
3297                 mutex_enter(&mi->mi_lock);
3298                 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3299                 mutex_exit(&mi->mi_lock);
3300         }
3301 
3302         do {
3303                 rda.rda_count = MIN(count, mi->mi_curread);
3304                 error = rfs2call(mi, RFS_READDIR,
3305                     xdr_rddirargs, (caddr_t)&rda,
3306                     xdr_getrddirres, (caddr_t)&rd, cr,
3307                     &douprintf, &rd.rd_status, 0, fip);
3308         } while (error == ENFS_TRYAGAIN);
3309 
3310         if (mi->mi_io_kstats) {
3311                 mutex_enter(&mi->mi_lock);
3312                 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3313                 mutex_exit(&mi->mi_lock);
3314         }
3315 
3316         /*
3317          * Since we are actually doing a READDIR RPC, we must have
3318          * exclusive access to the cache entry being filled.  Thus,
3319          * it is safe to update all fields except for the flags
3320          * field.  The r_statelock in the rnode must be held to
3321          * prevent two different threads from simultaneously
3322          * attempting to update the flags field.  This can happen
3323          * if we are turning off RDDIR and the other thread is
3324          * trying to set RDDIRWAIT.
3325          */
3326         ASSERT(rdc->flags & RDDIR);
3327         if (!error) {
3328                 error = geterrno(rd.rd_status);
3329                 if (!error) {
3330                         rdc->nfs_ncookie = rd.rd_offset;
3331                         rdc->eof = rd.rd_eof ? 1 : 0;
3332                         rdc->entlen = rd.rd_size;
3333                         ASSERT(rdc->entlen <= rdc->buflen);
3334 #ifdef DEBUG
3335                         rdc->entries = rddir_cache_buf_alloc(rdc->buflen,
3336                             KM_SLEEP);
3337 #else
3338                         rdc->entries = kmem_alloc(rdc->buflen, KM_SLEEP);
3339 #endif
3340                         bcopy(rd.rd_dirents, rdc->entries, rdc->entlen);
3341                         rdc->error = 0;
3342                         if (mi->mi_io_kstats) {
3343                                 mutex_enter(&mi->mi_lock);
3344                                 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
3345                                 KSTAT_IO_PTR(mi->mi_io_kstats)->nread +=
3346                                     rd.rd_size;
3347                                 mutex_exit(&mi->mi_lock);
3348                         }
3349                 } else {
3350                         PURGE_STALE_FH(error, vp, cr);
3351                 }
3352         }
3353         if (error) {
3354                 rdc->entries = NULL;
3355                 rdc->error = error;
3356         }
3357         kmem_free(rd.rd_dirents, rdc->buflen);
3358 
3359         mutex_enter(&rp->r_statelock);
3360         rdc->flags &= ~RDDIR;
3361         if (rdc->flags & RDDIRWAIT) {
3362                 rdc->flags &= ~RDDIRWAIT;
3363                 cv_broadcast(&rdc->cv);
3364         }
3365         if (error)
3366                 rdc->flags |= RDDIRREQ;
3367         mutex_exit(&rp->r_statelock);
3368 
3369         rddir_cache_rele(rdc);
3370 
3371         return (error);
3372 }
3373 
3374 #ifdef DEBUG
3375 static int nfs_bio_do_stop = 0;
3376 #endif
3377 
3378 static int
3379 nfs_bio(struct buf *bp, cred_t *cr)
3380 {
3381         rnode_t *rp = VTOR(bp->b_vp);
3382         int count;
3383         int error;
3384         cred_t *cred;
3385         uint_t offset;
3386 
3387         DTRACE_IO1(start, struct buf *, bp);
3388 
3389         ASSERT(nfs_zone() == VTOMI(bp->b_vp)->mi_zone);
3390         offset = dbtob(bp->b_blkno);
3391 
3392         if (bp->b_flags & B_READ) {
3393                 mutex_enter(&rp->r_statelock);
3394                 if (rp->r_cred != NULL) {
3395                         cred = rp->r_cred;
3396                         crhold(cred);
3397                 } else {
3398                         rp->r_cred = cr;
3399                         crhold(cr);
3400                         cred = cr;
3401                         crhold(cred);
3402                 }
3403                 mutex_exit(&rp->r_statelock);
3404         read_again:
3405                 error = bp->b_error = nfsread(bp->b_vp, bp->b_un.b_addr,
3406                     offset, bp->b_bcount, &bp->b_resid, cred);
3407 
3408                 crfree(cred);
3409                 if (!error) {
3410                         if (bp->b_resid) {
3411                                 /*
3412                                  * Didn't get it all because we hit EOF,
3413                                  * zero all the memory beyond the EOF.
3414                                  */
3415                                 /* bzero(rdaddr + */
3416                                 bzero(bp->b_un.b_addr +
3417                                     bp->b_bcount - bp->b_resid, bp->b_resid);
3418                         }
3419                         mutex_enter(&rp->r_statelock);
3420                         if (bp->b_resid == bp->b_bcount &&
3421                             offset >= rp->r_size) {
3422                                 /*
3423                                  * We didn't read anything at all as we are
3424                                  * past EOF.  Return an error indicator back
3425                                  * but don't destroy the pages (yet).
3426                                  */
3427                                 error = NFS_EOF;
3428                         }
3429                         mutex_exit(&rp->r_statelock);
3430                 } else if (error == EACCES) {
3431                         mutex_enter(&rp->r_statelock);
3432                         if (cred != cr) {
3433                                 if (rp->r_cred != NULL)
3434                                         crfree(rp->r_cred);
3435                                 rp->r_cred = cr;
3436                                 crhold(cr);
3437                                 cred = cr;
3438                                 crhold(cred);
3439                                 mutex_exit(&rp->r_statelock);
3440                                 goto read_again;
3441                         }
3442                         mutex_exit(&rp->r_statelock);
3443                 }
3444         } else {
3445                 if (!(rp->r_flags & RSTALE)) {
3446                         mutex_enter(&rp->r_statelock);
3447                         if (rp->r_cred != NULL) {
3448                                 cred = rp->r_cred;
3449                                 crhold(cred);
3450                         } else {
3451                                 rp->r_cred = cr;
3452                                 crhold(cr);
3453                                 cred = cr;
3454                                 crhold(cred);
3455                         }
3456                         mutex_exit(&rp->r_statelock);
3457                 write_again:
3458                         mutex_enter(&rp->r_statelock);
3459                         count = MIN(bp->b_bcount, rp->r_size - offset);
3460                         mutex_exit(&rp->r_statelock);
3461                         if (count < 0)
3462                                 cmn_err(CE_PANIC, "nfs_bio: write count < 0");
3463 #ifdef DEBUG
3464                         if (count == 0) {
3465                                 zcmn_err(getzoneid(), CE_WARN,
3466                                     "nfs_bio: zero length write at %d",
3467                                     offset);
3468                                 nfs_printfhandle(&rp->r_fh);
3469                                 if (nfs_bio_do_stop)
3470                                         debug_enter("nfs_bio");
3471                         }
3472 #endif
3473                         error = nfswrite(bp->b_vp, bp->b_un.b_addr, offset,
3474                             count, cred);
3475                         if (error == EACCES) {
3476                                 mutex_enter(&rp->r_statelock);
3477                                 if (cred != cr) {
3478                                         if (rp->r_cred != NULL)
3479                                                 crfree(rp->r_cred);
3480                                         rp->r_cred = cr;
3481                                         crhold(cr);
3482                                         crfree(cred);
3483                                         cred = cr;
3484                                         crhold(cred);
3485                                         mutex_exit(&rp->r_statelock);
3486                                         goto write_again;
3487                                 }
3488                                 mutex_exit(&rp->r_statelock);
3489                         }
3490                         bp->b_error = error;
3491                         if (error && error != EINTR) {
3492                                 /*
3493                                  * Don't print EDQUOT errors on the console.
3494                                  * Don't print asynchronous EACCES errors.
3495                                  * Don't print EFBIG errors.
3496                                  * Print all other write errors.
3497                                  */
3498                                 if (error != EDQUOT && error != EFBIG &&
3499                                     (error != EACCES ||
3500                                     !(bp->b_flags & B_ASYNC)))
3501                                         nfs_write_error(bp->b_vp, error, cred);
3502                                 /*
3503                                  * Update r_error and r_flags as appropriate.
3504                                  * If the error was ESTALE, then mark the
3505                                  * rnode as not being writeable and save
3506                                  * the error status.  Otherwise, save any
3507                                  * errors which occur from asynchronous
3508                                  * page invalidations.  Any errors occurring
3509                                  * from other operations should be saved
3510                                  * by the caller.
3511                                  */
3512                                 mutex_enter(&rp->r_statelock);
3513                                 if (error == ESTALE) {
3514                                         rp->r_flags |= RSTALE;
3515                                         if (!rp->r_error)
3516                                                 rp->r_error = error;
3517                                 } else if (!rp->r_error &&
3518                                     (bp->b_flags &
3519                                     (B_INVAL|B_FORCE|B_ASYNC)) ==
3520                                     (B_INVAL|B_FORCE|B_ASYNC)) {
3521                                         rp->r_error = error;
3522                                 }
3523                                 mutex_exit(&rp->r_statelock);
3524                         }
3525                         crfree(cred);
3526                 } else {
3527                         error = rp->r_error;
3528                         /*
3529                          * A close may have cleared r_error, if so,
3530                          * propagate ESTALE error return properly
3531                          */
3532                         if (error == 0)
3533                                 error = ESTALE;
3534                 }
3535         }
3536 
3537         if (error != 0 && error != NFS_EOF)
3538                 bp->b_flags |= B_ERROR;
3539 
3540         DTRACE_IO1(done, struct buf *, bp);
3541 
3542         return (error);
3543 }
3544 
3545 /* ARGSUSED */
3546 static int
3547 nfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
3548 {
3549         struct nfs_fid *fp;
3550         rnode_t *rp;
3551 
3552         rp = VTOR(vp);
3553 
3554         if (fidp->fid_len < (sizeof (struct nfs_fid) - sizeof (short))) {
3555                 fidp->fid_len = sizeof (struct nfs_fid) - sizeof (short);
3556                 return (ENOSPC);
3557         }
3558         fp = (struct nfs_fid *)fidp;
3559         fp->nf_pad = 0;
3560         fp->nf_len = sizeof (struct nfs_fid) - sizeof (short);
3561         bcopy(rp->r_fh.fh_buf, fp->nf_data, NFS_FHSIZE);
3562         return (0);
3563 }
3564 
3565 /* ARGSUSED2 */
3566 static int
3567 nfs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
3568 {
3569         rnode_t *rp = VTOR(vp);
3570 
3571         if (!write_lock) {
3572                 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
3573                 return (V_WRITELOCK_FALSE);
3574         }
3575 
3576         if ((rp->r_flags & RDIRECTIO) || (VTOMI(vp)->mi_flags & MI_DIRECTIO)) {
3577                 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
3578                 if (rp->r_mapcnt == 0 && !vn_has_cached_data(vp))
3579                         return (V_WRITELOCK_FALSE);
3580                 nfs_rw_exit(&rp->r_rwlock);
3581         }
3582 
3583         (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE);
3584         return (V_WRITELOCK_TRUE);
3585 }
3586 
3587 /* ARGSUSED */
3588 static void
3589 nfs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
3590 {
3591         rnode_t *rp = VTOR(vp);
3592 
3593         nfs_rw_exit(&rp->r_rwlock);
3594 }
3595 
3596 /* ARGSUSED */
3597 static int
3598 nfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
3599 {
3600 
3601         /*
3602          * Because we stuff the readdir cookie into the offset field
3603          * someone may attempt to do an lseek with the cookie which
3604          * we want to succeed.
3605          */
3606         if (vp->v_type == VDIR)
3607                 return (0);
3608         if (*noffp < 0 || *noffp > MAXOFF32_T)
3609                 return (EINVAL);
3610         return (0);
3611 }
3612 
3613 /*
3614  * number of NFS_MAXDATA blocks to read ahead
3615  * optimized for 100 base-T.
3616  */
3617 volatile int nfs_nra = 4;
3618 
3619 #ifdef DEBUG
3620 static int nfs_lostpage = 0;    /* number of times we lost original page */
3621 #endif
3622 
3623 /*
3624  * Return all the pages from [off..off+len) in file
3625  */
3626 /* ARGSUSED */
3627 static int
3628 nfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
3629         page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
3630         enum seg_rw rw, cred_t *cr, caller_context_t *ct)
3631 {
3632         rnode_t *rp;
3633         int error;
3634         mntinfo_t *mi;
3635 
3636         if (vp->v_flag & VNOMAP)
3637                 return (ENOSYS);
3638 
3639         ASSERT(off <= MAXOFF32_T);
3640         if (nfs_zone() != VTOMI(vp)->mi_zone)
3641                 return (EIO);
3642         if (protp != NULL)
3643                 *protp = PROT_ALL;
3644 
3645         /*
3646          * Now valididate that the caches are up to date.
3647          */
3648         error = nfs_validate_caches(vp, cr);
3649         if (error)
3650                 return (error);
3651 
3652         rp = VTOR(vp);
3653         mi = VTOMI(vp);
3654 retry:
3655         mutex_enter(&rp->r_statelock);
3656 
3657         /*
3658          * Don't create dirty pages faster than they
3659          * can be cleaned so that the system doesn't
3660          * get imbalanced.  If the async queue is
3661          * maxed out, then wait for it to drain before
3662          * creating more dirty pages.  Also, wait for
3663          * any threads doing pagewalks in the vop_getattr
3664          * entry points so that they don't block for
3665          * long periods.
3666          */
3667         if (rw == S_CREATE) {
3668                 while ((mi->mi_max_threads != 0 &&
3669                     rp->r_awcount > 2 * mi->mi_max_threads) ||
3670                     rp->r_gcount > 0)
3671                         cv_wait(&rp->r_cv, &rp->r_statelock);
3672         }
3673 
3674         /*
3675          * If we are getting called as a side effect of an nfs_write()
3676          * operation the local file size might not be extended yet.
3677          * In this case we want to be able to return pages of zeroes.
3678          */
3679         if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) {
3680                 mutex_exit(&rp->r_statelock);
3681                 return (EFAULT);                /* beyond EOF */
3682         }
3683 
3684         mutex_exit(&rp->r_statelock);
3685 
3686         error = pvn_getpages(nfs_getapage, vp, off, len, protp, pl, plsz,
3687             seg, addr, rw, cr);
3688 
3689         switch (error) {
3690         case NFS_EOF:
3691                 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
3692                 goto retry;
3693         case ESTALE:
3694                 PURGE_STALE_FH(error, vp, cr);
3695         }
3696 
3697         return (error);
3698 }
3699 
3700 /*
3701  * Called from pvn_getpages to get a particular page.
3702  */
3703 /* ARGSUSED */
3704 static int
3705 nfs_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp,
3706         page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
3707         enum seg_rw rw, cred_t *cr)
3708 {
3709         rnode_t *rp;
3710         uint_t bsize;
3711         struct buf *bp;
3712         page_t *pp;
3713         u_offset_t lbn;
3714         u_offset_t io_off;
3715         u_offset_t blkoff;
3716         u_offset_t rablkoff;
3717         size_t io_len;
3718         uint_t blksize;
3719         int error;
3720         int readahead;
3721         int readahead_issued = 0;
3722         int ra_window; /* readahead window */
3723         page_t *pagefound;
3724 
3725         if (nfs_zone() != VTOMI(vp)->mi_zone)
3726                 return (EIO);
3727         rp = VTOR(vp);
3728         bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
3729 
3730 reread:
3731         bp = NULL;
3732         pp = NULL;
3733         pagefound = NULL;
3734 
3735         if (pl != NULL)
3736                 pl[0] = NULL;
3737 
3738         error = 0;
3739         lbn = off / bsize;
3740         blkoff = lbn * bsize;
3741 
3742         /*
3743          * Queueing up the readahead before doing the synchronous read
3744          * results in a significant increase in read throughput because
3745          * of the increased parallelism between the async threads and
3746          * the process context.
3747          */
3748         if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 &&
3749             rw != S_CREATE &&
3750             !(vp->v_flag & VNOCACHE)) {
3751                 mutex_enter(&rp->r_statelock);
3752 
3753                 /*
3754                  * Calculate the number of readaheads to do.
3755                  * a) No readaheads at offset = 0.
3756                  * b) Do maximum(nfs_nra) readaheads when the readahead
3757                  *    window is closed.
3758                  * c) Do readaheads between 1 to (nfs_nra - 1) depending
3759                  *    upon how far the readahead window is open or close.
3760                  * d) No readaheads if rp->r_nextr is not within the scope
3761                  *    of the readahead window (random i/o).
3762                  */
3763 
3764                 if (off == 0)
3765                         readahead = 0;
3766                 else if (blkoff == rp->r_nextr)
3767                         readahead = nfs_nra;
3768                 else if (rp->r_nextr > blkoff &&
3769                     ((ra_window = (rp->r_nextr - blkoff) / bsize)
3770                     <= (nfs_nra - 1)))
3771                         readahead = nfs_nra - ra_window;
3772                 else
3773                         readahead = 0;
3774 
3775                 rablkoff = rp->r_nextr;
3776                 while (readahead > 0 && rablkoff + bsize < rp->r_size) {
3777                         mutex_exit(&rp->r_statelock);
3778                         if (nfs_async_readahead(vp, rablkoff + bsize,
3779                             addr + (rablkoff + bsize - off), seg, cr,
3780                             nfs_readahead) < 0) {
3781                                 mutex_enter(&rp->r_statelock);
3782                                 break;
3783                         }
3784                         readahead--;
3785                         rablkoff += bsize;
3786                         /*
3787                          * Indicate that we did a readahead so
3788                          * readahead offset is not updated
3789                          * by the synchronous read below.
3790                          */
3791                         readahead_issued = 1;
3792                         mutex_enter(&rp->r_statelock);
3793                         /*
3794                          * set readahead offset to
3795                          * offset of last async readahead
3796                          * request.
3797                          */
3798                         rp->r_nextr = rablkoff;
3799                 }
3800                 mutex_exit(&rp->r_statelock);
3801         }
3802 
3803 again:
3804         if ((pagefound = page_exists(vp, off)) == NULL) {
3805                 if (pl == NULL) {
3806                         (void) nfs_async_readahead(vp, blkoff, addr, seg, cr,
3807                             nfs_readahead);
3808                 } else if (rw == S_CREATE) {
3809                         /*
3810                          * Block for this page is not allocated, or the offset
3811                          * is beyond the current allocation size, or we're
3812                          * allocating a swap slot and the page was not found,
3813                          * so allocate it and return a zero page.
3814                          */
3815                         if ((pp = page_create_va(vp, off,
3816                             PAGESIZE, PG_WAIT, seg, addr)) == NULL)
3817                                 cmn_err(CE_PANIC, "nfs_getapage: page_create");
3818                         io_len = PAGESIZE;
3819                         mutex_enter(&rp->r_statelock);
3820                         rp->r_nextr = off + PAGESIZE;
3821                         mutex_exit(&rp->r_statelock);
3822                 } else {
3823                         /*
3824                          * Need to go to server to get a BLOCK, exception to
3825                          * that being while reading at offset = 0 or doing
3826                          * random i/o, in that case read only a PAGE.
3827                          */
3828                         mutex_enter(&rp->r_statelock);
3829                         if (blkoff < rp->r_size &&
3830                             blkoff + bsize >= rp->r_size) {
3831                                 /*
3832                                  * If only a block or less is left in
3833                                  * the file, read all that is remaining.
3834                                  */
3835                                 if (rp->r_size <= off) {
3836                                         /*
3837                                          * Trying to access beyond EOF,
3838                                          * set up to get at least one page.
3839                                          */
3840                                         blksize = off + PAGESIZE - blkoff;
3841                                 } else
3842                                         blksize = rp->r_size - blkoff;
3843                         } else if ((off == 0) ||
3844                             (off != rp->r_nextr && !readahead_issued)) {
3845                                 blksize = PAGESIZE;
3846                                 blkoff = off; /* block = page here */
3847                         } else
3848                                 blksize = bsize;
3849                         mutex_exit(&rp->r_statelock);
3850 
3851                         pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
3852                             &io_len, blkoff, blksize, 0);
3853 
3854                         /*
3855                          * Some other thread has entered the page,
3856                          * so just use it.
3857                          */
3858                         if (pp == NULL)
3859                                 goto again;
3860 
3861                         /*
3862                          * Now round the request size up to page boundaries.
3863                          * This ensures that the entire page will be
3864                          * initialized to zeroes if EOF is encountered.
3865                          */
3866                         io_len = ptob(btopr(io_len));
3867 
3868                         bp = pageio_setup(pp, io_len, vp, B_READ);
3869                         ASSERT(bp != NULL);
3870 
3871                         /*
3872                          * pageio_setup should have set b_addr to 0.  This
3873                          * is correct since we want to do I/O on a page
3874                          * boundary.  bp_mapin will use this addr to calculate
3875                          * an offset, and then set b_addr to the kernel virtual
3876                          * address it allocated for us.
3877                          */
3878                         ASSERT(bp->b_un.b_addr == 0);
3879 
3880                         bp->b_edev = 0;
3881                         bp->b_dev = 0;
3882                         bp->b_lblkno = lbtodb(io_off);
3883                         bp->b_file = vp;
3884                         bp->b_offset = (offset_t)off;
3885                         bp_mapin(bp);
3886 
3887                         /*
3888                          * If doing a write beyond what we believe is EOF,
3889                          * don't bother trying to read the pages from the
3890                          * server, we'll just zero the pages here.  We
3891                          * don't check that the rw flag is S_WRITE here
3892                          * because some implementations may attempt a
3893                          * read access to the buffer before copying data.
3894                          */
3895                         mutex_enter(&rp->r_statelock);
3896                         if (io_off >= rp->r_size && seg == segkmap) {
3897                                 mutex_exit(&rp->r_statelock);
3898                                 bzero(bp->b_un.b_addr, io_len);
3899                         } else {
3900                                 mutex_exit(&rp->r_statelock);
3901                                 error = nfs_bio(bp, cr);
3902                         }
3903 
3904                         /*
3905                          * Unmap the buffer before freeing it.
3906                          */
3907                         bp_mapout(bp);
3908                         pageio_done(bp);
3909 
3910                         if (error == NFS_EOF) {
3911                                 /*
3912                                  * If doing a write system call just return
3913                                  * zeroed pages, else user tried to get pages
3914                                  * beyond EOF, return error.  We don't check
3915                                  * that the rw flag is S_WRITE here because
3916                                  * some implementations may attempt a read
3917                                  * access to the buffer before copying data.
3918                                  */
3919                                 if (seg == segkmap)
3920                                         error = 0;
3921                                 else
3922                                         error = EFAULT;
3923                         }
3924 
3925                         if (!readahead_issued && !error) {
3926                                 mutex_enter(&rp->r_statelock);
3927                                 rp->r_nextr = io_off + io_len;
3928                                 mutex_exit(&rp->r_statelock);
3929                         }
3930                 }
3931         }
3932 
3933 out:
3934         if (pl == NULL)
3935                 return (error);
3936 
3937         if (error) {
3938                 if (pp != NULL)
3939                         pvn_read_done(pp, B_ERROR);
3940                 return (error);
3941         }
3942 
3943         if (pagefound) {
3944                 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
3945 
3946                 /*
3947                  * Page exists in the cache, acquire the appropriate lock.
3948                  * If this fails, start all over again.
3949                  */
3950                 if ((pp = page_lookup(vp, off, se)) == NULL) {
3951 #ifdef DEBUG
3952                         nfs_lostpage++;
3953 #endif
3954                         goto reread;
3955                 }
3956                 pl[0] = pp;
3957                 pl[1] = NULL;
3958                 return (0);
3959         }
3960 
3961         if (pp != NULL)
3962                 pvn_plist_init(pp, pl, plsz, off, io_len, rw);
3963 
3964         return (error);
3965 }
3966 
3967 static void
3968 nfs_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg,
3969         cred_t *cr)
3970 {
3971         int error;
3972         page_t *pp;
3973         u_offset_t io_off;
3974         size_t io_len;
3975         struct buf *bp;
3976         uint_t bsize, blksize;
3977         rnode_t *rp = VTOR(vp);
3978 
3979         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
3980 
3981         bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
3982 
3983         mutex_enter(&rp->r_statelock);
3984         if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
3985                 /*
3986                  * If less than a block left in file read less
3987                  * than a block.
3988                  */
3989                 blksize = rp->r_size - blkoff;
3990         } else
3991                 blksize = bsize;
3992         mutex_exit(&rp->r_statelock);
3993 
3994         pp = pvn_read_kluster(vp, blkoff, segkmap, addr,
3995             &io_off, &io_len, blkoff, blksize, 1);
3996         /*
3997          * The isra flag passed to the kluster function is 1, we may have
3998          * gotten a return value of NULL for a variety of reasons (# of free
3999          * pages < minfree, someone entered the page on the vnode etc). In all
4000          * cases, we want to punt on the readahead.
4001          */
4002         if (pp == NULL)
4003                 return;
4004 
4005         /*
4006          * Now round the request size up to page boundaries.
4007          * This ensures that the entire page will be
4008          * initialized to zeroes if EOF is encountered.
4009          */
4010         io_len = ptob(btopr(io_len));
4011 
4012         bp = pageio_setup(pp, io_len, vp, B_READ);
4013         ASSERT(bp != NULL);
4014 
4015         /*
4016          * pageio_setup should have set b_addr to 0.  This is correct since
4017          * we want to do I/O on a page boundary. bp_mapin() will use this addr
4018          * to calculate an offset, and then set b_addr to the kernel virtual
4019          * address it allocated for us.
4020          */
4021         ASSERT(bp->b_un.b_addr == 0);
4022 
4023         bp->b_edev = 0;
4024         bp->b_dev = 0;
4025         bp->b_lblkno = lbtodb(io_off);
4026         bp->b_file = vp;
4027         bp->b_offset = (offset_t)blkoff;
4028         bp_mapin(bp);
4029 
4030         /*
4031          * If doing a write beyond what we believe is EOF, don't bother trying
4032          * to read the pages from the server, we'll just zero the pages here.
4033          * We don't check that the rw flag is S_WRITE here because some
4034          * implementations may attempt a read access to the buffer before
4035          * copying data.
4036          */
4037         mutex_enter(&rp->r_statelock);
4038         if (io_off >= rp->r_size && seg == segkmap) {
4039                 mutex_exit(&rp->r_statelock);
4040                 bzero(bp->b_un.b_addr, io_len);
4041                 error = 0;
4042         } else {
4043                 mutex_exit(&rp->r_statelock);
4044                 error = nfs_bio(bp, cr);
4045                 if (error == NFS_EOF)
4046                         error = 0;
4047         }
4048 
4049         /*
4050          * Unmap the buffer before freeing it.
4051          */
4052         bp_mapout(bp);
4053         pageio_done(bp);
4054 
4055         pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ);
4056 
4057         /*
4058          * In case of error set readahead offset
4059          * to the lowest offset.
4060          * pvn_read_done() calls VN_DISPOSE to destroy the pages
4061          */
4062         if (error && rp->r_nextr > io_off) {
4063                 mutex_enter(&rp->r_statelock);
4064                 if (rp->r_nextr > io_off)
4065                         rp->r_nextr = io_off;
4066                 mutex_exit(&rp->r_statelock);
4067         }
4068 }
4069 
4070 /*
4071  * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
4072  * If len == 0, do from off to EOF.
4073  *
4074  * The normal cases should be len == 0 && off == 0 (entire vp list),
4075  * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
4076  * (from pageout).
4077  */
4078 /* ARGSUSED */
4079 static int
4080 nfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4081         caller_context_t *ct)
4082 {
4083         int error;
4084         rnode_t *rp;
4085 
4086         ASSERT(cr != NULL);
4087 
4088         /*
4089          * XXX - Why should this check be made here?
4090          */
4091         if (vp->v_flag & VNOMAP)
4092                 return (ENOSYS);
4093 
4094         if (len == 0 && !(flags & B_INVAL) && vn_is_readonly(vp))
4095                 return (0);
4096 
4097         if (!(flags & B_ASYNC) && nfs_zone() != VTOMI(vp)->mi_zone)
4098                 return (EIO);
4099         ASSERT(off <= MAXOFF32_T);
4100 
4101         rp = VTOR(vp);
4102         mutex_enter(&rp->r_statelock);
4103         rp->r_count++;
4104         mutex_exit(&rp->r_statelock);
4105         error = nfs_putpages(vp, off, len, flags, cr);
4106         mutex_enter(&rp->r_statelock);
4107         rp->r_count--;
4108         cv_broadcast(&rp->r_cv);
4109         mutex_exit(&rp->r_statelock);
4110 
4111         return (error);
4112 }
4113 
4114 /*
4115  * Write out a single page, possibly klustering adjacent dirty pages.
4116  */
4117 int
4118 nfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
4119         int flags, cred_t *cr)
4120 {
4121         u_offset_t io_off;
4122         u_offset_t lbn_off;
4123         u_offset_t lbn;
4124         size_t io_len;
4125         uint_t bsize;
4126         int error;
4127         rnode_t *rp;
4128 
4129         ASSERT(!vn_is_readonly(vp));
4130         ASSERT(pp != NULL);
4131         ASSERT(cr != NULL);
4132         ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI(vp)->mi_zone);
4133 
4134         rp = VTOR(vp);
4135         ASSERT(rp->r_count > 0);
4136 
4137         ASSERT(pp->p_offset <= MAXOFF32_T);
4138 
4139         bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
4140         lbn = pp->p_offset / bsize;
4141         lbn_off = lbn * bsize;
4142 
4143         /*
4144          * Find a kluster that fits in one block, or in
4145          * one page if pages are bigger than blocks.  If
4146          * there is less file space allocated than a whole
4147          * page, we'll shorten the i/o request below.
4148          */
4149         pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off,
4150             roundup(bsize, PAGESIZE), flags);
4151 
4152         /*
4153          * pvn_write_kluster shouldn't have returned a page with offset
4154          * behind the original page we were given.  Verify that.
4155          */
4156         ASSERT((pp->p_offset / bsize) >= lbn);
4157 
4158         /*
4159          * Now pp will have the list of kept dirty pages marked for
4160          * write back.  It will also handle invalidation and freeing
4161          * of pages that are not dirty.  Check for page length rounding
4162          * problems.
4163          */
4164         if (io_off + io_len > lbn_off + bsize) {
4165                 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE);
4166                 io_len = lbn_off + bsize - io_off;
4167         }
4168         /*
4169          * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
4170          * consistent value of r_size. RMODINPROGRESS is set in writerp().
4171          * When RMODINPROGRESS is set it indicates that a uiomove() is in
4172          * progress and the r_size has not been made consistent with the
4173          * new size of the file. When the uiomove() completes the r_size is
4174          * updated and the RMODINPROGRESS flag is cleared.
4175          *
4176          * The RMODINPROGRESS flag makes sure that nfs(3)_bio() sees a
4177          * consistent value of r_size. Without this handshaking, it is
4178          * possible that nfs(3)_bio() picks  up the old value of r_size
4179          * before the uiomove() in writerp() completes. This will result
4180          * in the write through nfs(3)_bio() being dropped.
4181          *
4182          * More precisely, there is a window between the time the uiomove()
4183          * completes and the time the r_size is updated. If a VOP_PUTPAGE()
4184          * operation intervenes in this window, the page will be picked up,
4185          * because it is dirty (it will be unlocked, unless it was
4186          * pagecreate'd). When the page is picked up as dirty, the dirty
4187          * bit is reset (pvn_getdirty()). In nfs(3)write(), r_size is
4188          * checked. This will still be the old size. Therefore the page will
4189          * not be written out. When segmap_release() calls VOP_PUTPAGE(),
4190          * the page will be found to be clean and the write will be dropped.
4191          */
4192         if (rp->r_flags & RMODINPROGRESS) {
4193                 mutex_enter(&rp->r_statelock);
4194                 if ((rp->r_flags & RMODINPROGRESS) &&
4195                     rp->r_modaddr + MAXBSIZE > io_off &&
4196                     rp->r_modaddr < io_off + io_len) {
4197                         page_t *plist;
4198                         /*
4199                          * A write is in progress for this region of the file.
4200                          * If we did not detect RMODINPROGRESS here then this
4201                          * path through nfs_putapage() would eventually go to
4202                          * nfs(3)_bio() and may not write out all of the data
4203                          * in the pages. We end up losing data. So we decide
4204                          * to set the modified bit on each page in the page
4205                          * list and mark the rnode with RDIRTY. This write
4206                          * will be restarted at some later time.
4207                          */
4208                         plist = pp;
4209                         while (plist != NULL) {
4210                                 pp = plist;
4211                                 page_sub(&plist, pp);
4212                                 hat_setmod(pp);
4213                                 page_io_unlock(pp);
4214                                 page_unlock(pp);
4215                         }
4216                         rp->r_flags |= RDIRTY;
4217                         mutex_exit(&rp->r_statelock);
4218                         if (offp)
4219                                 *offp = io_off;
4220                         if (lenp)
4221                                 *lenp = io_len;
4222                         return (0);
4223                 }
4224                 mutex_exit(&rp->r_statelock);
4225         }
4226 
4227         if (flags & B_ASYNC) {
4228                 error = nfs_async_putapage(vp, pp, io_off, io_len, flags, cr,
4229                     nfs_sync_putapage);
4230         } else
4231                 error = nfs_sync_putapage(vp, pp, io_off, io_len, flags, cr);
4232 
4233         if (offp)
4234                 *offp = io_off;
4235         if (lenp)
4236                 *lenp = io_len;
4237         return (error);
4238 }
4239 
4240 static int
4241 nfs_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4242         int flags, cred_t *cr)
4243 {
4244         int error;
4245         rnode_t *rp;
4246 
4247         flags |= B_WRITE;
4248 
4249         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4250         error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4251 
4252         rp = VTOR(vp);
4253 
4254         if ((error == ENOSPC || error == EDQUOT || error == EACCES) &&
4255             (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) {
4256                 if (!(rp->r_flags & ROUTOFSPACE)) {
4257                         mutex_enter(&rp->r_statelock);
4258                         rp->r_flags |= ROUTOFSPACE;
4259                         mutex_exit(&rp->r_statelock);
4260                 }
4261                 flags |= B_ERROR;
4262                 pvn_write_done(pp, flags);
4263                 /*
4264                  * If this was not an async thread, then try again to
4265                  * write out the pages, but this time, also destroy
4266                  * them whether or not the write is successful.  This
4267                  * will prevent memory from filling up with these
4268                  * pages and destroying them is the only alternative
4269                  * if they can't be written out.
4270                  *
4271                  * Don't do this if this is an async thread because
4272                  * when the pages are unlocked in pvn_write_done,
4273                  * some other thread could have come along, locked
4274                  * them, and queued for an async thread.  It would be
4275                  * possible for all of the async threads to be tied
4276                  * up waiting to lock the pages again and they would
4277                  * all already be locked and waiting for an async
4278                  * thread to handle them.  Deadlock.
4279                  */
4280                 if (!(flags & B_ASYNC)) {
4281                         error = nfs_putpage(vp, io_off, io_len,
4282                             B_INVAL | B_FORCE, cr, NULL);
4283                 }
4284         } else {
4285                 if (error)
4286                         flags |= B_ERROR;
4287                 else if (rp->r_flags & ROUTOFSPACE) {
4288                         mutex_enter(&rp->r_statelock);
4289                         rp->r_flags &= ~ROUTOFSPACE;
4290                         mutex_exit(&rp->r_statelock);
4291                 }
4292                 pvn_write_done(pp, flags);
4293         }
4294 
4295         return (error);
4296 }
4297 
4298 /* ARGSUSED */
4299 static int
4300 nfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
4301         size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4302         caller_context_t *ct)
4303 {
4304         struct segvn_crargs vn_a;
4305         int error;
4306         rnode_t *rp;
4307         struct vattr va;
4308 
4309         if (nfs_zone() != VTOMI(vp)->mi_zone)
4310                 return (EIO);
4311 
4312         if (vp->v_flag & VNOMAP)
4313                 return (ENOSYS);
4314 
4315         if (off > MAXOFF32_T)
4316                 return (EFBIG);
4317 
4318         if (off < 0 || off + len < 0)
4319                 return (ENXIO);
4320 
4321         if (vp->v_type != VREG)
4322                 return (ENODEV);
4323 
4324         /*
4325          * If there is cached data and if close-to-open consistency
4326          * checking is not turned off and if the file system is not
4327          * mounted readonly, then force an over the wire getattr.
4328          * Otherwise, just invoke nfsgetattr to get a copy of the
4329          * attributes.  The attribute cache will be used unless it
4330          * is timed out and if it is, then an over the wire getattr
4331          * will be issued.
4332          */
4333         va.va_mask = AT_ALL;
4334         if (vn_has_cached_data(vp) &&
4335             !(VTOMI(vp)->mi_flags & MI_NOCTO) && !vn_is_readonly(vp))
4336                 error = nfs_getattr_otw(vp, &va, cr);
4337         else
4338                 error = nfsgetattr(vp, &va, cr);
4339         if (error)
4340                 return (error);
4341 
4342         /*
4343          * Check to see if the vnode is currently marked as not cachable.
4344          * This means portions of the file are locked (through VOP_FRLOCK).
4345          * In this case the map request must be refused.  We use
4346          * rp->r_lkserlock to avoid a race with concurrent lock requests.
4347          */
4348         rp = VTOR(vp);
4349 
4350         /*
4351          * Atomically increment r_inmap after acquiring r_rwlock. The
4352          * idea here is to acquire r_rwlock to block read/write and
4353          * not to protect r_inmap. r_inmap will inform nfs_read/write()
4354          * that we are in nfs_map(). Now, r_rwlock is acquired in order
4355          * and we can prevent the deadlock that would have occurred
4356          * when nfs_addmap() would have acquired it out of order.
4357          *
4358          * Since we are not protecting r_inmap by any lock, we do not
4359          * hold any lock when we decrement it. We atomically decrement
4360          * r_inmap after we release r_lkserlock.
4361          */
4362 
4363         if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp)))
4364                 return (EINTR);
4365         atomic_inc_uint(&rp->r_inmap);
4366         nfs_rw_exit(&rp->r_rwlock);
4367 
4368         if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR(vp))) {
4369                 atomic_dec_uint(&rp->r_inmap);
4370                 return (EINTR);
4371         }
4372         if (vp->v_flag & VNOCACHE) {
4373                 error = EAGAIN;
4374                 goto done;
4375         }
4376 
4377         /*
4378          * Don't allow concurrent locks and mapping if mandatory locking is
4379          * enabled.
4380          */
4381         if ((flk_has_remote_locks(vp) || lm_has_sleep(vp)) &&
4382             MANDLOCK(vp, va.va_mode)) {
4383                 error = EAGAIN;
4384                 goto done;
4385         }
4386 
4387         as_rangelock(as);
4388         error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
4389         if (error != 0) {
4390                 as_rangeunlock(as);
4391                 goto done;
4392         }
4393 
4394         vn_a.vp = vp;
4395         vn_a.offset = off;
4396         vn_a.type = (flags & MAP_TYPE);
4397         vn_a.prot = (uchar_t)prot;
4398         vn_a.maxprot = (uchar_t)maxprot;
4399         vn_a.flags = (flags & ~MAP_TYPE);
4400         vn_a.cred = cr;
4401         vn_a.amp = NULL;
4402         vn_a.szc = 0;
4403         vn_a.lgrp_mem_policy_flags = 0;
4404 
4405         error = as_map(as, *addrp, len, segvn_create, &vn_a);
4406         as_rangeunlock(as);
4407 
4408 done:
4409         nfs_rw_exit(&rp->r_lkserlock);
4410         atomic_dec_uint(&rp->r_inmap);
4411         return (error);
4412 }
4413 
4414 /* ARGSUSED */
4415 static int
4416 nfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4417         size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4418         caller_context_t *ct)
4419 {
4420         rnode_t *rp;
4421 
4422         if (vp->v_flag & VNOMAP)
4423                 return (ENOSYS);
4424         if (nfs_zone() != VTOMI(vp)->mi_zone)
4425                 return (EIO);
4426 
4427         rp = VTOR(vp);
4428         atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
4429 
4430         return (0);
4431 }
4432 
4433 /* ARGSUSED */
4434 static int
4435 nfs_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, offset_t offset,
4436         struct flk_callback *flk_cbp, cred_t *cr, caller_context_t *ct)
4437 {
4438         netobj lm_fh;
4439         int rc;
4440         u_offset_t start, end;
4441         rnode_t *rp;
4442         int error = 0, intr = INTR(vp);
4443 
4444         /* check for valid cmd parameter */
4445         if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW)
4446                 return (EINVAL);
4447         if (nfs_zone() != VTOMI(vp)->mi_zone)
4448                 return (EIO);
4449 
4450         /* Verify l_type. */
4451         switch (bfp->l_type) {
4452         case F_RDLCK:
4453                 if (cmd != F_GETLK && !(flag & FREAD))
4454                         return (EBADF);
4455                 break;
4456         case F_WRLCK:
4457                 if (cmd != F_GETLK && !(flag & FWRITE))
4458                         return (EBADF);
4459                 break;
4460         case F_UNLCK:
4461                 intr = 0;
4462                 break;
4463 
4464         default:
4465                 return (EINVAL);
4466         }
4467 
4468         /* check the validity of the lock range */
4469         if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset))
4470                 return (rc);
4471         if (rc = flk_check_lock_data(start, end, MAXOFF32_T))
4472                 return (rc);
4473 
4474         /*
4475          * If the filesystem is mounted using local locking, pass the
4476          * request off to the local locking code.
4477          */
4478         if (VTOMI(vp)->mi_flags & MI_LLOCK) {
4479                 if (offset > MAXOFF32_T)
4480                         return (EFBIG);
4481                 if (cmd == F_SETLK || cmd == F_SETLKW) {
4482                         /*
4483                          * For complete safety, we should be holding
4484                          * r_lkserlock.  However, we can't call
4485                          * lm_safelock and then fs_frlock while
4486                          * holding r_lkserlock, so just invoke
4487                          * lm_safelock and expect that this will
4488                          * catch enough of the cases.
4489                          */
4490                         if (!lm_safelock(vp, bfp, cr))
4491                                 return (EAGAIN);
4492                 }
4493                 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4494         }
4495 
4496         rp = VTOR(vp);
4497 
4498         /*
4499          * Check whether the given lock request can proceed, given the
4500          * current file mappings.
4501          */
4502         if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr))
4503                 return (EINTR);
4504         if (cmd == F_SETLK || cmd == F_SETLKW) {
4505                 if (!lm_safelock(vp, bfp, cr)) {
4506                         rc = EAGAIN;
4507                         goto done;
4508                 }
4509         }
4510 
4511         /*
4512          * Flush the cache after waiting for async I/O to finish.  For new
4513          * locks, this is so that the process gets the latest bits from the
4514          * server.  For unlocks, this is so that other clients see the
4515          * latest bits once the file has been unlocked.  If currently dirty
4516          * pages can't be flushed, then don't allow a lock to be set.  But
4517          * allow unlocks to succeed, to avoid having orphan locks on the
4518          * server.
4519          */
4520         if (cmd != F_GETLK) {
4521                 mutex_enter(&rp->r_statelock);
4522                 while (rp->r_count > 0) {
4523                         if (intr) {
4524                                 klwp_t *lwp = ttolwp(curthread);
4525 
4526                                 if (lwp != NULL)
4527                                         lwp->lwp_nostop++;
4528                                 if (cv_wait_sig(&rp->r_cv, &rp->r_statelock)
4529                                     == 0) {
4530                                         if (lwp != NULL)
4531                                                 lwp->lwp_nostop--;
4532                                         rc = EINTR;
4533                                         break;
4534                                 }
4535                                 if (lwp != NULL)
4536                                         lwp->lwp_nostop--;
4537                         } else
4538                         cv_wait(&rp->r_cv, &rp->r_statelock);
4539                 }
4540                 mutex_exit(&rp->r_statelock);
4541                 if (rc != 0)
4542                         goto done;
4543                 error = nfs_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct);
4544                 if (error) {
4545                         if (error == ENOSPC || error == EDQUOT) {
4546                                 mutex_enter(&rp->r_statelock);
4547                                 if (!rp->r_error)
4548                                         rp->r_error = error;
4549                                 mutex_exit(&rp->r_statelock);
4550                         }
4551                         if (bfp->l_type != F_UNLCK) {
4552                                 rc = ENOLCK;
4553                                 goto done;
4554                         }
4555                 }
4556         }
4557 
4558         lm_fh.n_len = sizeof (fhandle_t);
4559         lm_fh.n_bytes = (char *)VTOFH(vp);
4560 
4561         /*
4562          * Call the lock manager to do the real work of contacting
4563          * the server and obtaining the lock.
4564          */
4565         rc = lm_frlock(vp, cmd, bfp, flag, offset, cr, &lm_fh, flk_cbp);
4566 
4567         if (rc == 0)
4568                 nfs_lockcompletion(vp, cmd);
4569 
4570 done:
4571         nfs_rw_exit(&rp->r_lkserlock);
4572         return (rc);
4573 }
4574 
4575 /*
4576  * Free storage space associated with the specified vnode.  The portion
4577  * to be freed is specified by bfp->l_start and bfp->l_len (already
4578  * normalized to a "whence" of 0).
4579  *
4580  * This is an experimental facility whose continued existence is not
4581  * guaranteed.  Currently, we only support the special case
4582  * of l_len == 0, meaning free to end of file.
4583  */
4584 /* ARGSUSED */
4585 static int
4586 nfs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
4587         offset_t offset, cred_t *cr, caller_context_t *ct)
4588 {
4589         int error;
4590 
4591         ASSERT(vp->v_type == VREG);
4592         if (cmd != F_FREESP)
4593                 return (EINVAL);
4594 
4595         if (offset > MAXOFF32_T)
4596                 return (EFBIG);
4597 
4598         if ((bfp->l_start > MAXOFF32_T) || (bfp->l_end > MAXOFF32_T) ||
4599             (bfp->l_len > MAXOFF32_T))
4600                 return (EFBIG);
4601 
4602         if (nfs_zone() != VTOMI(vp)->mi_zone)
4603                 return (EIO);
4604 
4605         error = convoff(vp, bfp, 0, offset);
4606         if (!error) {
4607                 ASSERT(bfp->l_start >= 0);
4608                 if (bfp->l_len == 0) {
4609                         struct vattr va;
4610 
4611                         /*
4612                          * ftruncate should not change the ctime and
4613                          * mtime if we truncate the file to its
4614                          * previous size.
4615                          */
4616                         va.va_mask = AT_SIZE;
4617                         error = nfsgetattr(vp, &va, cr);
4618                         if (error || va.va_size == bfp->l_start)
4619                                 return (error);
4620                         va.va_mask = AT_SIZE;
4621                         va.va_size = bfp->l_start;
4622                         error = nfssetattr(vp, &va, 0, cr);
4623 
4624                         if (error == 0 && bfp->l_start == 0)
4625                                 vnevent_truncate(vp, ct);
4626                 } else
4627                         error = EINVAL;
4628         }
4629 
4630         return (error);
4631 }
4632 
4633 /* ARGSUSED */
4634 static int
4635 nfs_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
4636 {
4637 
4638         return (EINVAL);
4639 }
4640 
4641 /*
4642  * Setup and add an address space callback to do the work of the delmap call.
4643  * The callback will (and must be) deleted in the actual callback function.
4644  *
4645  * This is done in order to take care of the problem that we have with holding
4646  * the address space's a_lock for a long period of time (e.g. if the NFS server
4647  * is down).  Callbacks will be executed in the address space code while the
4648  * a_lock is not held.  Holding the address space's a_lock causes things such
4649  * as ps and fork to hang because they are trying to acquire this lock as well.
4650  */
4651 /* ARGSUSED */
4652 static int
4653 nfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4654         size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
4655         caller_context_t *ct)
4656 {
4657         int                     caller_found;
4658         int                     error;
4659         rnode_t                 *rp;
4660         nfs_delmap_args_t       *dmapp;
4661         nfs_delmapcall_t        *delmap_call;
4662 
4663         if (vp->v_flag & VNOMAP)
4664                 return (ENOSYS);
4665         /*
4666          * A process may not change zones if it has NFS pages mmap'ed
4667          * in, so we can't legitimately get here from the wrong zone.
4668          */
4669         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4670 
4671         rp = VTOR(vp);
4672 
4673         /*
4674          * The way that the address space of this process deletes its mapping
4675          * of this file is via the following call chains:
4676          * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap()
4677          * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs_delmap()
4678          *
4679          * With the use of address space callbacks we are allowed to drop the
4680          * address space lock, a_lock, while executing the NFS operations that
4681          * need to go over the wire.  Returning EAGAIN to the caller of this
4682          * function is what drives the execution of the callback that we add
4683          * below.  The callback will be executed by the address space code
4684          * after dropping the a_lock.  When the callback is finished, since
4685          * we dropped the a_lock, it must be re-acquired and segvn_unmap()
4686          * is called again on the same segment to finish the rest of the work
4687          * that needs to happen during unmapping.
4688          *
4689          * This action of calling back into the segment driver causes
4690          * nfs_delmap() to get called again, but since the callback was
4691          * already executed at this point, it already did the work and there
4692          * is nothing left for us to do.
4693          *
4694          * To Summarize:
4695          * - The first time nfs_delmap is called by the current thread is when
4696          * we add the caller associated with this delmap to the delmap caller
4697          * list, add the callback, and return EAGAIN.
4698          * - The second time in this call chain when nfs_delmap is called we
4699          * will find this caller in the delmap caller list and realize there
4700          * is no more work to do thus removing this caller from the list and
4701          * returning the error that was set in the callback execution.
4702          */
4703         caller_found = nfs_find_and_delete_delmapcall(rp, &error);
4704         if (caller_found) {
4705                 /*
4706                  * 'error' is from the actual delmap operations.  To avoid
4707                  * hangs, we need to handle the return of EAGAIN differently
4708                  * since this is what drives the callback execution.
4709                  * In this case, we don't want to return EAGAIN and do the
4710                  * callback execution because there are none to execute.
4711                  */
4712                 if (error == EAGAIN)
4713                         return (0);
4714                 else
4715                         return (error);
4716         }
4717 
4718         /* current caller was not in the list */
4719         delmap_call = nfs_init_delmapcall();
4720 
4721         mutex_enter(&rp->r_statelock);
4722         list_insert_tail(&rp->r_indelmap, delmap_call);
4723         mutex_exit(&rp->r_statelock);
4724 
4725         dmapp = kmem_alloc(sizeof (nfs_delmap_args_t), KM_SLEEP);
4726 
4727         dmapp->vp = vp;
4728         dmapp->off = off;
4729         dmapp->addr = addr;
4730         dmapp->len = len;
4731         dmapp->prot = prot;
4732         dmapp->maxprot = maxprot;
4733         dmapp->flags = flags;
4734         dmapp->cr = cr;
4735         dmapp->caller = delmap_call;
4736 
4737         error = as_add_callback(as, nfs_delmap_callback, dmapp,
4738             AS_UNMAP_EVENT, addr, len, KM_SLEEP);
4739 
4740         return (error ? error : EAGAIN);
4741 }
4742 
4743 /*
4744  * Remove some pages from an mmap'd vnode.  Just update the
4745  * count of pages.  If doing close-to-open, then flush all
4746  * of the pages associated with this file.  Otherwise, start
4747  * an asynchronous page flush to write out any dirty pages.
4748  * This will also associate a credential with the rnode which
4749  * can be used to write the pages.
4750  */
4751 /* ARGSUSED */
4752 static void
4753 nfs_delmap_callback(struct as *as, void *arg, uint_t event)
4754 {
4755         int                     error;
4756         rnode_t                 *rp;
4757         mntinfo_t               *mi;
4758         nfs_delmap_args_t       *dmapp = (nfs_delmap_args_t *)arg;
4759 
4760         rp = VTOR(dmapp->vp);
4761         mi = VTOMI(dmapp->vp);
4762 
4763         atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len));
4764         ASSERT(rp->r_mapcnt >= 0);
4765 
4766         /*
4767          * Initiate a page flush if there are pages, the file system
4768          * was not mounted readonly, the segment was mapped shared, and
4769          * the pages themselves were writeable.
4770          */
4771         if (vn_has_cached_data(dmapp->vp) && !vn_is_readonly(dmapp->vp) &&
4772             dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
4773                 mutex_enter(&rp->r_statelock);
4774                 rp->r_flags |= RDIRTY;
4775                 mutex_exit(&rp->r_statelock);
4776                 /*
4777                  * If this is a cross-zone access a sync putpage won't work, so
4778                  * the best we can do is try an async putpage.  That seems
4779                  * better than something more draconian such as discarding the
4780                  * dirty pages.
4781                  */
4782                 if ((mi->mi_flags & MI_NOCTO) ||
4783                     nfs_zone() != mi->mi_zone)
4784                         error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4785                             B_ASYNC, dmapp->cr, NULL);
4786                 else
4787                         error = nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4788                             0, dmapp->cr, NULL);
4789                 if (!error) {
4790                         mutex_enter(&rp->r_statelock);
4791                         error = rp->r_error;
4792                         rp->r_error = 0;
4793                         mutex_exit(&rp->r_statelock);
4794                 }
4795         } else
4796                 error = 0;
4797 
4798         if ((rp->r_flags & RDIRECTIO) || (mi->mi_flags & MI_DIRECTIO))
4799                 (void) nfs_putpage(dmapp->vp, dmapp->off, dmapp->len,
4800                     B_INVAL, dmapp->cr, NULL);
4801 
4802         dmapp->caller->error = error;
4803         (void) as_delete_callback(as, arg);
4804         kmem_free(dmapp, sizeof (nfs_delmap_args_t));
4805 }
4806 
4807 /* ARGSUSED */
4808 static int
4809 nfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4810         caller_context_t *ct)
4811 {
4812         int error = 0;
4813 
4814         if (nfs_zone() != VTOMI(vp)->mi_zone)
4815                 return (EIO);
4816         /*
4817          * This looks a little weird because it's written in a general
4818          * manner but we make little use of cases.  If cntl() ever gets
4819          * widely used, the outer switch will make more sense.
4820          */
4821 
4822         switch (cmd) {
4823 
4824         /*
4825          * Large file spec - need to base answer new query with
4826          * hardcoded constant based on the protocol.
4827          */
4828         case _PC_FILESIZEBITS:
4829                 *valp = 32;
4830                 return (0);
4831 
4832         case _PC_LINK_MAX:
4833         case _PC_NAME_MAX:
4834         case _PC_PATH_MAX:
4835         case _PC_SYMLINK_MAX:
4836         case _PC_CHOWN_RESTRICTED:
4837         case _PC_NO_TRUNC: {
4838                 mntinfo_t *mi;
4839                 struct pathcnf *pc;
4840 
4841                 if ((mi = VTOMI(vp)) == NULL || (pc = mi->mi_pathconf) == NULL)
4842                         return (EINVAL);
4843                 error = _PC_ISSET(cmd, pc->pc_mask);    /* error or bool */
4844                 switch (cmd) {
4845                 case _PC_LINK_MAX:
4846                         *valp = pc->pc_link_max;
4847                         break;
4848                 case _PC_NAME_MAX:
4849                         *valp = pc->pc_name_max;
4850                         break;
4851                 case _PC_PATH_MAX:
4852                 case _PC_SYMLINK_MAX:
4853                         *valp = pc->pc_path_max;
4854                         break;
4855                 case _PC_CHOWN_RESTRICTED:
4856                         /*
4857                          * if we got here, error is really a boolean which
4858                          * indicates whether cmd is set or not.
4859                          */
4860                         *valp = error ? 1 : 0;  /* see above */
4861                         error = 0;
4862                         break;
4863                 case _PC_NO_TRUNC:
4864                         /*
4865                          * if we got here, error is really a boolean which
4866                          * indicates whether cmd is set or not.
4867                          */
4868                         *valp = error ? 1 : 0;  /* see above */
4869                         error = 0;
4870                         break;
4871                 }
4872                 return (error ? EINVAL : 0);
4873                 }
4874 
4875         case _PC_XATTR_EXISTS:
4876                 *valp = 0;
4877                 if (vp->v_vfsp->vfs_flag & VFS_XATTR) {
4878                         vnode_t *avp;
4879                         rnode_t *rp;
4880                         mntinfo_t *mi = VTOMI(vp);
4881 
4882                         if (!(mi->mi_flags & MI_EXTATTR))
4883                                 return (0);
4884 
4885                         rp = VTOR(vp);
4886                         if (nfs_rw_enter_sig(&rp->r_rwlock, RW_READER,
4887                             INTR(vp)))
4888                                 return (EINTR);
4889 
4890                         error = nfslookup_dnlc(vp, XATTR_DIR_NAME, &avp, cr);
4891                         if (error || avp == NULL)
4892                                 error = acl_getxattrdir2(vp, &avp, 0, cr, 0);
4893 
4894                         nfs_rw_exit(&rp->r_rwlock);
4895 
4896                         if (error == 0 && avp != NULL) {
4897                                 error = do_xattr_exists_check(avp, valp, cr);
4898                                 VN_RELE(avp);
4899                         }
4900                 }
4901                 return (error ? EINVAL : 0);
4902 
4903         case _PC_ACL_ENABLED:
4904                 *valp = _ACL_ACLENT_ENABLED;
4905                 return (0);
4906 
4907         default:
4908                 return (EINVAL);
4909         }
4910 }
4911 
4912 /*
4913  * Called by async thread to do synchronous pageio. Do the i/o, wait
4914  * for it to complete, and cleanup the page list when done.
4915  */
4916 static int
4917 nfs_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4918         int flags, cred_t *cr)
4919 {
4920         int error;
4921 
4922         ASSERT(nfs_zone() == VTOMI(vp)->mi_zone);
4923         error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4924         if (flags & B_READ)
4925                 pvn_read_done(pp, (error ? B_ERROR : 0) | flags);
4926         else
4927                 pvn_write_done(pp, (error ? B_ERROR : 0) | flags);
4928         return (error);
4929 }
4930 
4931 /* ARGSUSED */
4932 static int
4933 nfs_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
4934         int flags, cred_t *cr, caller_context_t *ct)
4935 {
4936         int error;
4937         rnode_t *rp;
4938 
4939         if (pp == NULL)
4940                 return (EINVAL);
4941 
4942         if (io_off > MAXOFF32_T)
4943                 return (EFBIG);
4944         if (nfs_zone() != VTOMI(vp)->mi_zone)
4945                 return (EIO);
4946         rp = VTOR(vp);
4947         mutex_enter(&rp->r_statelock);
4948         rp->r_count++;
4949         mutex_exit(&rp->r_statelock);
4950 
4951         if (flags & B_ASYNC) {
4952                 error = nfs_async_pageio(vp, pp, io_off, io_len, flags, cr,
4953                     nfs_sync_pageio);
4954         } else
4955                 error = nfs_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
4956         mutex_enter(&rp->r_statelock);
4957         rp->r_count--;
4958         cv_broadcast(&rp->r_cv);
4959         mutex_exit(&rp->r_statelock);
4960         return (error);
4961 }
4962 
4963 /* ARGSUSED */
4964 static int
4965 nfs_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
4966         caller_context_t *ct)
4967 {
4968         int error;
4969         mntinfo_t *mi;
4970 
4971         mi = VTOMI(vp);
4972 
4973         if (nfs_zone() != mi->mi_zone)
4974                 return (EIO);
4975         if (mi->mi_flags & MI_ACL) {
4976                 error = acl_setacl2(vp, vsecattr, flag, cr);
4977                 if (mi->mi_flags & MI_ACL)
4978                         return (error);
4979         }
4980 
4981         return (ENOSYS);
4982 }
4983 
4984 /* ARGSUSED */
4985 static int
4986 nfs_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
4987         caller_context_t *ct)
4988 {
4989         int error;
4990         mntinfo_t *mi;
4991 
4992         mi = VTOMI(vp);
4993 
4994         if (nfs_zone() != mi->mi_zone)
4995                 return (EIO);
4996         if (mi->mi_flags & MI_ACL) {
4997                 error = acl_getacl2(vp, vsecattr, flag, cr);
4998                 if (mi->mi_flags & MI_ACL)
4999                         return (error);
5000         }
5001 
5002         return (fs_fab_acl(vp, vsecattr, flag, cr, ct));
5003 }
5004 
5005 /* ARGSUSED */
5006 static int
5007 nfs_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr,
5008         caller_context_t *ct)
5009 {
5010         int error;
5011         struct shrlock nshr;
5012         struct nfs_owner nfs_owner;
5013         netobj lm_fh;
5014 
5015         if (nfs_zone() != VTOMI(vp)->mi_zone)
5016                 return (EIO);
5017 
5018         /*
5019          * check for valid cmd parameter
5020          */
5021         if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS)
5022                 return (EINVAL);
5023 
5024         /*
5025          * Check access permissions
5026          */
5027         if (cmd == F_SHARE &&
5028             (((shr->s_access & F_RDACC) && !(flag & FREAD)) ||
5029             ((shr->s_access & F_WRACC) && !(flag & FWRITE))))
5030                 return (EBADF);
5031 
5032         /*
5033          * If the filesystem is mounted using local locking, pass the
5034          * request off to the local share code.
5035          */
5036         if (VTOMI(vp)->mi_flags & MI_LLOCK)
5037                 return (fs_shrlock(vp, cmd, shr, flag, cr, ct));
5038 
5039         switch (cmd) {
5040         case F_SHARE:
5041         case F_UNSHARE:
5042                 lm_fh.n_len = sizeof (fhandle_t);
5043                 lm_fh.n_bytes = (char *)VTOFH(vp);
5044 
5045                 /*
5046                  * If passed an owner that is too large to fit in an
5047                  * nfs_owner it is likely a recursive call from the
5048                  * lock manager client and pass it straight through.  If
5049                  * it is not a nfs_owner then simply return an error.
5050                  */
5051                 if (shr->s_own_len > sizeof (nfs_owner.lowner)) {
5052                         if (((struct nfs_owner *)shr->s_owner)->magic !=
5053                             NFS_OWNER_MAGIC)
5054                                 return (EINVAL);
5055 
5056                         if (error = lm_shrlock(vp, cmd, shr, flag, &lm_fh)) {
5057                                 error = set_errno(error);
5058                         }
5059                         return (error);
5060                 }
5061                 /*
5062                  * Remote share reservations owner is a combination of
5063                  * a magic number, hostname, and the local owner
5064                  */
5065                 bzero(&nfs_owner, sizeof (nfs_owner));
5066                 nfs_owner.magic = NFS_OWNER_MAGIC;
5067                 (void) strncpy(nfs_owner.hname, uts_nodename(),
5068                     sizeof (nfs_owner.hname));
5069                 bcopy(shr->s_owner, nfs_owner.lowner, shr->s_own_len);
5070                 nshr.s_access = shr->s_access;
5071                 nshr.s_deny = shr->s_deny;
5072                 nshr.s_sysid = 0;
5073                 nshr.s_pid = ttoproc(curthread)->p_pid;
5074                 nshr.s_own_len = sizeof (nfs_owner);
5075                 nshr.s_owner = (caddr_t)&nfs_owner;
5076 
5077                 if (error = lm_shrlock(vp, cmd, &nshr, flag, &lm_fh)) {
5078                         error = set_errno(error);
5079                 }
5080 
5081                 break;
5082 
5083         case F_HASREMOTELOCKS:
5084                 /*
5085                  * NFS client can't store remote locks itself
5086                  */
5087                 shr->s_access = 0;
5088                 error = 0;
5089                 break;
5090 
5091         default:
5092                 error = EINVAL;
5093                 break;
5094         }
5095 
5096         return (error);
5097 }