checkme Wdiff usr/src/uts/common/fs/nfs/nfs4_vnops.c

Print this page

8040 NFSv4 client: 3-way deadlock between nfs4_bio(), nfs4_do_delegreturn(), and nfs4_flush_pages()
Reviewed by: Arne Jansen <arne@die-jansens.de>
Reviewed by: Vitaliy Gusev <gusev.vitaliy@icloud.com>
Approved by: Gordon Ross <gordon.w.ross@gmail.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/nfs/nfs4_vnops.c
          +++ new/usr/src/uts/common/fs/nfs/nfs4_vnops.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2016 STRATO AG. All rights reserved.
  24   24   */
  25   25  
  26   26  /*
  27   27   * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  28   28   */
  29   29  
  30   30  /*
  31   31   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  32   32   * Use is subject to license terms.
  33   33   */
  34   34  
  35   35  /*
  36   36   *      Copyright 1983,1984,1985,1986,1987,1988,1989 AT&T.
  37   37   *      All Rights Reserved
  38   38   */
  39   39  
  40   40  /*
  41   41   * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  42   42   */
  43   43  
  44   44  #include <sys/param.h>
  45   45  #include <sys/types.h>
  46   46  #include <sys/systm.h>
  47   47  #include <sys/cred.h>
  48   48  #include <sys/time.h>
  49   49  #include <sys/vnode.h>
  50   50  #include <sys/vfs.h>
  51   51  #include <sys/vfs_opreg.h>
  52   52  #include <sys/file.h>
  53   53  #include <sys/filio.h>
  54   54  #include <sys/uio.h>
  55   55  #include <sys/buf.h>
  56   56  #include <sys/mman.h>
  57   57  #include <sys/pathname.h>
  58   58  #include <sys/dirent.h>
  59   59  #include <sys/debug.h>
  60   60  #include <sys/vmsystm.h>
  61   61  #include <sys/fcntl.h>
  62   62  #include <sys/flock.h>
  63   63  #include <sys/swap.h>
  64   64  #include <sys/errno.h>
  65   65  #include <sys/strsubr.h>
  66   66  #include <sys/sysmacros.h>
  67   67  #include <sys/kmem.h>
  68   68  #include <sys/cmn_err.h>
  69   69  #include <sys/pathconf.h>
  70   70  #include <sys/utsname.h>
  71   71  #include <sys/dnlc.h>
  72   72  #include <sys/acl.h>
  73   73  #include <sys/systeminfo.h>
  74   74  #include <sys/policy.h>
  75   75  #include <sys/sdt.h>
  76   76  #include <sys/list.h>
  77   77  #include <sys/stat.h>
  78   78  #include <sys/zone.h>
  79   79  
  80   80  #include <rpc/types.h>
  81   81  #include <rpc/auth.h>
  82   82  #include <rpc/clnt.h>
  83   83  
  84   84  #include <nfs/nfs.h>
  85   85  #include <nfs/nfs_clnt.h>
  86   86  #include <nfs/nfs_acl.h>
  87   87  #include <nfs/lm.h>
  88   88  #include <nfs/nfs4.h>
  89   89  #include <nfs/nfs4_kprot.h>
  90   90  #include <nfs/rnode4.h>
  91   91  #include <nfs/nfs4_clnt.h>
  92   92  
  93   93  #include <vm/hat.h>
  94   94  #include <vm/as.h>
  95   95  #include <vm/page.h>
  96   96  #include <vm/pvn.h>
  97   97  #include <vm/seg.h>
  98   98  #include <vm/seg_map.h>
  99   99  #include <vm/seg_kpm.h>
 100  100  #include <vm/seg_vn.h>
 101  101  
 102  102  #include <fs/fs_subr.h>
 103  103  
 104  104  #include <sys/ddi.h>
 105  105  #include <sys/int_fmtio.h>
 106  106  #include <sys/fs/autofs.h>
 107  107  
 108  108  typedef struct {
 109  109          nfs4_ga_res_t   *di_garp;
 110  110          cred_t          *di_cred;
 111  111          hrtime_t        di_time_call;
 112  112  } dirattr_info_t;
 113  113  
 114  114  typedef enum nfs4_acl_op {
 115  115          NFS4_ACL_GET,
 116  116          NFS4_ACL_SET
 117  117  } nfs4_acl_op_t;
 118  118  
 119  119  static struct lm_sysid *nfs4_find_sysid(mntinfo4_t *);
 120  120  
 121  121  static void     nfs4_update_dircaches(change_info4 *, vnode_t *, vnode_t *,
 122  122                          char *, dirattr_info_t *);
 123  123  
 124  124  static void     nfs4close_otw(rnode4_t *, cred_t *, nfs4_open_owner_t *,
 125  125                      nfs4_open_stream_t *, int *, int *, nfs4_close_type_t,
 126  126                      nfs4_error_t *, int *);
 127  127  static int      nfs4_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int,
 128  128                          cred_t *);
 129  129  static int      nfs4write(vnode_t *, caddr_t, u_offset_t, int, cred_t *,
 130  130                          stable_how4 *);
 131  131  static int      nfs4read(vnode_t *, caddr_t, offset_t, int, size_t *,
 132  132                          cred_t *, bool_t, struct uio *);
 133  133  static int      nfs4setattr(vnode_t *, struct vattr *, int, cred_t *,
 134  134                          vsecattr_t *);
 135  135  static int      nfs4openattr(vnode_t *, vnode_t **, int, cred_t *);
 136  136  static int      nfs4lookup(vnode_t *, char *, vnode_t **, cred_t *, int);
 137  137  static int      nfs4lookup_xattr(vnode_t *, char *, vnode_t **, int, cred_t *);
 138  138  static int      nfs4lookupvalidate_otw(vnode_t *, char *, vnode_t **, cred_t *);
 139  139  static int      nfs4lookupnew_otw(vnode_t *, char *, vnode_t **, cred_t *);
 140  140  static int      nfs4mknod(vnode_t *, char *, struct vattr *, enum vcexcl,
 141  141                          int, vnode_t **, cred_t *);
 142  142  static int      nfs4open_otw(vnode_t *, char *, struct vattr *, vnode_t **,
 143  143                          cred_t *, int, int, enum createmode4, int);
 144  144  static int      nfs4rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
 145  145                          caller_context_t *);
 146  146  static int      nfs4rename_persistent_fh(vnode_t *, char *, vnode_t *,
 147  147                          vnode_t *, char *, cred_t *, nfsstat4 *);
 148  148  static int      nfs4rename_volatile_fh(vnode_t *, char *, vnode_t *,
 149  149                          vnode_t *, char *, cred_t *, nfsstat4 *);
 150  150  static int      do_nfs4readdir(vnode_t *, rddir4_cache *, cred_t *);
 151  151  static void     nfs4readdir(vnode_t *, rddir4_cache *, cred_t *);
 152  152  static int      nfs4_bio(struct buf *, stable_how4 *, cred_t *, bool_t);
 153  153  static int      nfs4_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
 154  154                          page_t *[], size_t, struct seg *, caddr_t,
 155  155                          enum seg_rw, cred_t *);
 156  156  static void     nfs4_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *,
 157  157                          cred_t *);
 158  158  static int      nfs4_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t,
 159  159                          int, cred_t *);
 160  160  static int      nfs4_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t,
 161  161                          int, cred_t *);
 162  162  static int      nfs4_commit(vnode_t *, offset4, count4, cred_t *);
 163  163  static void     nfs4_set_mod(vnode_t *);
 164  164  static void     nfs4_get_commit(vnode_t *);
 165  165  static void     nfs4_get_commit_range(vnode_t *, u_offset_t, size_t);
 166  166  static int      nfs4_putpage_commit(vnode_t *, offset_t, size_t, cred_t *);
 167  167  static int      nfs4_commit_vp(vnode_t *, u_offset_t, size_t, cred_t *, int);
 168  168  static int      nfs4_sync_commit(vnode_t *, page_t *, offset3, count3,
 169  169                          cred_t *);
 170  170  static void     do_nfs4_async_commit(vnode_t *, page_t *, offset3, count3,
 171  171                          cred_t *);
 172  172  static int      nfs4_update_attrcache(nfsstat4, nfs4_ga_res_t *,
 173  173                          hrtime_t, vnode_t *, cred_t *);
 174  174  static int      nfs4_open_non_reg_file(vnode_t **, int, cred_t *);
 175  175  static int      nfs4_safelock(vnode_t *, const struct flock64 *, cred_t *);
 176  176  static void     nfs4_register_lock_locally(vnode_t *, struct flock64 *, int,
 177  177                          u_offset_t);
 178  178  static int      nfs4_lockrelease(vnode_t *, int, offset_t, cred_t *);
 179  179  static int      nfs4_block_and_wait(clock_t *, rnode4_t *);
 180  180  static cred_t  *state_to_cred(nfs4_open_stream_t *);
 181  181  static void     denied_to_flk(LOCK4denied *, flock64_t *, LOCKT4args *);
 182  182  static pid_t    lo_to_pid(lock_owner4 *);
 183  183  static void     nfs4_reinstitute_local_lock_state(vnode_t *, flock64_t *,
 184  184                          cred_t *, nfs4_lock_owner_t *);
 185  185  static void     push_reinstate(vnode_t *, int, flock64_t *, cred_t *,
 186  186                          nfs4_lock_owner_t *);
 187  187  static int      open_and_get_osp(vnode_t *, cred_t *, nfs4_open_stream_t **);
 188  188  static void     nfs4_delmap_callback(struct as *, void *, uint_t);
 189  189  static void     nfs4_free_delmapcall(nfs4_delmapcall_t *);
 190  190  static nfs4_delmapcall_t        *nfs4_init_delmapcall();
 191  191  static int      nfs4_find_and_delete_delmapcall(rnode4_t *, int *);
 192  192  static int      nfs4_is_acl_mask_valid(uint_t, nfs4_acl_op_t);
 193  193  static int      nfs4_create_getsecattr_return(vsecattr_t *, vsecattr_t *,
 194  194                          uid_t, gid_t, int);
 195  195  
 196  196  /*
 197  197   * Routines that implement the setting of v4 args for the misc. ops
 198  198   */
 199  199  static void     nfs4args_lock_free(nfs_argop4 *);
 200  200  static void     nfs4args_lockt_free(nfs_argop4 *);
 201  201  static void     nfs4args_setattr(nfs_argop4 *, vattr_t *, vsecattr_t *,
 202  202                          int, rnode4_t *, cred_t *, bitmap4, int *,
 203  203                          nfs4_stateid_types_t *);
 204  204  static void     nfs4args_setattr_free(nfs_argop4 *);
 205  205  static int      nfs4args_verify(nfs_argop4 *, vattr_t *, enum nfs_opnum4,
 206  206                          bitmap4);
 207  207  static void     nfs4args_verify_free(nfs_argop4 *);
 208  208  static void     nfs4args_write(nfs_argop4 *, stable_how4, rnode4_t *, cred_t *,
 209  209                          WRITE4args **, nfs4_stateid_types_t *);
 210  210  
 211  211  /*
 212  212   * These are the vnode ops functions that implement the vnode interface to
 213  213   * the networked file system.  See more comments below at nfs4_vnodeops.
 214  214   */
 215  215  static int      nfs4_open(vnode_t **, int, cred_t *, caller_context_t *);
 216  216  static int      nfs4_close(vnode_t *, int, int, offset_t, cred_t *,
 217  217                          caller_context_t *);
 218  218  static int      nfs4_read(vnode_t *, struct uio *, int, cred_t *,
 219  219                          caller_context_t *);
 220  220  static int      nfs4_write(vnode_t *, struct uio *, int, cred_t *,
 221  221                          caller_context_t *);
 222  222  static int      nfs4_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *,
 223  223                          caller_context_t *);
 224  224  static int      nfs4_setattr(vnode_t *, struct vattr *, int, cred_t *,
 225  225                          caller_context_t *);
 226  226  static int      nfs4_access(vnode_t *, int, int, cred_t *, caller_context_t *);
 227  227  static int      nfs4_readlink(vnode_t *, struct uio *, cred_t *,
 228  228                          caller_context_t *);
 229  229  static int      nfs4_fsync(vnode_t *, int, cred_t *, caller_context_t *);
 230  230  static int      nfs4_create(vnode_t *, char *, struct vattr *, enum vcexcl,
 231  231                          int, vnode_t **, cred_t *, int, caller_context_t *,
 232  232                          vsecattr_t *);
 233  233  static int      nfs4_remove(vnode_t *, char *, cred_t *, caller_context_t *,
 234  234                          int);
 235  235  static int      nfs4_link(vnode_t *, vnode_t *, char *, cred_t *,
 236  236                          caller_context_t *, int);
 237  237  static int      nfs4_rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
 238  238                          caller_context_t *, int);
 239  239  static int      nfs4_mkdir(vnode_t *, char *, struct vattr *, vnode_t **,
 240  240                          cred_t *, caller_context_t *, int, vsecattr_t *);
 241  241  static int      nfs4_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
 242  242                          caller_context_t *, int);
 243  243  static int      nfs4_symlink(vnode_t *, char *, struct vattr *, char *,
 244  244                          cred_t *, caller_context_t *, int);
 245  245  static int      nfs4_readdir(vnode_t *, struct uio *, cred_t *, int *,
 246  246                          caller_context_t *, int);
 247  247  static int      nfs4_seek(vnode_t *, offset_t, offset_t *, caller_context_t *);
 248  248  static int      nfs4_getpage(vnode_t *, offset_t, size_t, uint_t *,
 249  249                          page_t *[], size_t, struct seg *, caddr_t,
 250  250                          enum seg_rw, cred_t *, caller_context_t *);
 251  251  static int      nfs4_putpage(vnode_t *, offset_t, size_t, int, cred_t *,
 252  252                          caller_context_t *);
 253  253  static int      nfs4_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t,
 254  254                          uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
 255  255  static int      nfs4_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
 256  256                          uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
 257  257  static int      nfs4_cmp(vnode_t *, vnode_t *, caller_context_t *);
 258  258  static int      nfs4_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
 259  259                          struct flk_callback *, cred_t *, caller_context_t *);
 260  260  static int      nfs4_space(vnode_t *, int, struct flock64 *, int, offset_t,
 261  261                          cred_t *, caller_context_t *);
 262  262  static int      nfs4_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
 263  263                          uint_t, uint_t, uint_t, cred_t *, caller_context_t *);
 264  264  static int      nfs4_pageio(vnode_t *, page_t *, u_offset_t, size_t, int,
 265  265                          cred_t *, caller_context_t *);
 266  266  static void     nfs4_dispose(vnode_t *, page_t *, int, int, cred_t *,
 267  267                          caller_context_t *);
 268  268  static int      nfs4_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
 269  269                          caller_context_t *);
 270  270  /*
 271  271   * These vnode ops are required to be called from outside this source file,
 272  272   * e.g. by ephemeral mount stub vnode ops, and so may not be declared
 273  273   * as static.
 274  274   */
 275  275  int     nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *,
 276  276              caller_context_t *);
 277  277  void    nfs4_inactive(vnode_t *, cred_t *, caller_context_t *);
 278  278  int     nfs4_lookup(vnode_t *, char *, vnode_t **,
 279  279              struct pathname *, int, vnode_t *, cred_t *,
 280  280              caller_context_t *, int *, pathname_t *);
 281  281  int     nfs4_fid(vnode_t *, fid_t *, caller_context_t *);
 282  282  int     nfs4_rwlock(vnode_t *, int, caller_context_t *);
 283  283  void    nfs4_rwunlock(vnode_t *, int, caller_context_t *);
 284  284  int     nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *);
 285  285  int     nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *,
 286  286              caller_context_t *);
 287  287  int     nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
 288  288              caller_context_t *);
 289  289  int     nfs4_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
 290  290              caller_context_t *);
 291  291  
 292  292  /*
 293  293   * Used for nfs4_commit_vp() to indicate if we should
 294  294   * wait on pending writes.
 295  295   */
 296  296  #define NFS4_WRITE_NOWAIT       0
 297  297  #define NFS4_WRITE_WAIT         1
 298  298  
 299  299  #define NFS4_BASE_WAIT_TIME 1   /* 1 second */
 300  300  
 301  301  /*
 302  302   * Error flags used to pass information about certain special errors
 303  303   * which need to be handled specially.
 304  304   */
 305  305  #define NFS_EOF                 -98
 306  306  #define NFS_VERF_MISMATCH       -97
 307  307  
 308  308  /*
 309  309   * Flags used to differentiate between which operation drove the
 310  310   * potential CLOSE OTW. (see nfs4_close_otw_if_necessary)
 311  311   */
 312  312  #define NFS4_CLOSE_OP           0x1
 313  313  #define NFS4_DELMAP_OP          0x2
 314  314  #define NFS4_INACTIVE_OP        0x3
 315  315  
 316  316  #define ISVDEV(t) ((t == VBLK) || (t == VCHR) || (t == VFIFO))
 317  317  
 318  318  /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */
 319  319  #define ALIGN64(x, ptr, sz)                                             \
 320  320          x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1);               \
 321  321          if (x) {                                                        \
 322  322                  x = sizeof (uint64_t) - (x);                            \
 323  323                  sz -= (x);                                              \
 324  324                  ptr += (x);                                             \
 325  325          }
 326  326  
 327  327  #ifdef DEBUG
 328  328  int nfs4_client_attr_debug = 0;
 329  329  int nfs4_client_state_debug = 0;
 330  330  int nfs4_client_shadow_debug = 0;
 331  331  int nfs4_client_lock_debug = 0;
 332  332  int nfs4_seqid_sync = 0;
 333  333  int nfs4_client_map_debug = 0;
 334  334  static int nfs4_pageio_debug = 0;
 335  335  int nfs4_client_inactive_debug = 0;
 336  336  int nfs4_client_recov_debug = 0;
 337  337  int nfs4_client_failover_debug = 0;
 338  338  int nfs4_client_call_debug = 0;
 339  339  int nfs4_client_lookup_debug = 0;
 340  340  int nfs4_client_zone_debug = 0;
 341  341  int nfs4_lost_rqst_debug = 0;
 342  342  int nfs4_rdattrerr_debug = 0;
 343  343  int nfs4_open_stream_debug = 0;
 344  344  
 345  345  int nfs4read_error_inject;
 346  346  
 347  347  static int nfs4_create_misses = 0;
 348  348  
 349  349  static int nfs4_readdir_cache_shorts = 0;
 350  350  static int nfs4_readdir_readahead = 0;
 351  351  
 352  352  static int nfs4_bio_do_stop = 0;
 353  353  
 354  354  static int nfs4_lostpage = 0;   /* number of times we lost original page */
 355  355  
 356  356  int nfs4_mmap_debug = 0;
 357  357  
 358  358  static int nfs4_pathconf_cache_hits = 0;
 359  359  static int nfs4_pathconf_cache_misses = 0;
 360  360  
 361  361  int nfs4close_all_cnt;
 362  362  int nfs4close_one_debug = 0;
 363  363  int nfs4close_notw_debug = 0;
 364  364  
 365  365  int denied_to_flk_debug = 0;
 366  366  void *lockt_denied_debug;
 367  367  
 368  368  #endif
 369  369  
 370  370  /*
 371  371   * How long to wait before trying again if OPEN_CONFIRM gets ETIMEDOUT
 372  372   * or NFS4ERR_RESOURCE.
 373  373   */
 374  374  static int confirm_retry_sec = 30;
 375  375  
 376  376  static int nfs4_lookup_neg_cache = 1;
 377  377  
 378  378  /*
 379  379   * number of pages to read ahead
 380  380   * optimized for 100 base-T.
 381  381   */
 382  382  static int nfs4_nra = 4;
 383  383  
 384  384  static int nfs4_do_symlink_cache = 1;
 385  385  
 386  386  static int nfs4_pathconf_disable_cache = 0;
 387  387  
 388  388  /*
 389  389   * These are the vnode ops routines which implement the vnode interface to
 390  390   * the networked file system.  These routines just take their parameters,
 391  391   * make them look networkish by putting the right info into interface structs,
 392  392   * and then calling the appropriate remote routine(s) to do the work.
 393  393   *
 394  394   * Note on directory name lookup cacheing:  If we detect a stale fhandle,
 395  395   * we purge the directory cache relative to that vnode.  This way, the
 396  396   * user won't get burned by the cache repeatedly.  See <nfs/rnode4.h> for
 397  397   * more details on rnode locking.
 398  398   */
 399  399  
 400  400  struct vnodeops *nfs4_vnodeops;
 401  401  
 402  402  const fs_operation_def_t nfs4_vnodeops_template[] = {
 403  403          VOPNAME_OPEN,           { .vop_open = nfs4_open },
 404  404          VOPNAME_CLOSE,          { .vop_close = nfs4_close },
 405  405          VOPNAME_READ,           { .vop_read = nfs4_read },
 406  406          VOPNAME_WRITE,          { .vop_write = nfs4_write },
 407  407          VOPNAME_IOCTL,          { .vop_ioctl = nfs4_ioctl },
 408  408          VOPNAME_GETATTR,        { .vop_getattr = nfs4_getattr },
 409  409          VOPNAME_SETATTR,        { .vop_setattr = nfs4_setattr },
 410  410          VOPNAME_ACCESS,         { .vop_access = nfs4_access },
 411  411          VOPNAME_LOOKUP,         { .vop_lookup = nfs4_lookup },
 412  412          VOPNAME_CREATE,         { .vop_create = nfs4_create },
 413  413          VOPNAME_REMOVE,         { .vop_remove = nfs4_remove },
 414  414          VOPNAME_LINK,           { .vop_link = nfs4_link },
 415  415          VOPNAME_RENAME,         { .vop_rename = nfs4_rename },
 416  416          VOPNAME_MKDIR,          { .vop_mkdir = nfs4_mkdir },
 417  417          VOPNAME_RMDIR,          { .vop_rmdir = nfs4_rmdir },
 418  418          VOPNAME_READDIR,        { .vop_readdir = nfs4_readdir },
 419  419          VOPNAME_SYMLINK,        { .vop_symlink = nfs4_symlink },
 420  420          VOPNAME_READLINK,       { .vop_readlink = nfs4_readlink },
 421  421          VOPNAME_FSYNC,          { .vop_fsync = nfs4_fsync },
 422  422          VOPNAME_INACTIVE,       { .vop_inactive = nfs4_inactive },
 423  423          VOPNAME_FID,            { .vop_fid = nfs4_fid },
 424  424          VOPNAME_RWLOCK,         { .vop_rwlock = nfs4_rwlock },
 425  425          VOPNAME_RWUNLOCK,       { .vop_rwunlock = nfs4_rwunlock },
 426  426          VOPNAME_SEEK,           { .vop_seek = nfs4_seek },
 427  427          VOPNAME_FRLOCK,         { .vop_frlock = nfs4_frlock },
 428  428          VOPNAME_SPACE,          { .vop_space = nfs4_space },
 429  429          VOPNAME_REALVP,         { .vop_realvp = nfs4_realvp },
 430  430          VOPNAME_GETPAGE,        { .vop_getpage = nfs4_getpage },
 431  431          VOPNAME_PUTPAGE,        { .vop_putpage = nfs4_putpage },
 432  432          VOPNAME_MAP,            { .vop_map = nfs4_map },
 433  433          VOPNAME_ADDMAP,         { .vop_addmap = nfs4_addmap },
 434  434          VOPNAME_DELMAP,         { .vop_delmap = nfs4_delmap },
 435  435          /* no separate nfs4_dump */
 436  436          VOPNAME_DUMP,           { .vop_dump = nfs_dump },
 437  437          VOPNAME_PATHCONF,       { .vop_pathconf = nfs4_pathconf },
 438  438          VOPNAME_PAGEIO,         { .vop_pageio = nfs4_pageio },
 439  439          VOPNAME_DISPOSE,        { .vop_dispose = nfs4_dispose },
 440  440          VOPNAME_SETSECATTR,     { .vop_setsecattr = nfs4_setsecattr },
 441  441          VOPNAME_GETSECATTR,     { .vop_getsecattr = nfs4_getsecattr },
 442  442          VOPNAME_SHRLOCK,        { .vop_shrlock = nfs4_shrlock },
 443  443          VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
 444  444          NULL,                   NULL
 445  445  };
 446  446  
 447  447  /*
 448  448   * The following are subroutines and definitions to set args or get res
 449  449   * for the different nfsv4 ops
 450  450   */
 451  451  
 452  452  void
 453  453  nfs4args_lookup_free(nfs_argop4 *argop, int arglen)
 454  454  {
 455  455          int             i;
 456  456  
 457  457          for (i = 0; i < arglen; i++) {
 458  458                  if (argop[i].argop == OP_LOOKUP) {
 459  459                          kmem_free(
 460  460                              argop[i].nfs_argop4_u.oplookup.
 461  461                              objname.utf8string_val,
 462  462                              argop[i].nfs_argop4_u.oplookup.
 463  463                              objname.utf8string_len);
 464  464                  }
 465  465          }
 466  466  }
 467  467  
 468  468  static void
 469  469  nfs4args_lock_free(nfs_argop4 *argop)
 470  470  {
 471  471          locker4 *locker = &argop->nfs_argop4_u.oplock.locker;
 472  472  
 473  473          if (locker->new_lock_owner == TRUE) {
 474  474                  open_to_lock_owner4 *open_owner;
 475  475  
 476  476                  open_owner = &locker->locker4_u.open_owner;
 477  477                  if (open_owner->lock_owner.owner_val != NULL) {
 478  478                          kmem_free(open_owner->lock_owner.owner_val,
 479  479                              open_owner->lock_owner.owner_len);
 480  480                  }
 481  481          }
 482  482  }
 483  483  
 484  484  static void
 485  485  nfs4args_lockt_free(nfs_argop4 *argop)
 486  486  {
 487  487          lock_owner4 *lowner = &argop->nfs_argop4_u.oplockt.owner;
 488  488  
 489  489          if (lowner->owner_val != NULL) {
 490  490                  kmem_free(lowner->owner_val, lowner->owner_len);
 491  491          }
 492  492  }
 493  493  
 494  494  static void
 495  495  nfs4args_setattr(nfs_argop4 *argop, vattr_t *vap, vsecattr_t *vsap, int flags,
 496  496      rnode4_t *rp, cred_t *cr, bitmap4 supp, int *error,
 497  497      nfs4_stateid_types_t *sid_types)
 498  498  {
 499  499          fattr4          *attr = &argop->nfs_argop4_u.opsetattr.obj_attributes;
 500  500          mntinfo4_t      *mi;
 501  501  
 502  502          argop->argop = OP_SETATTR;
 503  503          /*
 504  504           * The stateid is set to 0 if client is not modifying the size
 505  505           * and otherwise to whatever nfs4_get_stateid() returns.
 506  506           *
 507  507           * XXX Note: nfs4_get_stateid() returns 0 if no lockowner and/or no
 508  508           * state struct could be found for the process/file pair.  We may
 509  509           * want to change this in the future (by OPENing the file).  See
 510  510           * bug # 4474852.
 511  511           */
 512  512          if (vap->va_mask & AT_SIZE) {
 513  513  
 514  514                  ASSERT(rp != NULL);
 515  515                  mi = VTOMI4(RTOV4(rp));
 516  516  
 517  517                  argop->nfs_argop4_u.opsetattr.stateid =
 518  518                      nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi,
 519  519                      OP_SETATTR, sid_types, FALSE);
 520  520          } else {
 521  521                  bzero(&argop->nfs_argop4_u.opsetattr.stateid,
 522  522                      sizeof (stateid4));
 523  523          }
 524  524  
 525  525          *error = vattr_to_fattr4(vap, vsap, attr, flags, OP_SETATTR, supp);
 526  526          if (*error)
 527  527                  bzero(attr, sizeof (*attr));
 528  528  }
 529  529  
 530  530  static void
 531  531  nfs4args_setattr_free(nfs_argop4 *argop)
 532  532  {
 533  533          nfs4_fattr4_free(&argop->nfs_argop4_u.opsetattr.obj_attributes);
 534  534  }
 535  535  
 536  536  static int
 537  537  nfs4args_verify(nfs_argop4 *argop, vattr_t *vap, enum nfs_opnum4 op,
 538  538      bitmap4 supp)
 539  539  {
 540  540          fattr4 *attr;
 541  541          int error = 0;
 542  542  
 543  543          argop->argop = op;
 544  544          switch (op) {
 545  545          case OP_VERIFY:
 546  546                  attr = &argop->nfs_argop4_u.opverify.obj_attributes;
 547  547                  break;
 548  548          case OP_NVERIFY:
 549  549                  attr = &argop->nfs_argop4_u.opnverify.obj_attributes;
 550  550                  break;
 551  551          default:
 552  552                  return (EINVAL);
 553  553          }
 554  554          if (!error)
 555  555                  error = vattr_to_fattr4(vap, NULL, attr, 0, op, supp);
 556  556          if (error)
 557  557                  bzero(attr, sizeof (*attr));
 558  558          return (error);
 559  559  }
 560  560  
 561  561  static void
 562  562  nfs4args_verify_free(nfs_argop4 *argop)
 563  563  {
 564  564          switch (argop->argop) {
 565  565          case OP_VERIFY:
 566  566                  nfs4_fattr4_free(&argop->nfs_argop4_u.opverify.obj_attributes);
 567  567                  break;
 568  568          case OP_NVERIFY:
 569  569                  nfs4_fattr4_free(&argop->nfs_argop4_u.opnverify.obj_attributes);
 570  570                  break;
 571  571          default:
 572  572                  break;
 573  573          }
 574  574  }
 575  575  
 576  576  static void
 577  577  nfs4args_write(nfs_argop4 *argop, stable_how4 stable, rnode4_t *rp, cred_t *cr,
 578  578      WRITE4args **wargs_pp, nfs4_stateid_types_t *sid_tp)
 579  579  {
 580  580          WRITE4args *wargs = &argop->nfs_argop4_u.opwrite;
 581  581          mntinfo4_t *mi = VTOMI4(RTOV4(rp));
 582  582  
 583  583          argop->argop = OP_WRITE;
 584  584          wargs->stable = stable;
 585  585          wargs->stateid = nfs4_get_w_stateid(cr, rp, curproc->p_pidp->pid_id,
 586  586              mi, OP_WRITE, sid_tp);
 587  587          wargs->mblk = NULL;
 588  588          *wargs_pp = wargs;
 589  589  }
 590  590  
 591  591  void
 592  592  nfs4args_copen_free(OPEN4cargs *open_args)
 593  593  {
 594  594          if (open_args->owner.owner_val) {
 595  595                  kmem_free(open_args->owner.owner_val,
 596  596                      open_args->owner.owner_len);
 597  597          }
 598  598          if ((open_args->opentype == OPEN4_CREATE) &&
 599  599              (open_args->mode != EXCLUSIVE4)) {
 600  600                  nfs4_fattr4_free(&open_args->createhow4_u.createattrs);
 601  601          }
 602  602  }
 603  603  
 604  604  /*
 605  605   * XXX:  This is referenced in modstubs.s
 606  606   */
 607  607  struct vnodeops *
 608  608  nfs4_getvnodeops(void)
 609  609  {
 610  610          return (nfs4_vnodeops);
 611  611  }
 612  612  
 613  613  /*
 614  614   * The OPEN operation opens a regular file.
 615  615   */
 616  616  /*ARGSUSED3*/
 617  617  static int
 618  618  nfs4_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 619  619  {
 620  620          vnode_t *dvp = NULL;
 621  621          rnode4_t *rp, *drp;
 622  622          int error;
 623  623          int just_been_created;
 624  624          char fn[MAXNAMELEN];
 625  625  
 626  626          NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4_open: "));
 627  627          if (nfs_zone() != VTOMI4(*vpp)->mi_zone)
 628  628                  return (EIO);
 629  629          rp = VTOR4(*vpp);
 630  630  
 631  631          /*
 632  632           * Check to see if opening something besides a regular file;
 633  633           * if so skip the OTW call
 634  634           */
 635  635          if ((*vpp)->v_type != VREG) {
 636  636                  error = nfs4_open_non_reg_file(vpp, flag, cr);
 637  637                  return (error);
 638  638          }
 639  639  
 640  640          /*
 641  641           * XXX - would like a check right here to know if the file is
 642  642           * executable or not, so as to skip OTW
 643  643           */
 644  644  
 645  645          if ((error = vtodv(*vpp, &dvp, cr, TRUE)) != 0)
 646  646                  return (error);
 647  647  
 648  648          drp = VTOR4(dvp);
 649  649          if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
 650  650                  return (EINTR);
 651  651  
 652  652          if ((error = vtoname(*vpp, fn, MAXNAMELEN)) != 0) {
 653  653                  nfs_rw_exit(&drp->r_rwlock);
 654  654                  return (error);
 655  655          }
 656  656  
 657  657          /*
 658  658           * See if this file has just been CREATEd.
 659  659           * If so, clear the flag and update the dnlc, which was previously
 660  660           * skipped in nfs4_create.
 661  661           * XXX need better serilization on this.
 662  662           * XXX move this into the nf4open_otw call, after we have
 663  663           * XXX acquired the open owner seqid sync.
 664  664           */
 665  665          mutex_enter(&rp->r_statev4_lock);
 666  666          if (rp->created_v4) {
 667  667                  rp->created_v4 = 0;
 668  668                  mutex_exit(&rp->r_statev4_lock);
 669  669  
 670  670                  dnlc_update(dvp, fn, *vpp);
 671  671                  /* This is needed so we don't bump the open ref count */
 672  672                  just_been_created = 1;
 673  673          } else {
 674  674                  mutex_exit(&rp->r_statev4_lock);
 675  675                  just_been_created = 0;
 676  676          }
 677  677  
 678  678          /*
 679  679           * If caller specified O_TRUNC/FTRUNC, then be sure to set
 680  680           * FWRITE (to drive successful setattr(size=0) after open)
 681  681           */
 682  682          if (flag & FTRUNC)
 683  683                  flag |= FWRITE;
 684  684  
 685  685          error = nfs4open_otw(dvp, fn, NULL, vpp, cr, 0, flag, 0,
 686  686              just_been_created);
 687  687  
 688  688          if (!error && !((*vpp)->v_flag & VROOT))
 689  689                  dnlc_update(dvp, fn, *vpp);
 690  690  
 691  691          nfs_rw_exit(&drp->r_rwlock);
 692  692  
 693  693          /* release the hold from vtodv */
 694  694          VN_RELE(dvp);
 695  695  
 696  696          /* exchange the shadow for the master vnode, if needed */
 697  697  
 698  698          if (error == 0 && IS_SHADOW(*vpp, rp))
 699  699                  sv_exchange(vpp);
 700  700  
 701  701          return (error);
 702  702  }
 703  703  
 704  704  /*
 705  705   * See if there's a "lost open" request to be saved and recovered.
 706  706   */
 707  707  static void
 708  708  nfs4open_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
 709  709      nfs4_open_owner_t *oop, cred_t *cr, vnode_t *vp,
 710  710      vnode_t *dvp, OPEN4cargs *open_args)
 711  711  {
 712  712          vfs_t *vfsp;
 713  713          char *srccfp;
 714  714  
 715  715          vfsp = (dvp ? dvp->v_vfsp : vp->v_vfsp);
 716  716  
 717  717          if (error != ETIMEDOUT && error != EINTR &&
 718  718              !NFS4_FRC_UNMT_ERR(error, vfsp)) {
 719  719                  lost_rqstp->lr_op = 0;
 720  720                  return;
 721  721          }
 722  722  
 723  723          NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
 724  724              "nfs4open_save_lost_rqst: error %d", error));
 725  725  
 726  726          lost_rqstp->lr_op = OP_OPEN;
 727  727  
 728  728          /*
 729  729           * The vp (if it is not NULL) and dvp are held and rele'd via
 730  730           * the recovery code.  See nfs4_save_lost_rqst.
 731  731           */
 732  732          lost_rqstp->lr_vp = vp;
 733  733          lost_rqstp->lr_dvp = dvp;
 734  734          lost_rqstp->lr_oop = oop;
 735  735          lost_rqstp->lr_osp = NULL;
 736  736          lost_rqstp->lr_lop = NULL;
 737  737          lost_rqstp->lr_cr = cr;
 738  738          lost_rqstp->lr_flk = NULL;
 739  739          lost_rqstp->lr_oacc = open_args->share_access;
 740  740          lost_rqstp->lr_odeny = open_args->share_deny;
 741  741          lost_rqstp->lr_oclaim = open_args->claim;
 742  742          if (open_args->claim == CLAIM_DELEGATE_CUR) {
 743  743                  lost_rqstp->lr_ostateid =
 744  744                      open_args->open_claim4_u.delegate_cur_info.delegate_stateid;
 745  745                  srccfp = open_args->open_claim4_u.delegate_cur_info.cfile;
 746  746          } else {
 747  747                  srccfp = open_args->open_claim4_u.cfile;
 748  748          }
 749  749          lost_rqstp->lr_ofile.utf8string_len = 0;
 750  750          lost_rqstp->lr_ofile.utf8string_val = NULL;
 751  751          (void) str_to_utf8(srccfp, &lost_rqstp->lr_ofile);
 752  752          lost_rqstp->lr_putfirst = FALSE;
 753  753  }
 754  754  
 755  755  struct nfs4_excl_time {
 756  756          uint32 seconds;
 757  757          uint32 nseconds;
 758  758  };
 759  759  
 760  760  /*
 761  761   * The OPEN operation creates and/or opens a regular file
 762  762   *
 763  763   * ARGSUSED
 764  764   */
 765  765  static int
 766  766  nfs4open_otw(vnode_t *dvp, char *file_name, struct vattr *in_va,
 767  767      vnode_t **vpp, cred_t *cr, int create_flag, int open_flag,
 768  768      enum createmode4 createmode, int file_just_been_created)
 769  769  {
 770  770          rnode4_t *rp;
 771  771          rnode4_t *drp = VTOR4(dvp);
 772  772          vnode_t *vp = NULL;
 773  773          vnode_t *vpi = *vpp;
 774  774          bool_t needrecov = FALSE;
 775  775  
 776  776          int doqueue = 1;
 777  777  
 778  778          COMPOUND4args_clnt args;
 779  779          COMPOUND4res_clnt res;
 780  780          nfs_argop4 *argop;
 781  781          nfs_resop4 *resop;
 782  782          int argoplist_size;
 783  783          int idx_open, idx_fattr;
 784  784  
 785  785          GETFH4res *gf_res = NULL;
 786  786          OPEN4res *op_res = NULL;
 787  787          nfs4_ga_res_t *garp;
 788  788          fattr4 *attr = NULL;
 789  789          struct nfs4_excl_time verf;
 790  790          bool_t did_excl_setup = FALSE;
 791  791          int created_osp;
 792  792  
 793  793          OPEN4cargs *open_args;
 794  794          nfs4_open_owner_t       *oop = NULL;
 795  795          nfs4_open_stream_t      *osp = NULL;
 796  796          seqid4 seqid = 0;
 797  797          bool_t retry_open = FALSE;
 798  798          nfs4_recov_state_t recov_state;
 799  799          nfs4_lost_rqst_t lost_rqst;
 800  800          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
 801  801          hrtime_t t;
 802  802          int acc = 0;
 803  803          cred_t *cred_otw = NULL;        /* cred used to do the RPC call */
 804  804          cred_t *ncr = NULL;
 805  805  
 806  806          nfs4_sharedfh_t *otw_sfh;
 807  807          nfs4_sharedfh_t *orig_sfh;
 808  808          int fh_differs = 0;
 809  809          int numops, setgid_flag;
 810  810          int num_bseqid_retry = NFS4_NUM_RETRY_BAD_SEQID + 1;
 811  811  
 812  812          /*
 813  813           * Make sure we properly deal with setting the right gid on
 814  814           * a newly created file to reflect the parent's setgid bit
 815  815           */
 816  816          setgid_flag = 0;
 817  817          if (create_flag && in_va) {
 818  818  
 819  819                  /*
 820  820                   * If there is grpid mount flag used or
 821  821                   * the parent's directory has the setgid bit set
 822  822                   * _and_ the client was able to get a valid mapping
 823  823                   * for the parent dir's owner_group, we want to
 824  824                   * append NVERIFY(owner_group == dva.va_gid) and
 825  825                   * SETATTR to the CREATE compound.
 826  826                   */
 827  827                  mutex_enter(&drp->r_statelock);
 828  828                  if ((VTOMI4(dvp)->mi_flags & MI4_GRPID ||
 829  829                      drp->r_attr.va_mode & VSGID) &&
 830  830                      drp->r_attr.va_gid != GID_NOBODY) {
 831  831                          in_va->va_mask |= AT_GID;
 832  832                          in_va->va_gid = drp->r_attr.va_gid;
 833  833                          setgid_flag = 1;
 834  834                  }
 835  835                  mutex_exit(&drp->r_statelock);
 836  836          }
 837  837  
 838  838          /*
 839  839           * Normal/non-create compound:
 840  840           * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new)
 841  841           *
 842  842           * Open(create) compound no setgid:
 843  843           * PUTFH(dfh) + SAVEFH + OPEN(create) + GETFH + GETATTR(new) +
 844  844           * RESTOREFH + GETATTR
 845  845           *
 846  846           * Open(create) setgid:
 847  847           * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) +
 848  848           * SAVEFH + PUTFH(dfh) + GETATTR(dvp) + RESTOREFH +
 849  849           * NVERIFY(grp) + SETATTR
 850  850           */
 851  851          if (setgid_flag) {
 852  852                  numops = 10;
 853  853                  idx_open = 1;
 854  854                  idx_fattr = 3;
 855  855          } else if (create_flag) {
 856  856                  numops = 7;
 857  857                  idx_open = 2;
 858  858                  idx_fattr = 4;
 859  859          } else {
 860  860                  numops = 4;
 861  861                  idx_open = 1;
 862  862                  idx_fattr = 3;
 863  863          }
 864  864  
 865  865          args.array_len = numops;
 866  866          argoplist_size = numops * sizeof (nfs_argop4);
 867  867          argop = kmem_alloc(argoplist_size, KM_SLEEP);
 868  868  
 869  869          NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw: "
 870  870              "open %s open flag 0x%x cred %p", file_name, open_flag,
 871  871              (void *)cr));
 872  872  
 873  873          ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
 874  874          if (create_flag) {
 875  875                  /*
 876  876                   * We are to create a file.  Initialize the passed in vnode
 877  877                   * pointer.
 878  878                   */
 879  879                  vpi = NULL;
 880  880          } else {
 881  881                  /*
 882  882                   * Check to see if the client owns a read delegation and is
 883  883                   * trying to open for write.  If so, then return the delegation
 884  884                   * to avoid the server doing a cb_recall and returning DELAY.
 885  885                   * NB - we don't use the statev4_lock here because we'd have
 886  886                   * to drop the lock anyway and the result would be stale.
 887  887                   */
 888  888                  if ((open_flag & FWRITE) &&
 889  889                      VTOR4(vpi)->r_deleg_type == OPEN_DELEGATE_READ)
 890  890                          (void) nfs4delegreturn(VTOR4(vpi), NFS4_DR_REOPEN);
 891  891  
 892  892                  /*
 893  893                   * If the file has a delegation, then do an access check up
 894  894                   * front.  This avoids having to an access check later after
 895  895                   * we've already done start_op, which could deadlock.
 896  896                   */
 897  897                  if (VTOR4(vpi)->r_deleg_type != OPEN_DELEGATE_NONE) {
 898  898                          if (open_flag & FREAD &&
 899  899                              nfs4_access(vpi, VREAD, 0, cr, NULL) == 0)
 900  900                                  acc |= VREAD;
 901  901                          if (open_flag & FWRITE &&
 902  902                              nfs4_access(vpi, VWRITE, 0, cr, NULL) == 0)
 903  903                                  acc |= VWRITE;
 904  904                  }
 905  905          }
 906  906  
 907  907          drp = VTOR4(dvp);
 908  908  
 909  909          recov_state.rs_flags = 0;
 910  910          recov_state.rs_num_retry_despite_err = 0;
 911  911          cred_otw = cr;
 912  912  
 913  913  recov_retry:
 914  914          fh_differs = 0;
 915  915          nfs4_error_zinit(&e);
 916  916  
 917  917          e.error = nfs4_start_op(VTOMI4(dvp), dvp, vpi, &recov_state);
 918  918          if (e.error) {
 919  919                  if (ncr != NULL)
 920  920                          crfree(ncr);
 921  921                  kmem_free(argop, argoplist_size);
 922  922                  return (e.error);
 923  923          }
 924  924  
 925  925          args.ctag = TAG_OPEN;
 926  926          args.array_len = numops;
 927  927          args.array = argop;
 928  928  
 929  929          /* putfh directory fh */
 930  930          argop[0].argop = OP_CPUTFH;
 931  931          argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
 932  932  
 933  933          /* OPEN: either op 1 or op 2 depending upon create/setgid flags */
 934  934          argop[idx_open].argop = OP_COPEN;
 935  935          open_args = &argop[idx_open].nfs_argop4_u.opcopen;
 936  936          open_args->claim = CLAIM_NULL;
 937  937  
 938  938          /* name of file */
 939  939          open_args->open_claim4_u.cfile = file_name;
 940  940          open_args->owner.owner_len = 0;
 941  941          open_args->owner.owner_val = NULL;
 942  942  
 943  943          if (create_flag) {
 944  944                  /* CREATE a file */
 945  945                  open_args->opentype = OPEN4_CREATE;
 946  946                  open_args->mode = createmode;
 947  947                  if (createmode == EXCLUSIVE4) {
 948  948                          if (did_excl_setup == FALSE) {
 949  949                                  verf.seconds = zone_get_hostid(NULL);
 950  950                                  if (verf.seconds != 0)
 951  951                                          verf.nseconds = newnum();
 952  952                                  else {
 953  953                                          timestruc_t now;
 954  954  
 955  955                                          gethrestime(&now);
 956  956                                          verf.seconds = now.tv_sec;
 957  957                                          verf.nseconds = now.tv_nsec;
 958  958                                  }
 959  959                                  /*
 960  960                                   * Since the server will use this value for the
 961  961                                   * mtime, make sure that it can't overflow. Zero
 962  962                                   * out the MSB. The actual value does not matter
 963  963                                   * here, only its uniqeness.
 964  964                                   */
 965  965                                  verf.seconds &= INT32_MAX;
 966  966                                  did_excl_setup = TRUE;
 967  967                          }
 968  968  
 969  969                          /* Now copy over verifier to OPEN4args. */
 970  970                          open_args->createhow4_u.createverf = *(uint64_t *)&verf;
 971  971                  } else {
 972  972                          int v_error;
 973  973                          bitmap4 supp_attrs;
 974  974                          servinfo4_t *svp;
 975  975  
 976  976                          attr = &open_args->createhow4_u.createattrs;
 977  977  
 978  978                          svp = drp->r_server;
 979  979                          (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
 980  980                          supp_attrs = svp->sv_supp_attrs;
 981  981                          nfs_rw_exit(&svp->sv_lock);
 982  982  
 983  983                          /* GUARDED4 or UNCHECKED4 */
 984  984                          v_error = vattr_to_fattr4(in_va, NULL, attr, 0, OP_OPEN,
 985  985                              supp_attrs);
 986  986                          if (v_error) {
 987  987                                  bzero(attr, sizeof (*attr));
 988  988                                  nfs4args_copen_free(open_args);
 989  989                                  nfs4_end_op(VTOMI4(dvp), dvp, vpi,
 990  990                                      &recov_state, FALSE);
 991  991                                  if (ncr != NULL)
 992  992                                          crfree(ncr);
 993  993                                  kmem_free(argop, argoplist_size);
 994  994                                  return (v_error);
 995  995                          }
 996  996                  }
 997  997          } else {
 998  998                  /* NO CREATE */
 999  999                  open_args->opentype = OPEN4_NOCREATE;
1000 1000          }
1001 1001  
1002 1002          if (recov_state.rs_sp != NULL) {
1003 1003                  mutex_enter(&recov_state.rs_sp->s_lock);
1004 1004                  open_args->owner.clientid = recov_state.rs_sp->clientid;
1005 1005                  mutex_exit(&recov_state.rs_sp->s_lock);
1006 1006          } else {
1007 1007                  /* XXX should we just fail here? */
1008 1008                  open_args->owner.clientid = 0;
1009 1009          }
1010 1010  
1011 1011          /*
1012 1012           * This increments oop's ref count or creates a temporary 'just_created'
1013 1013           * open owner that will become valid when this OPEN/OPEN_CONFIRM call
1014 1014           * completes.
1015 1015           */
1016 1016          mutex_enter(&VTOMI4(dvp)->mi_lock);
1017 1017  
1018 1018          /* See if a permanent or just created open owner exists */
1019 1019          oop = find_open_owner_nolock(cr, NFS4_JUST_CREATED, VTOMI4(dvp));
1020 1020          if (!oop) {
1021 1021                  /*
1022 1022                   * This open owner does not exist so create a temporary
1023 1023                   * just created one.
1024 1024                   */
1025 1025                  oop = create_open_owner(cr, VTOMI4(dvp));
1026 1026                  ASSERT(oop != NULL);
1027 1027          }
1028 1028          mutex_exit(&VTOMI4(dvp)->mi_lock);
1029 1029  
1030 1030          /* this length never changes, do alloc before seqid sync */
1031 1031          open_args->owner.owner_len = sizeof (oop->oo_name);
1032 1032          open_args->owner.owner_val =
1033 1033              kmem_alloc(open_args->owner.owner_len, KM_SLEEP);
1034 1034  
1035 1035          e.error = nfs4_start_open_seqid_sync(oop, VTOMI4(dvp));
1036 1036          if (e.error == EAGAIN) {
1037 1037                  open_owner_rele(oop);
1038 1038                  nfs4args_copen_free(open_args);
1039 1039                  nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE);
1040 1040                  if (ncr != NULL) {
1041 1041                          crfree(ncr);
1042 1042                          ncr = NULL;
1043 1043                  }
1044 1044                  goto recov_retry;
1045 1045          }
1046 1046  
1047 1047          /* Check to see if we need to do the OTW call */
1048 1048          if (!create_flag) {
1049 1049                  if (!nfs4_is_otw_open_necessary(oop, open_flag, vpi,
1050 1050                      file_just_been_created, &e.error, acc, &recov_state)) {
1051 1051  
1052 1052                          /*
1053 1053                           * The OTW open is not necessary.  Either
1054 1054                           * the open can succeed without it (eg.
1055 1055                           * delegation, error == 0) or the open
1056 1056                           * must fail due to an access failure
1057 1057                           * (error != 0).  In either case, tidy
1058 1058                           * up and return.
1059 1059                           */
1060 1060  
1061 1061                          nfs4_end_open_seqid_sync(oop);
1062 1062                          open_owner_rele(oop);
1063 1063                          nfs4args_copen_free(open_args);
1064 1064                          nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, FALSE);
1065 1065                          if (ncr != NULL)
1066 1066                                  crfree(ncr);
1067 1067                          kmem_free(argop, argoplist_size);
1068 1068                          return (e.error);
1069 1069                  }
1070 1070          }
1071 1071  
1072 1072          bcopy(&oop->oo_name, open_args->owner.owner_val,
1073 1073              open_args->owner.owner_len);
1074 1074  
1075 1075          seqid = nfs4_get_open_seqid(oop) + 1;
1076 1076          open_args->seqid = seqid;
1077 1077          open_args->share_access = 0;
1078 1078          if (open_flag & FREAD)
1079 1079                  open_args->share_access |= OPEN4_SHARE_ACCESS_READ;
1080 1080          if (open_flag & FWRITE)
1081 1081                  open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE;
1082 1082          open_args->share_deny = OPEN4_SHARE_DENY_NONE;
1083 1083  
1084 1084  
1085 1085  
1086 1086          /*
1087 1087           * getfh w/sanity check for idx_open/idx_fattr
1088 1088           */
1089 1089          ASSERT((idx_open + 1) == (idx_fattr - 1));
1090 1090          argop[idx_open + 1].argop = OP_GETFH;
1091 1091  
1092 1092          /* getattr */
1093 1093          argop[idx_fattr].argop = OP_GETATTR;
1094 1094          argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1095 1095          argop[idx_fattr].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1096 1096  
1097 1097          if (setgid_flag) {
1098 1098                  vattr_t _v;
1099 1099                  servinfo4_t *svp;
1100 1100                  bitmap4 supp_attrs;
1101 1101  
1102 1102                  svp = drp->r_server;
1103 1103                  (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1104 1104                  supp_attrs = svp->sv_supp_attrs;
1105 1105                  nfs_rw_exit(&svp->sv_lock);
1106 1106  
1107 1107                  /*
1108 1108                   * For setgid case, we need to:
1109 1109                   * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new)
1110 1110                   */
1111 1111                  argop[4].argop = OP_SAVEFH;
1112 1112  
1113 1113                  argop[5].argop = OP_CPUTFH;
1114 1114                  argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
1115 1115  
1116 1116                  argop[6].argop = OP_GETATTR;
1117 1117                  argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1118 1118                  argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1119 1119  
1120 1120                  argop[7].argop = OP_RESTOREFH;
1121 1121  
1122 1122                  /*
1123 1123                   * nverify
1124 1124                   */
1125 1125                  _v.va_mask = AT_GID;
1126 1126                  _v.va_gid = in_va->va_gid;
1127 1127                  if (!(e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY,
1128 1128                      supp_attrs))) {
1129 1129  
1130 1130                          /*
1131 1131                           * setattr
1132 1132                           *
1133 1133                           * We _know_ we're not messing with AT_SIZE or
1134 1134                           * AT_XTIME, so no need for stateid or flags.
1135 1135                           * Also we specify NULL rp since we're only
1136 1136                           * interested in setting owner_group attributes.
1137 1137                           */
1138 1138                          nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr,
1139 1139                              supp_attrs, &e.error, 0);
1140 1140                          if (e.error)
1141 1141                                  nfs4args_verify_free(&argop[8]);
1142 1142                  }
1143 1143  
1144 1144                  if (e.error) {
1145 1145                          /*
1146 1146                           * XXX - Revisit the last argument to nfs4_end_op()
1147 1147                           *       once 5020486 is fixed.
1148 1148                           */
1149 1149                          nfs4_end_open_seqid_sync(oop);
1150 1150                          open_owner_rele(oop);
1151 1151                          nfs4args_copen_free(open_args);
1152 1152                          nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE);
1153 1153                          if (ncr != NULL)
1154 1154                                  crfree(ncr);
1155 1155                          kmem_free(argop, argoplist_size);
1156 1156                          return (e.error);
1157 1157                  }
1158 1158          } else if (create_flag) {
1159 1159                  argop[1].argop = OP_SAVEFH;
1160 1160  
1161 1161                  argop[5].argop = OP_RESTOREFH;
1162 1162  
1163 1163                  argop[6].argop = OP_GETATTR;
1164 1164                  argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1165 1165                  argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1166 1166          }
1167 1167  
1168 1168          NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1169 1169              "nfs4open_otw: %s call, nm %s, rp %s",
1170 1170              needrecov ? "recov" : "first", file_name,
1171 1171              rnode4info(VTOR4(dvp))));
1172 1172  
1173 1173          t = gethrtime();
1174 1174  
1175 1175          rfs4call(VTOMI4(dvp), &args, &res, cred_otw, &doqueue, 0, &e);
1176 1176  
1177 1177          if (!e.error && nfs4_need_to_bump_seqid(&res))
1178 1178                  nfs4_set_open_seqid(seqid, oop, args.ctag);
1179 1179  
1180 1180          needrecov = nfs4_needs_recovery(&e, TRUE, dvp->v_vfsp);
1181 1181  
1182 1182          if (e.error || needrecov) {
1183 1183                  bool_t abort = FALSE;
1184 1184  
1185 1185                  if (needrecov) {
1186 1186                          nfs4_bseqid_entry_t *bsep = NULL;
1187 1187  
1188 1188                          nfs4open_save_lost_rqst(e.error, &lost_rqst, oop,
1189 1189                              cred_otw, vpi, dvp, open_args);
1190 1190  
1191 1191                          if (!e.error && res.status == NFS4ERR_BAD_SEQID) {
1192 1192                                  bsep = nfs4_create_bseqid_entry(oop, NULL,
1193 1193                                      vpi, 0, args.ctag, open_args->seqid);
1194 1194                                  num_bseqid_retry--;
1195 1195                          }
1196 1196  
1197 1197                          abort = nfs4_start_recovery(&e, VTOMI4(dvp), dvp, vpi,
1198 1198                              NULL, lost_rqst.lr_op == OP_OPEN ?
1199 1199                              &lost_rqst : NULL, OP_OPEN, bsep, NULL, NULL);
1200 1200  
1201 1201                          if (bsep)
1202 1202                                  kmem_free(bsep, sizeof (*bsep));
1203 1203                          /* give up if we keep getting BAD_SEQID */
1204 1204                          if (num_bseqid_retry == 0)
1205 1205                                  abort = TRUE;
1206 1206                          if (abort == TRUE && e.error == 0)
1207 1207                                  e.error = geterrno4(res.status);
1208 1208                  }
1209 1209                  nfs4_end_open_seqid_sync(oop);
1210 1210                  open_owner_rele(oop);
1211 1211                  nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1212 1212                  nfs4args_copen_free(open_args);
1213 1213                  if (setgid_flag) {
1214 1214                          nfs4args_verify_free(&argop[8]);
1215 1215                          nfs4args_setattr_free(&argop[9]);
1216 1216                  }
1217 1217                  if (!e.error)
1218 1218                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1219 1219                  if (ncr != NULL) {
1220 1220                          crfree(ncr);
1221 1221                          ncr = NULL;
1222 1222                  }
1223 1223                  if (!needrecov || abort == TRUE || e.error == EINTR ||
1224 1224                      NFS4_FRC_UNMT_ERR(e.error, dvp->v_vfsp)) {
1225 1225                          kmem_free(argop, argoplist_size);
1226 1226                          return (e.error);
1227 1227                  }
1228 1228                  goto recov_retry;
1229 1229          }
1230 1230  
1231 1231          /*
1232 1232           * Will check and update lease after checking the rflag for
1233 1233           * OPEN_CONFIRM in the successful OPEN call.
1234 1234           */
1235 1235          if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) {
1236 1236  
1237 1237                  /*
1238 1238                   * XXX what if we're crossing mount points from server1:/drp
1239 1239                   * to server2:/drp/rp.
1240 1240                   */
1241 1241  
1242 1242                  /* Signal our end of use of the open seqid */
1243 1243                  nfs4_end_open_seqid_sync(oop);
1244 1244  
1245 1245                  /*
1246 1246                   * This will destroy the open owner if it was just created,
1247 1247                   * and no one else has put a reference on it.
1248 1248                   */
1249 1249                  open_owner_rele(oop);
1250 1250                  if (create_flag && (createmode != EXCLUSIVE4) &&
1251 1251                      res.status == NFS4ERR_BADOWNER)
1252 1252                          nfs4_log_badowner(VTOMI4(dvp), OP_OPEN);
1253 1253  
1254 1254                  e.error = geterrno4(res.status);
1255 1255                  nfs4args_copen_free(open_args);
1256 1256                  if (setgid_flag) {
1257 1257                          nfs4args_verify_free(&argop[8]);
1258 1258                          nfs4args_setattr_free(&argop[9]);
1259 1259                  }
1260 1260                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1261 1261                  nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1262 1262                  /*
1263 1263                   * If the reply is NFS4ERR_ACCESS, it may be because
1264 1264                   * we are root (no root net access).  If the real uid
1265 1265                   * is not root, then retry with the real uid instead.
1266 1266                   */
1267 1267                  if (ncr != NULL) {
1268 1268                          crfree(ncr);
1269 1269                          ncr = NULL;
1270 1270                  }
1271 1271                  if (res.status == NFS4ERR_ACCESS &&
1272 1272                      (ncr = crnetadjust(cred_otw)) != NULL) {
1273 1273                          cred_otw = ncr;
1274 1274                          goto recov_retry;
1275 1275                  }
1276 1276                  kmem_free(argop, argoplist_size);
1277 1277                  return (e.error);
1278 1278          }
1279 1279  
1280 1280          resop = &res.array[idx_open];  /* open res */
1281 1281          op_res = &resop->nfs_resop4_u.opopen;
1282 1282  
1283 1283  #ifdef DEBUG
1284 1284          /*
1285 1285           * verify attrset bitmap
1286 1286           */
1287 1287          if (create_flag &&
1288 1288              (createmode == UNCHECKED4 || createmode == GUARDED4)) {
1289 1289                  /* make sure attrset returned is what we asked for */
1290 1290                  /* XXX Ignore this 'error' for now */
1291 1291                  if (attr->attrmask != op_res->attrset)
1292 1292                          /* EMPTY */;
1293 1293          }
1294 1294  #endif
1295 1295  
1296 1296          if (op_res->rflags & OPEN4_RESULT_LOCKTYPE_POSIX) {
1297 1297                  mutex_enter(&VTOMI4(dvp)->mi_lock);
1298 1298                  VTOMI4(dvp)->mi_flags |= MI4_POSIX_LOCK;
1299 1299                  mutex_exit(&VTOMI4(dvp)->mi_lock);
1300 1300          }
1301 1301  
1302 1302          resop = &res.array[idx_open + 1];  /* getfh res */
1303 1303          gf_res = &resop->nfs_resop4_u.opgetfh;
1304 1304  
1305 1305          otw_sfh = sfh4_get(&gf_res->object, VTOMI4(dvp));
1306 1306  
1307 1307          /*
1308 1308           * The open stateid has been updated on the server but not
1309 1309           * on the client yet.  There is a path: makenfs4node->nfs4_attr_cache->
1310 1310           * flush_pages->VOP_PUTPAGE->...->nfs4write where we will issue an OTW
1311 1311           * WRITE call.  That, however, will use the old stateid, so go ahead
1312 1312           * and upate the open stateid now, before any call to makenfs4node.
1313 1313           */
1314 1314          if (vpi) {
1315 1315                  nfs4_open_stream_t      *tmp_osp;
1316 1316                  rnode4_t                *tmp_rp = VTOR4(vpi);
1317 1317  
1318 1318                  tmp_osp = find_open_stream(oop, tmp_rp);
1319 1319                  if (tmp_osp) {
1320 1320                          tmp_osp->open_stateid = op_res->stateid;
1321 1321                          mutex_exit(&tmp_osp->os_sync_lock);
1322 1322                          open_stream_rele(tmp_osp, tmp_rp);
1323 1323                  }
1324 1324  
1325 1325                  /*
1326 1326                   * We must determine if the file handle given by the otw open
1327 1327                   * is the same as the file handle which was passed in with
1328 1328                   * *vpp.  This case can be reached if the file we are trying
1329 1329                   * to open has been removed and another file has been created
1330 1330                   * having the same file name.  The passed in vnode is released
1331 1331                   * later.
1332 1332                   */
1333 1333                  orig_sfh = VTOR4(vpi)->r_fh;
1334 1334                  fh_differs = nfs4cmpfh(&orig_sfh->sfh_fh, &otw_sfh->sfh_fh);
1335 1335          }
1336 1336  
1337 1337          garp = &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res;
1338 1338  
1339 1339          if (create_flag || fh_differs) {
1340 1340                  int rnode_err = 0;
1341 1341  
1342 1342                  vp = makenfs4node(otw_sfh, garp, dvp->v_vfsp, t, cr,
1343 1343                      dvp, fn_get(VTOSV(dvp)->sv_name, file_name, otw_sfh));
1344 1344  
1345 1345                  if (e.error)
1346 1346                          PURGE_ATTRCACHE4(vp);
1347 1347                  /*
1348 1348                   * For the newly created vp case, make sure the rnode
1349 1349                   * isn't bad before using it.
1350 1350                   */
1351 1351                  mutex_enter(&(VTOR4(vp))->r_statelock);
1352 1352                  if (VTOR4(vp)->r_flags & R4RECOVERR)
1353 1353                          rnode_err = EIO;
1354 1354                  mutex_exit(&(VTOR4(vp))->r_statelock);
1355 1355  
1356 1356                  if (rnode_err) {
1357 1357                          nfs4_end_open_seqid_sync(oop);
1358 1358                          nfs4args_copen_free(open_args);
1359 1359                          if (setgid_flag) {
1360 1360                                  nfs4args_verify_free(&argop[8]);
1361 1361                                  nfs4args_setattr_free(&argop[9]);
1362 1362                          }
1363 1363                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1364 1364                          nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state,
1365 1365                              needrecov);
1366 1366                          open_owner_rele(oop);
1367 1367                          VN_RELE(vp);
1368 1368                          if (ncr != NULL)
1369 1369                                  crfree(ncr);
1370 1370                          sfh4_rele(&otw_sfh);
1371 1371                          kmem_free(argop, argoplist_size);
1372 1372                          return (EIO);
1373 1373                  }
1374 1374          } else {
1375 1375                  vp = vpi;
1376 1376          }
1377 1377          sfh4_rele(&otw_sfh);
1378 1378  
1379 1379          /*
1380 1380           * It seems odd to get a full set of attrs and then not update
1381 1381           * the object's attrcache in the non-create case.  Create case uses
1382 1382           * the attrs since makenfs4node checks to see if the attrs need to
1383 1383           * be updated (and then updates them).  The non-create case should
1384 1384           * update attrs also.
1385 1385           */
1386 1386          if (! create_flag && ! fh_differs && !e.error) {
1387 1387                  nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
1388 1388          }
1389 1389  
1390 1390          nfs4_error_zinit(&e);
1391 1391          if (op_res->rflags & OPEN4_RESULT_CONFIRM) {
1392 1392                  /* This does not do recovery for vp explicitly. */
1393 1393                  nfs4open_confirm(vp, &seqid, &op_res->stateid, cred_otw, FALSE,
1394 1394                      &retry_open, oop, FALSE, &e, &num_bseqid_retry);
1395 1395  
1396 1396                  if (e.error || e.stat) {
1397 1397                          nfs4_end_open_seqid_sync(oop);
1398 1398                          nfs4args_copen_free(open_args);
1399 1399                          if (setgid_flag) {
1400 1400                                  nfs4args_verify_free(&argop[8]);
1401 1401                                  nfs4args_setattr_free(&argop[9]);
1402 1402                          }
1403 1403                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1404 1404                          nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state,
1405 1405                              needrecov);
1406 1406                          open_owner_rele(oop);
1407 1407                          if (create_flag || fh_differs) {
1408 1408                                  /* rele the makenfs4node */
1409 1409                                  VN_RELE(vp);
1410 1410                          }
1411 1411                          if (ncr != NULL) {
1412 1412                                  crfree(ncr);
1413 1413                                  ncr = NULL;
1414 1414                          }
1415 1415                          if (retry_open == TRUE) {
1416 1416                                  NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1417 1417                                      "nfs4open_otw: retry the open since OPEN "
1418 1418                                      "CONFIRM failed with error %d stat %d",
1419 1419                                      e.error, e.stat));
1420 1420                                  if (create_flag && createmode == GUARDED4) {
1421 1421                                          NFS4_DEBUG(nfs4_client_recov_debug,
1422 1422                                              (CE_NOTE, "nfs4open_otw: switch "
1423 1423                                              "createmode from GUARDED4 to "
1424 1424                                              "UNCHECKED4"));
1425 1425                                          createmode = UNCHECKED4;
1426 1426                                  }
1427 1427                                  goto recov_retry;
1428 1428                          }
1429 1429                          if (!e.error) {
1430 1430                                  if (create_flag && (createmode != EXCLUSIVE4) &&
1431 1431                                      e.stat == NFS4ERR_BADOWNER)
1432 1432                                          nfs4_log_badowner(VTOMI4(dvp), OP_OPEN);
1433 1433  
1434 1434                                  e.error = geterrno4(e.stat);
1435 1435                          }
1436 1436                          kmem_free(argop, argoplist_size);
1437 1437                          return (e.error);
1438 1438                  }
1439 1439          }
1440 1440  
1441 1441          rp = VTOR4(vp);
1442 1442  
1443 1443          mutex_enter(&rp->r_statev4_lock);
1444 1444          if (create_flag)
1445 1445                  rp->created_v4 = 1;
1446 1446          mutex_exit(&rp->r_statev4_lock);
1447 1447  
1448 1448          mutex_enter(&oop->oo_lock);
1449 1449          /* Doesn't matter if 'oo_just_created' already was set as this */
1450 1450          oop->oo_just_created = NFS4_PERM_CREATED;
1451 1451          if (oop->oo_cred_otw)
1452 1452                  crfree(oop->oo_cred_otw);
1453 1453          oop->oo_cred_otw = cred_otw;
1454 1454          crhold(oop->oo_cred_otw);
1455 1455          mutex_exit(&oop->oo_lock);
1456 1456  
1457 1457          /* returns with 'os_sync_lock' held */
1458 1458          osp = find_or_create_open_stream(oop, rp, &created_osp);
1459 1459          if (!osp) {
1460 1460                  NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1461 1461                      "nfs4open_otw: failed to create an open stream"));
1462 1462                  NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE, "nfs4open_otw: "
1463 1463                      "signal our end of use of the open seqid"));
1464 1464  
1465 1465                  nfs4_end_open_seqid_sync(oop);
1466 1466                  open_owner_rele(oop);
1467 1467                  nfs4args_copen_free(open_args);
1468 1468                  if (setgid_flag) {
1469 1469                          nfs4args_verify_free(&argop[8]);
1470 1470                          nfs4args_setattr_free(&argop[9]);
1471 1471                  }
1472 1472                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1473 1473                  nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1474 1474                  if (create_flag || fh_differs)
1475 1475                          VN_RELE(vp);
1476 1476                  if (ncr != NULL)
1477 1477                          crfree(ncr);
1478 1478  
1479 1479                  kmem_free(argop, argoplist_size);
1480 1480                  return (EINVAL);
1481 1481  
1482 1482          }
1483 1483  
1484 1484          osp->open_stateid = op_res->stateid;
1485 1485  
1486 1486          if (open_flag & FREAD)
1487 1487                  osp->os_share_acc_read++;
1488 1488          if (open_flag & FWRITE)
1489 1489                  osp->os_share_acc_write++;
1490 1490          osp->os_share_deny_none++;
1491 1491  
1492 1492          /*
1493 1493           * Need to reset this bitfield for the possible case where we were
1494 1494           * going to OTW CLOSE the file, got a non-recoverable error, and before
1495 1495           * we could retry the CLOSE, OPENed the file again.
1496 1496           */
1497 1497          ASSERT(osp->os_open_owner->oo_seqid_inuse);
1498 1498          osp->os_final_close = 0;
1499 1499          osp->os_force_close = 0;
1500 1500  #ifdef DEBUG
1501 1501          if (osp->os_failed_reopen)
1502 1502                  NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, "nfs4open_otw:"
1503 1503                      " clearing os_failed_reopen for osp %p, cr %p, rp %s",
1504 1504                      (void *)osp, (void *)cr, rnode4info(rp)));
1505 1505  #endif
1506 1506          osp->os_failed_reopen = 0;
1507 1507  
1508 1508          mutex_exit(&osp->os_sync_lock);
1509 1509  
1510 1510          nfs4_end_open_seqid_sync(oop);
1511 1511  
1512 1512          if (created_osp && recov_state.rs_sp != NULL) {
1513 1513                  mutex_enter(&recov_state.rs_sp->s_lock);
1514 1514                  nfs4_inc_state_ref_count_nolock(recov_state.rs_sp, VTOMI4(dvp));
1515 1515                  mutex_exit(&recov_state.rs_sp->s_lock);
1516 1516          }
1517 1517  
1518 1518          /* get rid of our reference to find oop */
1519 1519          open_owner_rele(oop);
1520 1520  
1521 1521          open_stream_rele(osp, rp);
1522 1522  
1523 1523          /* accept delegation, if any */
1524 1524          nfs4_delegation_accept(rp, CLAIM_NULL, op_res, garp, cred_otw);
1525 1525  
1526 1526          nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1527 1527  
1528 1528          if (createmode == EXCLUSIVE4 &&
1529 1529              (in_va->va_mask & ~(AT_GID | AT_SIZE))) {
1530 1530                  NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw:"
1531 1531                      " EXCLUSIVE4: sending a SETATTR"));
1532 1532                  /*
1533 1533                   * If doing an exclusive create, then generate
1534 1534                   * a SETATTR to set the initial attributes.
1535 1535                   * Try to set the mtime and the atime to the
1536 1536                   * server's current time.  It is somewhat
1537 1537                   * expected that these fields will be used to
1538 1538                   * store the exclusive create cookie.  If not,
1539 1539                   * server implementors will need to know that
1540 1540                   * a SETATTR will follow an exclusive create
1541 1541                   * and the cookie should be destroyed if
1542 1542                   * appropriate.
1543 1543                   *
1544 1544                   * The AT_GID and AT_SIZE bits are turned off
1545 1545                   * so that the SETATTR request will not attempt
1546 1546                   * to process these.  The gid will be set
1547 1547                   * separately if appropriate.  The size is turned
1548 1548                   * off because it is assumed that a new file will
1549 1549                   * be created empty and if the file wasn't empty,
1550 1550                   * then the exclusive create will have failed
1551 1551                   * because the file must have existed already.
1552 1552                   * Therefore, no truncate operation is needed.
1553 1553                   */
1554 1554                  in_va->va_mask &= ~(AT_GID | AT_SIZE);
1555 1555                  in_va->va_mask |= (AT_MTIME | AT_ATIME);
1556 1556  
1557 1557                  e.error = nfs4setattr(vp, in_va, 0, cr, NULL);
1558 1558                  if (e.error) {
1559 1559                          nfs4_error_t err;
1560 1560  
1561 1561                          /*
1562 1562                           * Couldn't correct the attributes of
1563 1563                           * the newly created file and the
1564 1564                           * attributes are wrong.  Remove the
1565 1565                           * file and return an error to the
1566 1566                           * application.
1567 1567                           */
1568 1568                          /* XXX will this take care of client state ? */
1569 1569                          NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1570 1570                              "nfs4open_otw: EXCLUSIVE4: error %d on SETATTR:"
1571 1571                              " remove file", e.error));
1572 1572  
1573 1573                          /*
1574 1574                           * The file is currently open so try to close it first.
1575 1575                           *
1576 1576                           * If we do not close the file explicitly here then the
1577 1577                           * VN_RELE() would do an (implicit and asynchronous)
1578 1578                           * close for us.  But such async close could race with
1579 1579                           * the nfs4_remove() below.  If the async close is
1580 1580                           * slower than nfs4_remove() then nfs4_remove()
1581 1581                           * wouldn't remove the file but rename it to .nfsXXXX
1582 1582                           * instead.
1583 1583                           */
1584 1584                          nfs4close_one(vp, NULL, cr, open_flag, NULL, &err,
1585 1585                              CLOSE_NORM, 0, 0, 0);
1586 1586                          VN_RELE(vp);
1587 1587                          (void) nfs4_remove(dvp, file_name, cr, NULL, 0);
1588 1588  
1589 1589                          /*
1590 1590                           * Since we've reled the vnode and removed
1591 1591                           * the file we now need to return the error.
1592 1592                           * At this point we don't want to update the
1593 1593                           * dircaches, call nfs4_waitfor_purge_complete
1594 1594                           * or set vpp to vp so we need to skip these
1595 1595                           * as well.
1596 1596                           */
1597 1597                          goto skip_update_dircaches;
1598 1598                  }
1599 1599          }
1600 1600  
1601 1601          /*
1602 1602           * If we created or found the correct vnode, due to create_flag or
1603 1603           * fh_differs being set, then update directory cache attribute, readdir
1604 1604           * and dnlc caches.
1605 1605           */
1606 1606          if (create_flag || fh_differs) {
1607 1607                  dirattr_info_t dinfo, *dinfop;
1608 1608  
1609 1609                  /*
1610 1610                   * Make sure getattr succeeded before using results.
1611 1611                   * note: op 7 is getattr(dir) for both flavors of
1612 1612                   * open(create).
1613 1613                   */
1614 1614                  if (create_flag && res.status == NFS4_OK) {
1615 1615                          dinfo.di_time_call = t;
1616 1616                          dinfo.di_cred = cr;
1617 1617                          dinfo.di_garp =
1618 1618                              &res.array[6].nfs_resop4_u.opgetattr.ga_res;
1619 1619                          dinfop = &dinfo;
1620 1620                  } else {
1621 1621                          dinfop = NULL;
1622 1622                  }
1623 1623  
1624 1624                  nfs4_update_dircaches(&op_res->cinfo, dvp, vp, file_name,
1625 1625                      dinfop);
1626 1626          }
1627 1627  
1628 1628          /*
1629 1629           * If the page cache for this file was flushed from actions
1630 1630           * above, it was done asynchronously and if that is true,
1631 1631           * there is a need to wait here for it to complete.  This must
1632 1632           * be done outside of start_fop/end_fop.
1633 1633           */
1634 1634          (void) nfs4_waitfor_purge_complete(vp);
1635 1635  
1636 1636          /*
1637 1637           * It is implicit that we are in the open case (create_flag == 0) since
1638 1638           * fh_differs can only be set to a non-zero value in the open case.
1639 1639           */
1640 1640          if (fh_differs != 0 && vpi != NULL)
1641 1641                  VN_RELE(vpi);
1642 1642  
1643 1643          /*
1644 1644           * Be sure to set *vpp to the correct value before returning.
1645 1645           */
1646 1646          *vpp = vp;
1647 1647  
1648 1648  skip_update_dircaches:
1649 1649  
1650 1650          nfs4args_copen_free(open_args);
1651 1651          if (setgid_flag) {
1652 1652                  nfs4args_verify_free(&argop[8]);
1653 1653                  nfs4args_setattr_free(&argop[9]);
1654 1654          }
1655 1655          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1656 1656  
1657 1657          if (ncr)
1658 1658                  crfree(ncr);
1659 1659          kmem_free(argop, argoplist_size);
1660 1660          return (e.error);
1661 1661  }
1662 1662  
1663 1663  /*
1664 1664   * Reopen an open instance.  cf. nfs4open_otw().
1665 1665   *
1666 1666   * Errors are returned by the nfs4_error_t parameter.
1667 1667   * - ep->error contains an errno value or zero.
1668 1668   * - if it is zero, ep->stat is set to an NFS status code, if any.
1669 1669   *   If the file could not be reopened, but the caller should continue, the
1670 1670   *   file is marked dead and no error values are returned.  If the caller
1671 1671   *   should stop recovering open files and start over, either the ep->error
1672 1672   *   value or ep->stat will indicate an error (either something that requires
1673 1673   *   recovery or EAGAIN).  Note that some recovery (e.g., expired volatile
1674 1674   *   filehandles) may be handled silently by this routine.
1675 1675   * - if it is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, recovery for lost state
1676 1676   *   will be started, so the caller should not do it.
1677 1677   *
1678 1678   * Gotos:
1679 1679   * - kill_file : reopen failed in such a fashion to constitute marking the
1680 1680   *    file dead and setting the open stream's 'os_failed_reopen' as 1.  This
1681 1681   *   is for cases where recovery is not possible.
1682 1682   * - failed_reopen : same as above, except that the file has already been
1683 1683   *   marked dead, so no need to do it again.
1684 1684   * - bailout : reopen failed but we are able to recover and retry the reopen -
1685 1685   *   either within this function immediately or via the calling function.
1686 1686   */
1687 1687  
1688 1688  void
1689 1689  nfs4_reopen(vnode_t *vp, nfs4_open_stream_t *osp, nfs4_error_t *ep,
1690 1690      open_claim_type4 claim, bool_t frc_use_claim_previous,
1691 1691      bool_t is_recov)
1692 1692  {
1693 1693          COMPOUND4args_clnt args;
1694 1694          COMPOUND4res_clnt res;
1695 1695          nfs_argop4 argop[4];
1696 1696          nfs_resop4 *resop;
1697 1697          OPEN4res *op_res = NULL;
1698 1698          OPEN4cargs *open_args;
1699 1699          GETFH4res *gf_res;
1700 1700          rnode4_t *rp = VTOR4(vp);
1701 1701          int doqueue = 1;
1702 1702          cred_t *cr = NULL, *cred_otw = NULL;
1703 1703          nfs4_open_owner_t *oop = NULL;
1704 1704          seqid4 seqid;
1705 1705          nfs4_ga_res_t *garp;
1706 1706          char fn[MAXNAMELEN];
1707 1707          nfs4_recov_state_t recov = {NULL, 0};
1708 1708          nfs4_lost_rqst_t lost_rqst;
1709 1709          mntinfo4_t *mi = VTOMI4(vp);
1710 1710          bool_t abort;
1711 1711          char *failed_msg = "";
1712 1712          int fh_different;
1713 1713          hrtime_t t;
1714 1714          nfs4_bseqid_entry_t *bsep = NULL;
1715 1715  
1716 1716          ASSERT(nfs4_consistent_type(vp));
1717 1717          ASSERT(nfs_zone() == mi->mi_zone);
1718 1718  
1719 1719          nfs4_error_zinit(ep);
1720 1720  
1721 1721          /* this is the cred used to find the open owner */
1722 1722          cr = state_to_cred(osp);
1723 1723          if (cr == NULL) {
1724 1724                  failed_msg = "Couldn't reopen: no cred";
1725 1725                  goto kill_file;
1726 1726          }
1727 1727          /* use this cred for OTW operations */
1728 1728          cred_otw = nfs4_get_otw_cred(cr, mi, osp->os_open_owner);
1729 1729  
1730 1730  top:
1731 1731          nfs4_error_zinit(ep);
1732 1732  
1733 1733          if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) {
1734 1734                  /* File system has been unmounted, quit */
1735 1735                  ep->error = EIO;
1736 1736                  failed_msg = "Couldn't reopen: file system has been unmounted";
1737 1737                  goto kill_file;
1738 1738          }
1739 1739  
1740 1740          oop = osp->os_open_owner;
1741 1741  
1742 1742          ASSERT(oop != NULL);
1743 1743          if (oop == NULL) {      /* be defensive in non-DEBUG */
1744 1744                  failed_msg = "can't reopen: no open owner";
1745 1745                  goto kill_file;
1746 1746          }
1747 1747          open_owner_hold(oop);
1748 1748  
1749 1749          ep->error = nfs4_start_open_seqid_sync(oop, mi);
1750 1750          if (ep->error) {
1751 1751                  open_owner_rele(oop);
1752 1752                  oop = NULL;
1753 1753                  goto bailout;
1754 1754          }
1755 1755  
1756 1756          /*
1757 1757           * If the rnode has a delegation and the delegation has been
1758 1758           * recovered and the server didn't request a recall and the caller
1759 1759           * didn't specifically ask for CLAIM_PREVIOUS (nfs4frlock during
1760 1760           * recovery) and the rnode hasn't been marked dead, then install
1761 1761           * the delegation stateid in the open stream.  Otherwise, proceed
1762 1762           * with a CLAIM_PREVIOUS or CLAIM_NULL OPEN.
1763 1763           */
1764 1764          mutex_enter(&rp->r_statev4_lock);
1765 1765          if (rp->r_deleg_type != OPEN_DELEGATE_NONE &&
1766 1766              !rp->r_deleg_return_pending &&
1767 1767              (rp->r_deleg_needs_recovery == OPEN_DELEGATE_NONE) &&
1768 1768              !rp->r_deleg_needs_recall &&
1769 1769              claim != CLAIM_DELEGATE_CUR && !frc_use_claim_previous &&
1770 1770              !(rp->r_flags & R4RECOVERR)) {
1771 1771                  mutex_enter(&osp->os_sync_lock);
1772 1772                  osp->os_delegation = 1;
1773 1773                  osp->open_stateid = rp->r_deleg_stateid;
1774 1774                  mutex_exit(&osp->os_sync_lock);
1775 1775                  mutex_exit(&rp->r_statev4_lock);
1776 1776                  goto bailout;
1777 1777          }
1778 1778          mutex_exit(&rp->r_statev4_lock);
1779 1779  
1780 1780          /*
1781 1781           * If the file failed recovery, just quit.  This failure need not
1782 1782           * affect other reopens, so don't return an error.
1783 1783           */
1784 1784          mutex_enter(&rp->r_statelock);
1785 1785          if (rp->r_flags & R4RECOVERR) {
1786 1786                  mutex_exit(&rp->r_statelock);
1787 1787                  ep->error = 0;
1788 1788                  goto failed_reopen;
1789 1789          }
1790 1790          mutex_exit(&rp->r_statelock);
1791 1791  
1792 1792          /*
1793 1793           * argop is empty here
1794 1794           *
1795 1795           * PUTFH, OPEN, GETATTR
1796 1796           */
1797 1797          args.ctag = TAG_REOPEN;
1798 1798          args.array_len = 4;
1799 1799          args.array = argop;
1800 1800  
1801 1801          NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1802 1802              "nfs4_reopen: file is type %d, id %s",
1803 1803              vp->v_type, rnode4info(VTOR4(vp))));
1804 1804  
1805 1805          argop[0].argop = OP_CPUTFH;
1806 1806  
1807 1807          if (claim != CLAIM_PREVIOUS) {
1808 1808                  /*
1809 1809                   * if this is a file mount then
1810 1810                   * use the mntinfo parentfh
1811 1811                   */
1812 1812                  argop[0].nfs_argop4_u.opcputfh.sfh =
1813 1813                      (vp->v_flag & VROOT) ? mi->mi_srvparentfh :
1814 1814                      VTOSV(vp)->sv_dfh;
1815 1815          } else {
1816 1816                  /* putfh fh to reopen */
1817 1817                  argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
1818 1818          }
1819 1819  
1820 1820          argop[1].argop = OP_COPEN;
1821 1821          open_args = &argop[1].nfs_argop4_u.opcopen;
1822 1822          open_args->claim = claim;
1823 1823  
1824 1824          if (claim == CLAIM_NULL) {
1825 1825  
1826 1826                  if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) {
1827 1827                          nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname "
1828 1828                              "failed for vp 0x%p for CLAIM_NULL with %m",
1829 1829                              (void *)vp);
1830 1830                          failed_msg = "Couldn't reopen: vtoname failed for "
1831 1831                              "CLAIM_NULL";
1832 1832                          /* nothing allocated yet */
1833 1833                          goto kill_file;
1834 1834                  }
1835 1835  
1836 1836                  open_args->open_claim4_u.cfile = fn;
1837 1837          } else if (claim == CLAIM_PREVIOUS) {
1838 1838  
1839 1839                  /*
1840 1840                   * We have two cases to deal with here:
1841 1841                   * 1) We're being called to reopen files in order to satisfy
1842 1842                   *    a lock operation request which requires us to explicitly
1843 1843                   *    reopen files which were opened under a delegation.  If
1844 1844                   *    we're in recovery, we *must* use CLAIM_PREVIOUS.  In
1845 1845                   *    that case, frc_use_claim_previous is TRUE and we must
1846 1846                   *    use the rnode's current delegation type (r_deleg_type).
1847 1847                   * 2) We're reopening files during some form of recovery.
1848 1848                   *    In this case, frc_use_claim_previous is FALSE and we
1849 1849                   *    use the delegation type appropriate for recovery
1850 1850                   *    (r_deleg_needs_recovery).
1851 1851                   */
1852 1852                  mutex_enter(&rp->r_statev4_lock);
1853 1853                  open_args->open_claim4_u.delegate_type =
1854 1854                      frc_use_claim_previous ?
1855 1855                      rp->r_deleg_type :
1856 1856                      rp->r_deleg_needs_recovery;
1857 1857                  mutex_exit(&rp->r_statev4_lock);
1858 1858  
1859 1859          } else if (claim == CLAIM_DELEGATE_CUR) {
1860 1860  
1861 1861                  if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) {
1862 1862                          nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname "
1863 1863                              "failed for vp 0x%p for CLAIM_DELEGATE_CUR "
1864 1864                              "with %m", (void *)vp);
1865 1865                          failed_msg = "Couldn't reopen: vtoname failed for "
1866 1866                              "CLAIM_DELEGATE_CUR";
1867 1867                          /* nothing allocated yet */
1868 1868                          goto kill_file;
1869 1869                  }
1870 1870  
1871 1871                  mutex_enter(&rp->r_statev4_lock);
1872 1872                  open_args->open_claim4_u.delegate_cur_info.delegate_stateid =
1873 1873                      rp->r_deleg_stateid;
1874 1874                  mutex_exit(&rp->r_statev4_lock);
1875 1875  
1876 1876                  open_args->open_claim4_u.delegate_cur_info.cfile = fn;
1877 1877          }
1878 1878          open_args->opentype = OPEN4_NOCREATE;
1879 1879          open_args->owner.clientid = mi2clientid(mi);
1880 1880          open_args->owner.owner_len = sizeof (oop->oo_name);
1881 1881          open_args->owner.owner_val =
1882 1882              kmem_alloc(open_args->owner.owner_len, KM_SLEEP);
1883 1883          bcopy(&oop->oo_name, open_args->owner.owner_val,
1884 1884              open_args->owner.owner_len);
1885 1885          open_args->share_access = 0;
1886 1886          open_args->share_deny = 0;
1887 1887  
1888 1888          mutex_enter(&osp->os_sync_lock);
1889 1889          NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, "nfs4_reopen: osp %p rp "
1890 1890              "%p: read acc %"PRIu64" write acc %"PRIu64": open ref count %d: "
1891 1891              "mmap read %"PRIu64" mmap write %"PRIu64" claim %d ",
1892 1892              (void *)osp, (void *)rp, osp->os_share_acc_read,
1893 1893              osp->os_share_acc_write, osp->os_open_ref_count,
1894 1894              osp->os_mmap_read, osp->os_mmap_write, claim));
1895 1895  
1896 1896          if (osp->os_share_acc_read || osp->os_mmap_read)
1897 1897                  open_args->share_access |= OPEN4_SHARE_ACCESS_READ;
1898 1898          if (osp->os_share_acc_write || osp->os_mmap_write)
1899 1899                  open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE;
1900 1900          if (osp->os_share_deny_read)
1901 1901                  open_args->share_deny |= OPEN4_SHARE_DENY_READ;
1902 1902          if (osp->os_share_deny_write)
1903 1903                  open_args->share_deny |= OPEN4_SHARE_DENY_WRITE;
1904 1904          mutex_exit(&osp->os_sync_lock);
1905 1905  
1906 1906          seqid = nfs4_get_open_seqid(oop) + 1;
1907 1907          open_args->seqid = seqid;
1908 1908  
1909 1909          /* Construct the getfh part of the compound */
1910 1910          argop[2].argop = OP_GETFH;
1911 1911  
1912 1912          /* Construct the getattr part of the compound */
1913 1913          argop[3].argop = OP_GETATTR;
1914 1914          argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1915 1915          argop[3].nfs_argop4_u.opgetattr.mi = mi;
1916 1916  
1917 1917          t = gethrtime();
1918 1918  
1919 1919          rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);
1920 1920  
1921 1921          if (ep->error) {
1922 1922                  if (!is_recov && !frc_use_claim_previous &&
1923 1923                      (ep->error == EINTR || ep->error == ETIMEDOUT ||
1924 1924                      NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))) {
1925 1925                          nfs4open_save_lost_rqst(ep->error, &lost_rqst, oop,
1926 1926                              cred_otw, vp, NULL, open_args);
1927 1927                          abort = nfs4_start_recovery(ep,
1928 1928                              VTOMI4(vp), vp, NULL, NULL,
1929 1929                              lost_rqst.lr_op == OP_OPEN ?
1930 1930                              &lost_rqst : NULL, OP_OPEN, NULL, NULL, NULL);
1931 1931                          nfs4args_copen_free(open_args);
1932 1932                          goto bailout;
1933 1933                  }
1934 1934  
1935 1935                  nfs4args_copen_free(open_args);
1936 1936  
1937 1937                  if (ep->error == EACCES && cred_otw != cr) {
1938 1938                          crfree(cred_otw);
1939 1939                          cred_otw = cr;
1940 1940                          crhold(cred_otw);
1941 1941                          nfs4_end_open_seqid_sync(oop);
1942 1942                          open_owner_rele(oop);
1943 1943                          oop = NULL;
1944 1944                          goto top;
1945 1945                  }
1946 1946                  if (ep->error == ETIMEDOUT)
1947 1947                          goto bailout;
1948 1948                  failed_msg = "Couldn't reopen: rpc error";
1949 1949                  goto kill_file;
1950 1950          }
1951 1951  
1952 1952          if (nfs4_need_to_bump_seqid(&res))
1953 1953                  nfs4_set_open_seqid(seqid, oop, args.ctag);
1954 1954  
1955 1955          switch (res.status) {
1956 1956          case NFS4_OK:
1957 1957                  if (recov.rs_flags & NFS4_RS_DELAY_MSG) {
1958 1958                          mutex_enter(&rp->r_statelock);
1959 1959                          rp->r_delay_interval = 0;
1960 1960                          mutex_exit(&rp->r_statelock);
1961 1961                  }
1962 1962                  break;
1963 1963          case NFS4ERR_BAD_SEQID:
1964 1964                  bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 0,
1965 1965                      args.ctag, open_args->seqid);
1966 1966  
1967 1967                  abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL,
1968 1968                      NULL, lost_rqst.lr_op == OP_OPEN ? &lost_rqst :
1969 1969                      NULL, OP_OPEN, bsep, NULL, NULL);
1970 1970  
1971 1971                  nfs4args_copen_free(open_args);
1972 1972                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1973 1973                  nfs4_end_open_seqid_sync(oop);
1974 1974                  open_owner_rele(oop);
1975 1975                  oop = NULL;
1976 1976                  kmem_free(bsep, sizeof (*bsep));
1977 1977  
1978 1978                  goto kill_file;
1979 1979          case NFS4ERR_NO_GRACE:
1980 1980                  nfs4args_copen_free(open_args);
1981 1981                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1982 1982                  nfs4_end_open_seqid_sync(oop);
1983 1983                  open_owner_rele(oop);
1984 1984                  oop = NULL;
1985 1985                  if (claim == CLAIM_PREVIOUS) {
1986 1986                          /*
1987 1987                           * Retry as a plain open. We don't need to worry about
1988 1988                           * checking the changeinfo: it is acceptable for a
1989 1989                           * client to re-open a file and continue processing
1990 1990                           * (in the absence of locks).
1991 1991                           */
1992 1992                          NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1993 1993                              "nfs4_reopen: CLAIM_PREVIOUS: NFS4ERR_NO_GRACE; "
1994 1994                              "will retry as CLAIM_NULL"));
1995 1995                          claim = CLAIM_NULL;
1996 1996                          nfs4_mi_kstat_inc_no_grace(mi);
1997 1997                          goto top;
1998 1998                  }
1999 1999                  failed_msg =
2000 2000                      "Couldn't reopen: tried reclaim outside grace period. ";
2001 2001                  goto kill_file;
2002 2002          case NFS4ERR_GRACE:
2003 2003                  nfs4_set_grace_wait(mi);
2004 2004                  nfs4args_copen_free(open_args);
2005 2005                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2006 2006                  nfs4_end_open_seqid_sync(oop);
2007 2007                  open_owner_rele(oop);
2008 2008                  oop = NULL;
2009 2009                  ep->error = nfs4_wait_for_grace(mi, &recov);
2010 2010                  if (ep->error != 0)
2011 2011                          goto bailout;
2012 2012                  goto top;
2013 2013          case NFS4ERR_DELAY:
2014 2014                  nfs4_set_delay_wait(vp);
2015 2015                  nfs4args_copen_free(open_args);
2016 2016                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2017 2017                  nfs4_end_open_seqid_sync(oop);
2018 2018                  open_owner_rele(oop);
2019 2019                  oop = NULL;
2020 2020                  ep->error = nfs4_wait_for_delay(vp, &recov);
2021 2021                  nfs4_mi_kstat_inc_delay(mi);
2022 2022                  if (ep->error != 0)
2023 2023                          goto bailout;
2024 2024                  goto top;
2025 2025          case NFS4ERR_FHEXPIRED:
2026 2026                  /* recover filehandle and retry */
2027 2027                  abort = nfs4_start_recovery(ep,
2028 2028                      mi, vp, NULL, NULL, NULL, OP_OPEN, NULL, NULL, NULL);
2029 2029                  nfs4args_copen_free(open_args);
2030 2030                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2031 2031                  nfs4_end_open_seqid_sync(oop);
2032 2032                  open_owner_rele(oop);
2033 2033                  oop = NULL;
2034 2034                  if (abort == FALSE)
2035 2035                          goto top;
2036 2036                  failed_msg = "Couldn't reopen: recovery aborted";
2037 2037                  goto kill_file;
2038 2038          case NFS4ERR_RESOURCE:
2039 2039          case NFS4ERR_STALE_CLIENTID:
2040 2040          case NFS4ERR_WRONGSEC:
2041 2041          case NFS4ERR_EXPIRED:
2042 2042                  /*
2043 2043                   * Do not mark the file dead and let the calling
2044 2044                   * function initiate recovery.
2045 2045                   */
2046 2046                  nfs4args_copen_free(open_args);
2047 2047                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2048 2048                  nfs4_end_open_seqid_sync(oop);
2049 2049                  open_owner_rele(oop);
2050 2050                  oop = NULL;
2051 2051                  goto bailout;
2052 2052          case NFS4ERR_ACCESS:
2053 2053                  if (cred_otw != cr) {
2054 2054                          crfree(cred_otw);
2055 2055                          cred_otw = cr;
2056 2056                          crhold(cred_otw);
2057 2057                          nfs4args_copen_free(open_args);
2058 2058                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2059 2059                          nfs4_end_open_seqid_sync(oop);
2060 2060                          open_owner_rele(oop);
2061 2061                          oop = NULL;
2062 2062                          goto top;
2063 2063                  }
2064 2064                  /* fall through */
2065 2065          default:
2066 2066                  NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
2067 2067                      "nfs4_reopen: r_server 0x%p, mi_curr_serv 0x%p, rnode %s",
2068 2068                      (void*)VTOR4(vp)->r_server, (void*)mi->mi_curr_serv,
2069 2069                      rnode4info(VTOR4(vp))));
2070 2070                  failed_msg = "Couldn't reopen: NFSv4 error";
2071 2071                  nfs4args_copen_free(open_args);
2072 2072                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2073 2073                  goto kill_file;
2074 2074          }
2075 2075  
2076 2076          resop = &res.array[1];  /* open res */
2077 2077          op_res = &resop->nfs_resop4_u.opopen;
2078 2078  
2079 2079          garp = &res.array[3].nfs_resop4_u.opgetattr.ga_res;
2080 2080  
2081 2081          /*
2082 2082           * Check if the path we reopened really is the same
2083 2083           * file. We could end up in a situation where the file
2084 2084           * was removed and a new file created with the same name.
2085 2085           */
2086 2086          resop = &res.array[2];
2087 2087          gf_res = &resop->nfs_resop4_u.opgetfh;
2088 2088          (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
2089 2089          fh_different = (nfs4cmpfh(&rp->r_fh->sfh_fh, &gf_res->object) != 0);
2090 2090          if (fh_different) {
2091 2091                  if (mi->mi_fh_expire_type == FH4_PERSISTENT ||
2092 2092                      mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) {
2093 2093                          /* Oops, we don't have the same file */
2094 2094                          if (mi->mi_fh_expire_type == FH4_PERSISTENT)
2095 2095                                  failed_msg = "Couldn't reopen: Persistent "
2096 2096                                      "file handle changed";
2097 2097                          else
2098 2098                                  failed_msg = "Couldn't reopen: Volatile "
2099 2099                                      "(no expire on open) file handle changed";
2100 2100  
2101 2101                          nfs4args_copen_free(open_args);
2102 2102                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2103 2103                          nfs_rw_exit(&mi->mi_fh_lock);
2104 2104                          goto kill_file;
2105 2105  
2106 2106                  } else {
2107 2107                          /*
2108 2108                           * We have volatile file handles that don't compare.
2109 2109                           * If the fids are the same then we assume that the
2110 2110                           * file handle expired but the rnode still refers to
2111 2111                           * the same file object.
2112 2112                           *
2113 2113                           * First check that we have fids or not.
2114 2114                           * If we don't we have a dumb server so we will
2115 2115                           * just assume every thing is ok for now.
2116 2116                           */
2117 2117                          if (!ep->error && garp->n4g_va.va_mask & AT_NODEID &&
2118 2118                              rp->r_attr.va_mask & AT_NODEID &&
2119 2119                              rp->r_attr.va_nodeid != garp->n4g_va.va_nodeid) {
2120 2120                                  /*
2121 2121                                   * We have fids, but they don't
2122 2122                                   * compare. So kill the file.
2123 2123                                   */
2124 2124                                  failed_msg =
2125 2125                                      "Couldn't reopen: file handle changed"
2126 2126                                      " due to mismatched fids";
2127 2127                                  nfs4args_copen_free(open_args);
2128 2128                                  xdr_free(xdr_COMPOUND4res_clnt,
2129 2129                                      (caddr_t)&res);
2130 2130                                  nfs_rw_exit(&mi->mi_fh_lock);
2131 2131                                  goto kill_file;
2132 2132                          } else {
2133 2133                                  /*
2134 2134                                   * We have volatile file handles that refers
2135 2135                                   * to the same file (at least they have the
2136 2136                                   * same fid) or we don't have fids so we
2137 2137                                   * can't tell. :(. We'll be a kind and accepting
2138 2138                                   * client so we'll update the rnode's file
2139 2139                                   * handle with the otw handle.
2140 2140                                   *
2141 2141                                   * We need to drop mi->mi_fh_lock since
2142 2142                                   * sh4_update acquires it. Since there is
2143 2143                                   * only one recovery thread there is no
2144 2144                                   * race.
2145 2145                                   */
2146 2146                                  nfs_rw_exit(&mi->mi_fh_lock);
2147 2147                                  sfh4_update(rp->r_fh, &gf_res->object);
2148 2148                          }
2149 2149                  }
2150 2150          } else {
2151 2151                  nfs_rw_exit(&mi->mi_fh_lock);
2152 2152          }
2153 2153  
2154 2154          ASSERT(nfs4_consistent_type(vp));
2155 2155  
2156 2156          /*
2157 2157           * If the server wanted an OPEN_CONFIRM but that fails, just start
2158 2158           * over.  Presumably if there is a persistent error it will show up
2159 2159           * when we resend the OPEN.
2160 2160           */
2161 2161          if (op_res->rflags & OPEN4_RESULT_CONFIRM) {
2162 2162                  bool_t retry_open = FALSE;
2163 2163  
2164 2164                  nfs4open_confirm(vp, &seqid, &op_res->stateid,
2165 2165                      cred_otw, is_recov, &retry_open,
2166 2166                      oop, FALSE, ep, NULL);
2167 2167                  if (ep->error || ep->stat) {
2168 2168                          nfs4args_copen_free(open_args);
2169 2169                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2170 2170                          nfs4_end_open_seqid_sync(oop);
2171 2171                          open_owner_rele(oop);
2172 2172                          oop = NULL;
2173 2173                          goto top;
2174 2174                  }
2175 2175          }
2176 2176  
2177 2177          mutex_enter(&osp->os_sync_lock);
2178 2178          osp->open_stateid = op_res->stateid;
2179 2179          osp->os_delegation = 0;
2180 2180          /*
2181 2181           * Need to reset this bitfield for the possible case where we were
2182 2182           * going to OTW CLOSE the file, got a non-recoverable error, and before
2183 2183           * we could retry the CLOSE, OPENed the file again.
2184 2184           */
2185 2185          ASSERT(osp->os_open_owner->oo_seqid_inuse);
2186 2186          osp->os_final_close = 0;
2187 2187          osp->os_force_close = 0;
2188 2188          if (claim == CLAIM_DELEGATE_CUR || claim == CLAIM_PREVIOUS)
2189 2189                  osp->os_dc_openacc = open_args->share_access;
2190 2190          mutex_exit(&osp->os_sync_lock);
2191 2191  
2192 2192          nfs4_end_open_seqid_sync(oop);
2193 2193  
2194 2194          /* accept delegation, if any */
2195 2195          nfs4_delegation_accept(rp, claim, op_res, garp, cred_otw);
2196 2196  
2197 2197          nfs4args_copen_free(open_args);
2198 2198  
2199 2199          nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
2200 2200  
2201 2201          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2202 2202  
2203 2203          ASSERT(nfs4_consistent_type(vp));
2204 2204  
2205 2205          open_owner_rele(oop);
2206 2206          crfree(cr);
2207 2207          crfree(cred_otw);
2208 2208          return;
2209 2209  
2210 2210  kill_file:
2211 2211          nfs4_fail_recov(vp, failed_msg, ep->error, ep->stat);
2212 2212  failed_reopen:
2213 2213          NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE,
2214 2214              "nfs4_reopen: setting os_failed_reopen for osp %p, cr %p, rp %s",
2215 2215              (void *)osp, (void *)cr, rnode4info(rp)));
2216 2216          mutex_enter(&osp->os_sync_lock);
2217 2217          osp->os_failed_reopen = 1;
2218 2218          mutex_exit(&osp->os_sync_lock);
2219 2219  bailout:
2220 2220          if (oop != NULL) {
2221 2221                  nfs4_end_open_seqid_sync(oop);
2222 2222                  open_owner_rele(oop);
2223 2223          }
2224 2224          if (cr != NULL)
2225 2225                  crfree(cr);
2226 2226          if (cred_otw != NULL)
2227 2227                  crfree(cred_otw);
2228 2228  }
2229 2229  
2230 2230  /* for . and .. OPENs */
2231 2231  /* ARGSUSED */
2232 2232  static int
2233 2233  nfs4_open_non_reg_file(vnode_t **vpp, int flag, cred_t *cr)
2234 2234  {
2235 2235          rnode4_t *rp;
2236 2236          nfs4_ga_res_t gar;
2237 2237  
2238 2238          ASSERT(nfs_zone() == VTOMI4(*vpp)->mi_zone);
2239 2239  
2240 2240          /*
2241 2241           * If close-to-open consistency checking is turned off or
2242 2242           * if there is no cached data, we can avoid
2243 2243           * the over the wire getattr.  Otherwise, force a
2244 2244           * call to the server to get fresh attributes and to
2245 2245           * check caches. This is required for close-to-open
2246 2246           * consistency.
2247 2247           */
2248 2248          rp = VTOR4(*vpp);
2249 2249          if (VTOMI4(*vpp)->mi_flags & MI4_NOCTO ||
2250 2250              (rp->r_dir == NULL && !nfs4_has_pages(*vpp)))
2251 2251                  return (0);
2252 2252  
2253 2253          return (nfs4_getattr_otw(*vpp, &gar, cr, 0));
2254 2254  }
2255 2255  
2256 2256  /*
2257 2257   * CLOSE a file
2258 2258   */
2259 2259  /* ARGSUSED */
2260 2260  static int
2261 2261  nfs4_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
2262 2262      caller_context_t *ct)
2263 2263  {
2264 2264          rnode4_t        *rp;
2265 2265          int              error = 0;
2266 2266          int              r_error = 0;
2267 2267          int              n4error = 0;
2268 2268          nfs4_error_t     e = { 0, NFS4_OK, RPC_SUCCESS };
2269 2269  
2270 2270          /*
2271 2271           * Remove client state for this (lockowner, file) pair.
2272 2272           * Issue otw v4 call to have the server do the same.
2273 2273           */
2274 2274  
2275 2275          rp = VTOR4(vp);
2276 2276  
2277 2277          /*
2278 2278           * zone_enter(2) prevents processes from changing zones with NFS files
2279 2279           * open; if we happen to get here from the wrong zone we can't do
2280 2280           * anything over the wire.
2281 2281           */
2282 2282          if (VTOMI4(vp)->mi_zone != nfs_zone()) {
2283 2283                  /*
2284 2284                   * We could attempt to clean up locks, except we're sure
2285 2285                   * that the current process didn't acquire any locks on
2286 2286                   * the file: any attempt to lock a file belong to another zone
2287 2287                   * will fail, and one can't lock an NFS file and then change
2288 2288                   * zones, as that fails too.
2289 2289                   *
2290 2290                   * Returning an error here is the sane thing to do.  A
2291 2291                   * subsequent call to VN_RELE() which translates to a
2292 2292                   * nfs4_inactive() will clean up state: if the zone of the
2293 2293                   * vnode's origin is still alive and kicking, the inactive
2294 2294                   * thread will handle the request (from the correct zone), and
2295 2295                   * everything (minus the OTW close call) should be OK.  If the
2296 2296                   * zone is going away nfs4_async_inactive() will throw away
2297 2297                   * delegations, open streams and cached pages inline.
2298 2298                   */
2299 2299                  return (EIO);
2300 2300          }
2301 2301  
2302 2302          /*
2303 2303           * If we are using local locking for this filesystem, then
2304 2304           * release all of the SYSV style record locks.  Otherwise,
2305 2305           * we are doing network locking and we need to release all
2306 2306           * of the network locks.  All of the locks held by this
2307 2307           * process on this file are released no matter what the
2308 2308           * incoming reference count is.
2309 2309           */
2310 2310          if (VTOMI4(vp)->mi_flags & MI4_LLOCK) {
2311 2311                  cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
2312 2312                  cleanshares(vp, ttoproc(curthread)->p_pid);
2313 2313          } else
2314 2314                  e.error = nfs4_lockrelease(vp, flag, offset, cr);
2315 2315  
2316 2316          if (e.error) {
2317 2317                  struct lm_sysid *lmsid;
2318 2318                  lmsid = nfs4_find_sysid(VTOMI4(vp));
2319 2319                  if (lmsid == NULL) {
2320 2320                          DTRACE_PROBE2(unknown__sysid, int, e.error,
2321 2321                              vnode_t *, vp);
2322 2322                  } else {
2323 2323                          cleanlocks(vp, ttoproc(curthread)->p_pid,
2324 2324                              (lm_sysidt(lmsid) | LM_SYSID_CLIENT));
2325 2325  
2326 2326                          lm_rel_sysid(lmsid);
2327 2327                  }
2328 2328                  return (e.error);
2329 2329          }
2330 2330  
2331 2331          if (count > 1)
2332 2332                  return (0);
2333 2333  
2334 2334          /*
2335 2335           * If the file has been `unlinked', then purge the
2336 2336           * DNLC so that this vnode will get reycled quicker
2337 2337           * and the .nfs* file on the server will get removed.
2338 2338           */
2339 2339          if (rp->r_unldvp != NULL)
2340 2340                  dnlc_purge_vp(vp);
2341 2341  
2342 2342          /*
2343 2343           * If the file was open for write and there are pages,
2344 2344           * do a synchronous flush and commit of all of the
2345 2345           * dirty and uncommitted pages.
2346 2346           */
2347 2347          ASSERT(!e.error);
2348 2348          if ((flag & FWRITE) && nfs4_has_pages(vp))
2349 2349                  error = nfs4_putpage_commit(vp, 0, 0, cr);
2350 2350  
2351 2351          mutex_enter(&rp->r_statelock);
2352 2352          r_error = rp->r_error;
2353 2353          rp->r_error = 0;
2354 2354          mutex_exit(&rp->r_statelock);
2355 2355  
2356 2356          /*
2357 2357           * If this file type is one for which no explicit 'open' was
2358 2358           * done, then bail now (ie. no need for protocol 'close'). If
2359 2359           * there was an error w/the vm subsystem, return _that_ error,
2360 2360           * otherwise, return any errors that may've been reported via
2361 2361           * the rnode.
2362 2362           */
2363 2363          if (vp->v_type != VREG)
2364 2364                  return (error ? error : r_error);
2365 2365  
2366 2366          /*
2367 2367           * The sync putpage commit may have failed above, but since
2368 2368           * we're working w/a regular file, we need to do the protocol
2369 2369           * 'close' (nfs4close_one will figure out if an otw close is
2370 2370           * needed or not). Report any errors _after_ doing the protocol
2371 2371           * 'close'.
2372 2372           */
2373 2373          nfs4close_one(vp, NULL, cr, flag, NULL, &e, CLOSE_NORM, 0, 0, 0);
2374 2374          n4error = e.error ? e.error : geterrno4(e.stat);
2375 2375  
2376 2376          /*
2377 2377           * Error reporting prio (Hi -> Lo)
2378 2378           *
2379 2379           *   i) nfs4_putpage_commit (error)
2380 2380           *  ii) rnode's (r_error)
2381 2381           * iii) nfs4close_one (n4error)
2382 2382           */
2383 2383          return (error ? error : (r_error ? r_error : n4error));
2384 2384  }
2385 2385  
2386 2386  /*
2387 2387   * Initialize *lost_rqstp.
2388 2388   */
2389 2389  
2390 2390  static void
2391 2391  nfs4close_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
2392 2392      nfs4_open_owner_t *oop, nfs4_open_stream_t *osp, cred_t *cr,
2393 2393      vnode_t *vp)
2394 2394  {
2395 2395          if (error != ETIMEDOUT && error != EINTR &&
2396 2396              !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
2397 2397                  lost_rqstp->lr_op = 0;
2398 2398                  return;
2399 2399          }
2400 2400  
2401 2401          NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2402 2402              "nfs4close_save_lost_rqst: error %d", error));
2403 2403  
2404 2404          lost_rqstp->lr_op = OP_CLOSE;
2405 2405          /*
2406 2406           * The vp is held and rele'd via the recovery code.
2407 2407           * See nfs4_save_lost_rqst.
2408 2408           */
2409 2409          lost_rqstp->lr_vp = vp;
2410 2410          lost_rqstp->lr_dvp = NULL;
2411 2411          lost_rqstp->lr_oop = oop;
2412 2412          lost_rqstp->lr_osp = osp;
2413 2413          ASSERT(osp != NULL);
2414 2414          ASSERT(mutex_owned(&osp->os_sync_lock));
2415 2415          osp->os_pending_close = 1;
2416 2416          lost_rqstp->lr_lop = NULL;
2417 2417          lost_rqstp->lr_cr = cr;
2418 2418          lost_rqstp->lr_flk = NULL;
2419 2419          lost_rqstp->lr_putfirst = FALSE;
2420 2420  }
2421 2421  
2422 2422  /*
2423 2423   * Assumes you already have the open seqid sync grabbed as well as the
2424 2424   * 'os_sync_lock'.  Note: this will release the open seqid sync and
2425 2425   * 'os_sync_lock' if client recovery starts.  Calling functions have to
2426 2426   * be prepared to handle this.
2427 2427   *
2428 2428   * 'recov' is returned as 1 if the CLOSE operation detected client recovery
2429 2429   * was needed and was started, and that the calling function should retry
2430 2430   * this function; otherwise it is returned as 0.
2431 2431   *
2432 2432   * Errors are returned via the nfs4_error_t parameter.
2433 2433   */
2434 2434  static void
2435 2435  nfs4close_otw(rnode4_t *rp, cred_t *cred_otw, nfs4_open_owner_t *oop,
2436 2436      nfs4_open_stream_t *osp, int *recov, int *did_start_seqid_syncp,
2437 2437      nfs4_close_type_t close_type, nfs4_error_t *ep, int *have_sync_lockp)
2438 2438  {
2439 2439          COMPOUND4args_clnt args;
2440 2440          COMPOUND4res_clnt res;
2441 2441          CLOSE4args *close_args;
2442 2442          nfs_resop4 *resop;
2443 2443          nfs_argop4 argop[3];
2444 2444          int doqueue = 1;
2445 2445          mntinfo4_t *mi;
2446 2446          seqid4 seqid;
2447 2447          vnode_t *vp;
2448 2448          bool_t needrecov = FALSE;
2449 2449          nfs4_lost_rqst_t lost_rqst;
2450 2450          hrtime_t t;
2451 2451  
2452 2452          ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone);
2453 2453  
2454 2454          ASSERT(MUTEX_HELD(&osp->os_sync_lock));
2455 2455  
2456 2456          NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw"));
2457 2457  
2458 2458          /* Only set this to 1 if recovery is started */
2459 2459          *recov = 0;
2460 2460  
2461 2461          /* do the OTW call to close the file */
2462 2462  
2463 2463          if (close_type == CLOSE_RESEND)
2464 2464                  args.ctag = TAG_CLOSE_LOST;
2465 2465          else if (close_type == CLOSE_AFTER_RESEND)
2466 2466                  args.ctag = TAG_CLOSE_UNDO;
2467 2467          else
2468 2468                  args.ctag = TAG_CLOSE;
2469 2469  
2470 2470          args.array_len = 3;
2471 2471          args.array = argop;
2472 2472  
2473 2473          vp = RTOV4(rp);
2474 2474  
2475 2475          mi = VTOMI4(vp);
2476 2476  
2477 2477          /* putfh target fh */
2478 2478          argop[0].argop = OP_CPUTFH;
2479 2479          argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
2480 2480  
2481 2481          argop[1].argop = OP_GETATTR;
2482 2482          argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
2483 2483          argop[1].nfs_argop4_u.opgetattr.mi = mi;
2484 2484  
2485 2485          argop[2].argop = OP_CLOSE;
2486 2486          close_args = &argop[2].nfs_argop4_u.opclose;
2487 2487  
2488 2488          seqid = nfs4_get_open_seqid(oop) + 1;
2489 2489  
2490 2490          close_args->seqid = seqid;
2491 2491          close_args->open_stateid = osp->open_stateid;
2492 2492  
2493 2493          NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
2494 2494              "nfs4close_otw: %s call, rp %s", needrecov ? "recov" : "first",
2495 2495              rnode4info(rp)));
2496 2496  
2497 2497          t = gethrtime();
2498 2498  
2499 2499          rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);
2500 2500  
2501 2501          if (!ep->error && nfs4_need_to_bump_seqid(&res)) {
2502 2502                  nfs4_set_open_seqid(seqid, oop, args.ctag);
2503 2503          }
2504 2504  
2505 2505          needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp);
2506 2506          if (ep->error && !needrecov) {
2507 2507                  /*
2508 2508                   * if there was an error and no recovery is to be done
2509 2509                   * then then set up the file to flush its cache if
2510 2510                   * needed for the next caller.
2511 2511                   */
2512 2512                  mutex_enter(&rp->r_statelock);
2513 2513                  PURGE_ATTRCACHE4_LOCKED(rp);
2514 2514                  rp->r_flags &= ~R4WRITEMODIFIED;
2515 2515                  mutex_exit(&rp->r_statelock);
2516 2516                  return;
2517 2517          }
2518 2518  
2519 2519          if (needrecov) {
2520 2520                  bool_t abort;
2521 2521                  nfs4_bseqid_entry_t *bsep = NULL;
2522 2522  
2523 2523                  if (close_type != CLOSE_RESEND)
2524 2524                          nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop,
2525 2525                              osp, cred_otw, vp);
2526 2526  
2527 2527                  if (!ep->error && res.status == NFS4ERR_BAD_SEQID)
2528 2528                          bsep = nfs4_create_bseqid_entry(oop, NULL, vp,
2529 2529                              0, args.ctag, close_args->seqid);
2530 2530  
2531 2531                  NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2532 2532                      "nfs4close_otw: initiating recovery. error %d "
2533 2533                      "res.status %d", ep->error, res.status));
2534 2534  
2535 2535                  /*
2536 2536                   * Drop the 'os_sync_lock' here so we don't hit
2537 2537                   * a potential recursive mutex_enter via an
2538 2538                   * 'open_stream_hold()'.
2539 2539                   */
2540 2540                  mutex_exit(&osp->os_sync_lock);
2541 2541                  *have_sync_lockp = 0;
2542 2542                  abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL,
2543 2543                      (close_type != CLOSE_RESEND &&
2544 2544                      lost_rqst.lr_op == OP_CLOSE) ? &lost_rqst : NULL,
2545 2545                      OP_CLOSE, bsep, NULL, NULL);
2546 2546  
2547 2547                  /* drop open seq sync, and let the calling function regrab it */
2548 2548                  nfs4_end_open_seqid_sync(oop);
2549 2549                  *did_start_seqid_syncp = 0;
2550 2550  
2551 2551                  if (bsep)
2552 2552                          kmem_free(bsep, sizeof (*bsep));
2553 2553                  /*
2554 2554                   * For signals, the caller wants to quit, so don't say to
2555 2555                   * retry.  For forced unmount, if it's a user thread, it
2556 2556                   * wants to quit.  If it's a recovery thread, the retry
2557 2557                   * will happen higher-up on the call stack.  Either way,
2558 2558                   * don't say to retry.
2559 2559                   */
2560 2560                  if (abort == FALSE && ep->error != EINTR &&
2561 2561                      !NFS4_FRC_UNMT_ERR(ep->error, mi->mi_vfsp) &&
2562 2562                      close_type != CLOSE_RESEND &&
2563 2563                      close_type != CLOSE_AFTER_RESEND)
2564 2564                          *recov = 1;
2565 2565                  else
2566 2566                          *recov = 0;
2567 2567  
2568 2568                  if (!ep->error)
2569 2569                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2570 2570                  return;
2571 2571          }
2572 2572  
2573 2573          if (res.status) {
2574 2574                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2575 2575                  return;
2576 2576          }
2577 2577  
2578 2578          mutex_enter(&rp->r_statev4_lock);
2579 2579          rp->created_v4 = 0;
2580 2580          mutex_exit(&rp->r_statev4_lock);
2581 2581  
2582 2582          resop = &res.array[2];
2583 2583          osp->open_stateid = resop->nfs_resop4_u.opclose.open_stateid;
2584 2584          osp->os_valid = 0;
2585 2585  
2586 2586          /*
2587 2587           * This removes the reference obtained at OPEN; ie, when the
2588 2588           * open stream structure was created.

↓ open down ↓

2588 lines elided

↑ open up ↑

2589 2589           *
2590 2590           * We don't have to worry about calling 'open_stream_rele'
2591 2591           * since we our currently holding a reference to the open
2592 2592           * stream which means the count cannot go to 0 with this
2593 2593           * decrement.
2594 2594           */
2595 2595          ASSERT(osp->os_ref_count >= 2);
2596 2596          osp->os_ref_count--;
2597 2597  
2598 2598          if (ep->error == 0) {
2599      -                /*
2600      -                 * Avoid a deadlock with the r_serial thread waiting for
2601      -                 * os_sync_lock in nfs4_get_otw_cred_by_osp() which might be
2602      -                 * held by us. We will wait in nfs4_attr_cache() for the
2603      -                 * completion of the r_serial thread.
2604      -                 */
2605 2599                  mutex_exit(&osp->os_sync_lock);
2606 2600                  *have_sync_lockp = 0;
2607 2601  
2608 2602                  nfs4_attr_cache(vp,
2609 2603                      &res.array[1].nfs_resop4_u.opgetattr.ga_res,
2610 2604                      t, cred_otw, TRUE, NULL);
2611 2605          }
2612 2606  
2613 2607          NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw:"
2614 2608              " returning %d", ep->error));

2615 2609  
2616 2610          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2617 2611  }
2618 2612  
2619 2613  /* ARGSUSED */
2620 2614  static int
2621 2615  nfs4_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
2622 2616      caller_context_t *ct)
2623 2617  {
2624 2618          rnode4_t *rp;
2625 2619          u_offset_t off;
2626 2620          offset_t diff;
2627 2621          uint_t on;
2628 2622          uint_t n;
2629 2623          caddr_t base;
2630 2624          uint_t flags;
2631 2625          int error;
2632 2626          mntinfo4_t *mi;
2633 2627  
2634 2628          rp = VTOR4(vp);
2635 2629  
2636 2630          ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
2637 2631  
2638 2632          if (IS_SHADOW(vp, rp))
2639 2633                  vp = RTOV4(rp);
2640 2634  
2641 2635          if (vp->v_type != VREG)
2642 2636                  return (EISDIR);
2643 2637  
2644 2638          mi = VTOMI4(vp);
2645 2639  
2646 2640          if (nfs_zone() != mi->mi_zone)
2647 2641                  return (EIO);
2648 2642  
2649 2643          if (uiop->uio_resid == 0)
2650 2644                  return (0);
2651 2645  
2652 2646          if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0)
2653 2647                  return (EINVAL);
2654 2648  
2655 2649          mutex_enter(&rp->r_statelock);
2656 2650          if (rp->r_flags & R4RECOVERRP)
2657 2651                  error = (rp->r_error ? rp->r_error : EIO);
2658 2652          else
2659 2653                  error = 0;
2660 2654          mutex_exit(&rp->r_statelock);
2661 2655          if (error)
2662 2656                  return (error);
2663 2657  
2664 2658          /*
2665 2659           * Bypass VM if caching has been disabled (e.g., locking) or if
2666 2660           * using client-side direct I/O and the file is not mmap'd and
2667 2661           * there are no cached pages.
2668 2662           */
2669 2663          if ((vp->v_flag & VNOCACHE) ||
2670 2664              (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) &&
2671 2665              rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) {
2672 2666                  size_t resid = 0;
2673 2667  
2674 2668                  return (nfs4read(vp, NULL, uiop->uio_loffset,
2675 2669                      uiop->uio_resid, &resid, cr, FALSE, uiop));
2676 2670          }
2677 2671  
2678 2672          error = 0;
2679 2673  
2680 2674          do {
2681 2675                  off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
2682 2676                  on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
2683 2677                  n = MIN(MAXBSIZE - on, uiop->uio_resid);
2684 2678  
2685 2679                  if (error = nfs4_validate_caches(vp, cr))
2686 2680                          break;
2687 2681  
2688 2682                  mutex_enter(&rp->r_statelock);
2689 2683                  while (rp->r_flags & R4INCACHEPURGE) {
2690 2684                          if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
2691 2685                                  mutex_exit(&rp->r_statelock);
2692 2686                                  return (EINTR);
2693 2687                          }
2694 2688                  }
2695 2689                  diff = rp->r_size - uiop->uio_loffset;
2696 2690                  mutex_exit(&rp->r_statelock);
2697 2691                  if (diff <= 0)
2698 2692                          break;
2699 2693                  if (diff < n)
2700 2694                          n = (uint_t)diff;
2701 2695  
2702 2696                  if (vpm_enable) {
2703 2697                          /*
2704 2698                           * Copy data.
2705 2699                           */
2706 2700                          error = vpm_data_copy(vp, off + on, n, uiop,
2707 2701                              1, NULL, 0, S_READ);
2708 2702                  } else {
2709 2703                          base = segmap_getmapflt(segkmap, vp, off + on, n, 1,
2710 2704                              S_READ);
2711 2705  
2712 2706                          error = uiomove(base + on, n, UIO_READ, uiop);
2713 2707                  }
2714 2708  
2715 2709                  if (!error) {
2716 2710                          /*
2717 2711                           * If read a whole block or read to eof,
2718 2712                           * won't need this buffer again soon.
2719 2713                           */
2720 2714                          mutex_enter(&rp->r_statelock);
2721 2715                          if (n + on == MAXBSIZE ||
2722 2716                              uiop->uio_loffset == rp->r_size)
2723 2717                                  flags = SM_DONTNEED;
2724 2718                          else
2725 2719                                  flags = 0;
2726 2720                          mutex_exit(&rp->r_statelock);
2727 2721                          if (vpm_enable) {
2728 2722                                  error = vpm_sync_pages(vp, off, n, flags);
2729 2723                          } else {
2730 2724                                  error = segmap_release(segkmap, base, flags);
2731 2725                          }
2732 2726                  } else {
2733 2727                          if (vpm_enable) {
2734 2728                                  (void) vpm_sync_pages(vp, off, n, 0);
2735 2729                          } else {
2736 2730                                  (void) segmap_release(segkmap, base, 0);
2737 2731                          }
2738 2732                  }
2739 2733          } while (!error && uiop->uio_resid > 0);
2740 2734  
2741 2735          return (error);
2742 2736  }
2743 2737  
2744 2738  /* ARGSUSED */
2745 2739  static int
2746 2740  nfs4_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
2747 2741      caller_context_t *ct)
2748 2742  {
2749 2743          rlim64_t limit = uiop->uio_llimit;
2750 2744          rnode4_t *rp;
2751 2745          u_offset_t off;
2752 2746          caddr_t base;
2753 2747          uint_t flags;
2754 2748          int remainder;
2755 2749          size_t n;
2756 2750          int on;
2757 2751          int error;
2758 2752          int resid;
2759 2753          u_offset_t offset;
2760 2754          mntinfo4_t *mi;
2761 2755          uint_t bsize;
2762 2756  
2763 2757          rp = VTOR4(vp);
2764 2758  
2765 2759          if (IS_SHADOW(vp, rp))
2766 2760                  vp = RTOV4(rp);
2767 2761  
2768 2762          if (vp->v_type != VREG)
2769 2763                  return (EISDIR);
2770 2764  
2771 2765          mi = VTOMI4(vp);
2772 2766  
2773 2767          if (nfs_zone() != mi->mi_zone)
2774 2768                  return (EIO);
2775 2769  
2776 2770          if (uiop->uio_resid == 0)
2777 2771                  return (0);
2778 2772  
2779 2773          mutex_enter(&rp->r_statelock);
2780 2774          if (rp->r_flags & R4RECOVERRP)
2781 2775                  error = (rp->r_error ? rp->r_error : EIO);
2782 2776          else
2783 2777                  error = 0;
2784 2778          mutex_exit(&rp->r_statelock);
2785 2779          if (error)
2786 2780                  return (error);
2787 2781  
2788 2782          if (ioflag & FAPPEND) {
2789 2783                  struct vattr va;
2790 2784  
2791 2785                  /*
2792 2786                   * Must serialize if appending.
2793 2787                   */
2794 2788                  if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
2795 2789                          nfs_rw_exit(&rp->r_rwlock);
2796 2790                          if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
2797 2791                              INTR4(vp)))
2798 2792                                  return (EINTR);
2799 2793                  }
2800 2794  
2801 2795                  va.va_mask = AT_SIZE;
2802 2796                  error = nfs4getattr(vp, &va, cr);
2803 2797                  if (error)
2804 2798                          return (error);
2805 2799                  uiop->uio_loffset = va.va_size;
2806 2800          }
2807 2801  
2808 2802          offset = uiop->uio_loffset + uiop->uio_resid;
2809 2803  
2810 2804          if (uiop->uio_loffset < (offset_t)0 || offset < 0)
2811 2805                  return (EINVAL);
2812 2806  
2813 2807          if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
2814 2808                  limit = MAXOFFSET_T;
2815 2809  
2816 2810          /*
2817 2811           * Check to make sure that the process will not exceed
2818 2812           * its limit on file size.  It is okay to write up to
2819 2813           * the limit, but not beyond.  Thus, the write which
2820 2814           * reaches the limit will be short and the next write
2821 2815           * will return an error.
2822 2816           */
2823 2817          remainder = 0;
2824 2818          if (offset > uiop->uio_llimit) {
2825 2819                  remainder = offset - uiop->uio_llimit;
2826 2820                  uiop->uio_resid = uiop->uio_llimit - uiop->uio_loffset;
2827 2821                  if (uiop->uio_resid <= 0) {
2828 2822                          proc_t *p = ttoproc(curthread);
2829 2823  
2830 2824                          uiop->uio_resid += remainder;
2831 2825                          mutex_enter(&p->p_lock);
2832 2826                          (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
2833 2827                              p->p_rctls, p, RCA_UNSAFE_SIGINFO);
2834 2828                          mutex_exit(&p->p_lock);
2835 2829                          return (EFBIG);
2836 2830                  }
2837 2831          }
2838 2832  
2839 2833          /* update the change attribute, if we have a write delegation */
2840 2834  
2841 2835          mutex_enter(&rp->r_statev4_lock);
2842 2836          if (rp->r_deleg_type == OPEN_DELEGATE_WRITE)
2843 2837                  rp->r_deleg_change++;
2844 2838  
2845 2839          mutex_exit(&rp->r_statev4_lock);
2846 2840  
2847 2841          if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp)))
2848 2842                  return (EINTR);
2849 2843  
2850 2844          /*
2851 2845           * Bypass VM if caching has been disabled (e.g., locking) or if
2852 2846           * using client-side direct I/O and the file is not mmap'd and
2853 2847           * there are no cached pages.
2854 2848           */
2855 2849          if ((vp->v_flag & VNOCACHE) ||
2856 2850              (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) &&
2857 2851              rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) {
2858 2852                  size_t bufsize;
2859 2853                  int count;
2860 2854                  u_offset_t org_offset;
2861 2855                  stable_how4 stab_comm;
2862 2856  nfs4_fwrite:
2863 2857                  if (rp->r_flags & R4STALE) {
2864 2858                          resid = uiop->uio_resid;
2865 2859                          offset = uiop->uio_loffset;
2866 2860                          error = rp->r_error;
2867 2861                          /*
2868 2862                           * A close may have cleared r_error, if so,
2869 2863                           * propagate ESTALE error return properly
2870 2864                           */
2871 2865                          if (error == 0)
2872 2866                                  error = ESTALE;
2873 2867                          goto bottom;
2874 2868                  }
2875 2869  
2876 2870                  bufsize = MIN(uiop->uio_resid, mi->mi_stsize);
2877 2871                  base = kmem_alloc(bufsize, KM_SLEEP);
2878 2872                  do {
2879 2873                          if (ioflag & FDSYNC)
2880 2874                                  stab_comm = DATA_SYNC4;
2881 2875                          else
2882 2876                                  stab_comm = FILE_SYNC4;
2883 2877                          resid = uiop->uio_resid;
2884 2878                          offset = uiop->uio_loffset;
2885 2879                          count = MIN(uiop->uio_resid, bufsize);
2886 2880                          org_offset = uiop->uio_loffset;
2887 2881                          error = uiomove(base, count, UIO_WRITE, uiop);
2888 2882                          if (!error) {
2889 2883                                  error = nfs4write(vp, base, org_offset,
2890 2884                                      count, cr, &stab_comm);
2891 2885                                  if (!error) {
2892 2886                                          mutex_enter(&rp->r_statelock);
2893 2887                                          if (rp->r_size < uiop->uio_loffset)
2894 2888                                                  rp->r_size = uiop->uio_loffset;
2895 2889                                          mutex_exit(&rp->r_statelock);
2896 2890                                  }
2897 2891                          }
2898 2892                  } while (!error && uiop->uio_resid > 0);
2899 2893                  kmem_free(base, bufsize);
2900 2894                  goto bottom;
2901 2895          }
2902 2896  
2903 2897          bsize = vp->v_vfsp->vfs_bsize;
2904 2898  
2905 2899          do {
2906 2900                  off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
2907 2901                  on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
2908 2902                  n = MIN(MAXBSIZE - on, uiop->uio_resid);
2909 2903  
2910 2904                  resid = uiop->uio_resid;
2911 2905                  offset = uiop->uio_loffset;
2912 2906  
2913 2907                  if (rp->r_flags & R4STALE) {
2914 2908                          error = rp->r_error;
2915 2909                          /*
2916 2910                           * A close may have cleared r_error, if so,
2917 2911                           * propagate ESTALE error return properly
2918 2912                           */
2919 2913                          if (error == 0)
2920 2914                                  error = ESTALE;
2921 2915                          break;
2922 2916                  }
2923 2917  
2924 2918                  /*
2925 2919                   * Don't create dirty pages faster than they
2926 2920                   * can be cleaned so that the system doesn't
2927 2921                   * get imbalanced.  If the async queue is
2928 2922                   * maxed out, then wait for it to drain before
2929 2923                   * creating more dirty pages.  Also, wait for
2930 2924                   * any threads doing pagewalks in the vop_getattr
2931 2925                   * entry points so that they don't block for
2932 2926                   * long periods.
2933 2927                   */
2934 2928                  mutex_enter(&rp->r_statelock);
2935 2929                  while ((mi->mi_max_threads != 0 &&
2936 2930                      rp->r_awcount > 2 * mi->mi_max_threads) ||
2937 2931                      rp->r_gcount > 0) {
2938 2932                          if (INTR4(vp)) {
2939 2933                                  klwp_t *lwp = ttolwp(curthread);
2940 2934  
2941 2935                                  if (lwp != NULL)
2942 2936                                          lwp->lwp_nostop++;
2943 2937                                  if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
2944 2938                                          mutex_exit(&rp->r_statelock);
2945 2939                                          if (lwp != NULL)
2946 2940                                                  lwp->lwp_nostop--;
2947 2941                                          error = EINTR;
2948 2942                                          goto bottom;
2949 2943                                  }
2950 2944                                  if (lwp != NULL)
2951 2945                                          lwp->lwp_nostop--;
2952 2946                          } else
2953 2947                                  cv_wait(&rp->r_cv, &rp->r_statelock);
2954 2948                  }
2955 2949                  mutex_exit(&rp->r_statelock);
2956 2950  
2957 2951                  /*
2958 2952                   * Touch the page and fault it in if it is not in core
2959 2953                   * before segmap_getmapflt or vpm_data_copy can lock it.
2960 2954                   * This is to avoid the deadlock if the buffer is mapped
2961 2955                   * to the same file through mmap which we want to write.
2962 2956                   */
2963 2957                  uio_prefaultpages((long)n, uiop);
2964 2958  
2965 2959                  if (vpm_enable) {
2966 2960                          /*
2967 2961                           * It will use kpm mappings, so no need to
2968 2962                           * pass an address.
2969 2963                           */
2970 2964                          error = writerp4(rp, NULL, n, uiop, 0);
2971 2965                  } else  {
2972 2966                          if (segmap_kpm) {
2973 2967                                  int pon = uiop->uio_loffset & PAGEOFFSET;
2974 2968                                  size_t pn = MIN(PAGESIZE - pon,
2975 2969                                      uiop->uio_resid);
2976 2970                                  int pagecreate;
2977 2971  
2978 2972                                  mutex_enter(&rp->r_statelock);
2979 2973                                  pagecreate = (pon == 0) && (pn == PAGESIZE ||
2980 2974                                      uiop->uio_loffset + pn >= rp->r_size);
2981 2975                                  mutex_exit(&rp->r_statelock);
2982 2976  
2983 2977                                  base = segmap_getmapflt(segkmap, vp, off + on,
2984 2978                                      pn, !pagecreate, S_WRITE);
2985 2979  
2986 2980                                  error = writerp4(rp, base + pon, n, uiop,
2987 2981                                      pagecreate);
2988 2982  
2989 2983                          } else {
2990 2984                                  base = segmap_getmapflt(segkmap, vp, off + on,
2991 2985                                      n, 0, S_READ);
2992 2986                                  error = writerp4(rp, base + on, n, uiop, 0);
2993 2987                          }
2994 2988                  }
2995 2989  
2996 2990                  if (!error) {
2997 2991                          if (mi->mi_flags & MI4_NOAC)
2998 2992                                  flags = SM_WRITE;
2999 2993                          else if ((uiop->uio_loffset % bsize) == 0 ||
3000 2994                              IS_SWAPVP(vp)) {
3001 2995                                  /*
3002 2996                                   * Have written a whole block.
3003 2997                                   * Start an asynchronous write
3004 2998                                   * and mark the buffer to
3005 2999                                   * indicate that it won't be
3006 3000                                   * needed again soon.
3007 3001                                   */
3008 3002                                  flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
3009 3003                          } else
3010 3004                                  flags = 0;
3011 3005                          if ((ioflag & (FSYNC|FDSYNC)) ||
3012 3006                              (rp->r_flags & R4OUTOFSPACE)) {
3013 3007                                  flags &= ~SM_ASYNC;
3014 3008                                  flags |= SM_WRITE;
3015 3009                          }
3016 3010                          if (vpm_enable) {
3017 3011                                  error = vpm_sync_pages(vp, off, n, flags);
3018 3012                          } else {
3019 3013                                  error = segmap_release(segkmap, base, flags);
3020 3014                          }
3021 3015                  } else {
3022 3016                          if (vpm_enable) {
3023 3017                                  (void) vpm_sync_pages(vp, off, n, 0);
3024 3018                          } else {
3025 3019                                  (void) segmap_release(segkmap, base, 0);
3026 3020                          }
3027 3021                          /*
3028 3022                           * In the event that we got an access error while
3029 3023                           * faulting in a page for a write-only file just
3030 3024                           * force a write.
3031 3025                           */
3032 3026                          if (error == EACCES)
3033 3027                                  goto nfs4_fwrite;
3034 3028                  }
3035 3029          } while (!error && uiop->uio_resid > 0);
3036 3030  
3037 3031  bottom:
3038 3032          if (error) {
3039 3033                  uiop->uio_resid = resid + remainder;
3040 3034                  uiop->uio_loffset = offset;
3041 3035          } else {
3042 3036                  uiop->uio_resid += remainder;
3043 3037  
3044 3038                  mutex_enter(&rp->r_statev4_lock);
3045 3039                  if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) {
3046 3040                          gethrestime(&rp->r_attr.va_mtime);
3047 3041                          rp->r_attr.va_ctime = rp->r_attr.va_mtime;
3048 3042                  }
3049 3043                  mutex_exit(&rp->r_statev4_lock);
3050 3044          }
3051 3045  
3052 3046          nfs_rw_exit(&rp->r_lkserlock);
3053 3047  
3054 3048          return (error);
3055 3049  }
3056 3050  
3057 3051  /*
3058 3052   * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
3059 3053   */
3060 3054  static int
3061 3055  nfs4_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
3062 3056      int flags, cred_t *cr)
3063 3057  {
3064 3058          struct buf *bp;
3065 3059          int error;
3066 3060          page_t *savepp;
3067 3061          uchar_t fsdata;
3068 3062          stable_how4 stab_comm;
3069 3063  
3070 3064          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
3071 3065          bp = pageio_setup(pp, len, vp, flags);
3072 3066          ASSERT(bp != NULL);
3073 3067  
3074 3068          /*
3075 3069           * pageio_setup should have set b_addr to 0.  This
3076 3070           * is correct since we want to do I/O on a page
3077 3071           * boundary.  bp_mapin will use this addr to calculate
3078 3072           * an offset, and then set b_addr to the kernel virtual
3079 3073           * address it allocated for us.
3080 3074           */
3081 3075          ASSERT(bp->b_un.b_addr == 0);
3082 3076  
3083 3077          bp->b_edev = 0;
3084 3078          bp->b_dev = 0;
3085 3079          bp->b_lblkno = lbtodb(off);
3086 3080          bp->b_file = vp;
3087 3081          bp->b_offset = (offset_t)off;
3088 3082          bp_mapin(bp);
3089 3083  
3090 3084          if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) &&
3091 3085              freemem > desfree)
3092 3086                  stab_comm = UNSTABLE4;
3093 3087          else
3094 3088                  stab_comm = FILE_SYNC4;
3095 3089  
3096 3090          error = nfs4_bio(bp, &stab_comm, cr, FALSE);
3097 3091  
3098 3092          bp_mapout(bp);
3099 3093          pageio_done(bp);
3100 3094  
3101 3095          if (stab_comm == UNSTABLE4)
3102 3096                  fsdata = C_DELAYCOMMIT;
3103 3097          else
3104 3098                  fsdata = C_NOCOMMIT;
3105 3099  
3106 3100          savepp = pp;
3107 3101          do {
3108 3102                  pp->p_fsdata = fsdata;
3109 3103          } while ((pp = pp->p_next) != savepp);
3110 3104  
3111 3105          return (error);
3112 3106  }
3113 3107  
3114 3108  /*
3115 3109   */
3116 3110  static int
3117 3111  nfs4rdwr_check_osid(vnode_t *vp, nfs4_error_t *ep, cred_t *cr)
3118 3112  {
3119 3113          nfs4_open_owner_t       *oop;
3120 3114          nfs4_open_stream_t      *osp;
3121 3115          rnode4_t                *rp = VTOR4(vp);
3122 3116          mntinfo4_t              *mi = VTOMI4(vp);
3123 3117          int                     reopen_needed;
3124 3118  
3125 3119          ASSERT(nfs_zone() == mi->mi_zone);
3126 3120  
3127 3121  
3128 3122          oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
3129 3123          if (!oop)
3130 3124                  return (EIO);
3131 3125  
3132 3126          /* returns with 'os_sync_lock' held */
3133 3127          osp = find_open_stream(oop, rp);
3134 3128          if (!osp) {
3135 3129                  open_owner_rele(oop);
3136 3130                  return (EIO);
3137 3131          }
3138 3132  
3139 3133          if (osp->os_failed_reopen) {
3140 3134                  mutex_exit(&osp->os_sync_lock);
3141 3135                  open_stream_rele(osp, rp);
3142 3136                  open_owner_rele(oop);
3143 3137                  return (EIO);
3144 3138          }
3145 3139  
3146 3140          /*
3147 3141           * Determine whether a reopen is needed.  If this
3148 3142           * is a delegation open stream, then the os_delegation bit
3149 3143           * should be set.
3150 3144           */
3151 3145  
3152 3146          reopen_needed = osp->os_delegation;
3153 3147  
3154 3148          mutex_exit(&osp->os_sync_lock);
3155 3149          open_owner_rele(oop);
3156 3150  
3157 3151          if (reopen_needed) {
3158 3152                  nfs4_error_zinit(ep);
3159 3153                  nfs4_reopen(vp, osp, ep, CLAIM_NULL, FALSE, FALSE);
3160 3154                  mutex_enter(&osp->os_sync_lock);
3161 3155                  if (ep->error || ep->stat || osp->os_failed_reopen) {
3162 3156                          mutex_exit(&osp->os_sync_lock);
3163 3157                          open_stream_rele(osp, rp);
3164 3158                          return (EIO);
3165 3159                  }
3166 3160                  mutex_exit(&osp->os_sync_lock);
3167 3161          }
3168 3162          open_stream_rele(osp, rp);
3169 3163  
3170 3164          return (0);
3171 3165  }
3172 3166  
3173 3167  /*
3174 3168   * Write to file.  Writes to remote server in largest size
3175 3169   * chunks that the server can handle.  Write is synchronous.
3176 3170   */
3177 3171  static int
3178 3172  nfs4write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr,
3179 3173      stable_how4 *stab_comm)
3180 3174  {
3181 3175          mntinfo4_t *mi;
3182 3176          COMPOUND4args_clnt args;
3183 3177          COMPOUND4res_clnt res;
3184 3178          WRITE4args *wargs;
3185 3179          WRITE4res *wres;
3186 3180          nfs_argop4 argop[2];
3187 3181          nfs_resop4 *resop;
3188 3182          int tsize;
3189 3183          stable_how4 stable;
3190 3184          rnode4_t *rp;
3191 3185          int doqueue = 1;
3192 3186          bool_t needrecov;
3193 3187          nfs4_recov_state_t recov_state;
3194 3188          nfs4_stateid_types_t sid_types;
3195 3189          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3196 3190          int recov;
3197 3191  
3198 3192          rp = VTOR4(vp);
3199 3193          mi = VTOMI4(vp);
3200 3194  
3201 3195          ASSERT(nfs_zone() == mi->mi_zone);
3202 3196  
3203 3197          stable = *stab_comm;
3204 3198          *stab_comm = FILE_SYNC4;
3205 3199  
3206 3200          needrecov = FALSE;
3207 3201          recov_state.rs_flags = 0;
3208 3202          recov_state.rs_num_retry_despite_err = 0;
3209 3203          nfs4_init_stateid_types(&sid_types);
3210 3204  
3211 3205          /* Is curthread the recovery thread? */
3212 3206          mutex_enter(&mi->mi_lock);
3213 3207          recov = (mi->mi_recovthread == curthread);
3214 3208          mutex_exit(&mi->mi_lock);
3215 3209  
3216 3210  recov_retry:
3217 3211          args.ctag = TAG_WRITE;
3218 3212          args.array_len = 2;
3219 3213          args.array = argop;
3220 3214  
3221 3215          if (!recov) {
3222 3216                  e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3223 3217                      &recov_state, NULL);
3224 3218                  if (e.error)
3225 3219                          return (e.error);
3226 3220          }
3227 3221  
3228 3222          /* 0. putfh target fh */
3229 3223          argop[0].argop = OP_CPUTFH;
3230 3224          argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3231 3225  
3232 3226          /* 1. write */
3233 3227          nfs4args_write(&argop[1], stable, rp, cr, &wargs, &sid_types);
3234 3228  
3235 3229          do {
3236 3230  
3237 3231                  wargs->offset = (offset4)offset;
3238 3232                  wargs->data_val = base;
3239 3233  
3240 3234                  if (mi->mi_io_kstats) {
3241 3235                          mutex_enter(&mi->mi_lock);
3242 3236                          kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3243 3237                          mutex_exit(&mi->mi_lock);
3244 3238                  }
3245 3239  
3246 3240                  if ((vp->v_flag & VNOCACHE) ||
3247 3241                      (rp->r_flags & R4DIRECTIO) ||
3248 3242                      (mi->mi_flags & MI4_DIRECTIO))
3249 3243                          tsize = MIN(mi->mi_stsize, count);
3250 3244                  else
3251 3245                          tsize = MIN(mi->mi_curwrite, count);
3252 3246                  wargs->data_len = (uint_t)tsize;
3253 3247                  rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3254 3248  
3255 3249                  if (mi->mi_io_kstats) {
3256 3250                          mutex_enter(&mi->mi_lock);
3257 3251                          kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3258 3252                          mutex_exit(&mi->mi_lock);
3259 3253                  }
3260 3254  
3261 3255                  if (!recov) {
3262 3256                          needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3263 3257                          if (e.error && !needrecov) {
3264 3258                                  nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3265 3259                                      &recov_state, needrecov);
3266 3260                                  return (e.error);
3267 3261                          }
3268 3262                  } else {
3269 3263                          if (e.error)
3270 3264                                  return (e.error);
3271 3265                  }
3272 3266  
3273 3267                  /*
3274 3268                   * Do handling of OLD_STATEID outside
3275 3269                   * of the normal recovery framework.
3276 3270                   *
3277 3271                   * If write receives a BAD stateid error while using a
3278 3272                   * delegation stateid, retry using the open stateid (if it
3279 3273                   * exists).  If it doesn't have an open stateid, reopen the
3280 3274                   * file first, then retry.
3281 3275                   */
3282 3276                  if (!e.error && res.status == NFS4ERR_OLD_STATEID &&
3283 3277                      sid_types.cur_sid_type != SPEC_SID) {
3284 3278                          nfs4_save_stateid(&wargs->stateid, &sid_types);
3285 3279                          if (!recov)
3286 3280                                  nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3287 3281                                      &recov_state, needrecov);
3288 3282                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3289 3283                          goto recov_retry;
3290 3284                  } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID &&
3291 3285                      sid_types.cur_sid_type == DEL_SID) {
3292 3286                          nfs4_save_stateid(&wargs->stateid, &sid_types);
3293 3287                          mutex_enter(&rp->r_statev4_lock);
3294 3288                          rp->r_deleg_return_pending = TRUE;
3295 3289                          mutex_exit(&rp->r_statev4_lock);
3296 3290                          if (nfs4rdwr_check_osid(vp, &e, cr)) {
3297 3291                                  if (!recov)
3298 3292                                          nfs4_end_fop(mi, vp, NULL, OH_WRITE,
3299 3293                                              &recov_state, needrecov);
3300 3294                                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3301 3295                                  return (EIO);
3302 3296                          }
3303 3297                          if (!recov)
3304 3298                                  nfs4_end_fop(mi, vp, NULL, OH_WRITE,
3305 3299                                      &recov_state, needrecov);
3306 3300                          /* hold needed for nfs4delegreturn_thread */
3307 3301                          VN_HOLD(vp);
3308 3302                          nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN|
3309 3303                              NFS4_DR_DISCARD), FALSE);
3310 3304                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3311 3305                          goto recov_retry;
3312 3306                  }
3313 3307  
3314 3308                  if (needrecov) {
3315 3309                          bool_t abort;
3316 3310  
3317 3311                          NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3318 3312                              "nfs4write: client got error %d, res.status %d"
3319 3313                              ", so start recovery", e.error, res.status));
3320 3314  
3321 3315                          abort = nfs4_start_recovery(&e,
3322 3316                              VTOMI4(vp), vp, NULL, &wargs->stateid,
3323 3317                              NULL, OP_WRITE, NULL, NULL, NULL);
3324 3318                          if (!e.error) {
3325 3319                                  e.error = geterrno4(res.status);
3326 3320                                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3327 3321                          }
3328 3322                          nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3329 3323                              &recov_state, needrecov);
3330 3324                          if (abort == FALSE)
3331 3325                                  goto recov_retry;
3332 3326                          return (e.error);
3333 3327                  }
3334 3328  
3335 3329                  if (res.status) {
3336 3330                          e.error = geterrno4(res.status);
3337 3331                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3338 3332                          if (!recov)
3339 3333                                  nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3340 3334                                      &recov_state, needrecov);
3341 3335                          return (e.error);
3342 3336                  }
3343 3337  
3344 3338                  resop = &res.array[1];  /* write res */
3345 3339                  wres = &resop->nfs_resop4_u.opwrite;
3346 3340  
3347 3341                  if ((int)wres->count > tsize) {
3348 3342                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3349 3343  
3350 3344                          zcmn_err(getzoneid(), CE_WARN,
3351 3345                              "nfs4write: server wrote %u, requested was %u",
3352 3346                              (int)wres->count, tsize);
3353 3347                          if (!recov)
3354 3348                                  nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3355 3349                                      &recov_state, needrecov);
3356 3350                          return (EIO);
3357 3351                  }
3358 3352                  if (wres->committed == UNSTABLE4) {
3359 3353                          *stab_comm = UNSTABLE4;
3360 3354                          if (wargs->stable == DATA_SYNC4 ||
3361 3355                              wargs->stable == FILE_SYNC4) {
3362 3356                                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3363 3357                                  zcmn_err(getzoneid(), CE_WARN,
3364 3358                                      "nfs4write: server %s did not commit "
3365 3359                                      "to stable storage",
3366 3360                                      rp->r_server->sv_hostname);
3367 3361                                  if (!recov)
3368 3362                                          nfs4_end_fop(VTOMI4(vp), vp, NULL,
3369 3363                                              OH_WRITE, &recov_state, needrecov);
3370 3364                                  return (EIO);
3371 3365                          }
3372 3366                  }
3373 3367  
3374 3368                  tsize = (int)wres->count;
3375 3369                  count -= tsize;
3376 3370                  base += tsize;
3377 3371                  offset += tsize;
3378 3372                  if (mi->mi_io_kstats) {
3379 3373                          mutex_enter(&mi->mi_lock);
3380 3374                          KSTAT_IO_PTR(mi->mi_io_kstats)->writes++;
3381 3375                          KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten +=
3382 3376                              tsize;
3383 3377                          mutex_exit(&mi->mi_lock);
3384 3378                  }
3385 3379                  lwp_stat_update(LWP_STAT_OUBLK, 1);
3386 3380                  mutex_enter(&rp->r_statelock);
3387 3381                  if (rp->r_flags & R4HAVEVERF) {
3388 3382                          if (rp->r_writeverf != wres->writeverf) {
3389 3383                                  nfs4_set_mod(vp);
3390 3384                                  rp->r_writeverf = wres->writeverf;
3391 3385                          }
3392 3386                  } else {
3393 3387                          rp->r_writeverf = wres->writeverf;
3394 3388                          rp->r_flags |= R4HAVEVERF;
3395 3389                  }
3396 3390                  PURGE_ATTRCACHE4_LOCKED(rp);
3397 3391                  rp->r_flags |= R4WRITEMODIFIED;
3398 3392                  gethrestime(&rp->r_attr.va_mtime);
3399 3393                  rp->r_attr.va_ctime = rp->r_attr.va_mtime;
3400 3394                  mutex_exit(&rp->r_statelock);
3401 3395                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3402 3396          } while (count);
3403 3397  
3404 3398          if (!recov)
3405 3399                  nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, &recov_state,
3406 3400                      needrecov);
3407 3401  
3408 3402          return (e.error);
3409 3403  }
3410 3404  
3411 3405  /*
3412 3406   * Read from a file.  Reads data in largest chunks our interface can handle.
3413 3407   */
3414 3408  static int
3415 3409  nfs4read(vnode_t *vp, caddr_t base, offset_t offset, int count,
3416 3410      size_t *residp, cred_t *cr, bool_t async, struct uio *uiop)
3417 3411  {
3418 3412          mntinfo4_t *mi;
3419 3413          COMPOUND4args_clnt args;
3420 3414          COMPOUND4res_clnt res;
3421 3415          READ4args *rargs;
3422 3416          nfs_argop4 argop[2];
3423 3417          int tsize;
3424 3418          int doqueue;
3425 3419          rnode4_t *rp;
3426 3420          int data_len;
3427 3421          bool_t is_eof;
3428 3422          bool_t needrecov = FALSE;
3429 3423          nfs4_recov_state_t recov_state;
3430 3424          nfs4_stateid_types_t sid_types;
3431 3425          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3432 3426  
3433 3427          rp = VTOR4(vp);
3434 3428          mi = VTOMI4(vp);
3435 3429          doqueue = 1;
3436 3430  
3437 3431          ASSERT(nfs_zone() == mi->mi_zone);
3438 3432  
3439 3433          args.ctag = async ? TAG_READAHEAD : TAG_READ;
3440 3434  
3441 3435          args.array_len = 2;
3442 3436          args.array = argop;
3443 3437  
3444 3438          nfs4_init_stateid_types(&sid_types);
3445 3439  
3446 3440          recov_state.rs_flags = 0;
3447 3441          recov_state.rs_num_retry_despite_err = 0;
3448 3442  
3449 3443  recov_retry:
3450 3444          e.error = nfs4_start_fop(mi, vp, NULL, OH_READ,
3451 3445              &recov_state, NULL);
3452 3446          if (e.error)
3453 3447                  return (e.error);
3454 3448  
3455 3449          /* putfh target fh */
3456 3450          argop[0].argop = OP_CPUTFH;
3457 3451          argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3458 3452  
3459 3453          /* read */
3460 3454          argop[1].argop = OP_READ;
3461 3455          rargs = &argop[1].nfs_argop4_u.opread;
3462 3456          rargs->stateid = nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi,
3463 3457              OP_READ, &sid_types, async);
3464 3458  
3465 3459          do {
3466 3460                  if (mi->mi_io_kstats) {
3467 3461                          mutex_enter(&mi->mi_lock);
3468 3462                          kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3469 3463                          mutex_exit(&mi->mi_lock);
3470 3464                  }
3471 3465  
3472 3466                  NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
3473 3467                      "nfs4read: %s call, rp %s",
3474 3468                      needrecov ? "recov" : "first",
3475 3469                      rnode4info(rp)));
3476 3470  
3477 3471                  if ((vp->v_flag & VNOCACHE) ||
3478 3472                      (rp->r_flags & R4DIRECTIO) ||
3479 3473                      (mi->mi_flags & MI4_DIRECTIO))
3480 3474                          tsize = MIN(mi->mi_tsize, count);
3481 3475                  else
3482 3476                          tsize = MIN(mi->mi_curread, count);
3483 3477  
3484 3478                  rargs->offset = (offset4)offset;
3485 3479                  rargs->count = (count4)tsize;
3486 3480                  rargs->res_data_val_alt = NULL;
3487 3481                  rargs->res_mblk = NULL;
3488 3482                  rargs->res_uiop = NULL;
3489 3483                  rargs->res_maxsize = 0;
3490 3484                  rargs->wlist = NULL;
3491 3485  
3492 3486                  if (uiop)
3493 3487                          rargs->res_uiop = uiop;
3494 3488                  else
3495 3489                          rargs->res_data_val_alt = base;
3496 3490                  rargs->res_maxsize = tsize;
3497 3491  
3498 3492                  rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3499 3493  #ifdef  DEBUG
3500 3494                  if (nfs4read_error_inject) {
3501 3495                          res.status = nfs4read_error_inject;
3502 3496                          nfs4read_error_inject = 0;
3503 3497                  }
3504 3498  #endif
3505 3499  
3506 3500                  if (mi->mi_io_kstats) {
3507 3501                          mutex_enter(&mi->mi_lock);
3508 3502                          kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3509 3503                          mutex_exit(&mi->mi_lock);
3510 3504                  }
3511 3505  
3512 3506                  needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3513 3507                  if (e.error != 0 && !needrecov) {
3514 3508                          nfs4_end_fop(mi, vp, NULL, OH_READ,
3515 3509                              &recov_state, needrecov);
3516 3510                          return (e.error);
3517 3511                  }
3518 3512  
3519 3513                  /*
3520 3514                   * Do proper retry for OLD and BAD stateid errors outside
3521 3515                   * of the normal recovery framework.  There are two differences
3522 3516                   * between async and sync reads.  The first is that we allow
3523 3517                   * retry on BAD_STATEID for async reads, but not sync reads.
3524 3518                   * The second is that we mark the file dead for a failed
3525 3519                   * attempt with a special stateid for sync reads, but just
3526 3520                   * return EIO for async reads.
3527 3521                   *
3528 3522                   * If a sync read receives a BAD stateid error while using a
3529 3523                   * delegation stateid, retry using the open stateid (if it
3530 3524                   * exists).  If it doesn't have an open stateid, reopen the
3531 3525                   * file first, then retry.
3532 3526                   */
3533 3527                  if (e.error == 0 && (res.status == NFS4ERR_OLD_STATEID ||
3534 3528                      res.status == NFS4ERR_BAD_STATEID) && async) {
3535 3529                          nfs4_end_fop(mi, vp, NULL, OH_READ,
3536 3530                              &recov_state, needrecov);
3537 3531                          if (sid_types.cur_sid_type == SPEC_SID) {
3538 3532                                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3539 3533                                  return (EIO);
3540 3534                          }
3541 3535                          nfs4_save_stateid(&rargs->stateid, &sid_types);
3542 3536                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3543 3537                          goto recov_retry;
3544 3538                  } else if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
3545 3539                      !async && sid_types.cur_sid_type != SPEC_SID) {
3546 3540                          nfs4_save_stateid(&rargs->stateid, &sid_types);
3547 3541                          nfs4_end_fop(mi, vp, NULL, OH_READ,
3548 3542                              &recov_state, needrecov);
3549 3543                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3550 3544                          goto recov_retry;
3551 3545                  } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID &&
3552 3546                      sid_types.cur_sid_type == DEL_SID) {
3553 3547                          nfs4_save_stateid(&rargs->stateid, &sid_types);
3554 3548                          mutex_enter(&rp->r_statev4_lock);
3555 3549                          rp->r_deleg_return_pending = TRUE;
3556 3550                          mutex_exit(&rp->r_statev4_lock);
3557 3551                          if (nfs4rdwr_check_osid(vp, &e, cr)) {
3558 3552                                  nfs4_end_fop(mi, vp, NULL, OH_READ,
3559 3553                                      &recov_state, needrecov);
3560 3554                                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3561 3555                                  return (EIO);
3562 3556                          }
3563 3557                          nfs4_end_fop(mi, vp, NULL, OH_READ,
3564 3558                              &recov_state, needrecov);
3565 3559                          /* hold needed for nfs4delegreturn_thread */
3566 3560                          VN_HOLD(vp);
3567 3561                          nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN|
3568 3562                              NFS4_DR_DISCARD), FALSE);
3569 3563                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3570 3564                          goto recov_retry;
3571 3565                  }
3572 3566                  if (needrecov) {
3573 3567                          bool_t abort;
3574 3568  
3575 3569                          NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3576 3570                              "nfs4read: initiating recovery\n"));
3577 3571                          abort = nfs4_start_recovery(&e,
3578 3572                              mi, vp, NULL, &rargs->stateid,
3579 3573                              NULL, OP_READ, NULL, NULL, NULL);
3580 3574                          nfs4_end_fop(mi, vp, NULL, OH_READ,
3581 3575                              &recov_state, needrecov);
3582 3576                          /*
3583 3577                           * Do not retry if we got OLD_STATEID using a special
3584 3578                           * stateid.  This avoids looping with a broken server.
3585 3579                           */
3586 3580                          if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
3587 3581                              sid_types.cur_sid_type == SPEC_SID)
3588 3582                                  abort = TRUE;
3589 3583  
3590 3584                          if (abort == FALSE) {
3591 3585                                  /*
3592 3586                                   * Need to retry all possible stateids in
3593 3587                                   * case the recovery error wasn't stateid
3594 3588                                   * related or the stateids have become
3595 3589                                   * stale (server reboot).
3596 3590                                   */
3597 3591                                  nfs4_init_stateid_types(&sid_types);
3598 3592                                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3599 3593                                  goto recov_retry;
3600 3594                          }
3601 3595  
3602 3596                          if (!e.error) {
3603 3597                                  e.error = geterrno4(res.status);
3604 3598                                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3605 3599                          }
3606 3600                          return (e.error);
3607 3601                  }
3608 3602  
3609 3603                  if (res.status) {
3610 3604                          e.error = geterrno4(res.status);
3611 3605                          nfs4_end_fop(mi, vp, NULL, OH_READ,
3612 3606                              &recov_state, needrecov);
3613 3607                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3614 3608                          return (e.error);
3615 3609                  }
3616 3610  
3617 3611                  data_len = res.array[1].nfs_resop4_u.opread.data_len;
3618 3612                  count -= data_len;
3619 3613                  if (base)
3620 3614                          base += data_len;
3621 3615                  offset += data_len;
3622 3616                  if (mi->mi_io_kstats) {
3623 3617                          mutex_enter(&mi->mi_lock);
3624 3618                          KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
3625 3619                          KSTAT_IO_PTR(mi->mi_io_kstats)->nread += data_len;
3626 3620                          mutex_exit(&mi->mi_lock);
3627 3621                  }
3628 3622                  lwp_stat_update(LWP_STAT_INBLK, 1);
3629 3623                  is_eof = res.array[1].nfs_resop4_u.opread.eof;
3630 3624                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3631 3625  
3632 3626          } while (count && !is_eof);
3633 3627  
3634 3628          *residp = count;
3635 3629  
3636 3630          nfs4_end_fop(mi, vp, NULL, OH_READ, &recov_state, needrecov);
3637 3631  
3638 3632          return (e.error);
3639 3633  }
3640 3634  
3641 3635  /* ARGSUSED */
3642 3636  static int
3643 3637  nfs4_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp,
3644 3638      caller_context_t *ct)
3645 3639  {
3646 3640          if (nfs_zone() != VTOMI4(vp)->mi_zone)
3647 3641                  return (EIO);
3648 3642          switch (cmd) {
3649 3643                  case _FIODIRECTIO:
3650 3644                          return (nfs4_directio(vp, (int)arg, cr));
3651 3645                  default:
3652 3646                          return (ENOTTY);
3653 3647          }
3654 3648  }
3655 3649  
3656 3650  /* ARGSUSED */
3657 3651  int
3658 3652  nfs4_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3659 3653      caller_context_t *ct)
3660 3654  {
3661 3655          int error;
3662 3656          rnode4_t *rp = VTOR4(vp);
3663 3657  
3664 3658          if (nfs_zone() != VTOMI4(vp)->mi_zone)
3665 3659                  return (EIO);
3666 3660          /*
3667 3661           * If it has been specified that the return value will
3668 3662           * just be used as a hint, and we are only being asked
3669 3663           * for size, fsid or rdevid, then return the client's
3670 3664           * notion of these values without checking to make sure
3671 3665           * that the attribute cache is up to date.
3672 3666           * The whole point is to avoid an over the wire GETATTR
3673 3667           * call.
3674 3668           */
3675 3669          if (flags & ATTR_HINT) {
3676 3670                  if (!(vap->va_mask & ~(AT_SIZE | AT_FSID | AT_RDEV))) {
3677 3671                          mutex_enter(&rp->r_statelock);
3678 3672                          if (vap->va_mask & AT_SIZE)
3679 3673                                  vap->va_size = rp->r_size;
3680 3674                          if (vap->va_mask & AT_FSID)
3681 3675                                  vap->va_fsid = rp->r_attr.va_fsid;
3682 3676                          if (vap->va_mask & AT_RDEV)
3683 3677                                  vap->va_rdev = rp->r_attr.va_rdev;
3684 3678                          mutex_exit(&rp->r_statelock);
3685 3679                          return (0);
3686 3680                  }
3687 3681          }
3688 3682  
3689 3683          /*
3690 3684           * Only need to flush pages if asking for the mtime
3691 3685           * and if there any dirty pages or any outstanding
3692 3686           * asynchronous (write) requests for this file.
3693 3687           */
3694 3688          if (vap->va_mask & AT_MTIME) {
3695 3689                  rp = VTOR4(vp);
3696 3690                  if (nfs4_has_pages(vp)) {
3697 3691                          mutex_enter(&rp->r_statev4_lock);
3698 3692                          if (rp->r_deleg_type != OPEN_DELEGATE_WRITE) {
3699 3693                                  mutex_exit(&rp->r_statev4_lock);
3700 3694                                  if (rp->r_flags & R4DIRTY ||
3701 3695                                      rp->r_awcount > 0) {
3702 3696                                          mutex_enter(&rp->r_statelock);
3703 3697                                          rp->r_gcount++;
3704 3698                                          mutex_exit(&rp->r_statelock);
3705 3699                                          error =
3706 3700                                              nfs4_putpage(vp, (u_offset_t)0,
3707 3701                                              0, 0, cr, NULL);
3708 3702                                          mutex_enter(&rp->r_statelock);
3709 3703                                          if (error && (error == ENOSPC ||
3710 3704                                              error == EDQUOT)) {
3711 3705                                                  if (!rp->r_error)
3712 3706                                                          rp->r_error = error;
3713 3707                                          }
3714 3708                                          if (--rp->r_gcount == 0)
3715 3709                                                  cv_broadcast(&rp->r_cv);
3716 3710                                          mutex_exit(&rp->r_statelock);
3717 3711                                  }
3718 3712                          } else {
3719 3713                                  mutex_exit(&rp->r_statev4_lock);
3720 3714                          }
3721 3715                  }
3722 3716          }
3723 3717          return (nfs4getattr(vp, vap, cr));
3724 3718  }
3725 3719  
3726 3720  int
3727 3721  nfs4_compare_modes(mode_t from_server, mode_t on_client)
3728 3722  {
3729 3723          /*
3730 3724           * If these are the only two bits cleared
3731 3725           * on the server then return 0 (OK) else
3732 3726           * return 1 (BAD).
3733 3727           */
3734 3728          on_client &= ~(S_ISUID|S_ISGID);
3735 3729          if (on_client == from_server)
3736 3730                  return (0);
3737 3731          else
3738 3732                  return (1);
3739 3733  }
3740 3734  
3741 3735  /*ARGSUSED4*/
3742 3736  static int
3743 3737  nfs4_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3744 3738      caller_context_t *ct)
3745 3739  {
3746 3740          int error;
3747 3741  
3748 3742          if (vap->va_mask & AT_NOSET)
3749 3743                  return (EINVAL);
3750 3744  
3751 3745          if (nfs_zone() != VTOMI4(vp)->mi_zone)
3752 3746                  return (EIO);
3753 3747  
3754 3748          /*
3755 3749           * Don't call secpolicy_vnode_setattr, the client cannot
3756 3750           * use its cached attributes to make security decisions
3757 3751           * as the server may be faking mode bits or mapping uid/gid.
3758 3752           * Always just let the server to the checking.
3759 3753           * If we provide the ability to remove basic priviledges
3760 3754           * to setattr (e.g. basic without chmod) then we will
3761 3755           * need to add a check here before calling the server.
3762 3756           */
3763 3757          error = nfs4setattr(vp, vap, flags, cr, NULL);
3764 3758  
3765 3759          if (error == 0 && (vap->va_mask & AT_SIZE)) {
3766 3760                  if (vap->va_size == 0) {
3767 3761                          vnevent_truncate(vp, ct);
3768 3762                  } else {
3769 3763                          vnevent_resize(vp, ct);
3770 3764                  }
3771 3765          }
3772 3766  
3773 3767          return (error);
3774 3768  }
3775 3769  
3776 3770  /*
3777 3771   * To replace the "guarded" version 3 setattr, we use two types of compound
3778 3772   * setattr requests:
3779 3773   * 1. The "normal" setattr, used when the size of the file isn't being
3780 3774   *    changed - { Putfh <fh>; Setattr; Getattr }/
3781 3775   * 2. If the size is changed, precede Setattr with: Getattr; Verify
3782 3776   *    with only ctime as the argument. If the server ctime differs from
3783 3777   *    what is cached on the client, the verify will fail, but we would
3784 3778   *    already have the ctime from the preceding getattr, so just set it
3785 3779   *    and retry. Thus the compound here is - { Putfh <fh>; Getattr; Verify;
3786 3780   *      Setattr; Getattr }.
3787 3781   *
3788 3782   * The vsecattr_t * input parameter will be non-NULL if ACLs are being set in
3789 3783   * this setattr and NULL if they are not.
3790 3784   */
3791 3785  static int
3792 3786  nfs4setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3793 3787      vsecattr_t *vsap)
3794 3788  {
3795 3789          COMPOUND4args_clnt args;
3796 3790          COMPOUND4res_clnt res, *resp = NULL;
3797 3791          nfs4_ga_res_t *garp = NULL;
3798 3792          int numops = 3;                 /* { Putfh; Setattr; Getattr } */
3799 3793          nfs_argop4 argop[5];
3800 3794          int verify_argop = -1;
3801 3795          int setattr_argop = 1;
3802 3796          nfs_resop4 *resop;
3803 3797          vattr_t va;
3804 3798          rnode4_t *rp;
3805 3799          int doqueue = 1;
3806 3800          uint_t mask = vap->va_mask;
3807 3801          mode_t omode;
3808 3802          vsecattr_t *vsp;
3809 3803          timestruc_t ctime;
3810 3804          bool_t needrecov = FALSE;
3811 3805          nfs4_recov_state_t recov_state;
3812 3806          nfs4_stateid_types_t sid_types;
3813 3807          stateid4 stateid;
3814 3808          hrtime_t t;
3815 3809          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3816 3810          servinfo4_t *svp;
3817 3811          bitmap4 supp_attrs;
3818 3812  
3819 3813          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
3820 3814          rp = VTOR4(vp);
3821 3815          nfs4_init_stateid_types(&sid_types);
3822 3816  
3823 3817          /*
3824 3818           * Only need to flush pages if there are any pages and
3825 3819           * if the file is marked as dirty in some fashion.  The
3826 3820           * file must be flushed so that we can accurately
3827 3821           * determine the size of the file and the cached data
3828 3822           * after the SETATTR returns.  A file is considered to
3829 3823           * be dirty if it is either marked with R4DIRTY, has
3830 3824           * outstanding i/o's active, or is mmap'd.  In this
3831 3825           * last case, we can't tell whether there are dirty
3832 3826           * pages, so we flush just to be sure.
3833 3827           */
3834 3828          if (nfs4_has_pages(vp) &&
3835 3829              ((rp->r_flags & R4DIRTY) ||
3836 3830              rp->r_count > 0 ||
3837 3831              rp->r_mapcnt > 0)) {
3838 3832                  ASSERT(vp->v_type != VCHR);
3839 3833                  e.error = nfs4_putpage(vp, (offset_t)0, 0, 0, cr, NULL);
3840 3834                  if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) {
3841 3835                          mutex_enter(&rp->r_statelock);
3842 3836                          if (!rp->r_error)
3843 3837                                  rp->r_error = e.error;
3844 3838                          mutex_exit(&rp->r_statelock);
3845 3839                  }
3846 3840          }
3847 3841  
3848 3842          if (mask & AT_SIZE) {
3849 3843                  /*
3850 3844                   * Verification setattr compound for non-deleg AT_SIZE:
3851 3845                   *      { Putfh; Getattr; Verify; Setattr; Getattr }
3852 3846                   * Set ctime local here (outside the do_again label)
3853 3847                   * so that subsequent retries (after failed VERIFY)
3854 3848                   * will use ctime from GETATTR results (from failed
3855 3849                   * verify compound) as VERIFY arg.
3856 3850                   * If file has delegation, then VERIFY(time_metadata)
3857 3851                   * is of little added value, so don't bother.
3858 3852                   */
3859 3853                  mutex_enter(&rp->r_statev4_lock);
3860 3854                  if (rp->r_deleg_type == OPEN_DELEGATE_NONE ||
3861 3855                      rp->r_deleg_return_pending) {
3862 3856                          numops = 5;
3863 3857                          ctime = rp->r_attr.va_ctime;
3864 3858                  }
3865 3859                  mutex_exit(&rp->r_statev4_lock);
3866 3860          }
3867 3861  
3868 3862          recov_state.rs_flags = 0;
3869 3863          recov_state.rs_num_retry_despite_err = 0;
3870 3864  
3871 3865          args.ctag = TAG_SETATTR;
3872 3866  do_again:
3873 3867  recov_retry:
3874 3868          setattr_argop = numops - 2;
3875 3869  
3876 3870          args.array = argop;
3877 3871          args.array_len = numops;
3878 3872  
3879 3873          e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state);
3880 3874          if (e.error)
3881 3875                  return (e.error);
3882 3876  
3883 3877  
3884 3878          /* putfh target fh */
3885 3879          argop[0].argop = OP_CPUTFH;
3886 3880          argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3887 3881  
3888 3882          if (numops == 5) {
3889 3883                  /*
3890 3884                   * We only care about the ctime, but need to get mtime
3891 3885                   * and size for proper cache update.
3892 3886                   */
3893 3887                  /* getattr */
3894 3888                  argop[1].argop = OP_GETATTR;
3895 3889                  argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
3896 3890                  argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
3897 3891  
3898 3892                  /* verify - set later in loop */
3899 3893                  verify_argop = 2;
3900 3894          }
3901 3895  
3902 3896          /* setattr */
3903 3897          svp = rp->r_server;
3904 3898          (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3905 3899          supp_attrs = svp->sv_supp_attrs;
3906 3900          nfs_rw_exit(&svp->sv_lock);
3907 3901  
3908 3902          nfs4args_setattr(&argop[setattr_argop], vap, vsap, flags, rp, cr,
3909 3903              supp_attrs, &e.error, &sid_types);
3910 3904          stateid = argop[setattr_argop].nfs_argop4_u.opsetattr.stateid;
3911 3905          if (e.error) {
3912 3906                  /* req time field(s) overflow - return immediately */
3913 3907                  nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
3914 3908                  nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
3915 3909                      opsetattr.obj_attributes);
3916 3910                  return (e.error);
3917 3911          }
3918 3912          omode = rp->r_attr.va_mode;
3919 3913  
3920 3914          /* getattr */
3921 3915          argop[numops-1].argop = OP_GETATTR;
3922 3916          argop[numops-1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
3923 3917          /*
3924 3918           * If we are setting the ACL (indicated only by vsap != NULL), request
3925 3919           * the ACL in this getattr.  The ACL returned from this getattr will be
3926 3920           * used in updating the ACL cache.
3927 3921           */
3928 3922          if (vsap != NULL)
3929 3923                  argop[numops-1].nfs_argop4_u.opgetattr.attr_request |=
3930 3924                      FATTR4_ACL_MASK;
3931 3925          argop[numops-1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
3932 3926  
3933 3927          /*
3934 3928           * setattr iterates if the object size is set and the cached ctime
3935 3929           * does not match the file ctime. In that case, verify the ctime first.
3936 3930           */
3937 3931  
3938 3932          do {
3939 3933                  if (verify_argop != -1) {
3940 3934                          /*
3941 3935                           * Verify that the ctime match before doing setattr.
3942 3936                           */
3943 3937                          va.va_mask = AT_CTIME;
3944 3938                          va.va_ctime = ctime;
3945 3939                          svp = rp->r_server;
3946 3940                          (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3947 3941                          supp_attrs = svp->sv_supp_attrs;
3948 3942                          nfs_rw_exit(&svp->sv_lock);
3949 3943                          e.error = nfs4args_verify(&argop[verify_argop], &va,
3950 3944                              OP_VERIFY, supp_attrs);
3951 3945                          if (e.error) {
3952 3946                                  /* req time field(s) overflow - return */
3953 3947                                  nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3954 3948                                      needrecov);
3955 3949                                  break;
3956 3950                          }
3957 3951                  }
3958 3952  
3959 3953                  doqueue = 1;
3960 3954  
3961 3955                  t = gethrtime();
3962 3956  
3963 3957                  rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e);
3964 3958  
3965 3959                  /*
3966 3960                   * Purge the access cache and ACL cache if changing either the
3967 3961                   * owner of the file, the group owner, or the mode.  These may
3968 3962                   * change the access permissions of the file, so purge old
3969 3963                   * information and start over again.
3970 3964                   */
3971 3965                  if (mask & (AT_UID | AT_GID | AT_MODE)) {
3972 3966                          (void) nfs4_access_purge_rp(rp);
3973 3967                          if (rp->r_secattr != NULL) {
3974 3968                                  mutex_enter(&rp->r_statelock);
3975 3969                                  vsp = rp->r_secattr;
3976 3970                                  rp->r_secattr = NULL;
3977 3971                                  mutex_exit(&rp->r_statelock);
3978 3972                                  if (vsp != NULL)
3979 3973                                          nfs4_acl_free_cache(vsp);
3980 3974                          }
3981 3975                  }
3982 3976  
3983 3977                  /*
3984 3978                   * If res.array_len == numops, then everything succeeded,
3985 3979                   * except for possibly the final getattr.  If only the
3986 3980                   * last getattr failed, give up, and don't try recovery.
3987 3981                   */
3988 3982                  if (res.array_len == numops) {
3989 3983                          nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3990 3984                              needrecov);
3991 3985                          if (! e.error)
3992 3986                                  resp = &res;
3993 3987                          break;
3994 3988                  }
3995 3989  
3996 3990                  /*
3997 3991                   * if either rpc call failed or completely succeeded - done
3998 3992                   */
3999 3993                  needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
4000 3994                  if (e.error) {
4001 3995                          PURGE_ATTRCACHE4(vp);
4002 3996                          if (!needrecov) {
4003 3997                                  nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
4004 3998                                      needrecov);
4005 3999                                  break;
4006 4000                          }
4007 4001                  }
4008 4002  
4009 4003                  /*
4010 4004                   * Do proper retry for OLD_STATEID outside of the normal
4011 4005                   * recovery framework.
4012 4006                   */
4013 4007                  if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
4014 4008                      sid_types.cur_sid_type != SPEC_SID &&
4015 4009                      sid_types.cur_sid_type != NO_SID) {
4016 4010                          nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
4017 4011                              needrecov);
4018 4012                          nfs4_save_stateid(&stateid, &sid_types);
4019 4013                          nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4020 4014                              opsetattr.obj_attributes);
4021 4015                          if (verify_argop != -1) {
4022 4016                                  nfs4args_verify_free(&argop[verify_argop]);
4023 4017                                  verify_argop = -1;
4024 4018                          }
4025 4019                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4026 4020                          goto recov_retry;
4027 4021                  }
4028 4022  
4029 4023                  if (needrecov) {
4030 4024                          bool_t abort;
4031 4025  
4032 4026                          abort = nfs4_start_recovery(&e,
4033 4027                              VTOMI4(vp), vp, NULL, NULL, NULL,
4034 4028                              OP_SETATTR, NULL, NULL, NULL);
4035 4029                          nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
4036 4030                              needrecov);
4037 4031                          /*
4038 4032                           * Do not retry if we failed with OLD_STATEID using
4039 4033                           * a special stateid.  This is done to avoid looping
4040 4034                           * with a broken server.
4041 4035                           */
4042 4036                          if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
4043 4037                              (sid_types.cur_sid_type == SPEC_SID ||
4044 4038                              sid_types.cur_sid_type == NO_SID))
4045 4039                                  abort = TRUE;
4046 4040                          if (!e.error) {
4047 4041                                  if (res.status == NFS4ERR_BADOWNER)
4048 4042                                          nfs4_log_badowner(VTOMI4(vp),
4049 4043                                              OP_SETATTR);
4050 4044  
4051 4045                                  e.error = geterrno4(res.status);
4052 4046                                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4053 4047                          }
4054 4048                          nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4055 4049                              opsetattr.obj_attributes);
4056 4050                          if (verify_argop != -1) {
4057 4051                                  nfs4args_verify_free(&argop[verify_argop]);
4058 4052                                  verify_argop = -1;
4059 4053                          }
4060 4054                          if (abort == FALSE) {
4061 4055                                  /*
4062 4056                                   * Need to retry all possible stateids in
4063 4057                                   * case the recovery error wasn't stateid
4064 4058                                   * related or the stateids have become
4065 4059                                   * stale (server reboot).
4066 4060                                   */
4067 4061                                  nfs4_init_stateid_types(&sid_types);
4068 4062                                  goto recov_retry;
4069 4063                          }
4070 4064                          return (e.error);
4071 4065                  }
4072 4066  
4073 4067                  /*
4074 4068                   * Need to call nfs4_end_op before nfs4getattr to
4075 4069                   * avoid potential nfs4_start_op deadlock. See RFE
4076 4070                   * 4777612.  Calls to nfs4_invalidate_pages() and
4077 4071                   * nfs4_purge_stale_fh() might also generate over the
4078 4072                   * wire calls which my cause nfs4_start_op() deadlock.
4079 4073                   */
4080 4074                  nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
4081 4075  
4082 4076                  /*
4083 4077                   * Check to update lease.
4084 4078                   */
4085 4079                  resp = &res;
4086 4080                  if (res.status == NFS4_OK) {
4087 4081                          break;
4088 4082                  }
4089 4083  
4090 4084                  /*
4091 4085                   * Check if verify failed to see if try again
4092 4086                   */
4093 4087                  if ((verify_argop == -1) || (res.array_len != 3)) {
4094 4088                          /*
4095 4089                           * can't continue...
4096 4090                           */
4097 4091                          if (res.status == NFS4ERR_BADOWNER)
4098 4092                                  nfs4_log_badowner(VTOMI4(vp), OP_SETATTR);
4099 4093  
4100 4094                          e.error = geterrno4(res.status);
4101 4095                  } else {
4102 4096                          /*
4103 4097                           * When the verify request fails, the client ctime is
4104 4098                           * not in sync with the server. This is the same as
4105 4099                           * the version 3 "not synchronized" error, and we
4106 4100                           * handle it in a similar manner (XXX do we need to???).
4107 4101                           * Use the ctime returned in the first getattr for
4108 4102                           * the input to the next verify.
4109 4103                           * If we couldn't get the attributes, then we give up
4110 4104                           * because we can't complete the operation as required.
4111 4105                           */
4112 4106                          garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res;
4113 4107                  }
4114 4108                  if (e.error) {
4115 4109                          PURGE_ATTRCACHE4(vp);
4116 4110                          nfs4_purge_stale_fh(e.error, vp, cr);
4117 4111                  } else {
4118 4112                          /*
4119 4113                           * retry with a new verify value
4120 4114                           */
4121 4115                          ctime = garp->n4g_va.va_ctime;
4122 4116                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4123 4117                          resp = NULL;
4124 4118                  }
4125 4119                  if (!e.error) {
4126 4120                          nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4127 4121                              opsetattr.obj_attributes);
4128 4122                          if (verify_argop != -1) {
4129 4123                                  nfs4args_verify_free(&argop[verify_argop]);
4130 4124                                  verify_argop = -1;
4131 4125                          }
4132 4126                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4133 4127                          goto do_again;
4134 4128                  }
4135 4129          } while (!e.error);
4136 4130  
4137 4131          if (e.error) {
4138 4132                  /*
4139 4133                   * If we are here, rfs4call has an irrecoverable error - return
4140 4134                   */
4141 4135                  nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4142 4136                      opsetattr.obj_attributes);
4143 4137                  if (verify_argop != -1) {
4144 4138                          nfs4args_verify_free(&argop[verify_argop]);
4145 4139                          verify_argop = -1;
4146 4140                  }
4147 4141                  if (resp)
4148 4142                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
4149 4143                  return (e.error);
4150 4144          }
4151 4145  
4152 4146  
4153 4147  
4154 4148          /*
4155 4149           * If changing the size of the file, invalidate
4156 4150           * any local cached data which is no longer part
4157 4151           * of the file.  We also possibly invalidate the
4158 4152           * last page in the file.  We could use
4159 4153           * pvn_vpzero(), but this would mark the page as
4160 4154           * modified and require it to be written back to
4161 4155           * the server for no particularly good reason.
4162 4156           * This way, if we access it, then we bring it
4163 4157           * back in.  A read should be cheaper than a
4164 4158           * write.
4165 4159           */
4166 4160          if (mask & AT_SIZE) {
4167 4161                  nfs4_invalidate_pages(vp, (vap->va_size & PAGEMASK), cr);
4168 4162          }
4169 4163  
4170 4164          /* either no error or one of the postop getattr failed */
4171 4165  
4172 4166          /*
4173 4167           * XXX Perform a simplified version of wcc checking. Instead of
4174 4168           * have another getattr to get pre-op, just purge cache if
4175 4169           * any of the ops prior to and including the getattr failed.
4176 4170           * If the getattr succeeded then update the attrcache accordingly.
4177 4171           */
4178 4172  
4179 4173          garp = NULL;
4180 4174          if (res.status == NFS4_OK) {
4181 4175                  /*
4182 4176                   * Last getattr
4183 4177                   */
4184 4178                  resop = &res.array[numops - 1];
4185 4179                  garp = &resop->nfs_resop4_u.opgetattr.ga_res;
4186 4180          }
4187 4181          /*
4188 4182           * In certain cases, nfs4_update_attrcache() will purge the attrcache,
4189 4183           * rather than filling it.  See the function itself for details.
4190 4184           */
4191 4185          e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr);
4192 4186          if (garp != NULL) {
4193 4187                  if (garp->n4g_resbmap & FATTR4_ACL_MASK) {
4194 4188                          nfs4_acl_fill_cache(rp, &garp->n4g_vsa);
4195 4189                          vs_ace4_destroy(&garp->n4g_vsa);
4196 4190                  } else {
4197 4191                          if (vsap != NULL) {
4198 4192                                  /*
4199 4193                                   * The ACL was supposed to be set and to be
4200 4194                                   * returned in the last getattr of this
4201 4195                                   * compound, but for some reason the getattr
4202 4196                                   * result doesn't contain the ACL.  In this
4203 4197                                   * case, purge the ACL cache.
4204 4198                                   */
4205 4199                                  if (rp->r_secattr != NULL) {
4206 4200                                          mutex_enter(&rp->r_statelock);
4207 4201                                          vsp = rp->r_secattr;
4208 4202                                          rp->r_secattr = NULL;
4209 4203                                          mutex_exit(&rp->r_statelock);
4210 4204                                          if (vsp != NULL)
4211 4205                                                  nfs4_acl_free_cache(vsp);
4212 4206                                  }
4213 4207                          }
4214 4208                  }
4215 4209          }
4216 4210  
4217 4211          if (res.status == NFS4_OK && (mask & AT_SIZE)) {
4218 4212                  /*
4219 4213                   * Set the size, rather than relying on getting it updated
4220 4214                   * via a GETATTR.  With delegations the client tries to
4221 4215                   * suppress GETATTR calls.
4222 4216                   */
4223 4217                  mutex_enter(&rp->r_statelock);
4224 4218                  rp->r_size = vap->va_size;
4225 4219                  mutex_exit(&rp->r_statelock);
4226 4220          }
4227 4221  
4228 4222          /*
4229 4223           * Can free up request args and res
4230 4224           */
4231 4225          nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4232 4226              opsetattr.obj_attributes);
4233 4227          if (verify_argop != -1) {
4234 4228                  nfs4args_verify_free(&argop[verify_argop]);
4235 4229                  verify_argop = -1;
4236 4230          }
4237 4231          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4238 4232  
4239 4233          /*
4240 4234           * Some servers will change the mode to clear the setuid
4241 4235           * and setgid bits when changing the uid or gid.  The
4242 4236           * client needs to compensate appropriately.
4243 4237           */
4244 4238          if (mask & (AT_UID | AT_GID)) {
4245 4239                  int terror, do_setattr;
4246 4240  
4247 4241                  do_setattr = 0;
4248 4242                  va.va_mask = AT_MODE;
4249 4243                  terror = nfs4getattr(vp, &va, cr);
4250 4244                  if (!terror &&
4251 4245                      (((mask & AT_MODE) && va.va_mode != vap->va_mode) ||
4252 4246                      (!(mask & AT_MODE) && va.va_mode != omode))) {
4253 4247                          va.va_mask = AT_MODE;
4254 4248                          if (mask & AT_MODE) {
4255 4249                                  /*
4256 4250                                   * We asked the mode to be changed and what
4257 4251                                   * we just got from the server in getattr is
4258 4252                                   * not what we wanted it to be, so set it now.
4259 4253                                   */
4260 4254                                  va.va_mode = vap->va_mode;
4261 4255                                  do_setattr = 1;
4262 4256                          } else {
4263 4257                                  /*
4264 4258                                   * We did not ask the mode to be changed,
4265 4259                                   * Check to see that the server just cleared
4266 4260                                   * I_SUID and I_GUID from it. If not then
4267 4261                                   * set mode to omode with UID/GID cleared.
4268 4262                                   */
4269 4263                                  if (nfs4_compare_modes(va.va_mode, omode)) {
4270 4264                                          omode &= ~(S_ISUID|S_ISGID);
4271 4265                                          va.va_mode = omode;
4272 4266                                          do_setattr = 1;
4273 4267                                  }
4274 4268                          }
4275 4269  
4276 4270                          if (do_setattr)
4277 4271                                  (void) nfs4setattr(vp, &va, 0, cr, NULL);
4278 4272                  }
4279 4273          }
4280 4274  
4281 4275          return (e.error);
4282 4276  }
4283 4277  
4284 4278  /* ARGSUSED */
4285 4279  static int
4286 4280  nfs4_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
4287 4281  {
4288 4282          COMPOUND4args_clnt args;
4289 4283          COMPOUND4res_clnt res;
4290 4284          int doqueue;
4291 4285          uint32_t acc, resacc, argacc;
4292 4286          rnode4_t *rp;
4293 4287          cred_t *cred, *ncr, *ncrfree = NULL;
4294 4288          nfs4_access_type_t cacc;
4295 4289          int num_ops;
4296 4290          nfs_argop4 argop[3];
4297 4291          nfs_resop4 *resop;
4298 4292          bool_t needrecov = FALSE, do_getattr;
4299 4293          nfs4_recov_state_t recov_state;
4300 4294          int rpc_error;
4301 4295          hrtime_t t;
4302 4296          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4303 4297          mntinfo4_t *mi = VTOMI4(vp);
4304 4298  
4305 4299          if (nfs_zone() != mi->mi_zone)
4306 4300                  return (EIO);
4307 4301  
4308 4302          acc = 0;
4309 4303          if (mode & VREAD)
4310 4304                  acc |= ACCESS4_READ;
4311 4305          if (mode & VWRITE) {
4312 4306                  if ((vp->v_vfsp->vfs_flag & VFS_RDONLY) && !ISVDEV(vp->v_type))
4313 4307                          return (EROFS);
4314 4308                  if (vp->v_type == VDIR)
4315 4309                          acc |= ACCESS4_DELETE;
4316 4310                  acc |= ACCESS4_MODIFY | ACCESS4_EXTEND;
4317 4311          }
4318 4312          if (mode & VEXEC) {
4319 4313                  if (vp->v_type == VDIR)
4320 4314                          acc |= ACCESS4_LOOKUP;
4321 4315                  else
4322 4316                          acc |= ACCESS4_EXECUTE;
4323 4317          }
4324 4318  
4325 4319          if (VTOR4(vp)->r_acache != NULL) {
4326 4320                  e.error = nfs4_validate_caches(vp, cr);
4327 4321                  if (e.error)
4328 4322                          return (e.error);
4329 4323          }
4330 4324  
4331 4325          rp = VTOR4(vp);
4332 4326          if (vp->v_type == VDIR)
4333 4327                  argacc = ACCESS4_READ | ACCESS4_DELETE | ACCESS4_MODIFY |
4334 4328                      ACCESS4_EXTEND | ACCESS4_LOOKUP;
4335 4329          else
4336 4330                  argacc = ACCESS4_READ | ACCESS4_MODIFY | ACCESS4_EXTEND |
4337 4331                      ACCESS4_EXECUTE;
4338 4332          recov_state.rs_flags = 0;
4339 4333          recov_state.rs_num_retry_despite_err = 0;
4340 4334  
4341 4335          cred = cr;
4342 4336          /*
4343 4337           * ncr and ncrfree both initially
4344 4338           * point to the memory area returned
4345 4339           * by crnetadjust();
4346 4340           * ncrfree not NULL when exiting means
4347 4341           * that we need to release it
4348 4342           */
4349 4343          ncr = crnetadjust(cred);
4350 4344          ncrfree = ncr;
4351 4345  
4352 4346  tryagain:
4353 4347          cacc = nfs4_access_check(rp, acc, cred);
4354 4348          if (cacc == NFS4_ACCESS_ALLOWED) {
4355 4349                  if (ncrfree != NULL)
4356 4350                          crfree(ncrfree);
4357 4351                  return (0);
4358 4352          }
4359 4353          if (cacc == NFS4_ACCESS_DENIED) {
4360 4354                  /*
4361 4355                   * If the cred can be adjusted, try again
4362 4356                   * with the new cred.
4363 4357                   */
4364 4358                  if (ncr != NULL) {
4365 4359                          cred = ncr;
4366 4360                          ncr = NULL;
4367 4361                          goto tryagain;
4368 4362                  }
4369 4363                  if (ncrfree != NULL)
4370 4364                          crfree(ncrfree);
4371 4365                  return (EACCES);
4372 4366          }
4373 4367  
4374 4368  recov_retry:
4375 4369          /*
4376 4370           * Don't take with r_statev4_lock here. r_deleg_type could
4377 4371           * change as soon as lock is released.  Since it is an int,
4378 4372           * there is no atomicity issue.
4379 4373           */
4380 4374          do_getattr = (rp->r_deleg_type == OPEN_DELEGATE_NONE);
4381 4375          num_ops = do_getattr ? 3 : 2;
4382 4376  
4383 4377          args.ctag = TAG_ACCESS;
4384 4378  
4385 4379          args.array_len = num_ops;
4386 4380          args.array = argop;
4387 4381  
4388 4382          if (e.error = nfs4_start_fop(mi, vp, NULL, OH_ACCESS,
4389 4383              &recov_state, NULL)) {
4390 4384                  if (ncrfree != NULL)
4391 4385                          crfree(ncrfree);
4392 4386                  return (e.error);
4393 4387          }
4394 4388  
4395 4389          /* putfh target fh */
4396 4390          argop[0].argop = OP_CPUTFH;
4397 4391          argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
4398 4392  
4399 4393          /* access */
4400 4394          argop[1].argop = OP_ACCESS;
4401 4395          argop[1].nfs_argop4_u.opaccess.access = argacc;
4402 4396  
4403 4397          /* getattr */
4404 4398          if (do_getattr) {
4405 4399                  argop[2].argop = OP_GETATTR;
4406 4400                  argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
4407 4401                  argop[2].nfs_argop4_u.opgetattr.mi = mi;
4408 4402          }
4409 4403  
4410 4404          NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
4411 4405              "nfs4_access: %s call, rp %s", needrecov ? "recov" : "first",
4412 4406              rnode4info(VTOR4(vp))));
4413 4407  
4414 4408          doqueue = 1;
4415 4409          t = gethrtime();
4416 4410          rfs4call(VTOMI4(vp), &args, &res, cred, &doqueue, 0, &e);
4417 4411          rpc_error = e.error;
4418 4412  
4419 4413          needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
4420 4414          if (needrecov) {
4421 4415                  NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
4422 4416                      "nfs4_access: initiating recovery\n"));
4423 4417  
4424 4418                  if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
4425 4419                      NULL, OP_ACCESS, NULL, NULL, NULL) == FALSE) {
4426 4420                          nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_ACCESS,
4427 4421                              &recov_state, needrecov);
4428 4422                          if (!e.error)
4429 4423                                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4430 4424                          goto recov_retry;
4431 4425                  }
4432 4426          }
4433 4427          nfs4_end_fop(mi, vp, NULL, OH_ACCESS, &recov_state, needrecov);
4434 4428  
4435 4429          if (e.error)
4436 4430                  goto out;
4437 4431  
4438 4432          if (res.status) {
4439 4433                  e.error = geterrno4(res.status);
4440 4434                  /*
4441 4435                   * This might generate over the wire calls throught
4442 4436                   * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
4443 4437                   * here to avoid a deadlock.
4444 4438                   */
4445 4439                  nfs4_purge_stale_fh(e.error, vp, cr);
4446 4440                  goto out;
4447 4441          }
4448 4442          resop = &res.array[1];  /* access res */
4449 4443  
4450 4444          resacc = resop->nfs_resop4_u.opaccess.access;
4451 4445  
4452 4446          if (do_getattr) {
4453 4447                  resop++;        /* getattr res */
4454 4448                  nfs4_attr_cache(vp, &resop->nfs_resop4_u.opgetattr.ga_res,
4455 4449                      t, cr, FALSE, NULL);
4456 4450          }
4457 4451  
4458 4452          if (!e.error) {
4459 4453                  nfs4_access_cache(rp, argacc, resacc, cred);
4460 4454                  /*
4461 4455                   * we just cached results with cred; if cred is the
4462 4456                   * adjusted credentials from crnetadjust, we do not want
4463 4457                   * to release them before exiting: hence setting ncrfree
4464 4458                   * to NULL
4465 4459                   */
4466 4460                  if (cred != cr)
4467 4461                          ncrfree = NULL;
4468 4462                  /* XXX check the supported bits too? */
4469 4463                  if ((acc & resacc) != acc) {
4470 4464                          /*
4471 4465                           * The following code implements the semantic
4472 4466                           * that a setuid root program has *at least* the
4473 4467                           * permissions of the user that is running the
4474 4468                           * program.  See rfs3call() for more portions
4475 4469                           * of the implementation of this functionality.
4476 4470                           */
4477 4471                          /* XXX-LP */
4478 4472                          if (ncr != NULL) {
4479 4473                                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4480 4474                                  cred = ncr;
4481 4475                                  ncr = NULL;
4482 4476                                  goto tryagain;
4483 4477                          }
4484 4478                          e.error = EACCES;
4485 4479                  }
4486 4480          }
4487 4481  
4488 4482  out:
4489 4483          if (!rpc_error)
4490 4484                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4491 4485  
4492 4486          if (ncrfree != NULL)
4493 4487                  crfree(ncrfree);
4494 4488  
4495 4489          return (e.error);
4496 4490  }
4497 4491  
4498 4492  /* ARGSUSED */
4499 4493  static int
4500 4494  nfs4_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct)
4501 4495  {
4502 4496          COMPOUND4args_clnt args;
4503 4497          COMPOUND4res_clnt res;
4504 4498          int doqueue;
4505 4499          rnode4_t *rp;
4506 4500          nfs_argop4 argop[3];
4507 4501          nfs_resop4 *resop;
4508 4502          READLINK4res *lr_res;
4509 4503          nfs4_ga_res_t *garp;
4510 4504          uint_t len;
4511 4505          char *linkdata;
4512 4506          bool_t needrecov = FALSE;
4513 4507          nfs4_recov_state_t recov_state;
4514 4508          hrtime_t t;
4515 4509          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4516 4510  
4517 4511          if (nfs_zone() != VTOMI4(vp)->mi_zone)
4518 4512                  return (EIO);
4519 4513          /*
4520 4514           * Can't readlink anything other than a symbolic link.
4521 4515           */
4522 4516          if (vp->v_type != VLNK)
4523 4517                  return (EINVAL);
4524 4518  
4525 4519          rp = VTOR4(vp);
4526 4520          if (nfs4_do_symlink_cache && rp->r_symlink.contents != NULL) {
4527 4521                  e.error = nfs4_validate_caches(vp, cr);
4528 4522                  if (e.error)
4529 4523                          return (e.error);
4530 4524                  mutex_enter(&rp->r_statelock);
4531 4525                  if (rp->r_symlink.contents != NULL) {
4532 4526                          e.error = uiomove(rp->r_symlink.contents,
4533 4527                              rp->r_symlink.len, UIO_READ, uiop);
4534 4528                          mutex_exit(&rp->r_statelock);
4535 4529                          return (e.error);
4536 4530                  }
4537 4531                  mutex_exit(&rp->r_statelock);
4538 4532          }
4539 4533          recov_state.rs_flags = 0;
4540 4534          recov_state.rs_num_retry_despite_err = 0;
4541 4535  
4542 4536  recov_retry:
4543 4537          args.array_len = 3;
4544 4538          args.array = argop;
4545 4539          args.ctag = TAG_READLINK;
4546 4540  
4547 4541          e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state);
4548 4542          if (e.error) {
4549 4543                  return (e.error);
4550 4544          }
4551 4545  
4552 4546          /* 0. putfh symlink fh */
4553 4547          argop[0].argop = OP_CPUTFH;
4554 4548          argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
4555 4549  
4556 4550          /* 1. readlink */
4557 4551          argop[1].argop = OP_READLINK;
4558 4552  
4559 4553          /* 2. getattr */
4560 4554          argop[2].argop = OP_GETATTR;
4561 4555          argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
4562 4556          argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
4563 4557  
4564 4558          doqueue = 1;
4565 4559  
4566 4560          NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
4567 4561              "nfs4_readlink: %s call, rp %s", needrecov ? "recov" : "first",
4568 4562              rnode4info(VTOR4(vp))));
4569 4563  
4570 4564          t = gethrtime();
4571 4565  
4572 4566          rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e);
4573 4567  
4574 4568          needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
4575 4569          if (needrecov) {
4576 4570                  NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
4577 4571                      "nfs4_readlink: initiating recovery\n"));
4578 4572  
4579 4573                  if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
4580 4574                      NULL, OP_READLINK, NULL, NULL, NULL) == FALSE) {
4581 4575                          if (!e.error)
4582 4576                                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4583 4577  
4584 4578                          nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
4585 4579                              needrecov);
4586 4580                          goto recov_retry;
4587 4581                  }
4588 4582          }
4589 4583  
4590 4584          nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
4591 4585  
4592 4586          if (e.error)
4593 4587                  return (e.error);
4594 4588  
4595 4589          /*
4596 4590           * There is an path in the code below which calls
4597 4591           * nfs4_purge_stale_fh(), which may generate otw calls through
4598 4592           * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
4599 4593           * here to avoid nfs4_start_op() deadlock.
4600 4594           */
4601 4595  
4602 4596          if (res.status && (res.array_len < args.array_len)) {
4603 4597                  /*
4604 4598                   * either Putfh or Link failed
4605 4599                   */
4606 4600                  e.error = geterrno4(res.status);
4607 4601                  nfs4_purge_stale_fh(e.error, vp, cr);
4608 4602                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4609 4603                  return (e.error);
4610 4604          }
4611 4605  
4612 4606          resop = &res.array[1];  /* readlink res */
4613 4607          lr_res = &resop->nfs_resop4_u.opreadlink;
4614 4608  
4615 4609          /*
4616 4610           * treat symlink names as data
4617 4611           */
4618 4612          linkdata = utf8_to_str((utf8string *)&lr_res->link, &len, NULL);
4619 4613          if (linkdata != NULL) {
4620 4614                  int uio_len = len - 1;
4621 4615                  /* len includes null byte, which we won't uiomove */
4622 4616                  e.error = uiomove(linkdata, uio_len, UIO_READ, uiop);
4623 4617                  if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) {
4624 4618                          mutex_enter(&rp->r_statelock);
4625 4619                          if (rp->r_symlink.contents == NULL) {
4626 4620                                  rp->r_symlink.contents = linkdata;
4627 4621                                  rp->r_symlink.len = uio_len;
4628 4622                                  rp->r_symlink.size = len;
4629 4623                                  mutex_exit(&rp->r_statelock);
4630 4624                          } else {
4631 4625                                  mutex_exit(&rp->r_statelock);
4632 4626                                  kmem_free(linkdata, len);
4633 4627                          }
4634 4628                  } else {
4635 4629                          kmem_free(linkdata, len);
4636 4630                  }
4637 4631          }
4638 4632          if (res.status == NFS4_OK) {
4639 4633                  resop++;        /* getattr res */
4640 4634                  garp = &resop->nfs_resop4_u.opgetattr.ga_res;
4641 4635          }
4642 4636          e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr);
4643 4637  
4644 4638          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4645 4639  
4646 4640          /*
4647 4641           * The over the wire error for attempting to readlink something
4648 4642           * other than a symbolic link is ENXIO.  However, we need to
4649 4643           * return EINVAL instead of ENXIO, so we map it here.
4650 4644           */
4651 4645          return (e.error == ENXIO ? EINVAL : e.error);
4652 4646  }
4653 4647  
4654 4648  /*
4655 4649   * Flush local dirty pages to stable storage on the server.
4656 4650   *
4657 4651   * If FNODSYNC is specified, then there is nothing to do because
4658 4652   * metadata changes are not cached on the client before being
4659 4653   * sent to the server.
4660 4654   */
4661 4655  /* ARGSUSED */
4662 4656  static int
4663 4657  nfs4_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
4664 4658  {
4665 4659          int error;
4666 4660  
4667 4661          if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
4668 4662                  return (0);
4669 4663          if (nfs_zone() != VTOMI4(vp)->mi_zone)
4670 4664                  return (EIO);
4671 4665          error = nfs4_putpage_commit(vp, (offset_t)0, 0, cr);
4672 4666          if (!error)
4673 4667                  error = VTOR4(vp)->r_error;
4674 4668          return (error);
4675 4669  }
4676 4670  
4677 4671  /*
4678 4672   * Weirdness: if the file was removed or the target of a rename
4679 4673   * operation while it was open, it got renamed instead.  Here we
4680 4674   * remove the renamed file.
4681 4675   */
4682 4676  /* ARGSUSED */
4683 4677  void
4684 4678  nfs4_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4685 4679  {
4686 4680          rnode4_t *rp;
4687 4681  
4688 4682          ASSERT(vp != DNLC_NO_VNODE);
4689 4683  
4690 4684          rp = VTOR4(vp);
4691 4685  
4692 4686          if (IS_SHADOW(vp, rp)) {
4693 4687                  sv_inactive(vp);
4694 4688                  return;
4695 4689          }
4696 4690  
4697 4691          /*
4698 4692           * If this is coming from the wrong zone, we let someone in the right
4699 4693           * zone take care of it asynchronously.  We can get here due to
4700 4694           * VN_RELE() being called from pageout() or fsflush().  This call may
4701 4695           * potentially turn into an expensive no-op if, for instance, v_count
4702 4696           * gets incremented in the meantime, but it's still correct.
4703 4697           */
4704 4698          if (nfs_zone() != VTOMI4(vp)->mi_zone) {
4705 4699                  nfs4_async_inactive(vp, cr);
4706 4700                  return;
4707 4701          }
4708 4702  
4709 4703          /*
4710 4704           * Some of the cleanup steps might require over-the-wire
4711 4705           * operations.  Since VOP_INACTIVE can get called as a result of
4712 4706           * other over-the-wire operations (e.g., an attribute cache update
4713 4707           * can lead to a DNLC purge), doing those steps now would lead to a
4714 4708           * nested call to the recovery framework, which can deadlock.  So
4715 4709           * do any over-the-wire cleanups asynchronously, in a separate
4716 4710           * thread.
4717 4711           */
4718 4712  
4719 4713          mutex_enter(&rp->r_os_lock);
4720 4714          mutex_enter(&rp->r_statelock);
4721 4715          mutex_enter(&rp->r_statev4_lock);
4722 4716  
4723 4717          if (vp->v_type == VREG && list_head(&rp->r_open_streams) != NULL) {
4724 4718                  mutex_exit(&rp->r_statev4_lock);
4725 4719                  mutex_exit(&rp->r_statelock);
4726 4720                  mutex_exit(&rp->r_os_lock);
4727 4721                  nfs4_async_inactive(vp, cr);
4728 4722                  return;
4729 4723          }
4730 4724  
4731 4725          if (rp->r_deleg_type == OPEN_DELEGATE_READ ||
4732 4726              rp->r_deleg_type == OPEN_DELEGATE_WRITE) {
4733 4727                  mutex_exit(&rp->r_statev4_lock);
4734 4728                  mutex_exit(&rp->r_statelock);
4735 4729                  mutex_exit(&rp->r_os_lock);
4736 4730                  nfs4_async_inactive(vp, cr);
4737 4731                  return;
4738 4732          }
4739 4733  
4740 4734          if (rp->r_unldvp != NULL) {
4741 4735                  mutex_exit(&rp->r_statev4_lock);
4742 4736                  mutex_exit(&rp->r_statelock);
4743 4737                  mutex_exit(&rp->r_os_lock);
4744 4738                  nfs4_async_inactive(vp, cr);
4745 4739                  return;
4746 4740          }
4747 4741          mutex_exit(&rp->r_statev4_lock);
4748 4742          mutex_exit(&rp->r_statelock);
4749 4743          mutex_exit(&rp->r_os_lock);
4750 4744  
4751 4745          rp4_addfree(rp, cr);
4752 4746  }
4753 4747  
4754 4748  /*
4755 4749   * nfs4_inactive_otw - nfs4_inactive, plus over-the-wire calls to free up
4756 4750   * various bits of state.  The caller must not refer to vp after this call.
4757 4751   */
4758 4752  
4759 4753  void
4760 4754  nfs4_inactive_otw(vnode_t *vp, cred_t *cr)
4761 4755  {
4762 4756          rnode4_t *rp = VTOR4(vp);
4763 4757          nfs4_recov_state_t recov_state;
4764 4758          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4765 4759          vnode_t *unldvp;
4766 4760          char *unlname;
4767 4761          cred_t *unlcred;
4768 4762          COMPOUND4args_clnt args;
4769 4763          COMPOUND4res_clnt res, *resp;
4770 4764          nfs_argop4 argop[2];
4771 4765          int doqueue;
4772 4766  #ifdef DEBUG
4773 4767          char *name;
4774 4768  #endif
4775 4769  
4776 4770          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
4777 4771          ASSERT(!IS_SHADOW(vp, rp));
4778 4772  
4779 4773  #ifdef DEBUG
4780 4774          name = fn_name(VTOSV(vp)->sv_name);
4781 4775          NFS4_DEBUG(nfs4_client_inactive_debug, (CE_NOTE, "nfs4_inactive_otw: "
4782 4776              "release vnode %s", name));
4783 4777          kmem_free(name, MAXNAMELEN);
4784 4778  #endif
4785 4779  
4786 4780          if (vp->v_type == VREG) {
4787 4781                  bool_t recov_failed = FALSE;
4788 4782  
4789 4783                  e.error = nfs4close_all(vp, cr);
4790 4784                  if (e.error) {
4791 4785                          /* Check to see if recovery failed */
4792 4786                          mutex_enter(&(VTOMI4(vp)->mi_lock));
4793 4787                          if (VTOMI4(vp)->mi_flags & MI4_RECOV_FAIL)
4794 4788                                  recov_failed = TRUE;
4795 4789                          mutex_exit(&(VTOMI4(vp)->mi_lock));
4796 4790                          if (!recov_failed) {
4797 4791                                  mutex_enter(&rp->r_statelock);
4798 4792                                  if (rp->r_flags & R4RECOVERR)
4799 4793                                          recov_failed = TRUE;
4800 4794                                  mutex_exit(&rp->r_statelock);
4801 4795                          }
4802 4796                          if (recov_failed) {
4803 4797                                  NFS4_DEBUG(nfs4_client_recov_debug,
4804 4798                                      (CE_NOTE, "nfs4_inactive_otw: "
4805 4799                                      "close failed (recovery failure)"));
4806 4800                          }
4807 4801                  }
4808 4802          }
4809 4803  
4810 4804  redo:
4811 4805          if (rp->r_unldvp == NULL) {
4812 4806                  rp4_addfree(rp, cr);
4813 4807                  return;
4814 4808          }
4815 4809  
4816 4810          /*
4817 4811           * Save the vnode pointer for the directory where the
4818 4812           * unlinked-open file got renamed, then set it to NULL
4819 4813           * to prevent another thread from getting here before
4820 4814           * we're done with the remove.  While we have the
4821 4815           * statelock, make local copies of the pertinent rnode
4822 4816           * fields.  If we weren't to do this in an atomic way, the
4823 4817           * the unl* fields could become inconsistent with respect
4824 4818           * to each other due to a race condition between this
4825 4819           * code and nfs_remove().  See bug report 1034328.
4826 4820           */
4827 4821          mutex_enter(&rp->r_statelock);
4828 4822          if (rp->r_unldvp == NULL) {
4829 4823                  mutex_exit(&rp->r_statelock);
4830 4824                  rp4_addfree(rp, cr);
4831 4825                  return;
4832 4826          }
4833 4827  
4834 4828          unldvp = rp->r_unldvp;
4835 4829          rp->r_unldvp = NULL;
4836 4830          unlname = rp->r_unlname;
4837 4831          rp->r_unlname = NULL;
4838 4832          unlcred = rp->r_unlcred;
4839 4833          rp->r_unlcred = NULL;
4840 4834          mutex_exit(&rp->r_statelock);
4841 4835  
4842 4836          /*
4843 4837           * If there are any dirty pages left, then flush
4844 4838           * them.  This is unfortunate because they just
4845 4839           * may get thrown away during the remove operation,
4846 4840           * but we have to do this for correctness.
4847 4841           */
4848 4842          if (nfs4_has_pages(vp) &&
4849 4843              ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) {
4850 4844                  ASSERT(vp->v_type != VCHR);
4851 4845                  e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, NULL);
4852 4846                  if (e.error) {
4853 4847                          mutex_enter(&rp->r_statelock);
4854 4848                          if (!rp->r_error)
4855 4849                                  rp->r_error = e.error;
4856 4850                          mutex_exit(&rp->r_statelock);
4857 4851                  }
4858 4852          }
4859 4853  
4860 4854          recov_state.rs_flags = 0;
4861 4855          recov_state.rs_num_retry_despite_err = 0;
4862 4856  recov_retry_remove:
4863 4857          /*
4864 4858           * Do the remove operation on the renamed file
4865 4859           */
4866 4860          args.ctag = TAG_INACTIVE;
4867 4861  
4868 4862          /*
4869 4863           * Remove ops: putfh dir; remove
4870 4864           */
4871 4865          args.array_len = 2;
4872 4866          args.array = argop;
4873 4867  
4874 4868          e.error = nfs4_start_op(VTOMI4(unldvp), unldvp, NULL, &recov_state);
4875 4869          if (e.error) {
4876 4870                  kmem_free(unlname, MAXNAMELEN);
4877 4871                  crfree(unlcred);
4878 4872                  VN_RELE(unldvp);
4879 4873                  /*
4880 4874                   * Try again; this time around r_unldvp will be NULL, so we'll
4881 4875                   * just call rp4_addfree() and return.
4882 4876                   */
4883 4877                  goto redo;
4884 4878          }
4885 4879  
4886 4880          /* putfh directory */
4887 4881          argop[0].argop = OP_CPUTFH;
4888 4882          argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(unldvp)->r_fh;
4889 4883  
4890 4884          /* remove */
4891 4885          argop[1].argop = OP_CREMOVE;
4892 4886          argop[1].nfs_argop4_u.opcremove.ctarget = unlname;
4893 4887  
4894 4888          doqueue = 1;
4895 4889          resp = &res;
4896 4890  
4897 4891  #if 0 /* notyet */
4898 4892          /*
4899 4893           * Can't do this yet.  We may be being called from
4900 4894           * dnlc_purge_XXX while that routine is holding a
4901 4895           * mutex lock to the nc_rele list.  The calls to
4902 4896           * nfs3_cache_wcc_data may result in calls to
4903 4897           * dnlc_purge_XXX.  This will result in a deadlock.
4904 4898           */
4905 4899          rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e);
4906 4900          if (e.error) {
4907 4901                  PURGE_ATTRCACHE4(unldvp);
4908 4902                  resp = NULL;
4909 4903          } else if (res.status) {
4910 4904                  e.error = geterrno4(res.status);
4911 4905                  PURGE_ATTRCACHE4(unldvp);
4912 4906                  /*
4913 4907                   * This code is inactive right now
4914 4908                   * but if made active there should
4915 4909                   * be a nfs4_end_op() call before
4916 4910                   * nfs4_purge_stale_fh to avoid start_op()
4917 4911                   * deadlock. See BugId: 4948726
4918 4912                   */
4919 4913                  nfs4_purge_stale_fh(error, unldvp, cr);
4920 4914          } else {
4921 4915                  nfs_resop4 *resop;
4922 4916                  REMOVE4res *rm_res;
4923 4917  
4924 4918                  resop = &res.array[1];
4925 4919                  rm_res = &resop->nfs_resop4_u.opremove;
4926 4920                  /*
4927 4921                   * Update directory cache attribute,
4928 4922                   * readdir and dnlc caches.
4929 4923                   */
4930 4924                  nfs4_update_dircaches(&rm_res->cinfo, unldvp, NULL, NULL, NULL);
4931 4925          }
4932 4926  #else
4933 4927          rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e);
4934 4928  
4935 4929          PURGE_ATTRCACHE4(unldvp);
4936 4930  #endif
4937 4931  
4938 4932          if (nfs4_needs_recovery(&e, FALSE, unldvp->v_vfsp)) {
4939 4933                  if (nfs4_start_recovery(&e, VTOMI4(unldvp), unldvp, NULL,
4940 4934                      NULL, NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) {
4941 4935                          if (!e.error)
4942 4936                                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4943 4937                          nfs4_end_op(VTOMI4(unldvp), unldvp, NULL,
4944 4938                              &recov_state, TRUE);
4945 4939                          goto recov_retry_remove;
4946 4940                  }
4947 4941          }
4948 4942          nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, &recov_state, FALSE);
4949 4943  
4950 4944          /*
4951 4945           * Release stuff held for the remove
4952 4946           */
4953 4947          VN_RELE(unldvp);
4954 4948          if (!e.error && resp)
4955 4949                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
4956 4950  
4957 4951          kmem_free(unlname, MAXNAMELEN);
4958 4952          crfree(unlcred);
4959 4953          goto redo;
4960 4954  }
4961 4955  
4962 4956  /*
4963 4957   * Remote file system operations having to do with directory manipulation.
4964 4958   */
4965 4959  /* ARGSUSED3 */
4966 4960  int
4967 4961  nfs4_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
4968 4962      int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
4969 4963      int *direntflags, pathname_t *realpnp)
4970 4964  {
4971 4965          int error;
4972 4966          vnode_t *vp, *avp = NULL;
4973 4967          rnode4_t *drp;
4974 4968  
4975 4969          *vpp = NULL;
4976 4970          if (nfs_zone() != VTOMI4(dvp)->mi_zone)
4977 4971                  return (EPERM);
4978 4972          /*
4979 4973           * if LOOKUP_XATTR, must replace dvp (object) with
4980 4974           * object's attrdir before continuing with lookup
4981 4975           */
4982 4976          if (flags & LOOKUP_XATTR) {
4983 4977                  error = nfs4lookup_xattr(dvp, nm, &avp, flags, cr);
4984 4978                  if (error)
4985 4979                          return (error);
4986 4980  
4987 4981                  dvp = avp;
4988 4982  
4989 4983                  /*
4990 4984                   * If lookup is for "", just return dvp now.  The attrdir
4991 4985                   * has already been activated (from nfs4lookup_xattr), and
4992 4986                   * the caller will RELE the original dvp -- not
4993 4987                   * the attrdir.  So, set vpp and return.
4994 4988                   * Currently, when the LOOKUP_XATTR flag is
4995 4989                   * passed to VOP_LOOKUP, the name is always empty, and
4996 4990                   * shortcircuiting here avoids 3 unneeded lock/unlock
4997 4991                   * pairs.
4998 4992                   *
4999 4993                   * If a non-empty name was provided, then it is the
5000 4994                   * attribute name, and it will be looked up below.
5001 4995                   */
5002 4996                  if (*nm == '\0') {
5003 4997                          *vpp = dvp;
5004 4998                          return (0);
5005 4999                  }
5006 5000  
5007 5001                  /*
5008 5002                   * The vfs layer never sends a name when asking for the
5009 5003                   * attrdir, so we should never get here (unless of course
5010 5004                   * name is passed at some time in future -- at which time
5011 5005                   * we'll blow up here).
5012 5006                   */
5013 5007                  ASSERT(0);
5014 5008          }
5015 5009  
5016 5010          drp = VTOR4(dvp);
5017 5011          if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
5018 5012                  return (EINTR);
5019 5013  
5020 5014          error = nfs4lookup(dvp, nm, vpp, cr, 0);
5021 5015          nfs_rw_exit(&drp->r_rwlock);
5022 5016  
5023 5017          /*
5024 5018           * If vnode is a device, create special vnode.
5025 5019           */
5026 5020          if (!error && ISVDEV((*vpp)->v_type)) {
5027 5021                  vp = *vpp;
5028 5022                  *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
5029 5023                  VN_RELE(vp);
5030 5024          }
5031 5025  
5032 5026          return (error);
5033 5027  }
5034 5028  
5035 5029  /* ARGSUSED */
5036 5030  static int
5037 5031  nfs4lookup_xattr(vnode_t *dvp, char *nm, vnode_t **vpp, int flags, cred_t *cr)
5038 5032  {
5039 5033          int error;
5040 5034          rnode4_t *drp;
5041 5035          int cflag = ((flags & CREATE_XATTR_DIR) != 0);
5042 5036          mntinfo4_t *mi;
5043 5037  
5044 5038          mi = VTOMI4(dvp);
5045 5039          if (!(mi->mi_vfsp->vfs_flag & VFS_XATTR) &&
5046 5040              !vfs_has_feature(mi->mi_vfsp, VFSFT_SYSATTR_VIEWS))
5047 5041                  return (EINVAL);
5048 5042  
5049 5043          drp = VTOR4(dvp);
5050 5044          if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
5051 5045                  return (EINTR);
5052 5046  
5053 5047          mutex_enter(&drp->r_statelock);
5054 5048          /*
5055 5049           * If the server doesn't support xattrs just return EINVAL
5056 5050           */
5057 5051          if (drp->r_xattr_dir == NFS4_XATTR_DIR_NOTSUPP) {
5058 5052                  mutex_exit(&drp->r_statelock);
5059 5053                  nfs_rw_exit(&drp->r_rwlock);
5060 5054                  return (EINVAL);
5061 5055          }
5062 5056  
5063 5057          /*
5064 5058           * If there is a cached xattr directory entry,
5065 5059           * use it as long as the attributes are valid. If the
5066 5060           * attributes are not valid, take the simple approach and
5067 5061           * free the cached value and re-fetch a new value.
5068 5062           *
5069 5063           * We don't negative entry cache for now, if we did we
5070 5064           * would need to check if the file has changed on every
5071 5065           * lookup. But xattrs don't exist very often and failing
5072 5066           * an openattr is not much more expensive than and NVERIFY or GETATTR
5073 5067           * so do an openattr over the wire for now.
5074 5068           */
5075 5069          if (drp->r_xattr_dir != NULL) {
5076 5070                  if (ATTRCACHE4_VALID(dvp)) {
5077 5071                          VN_HOLD(drp->r_xattr_dir);
5078 5072                          *vpp = drp->r_xattr_dir;
5079 5073                          mutex_exit(&drp->r_statelock);
5080 5074                          nfs_rw_exit(&drp->r_rwlock);
5081 5075                          return (0);
5082 5076                  }
5083 5077                  VN_RELE(drp->r_xattr_dir);
5084 5078                  drp->r_xattr_dir = NULL;
5085 5079          }
5086 5080          mutex_exit(&drp->r_statelock);
5087 5081  
5088 5082          error = nfs4openattr(dvp, vpp, cflag, cr);
5089 5083  
5090 5084          nfs_rw_exit(&drp->r_rwlock);
5091 5085  
5092 5086          return (error);
5093 5087  }
5094 5088  
5095 5089  static int
5096 5090  nfs4lookup(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr, int skipdnlc)
5097 5091  {
5098 5092          int error;
5099 5093          rnode4_t *drp;
5100 5094  
5101 5095          ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5102 5096  
5103 5097          /*
5104 5098           * If lookup is for "", just return dvp.  Don't need
5105 5099           * to send it over the wire, look it up in the dnlc,
5106 5100           * or perform any access checks.
5107 5101           */
5108 5102          if (*nm == '\0') {
5109 5103                  VN_HOLD(dvp);
5110 5104                  *vpp = dvp;
5111 5105                  return (0);
5112 5106          }
5113 5107  
5114 5108          /*
5115 5109           * Can't do lookups in non-directories.
5116 5110           */
5117 5111          if (dvp->v_type != VDIR)
5118 5112                  return (ENOTDIR);
5119 5113  
5120 5114          /*
5121 5115           * If lookup is for ".", just return dvp.  Don't need
5122 5116           * to send it over the wire or look it up in the dnlc,
5123 5117           * just need to check access.
5124 5118           */
5125 5119          if (nm[0] == '.' && nm[1] == '\0') {
5126 5120                  error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5127 5121                  if (error)
5128 5122                          return (error);
5129 5123                  VN_HOLD(dvp);
5130 5124                  *vpp = dvp;
5131 5125                  return (0);
5132 5126          }
5133 5127  
5134 5128          drp = VTOR4(dvp);
5135 5129          if (!(drp->r_flags & R4LOOKUP)) {
5136 5130                  mutex_enter(&drp->r_statelock);
5137 5131                  drp->r_flags |= R4LOOKUP;
5138 5132                  mutex_exit(&drp->r_statelock);
5139 5133          }
5140 5134  
5141 5135          *vpp = NULL;
5142 5136          /*
5143 5137           * Lookup this name in the DNLC.  If there is no entry
5144 5138           * lookup over the wire.
5145 5139           */
5146 5140          if (!skipdnlc)
5147 5141                  *vpp = dnlc_lookup(dvp, nm);
5148 5142          if (*vpp == NULL) {
5149 5143                  /*
5150 5144                   * We need to go over the wire to lookup the name.
5151 5145                   */
5152 5146                  return (nfs4lookupnew_otw(dvp, nm, vpp, cr));
5153 5147          }
5154 5148  
5155 5149          /*
5156 5150           * We hit on the dnlc
5157 5151           */
5158 5152          if (*vpp != DNLC_NO_VNODE ||
5159 5153              (dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
5160 5154                  /*
5161 5155                   * But our attrs may not be valid.
5162 5156                   */
5163 5157                  if (ATTRCACHE4_VALID(dvp)) {
5164 5158                          error = nfs4_waitfor_purge_complete(dvp);
5165 5159                          if (error) {
5166 5160                                  VN_RELE(*vpp);
5167 5161                                  *vpp = NULL;
5168 5162                                  return (error);
5169 5163                          }
5170 5164  
5171 5165                          /*
5172 5166                           * If after the purge completes, check to make sure
5173 5167                           * our attrs are still valid.
5174 5168                           */
5175 5169                          if (ATTRCACHE4_VALID(dvp)) {
5176 5170                                  /*
5177 5171                                   * If we waited for a purge we may have
5178 5172                                   * lost our vnode so look it up again.
5179 5173                                   */
5180 5174                                  VN_RELE(*vpp);
5181 5175                                  *vpp = dnlc_lookup(dvp, nm);
5182 5176                                  if (*vpp == NULL)
5183 5177                                          return (nfs4lookupnew_otw(dvp,
5184 5178                                              nm, vpp, cr));
5185 5179  
5186 5180                                  /*
5187 5181                                   * The access cache should almost always hit
5188 5182                                   */
5189 5183                                  error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5190 5184  
5191 5185                                  if (error) {
5192 5186                                          VN_RELE(*vpp);
5193 5187                                          *vpp = NULL;
5194 5188                                          return (error);
5195 5189                                  }
5196 5190                                  if (*vpp == DNLC_NO_VNODE) {
5197 5191                                          VN_RELE(*vpp);
5198 5192                                          *vpp = NULL;
5199 5193                                          return (ENOENT);
5200 5194                                  }
5201 5195                                  return (0);
5202 5196                          }
5203 5197                  }
5204 5198          }
5205 5199  
5206 5200          ASSERT(*vpp != NULL);
5207 5201  
5208 5202          /*
5209 5203           * We may have gotten here we have one of the following cases:
5210 5204           *      1) vpp != DNLC_NO_VNODE, our attrs have timed out so we
5211 5205           *              need to validate them.
5212 5206           *      2) vpp == DNLC_NO_VNODE, a negative entry that we always
5213 5207           *              must validate.
5214 5208           *
5215 5209           * Go to the server and check if the directory has changed, if
5216 5210           * it hasn't we are done and can use the dnlc entry.
5217 5211           */
5218 5212          return (nfs4lookupvalidate_otw(dvp, nm, vpp, cr));
5219 5213  }
5220 5214  
5221 5215  /*
5222 5216   * Go to the server and check if the directory has changed, if
5223 5217   * it hasn't we are done and can use the dnlc entry.  If it
5224 5218   * has changed we get a new copy of its attributes and check
5225 5219   * the access for VEXEC, then relookup the filename and
5226 5220   * get its filehandle and attributes.
5227 5221   *
5228 5222   * PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR
5229 5223   *      if the NVERIFY failed we must
5230 5224   *              purge the caches
5231 5225   *              cache new attributes (will set r_time_attr_inval)
5232 5226   *              cache new access
5233 5227   *              recheck VEXEC access
5234 5228   *              add name to dnlc, possibly negative
5235 5229   *              if LOOKUP succeeded
5236 5230   *                      cache new attributes
5237 5231   *      else
5238 5232   *              set a new r_time_attr_inval for dvp
5239 5233   *              check to make sure we have access
5240 5234   *
5241 5235   * The vpp returned is the vnode passed in if the directory is valid,
5242 5236   * a new vnode if successful lookup, or NULL on error.
5243 5237   */
5244 5238  static int
5245 5239  nfs4lookupvalidate_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
5246 5240  {
5247 5241          COMPOUND4args_clnt args;
5248 5242          COMPOUND4res_clnt res;
5249 5243          fattr4 *ver_fattr;
5250 5244          fattr4_change dchange;
5251 5245          int32_t *ptr;
5252 5246          int argoplist_size  = 7 * sizeof (nfs_argop4);
5253 5247          nfs_argop4 *argop;
5254 5248          int doqueue;
5255 5249          mntinfo4_t *mi;
5256 5250          nfs4_recov_state_t recov_state;
5257 5251          hrtime_t t;
5258 5252          int isdotdot;
5259 5253          vnode_t *nvp;
5260 5254          nfs_fh4 *fhp;
5261 5255          nfs4_sharedfh_t *sfhp;
5262 5256          nfs4_access_type_t cacc;
5263 5257          rnode4_t *nrp;
5264 5258          rnode4_t *drp = VTOR4(dvp);
5265 5259          nfs4_ga_res_t *garp = NULL;
5266 5260          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
5267 5261  
5268 5262          ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5269 5263          ASSERT(nm != NULL);
5270 5264          ASSERT(nm[0] != '\0');
5271 5265          ASSERT(dvp->v_type == VDIR);
5272 5266          ASSERT(nm[0] != '.' || nm[1] != '\0');
5273 5267          ASSERT(*vpp != NULL);
5274 5268  
5275 5269          if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') {
5276 5270                  isdotdot = 1;
5277 5271                  args.ctag = TAG_LOOKUP_VPARENT;
5278 5272          } else {
5279 5273                  /*
5280 5274                   * If dvp were a stub, it should have triggered and caused
5281 5275                   * a mount for us to get this far.
5282 5276                   */
5283 5277                  ASSERT(!RP_ISSTUB(VTOR4(dvp)));
5284 5278  
5285 5279                  isdotdot = 0;
5286 5280                  args.ctag = TAG_LOOKUP_VALID;
5287 5281          }
5288 5282  
5289 5283          mi = VTOMI4(dvp);
5290 5284          recov_state.rs_flags = 0;
5291 5285          recov_state.rs_num_retry_despite_err = 0;
5292 5286  
5293 5287          nvp = NULL;
5294 5288  
5295 5289          /* Save the original mount point security information */
5296 5290          (void) save_mnt_secinfo(mi->mi_curr_serv);
5297 5291  
5298 5292  recov_retry:
5299 5293          e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP,
5300 5294              &recov_state, NULL);
5301 5295          if (e.error) {
5302 5296                  (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5303 5297                  VN_RELE(*vpp);
5304 5298                  *vpp = NULL;
5305 5299                  return (e.error);
5306 5300          }
5307 5301  
5308 5302          argop = kmem_alloc(argoplist_size, KM_SLEEP);
5309 5303  
5310 5304          /* PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR */
5311 5305          args.array_len = 7;
5312 5306          args.array = argop;
5313 5307  
5314 5308          /* 0. putfh file */
5315 5309          argop[0].argop = OP_CPUTFH;
5316 5310          argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh;
5317 5311  
5318 5312          /* 1. nverify the change info */
5319 5313          argop[1].argop = OP_NVERIFY;
5320 5314          ver_fattr = &argop[1].nfs_argop4_u.opnverify.obj_attributes;
5321 5315          ver_fattr->attrmask = FATTR4_CHANGE_MASK;
5322 5316          ver_fattr->attrlist4 = (char *)&dchange;
5323 5317          ptr = (int32_t *)&dchange;
5324 5318          IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change);
5325 5319          ver_fattr->attrlist4_len = sizeof (fattr4_change);
5326 5320  
5327 5321          /* 2. getattr directory */
5328 5322          argop[2].argop = OP_GETATTR;
5329 5323          argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5330 5324          argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5331 5325  
5332 5326          /* 3. access directory */
5333 5327          argop[3].argop = OP_ACCESS;
5334 5328          argop[3].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE |
5335 5329              ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP;
5336 5330  
5337 5331          /* 4. lookup name */
5338 5332          if (isdotdot) {
5339 5333                  argop[4].argop = OP_LOOKUPP;
5340 5334          } else {
5341 5335                  argop[4].argop = OP_CLOOKUP;
5342 5336                  argop[4].nfs_argop4_u.opclookup.cname = nm;
5343 5337          }
5344 5338  
5345 5339          /* 5. resulting file handle */
5346 5340          argop[5].argop = OP_GETFH;
5347 5341  
5348 5342          /* 6. resulting file attributes */
5349 5343          argop[6].argop = OP_GETATTR;
5350 5344          argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5351 5345          argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5352 5346  
5353 5347          doqueue = 1;
5354 5348          t = gethrtime();
5355 5349  
5356 5350          rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
5357 5351  
5358 5352          if (!isdotdot && res.status == NFS4ERR_MOVED) {
5359 5353                  e.error = nfs4_setup_referral(dvp, nm, vpp, cr);
5360 5354                  if (e.error != 0 && *vpp != NULL)
5361 5355                          VN_RELE(*vpp);
5362 5356                  nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5363 5357                      &recov_state, FALSE);
5364 5358                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5365 5359                  kmem_free(argop, argoplist_size);
5366 5360                  return (e.error);
5367 5361          }
5368 5362  
5369 5363          if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) {
5370 5364                  /*
5371 5365                   * For WRONGSEC of a non-dotdot case, send secinfo directly
5372 5366                   * from this thread, do not go thru the recovery thread since
5373 5367                   * we need the nm information.
5374 5368                   *
5375 5369                   * Not doing dotdot case because there is no specification
5376 5370                   * for (PUTFH, SECINFO "..") yet.
5377 5371                   */
5378 5372                  if (!isdotdot && res.status == NFS4ERR_WRONGSEC) {
5379 5373                          if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr)))
5380 5374                                  nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5381 5375                                      &recov_state, FALSE);
5382 5376                          else
5383 5377                                  nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5384 5378                                      &recov_state, TRUE);
5385 5379                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5386 5380                          kmem_free(argop, argoplist_size);
5387 5381                          if (!e.error)
5388 5382                                  goto recov_retry;
5389 5383                          (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5390 5384                          VN_RELE(*vpp);
5391 5385                          *vpp = NULL;
5392 5386                          return (e.error);
5393 5387                  }
5394 5388  
5395 5389                  if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
5396 5390                      OP_LOOKUP, NULL, NULL, NULL) == FALSE) {
5397 5391                          nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5398 5392                              &recov_state, TRUE);
5399 5393  
5400 5394                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5401 5395                          kmem_free(argop, argoplist_size);
5402 5396                          goto recov_retry;
5403 5397                  }
5404 5398          }
5405 5399  
5406 5400          nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE);
5407 5401  
5408 5402          if (e.error || res.array_len == 0) {
5409 5403                  /*
5410 5404                   * If e.error isn't set, then reply has no ops (or we couldn't
5411 5405                   * be here).  The only legal way to reply without an op array
5412 5406                   * is via NFS4ERR_MINOR_VERS_MISMATCH.  An ops array should
5413 5407                   * be in the reply for all other status values.
5414 5408                   *
5415 5409                   * For valid replies without an ops array, return ENOTSUP
5416 5410                   * (geterrno4 xlation of VERS_MISMATCH).  For illegal replies,
5417 5411                   * return EIO -- don't trust status.
5418 5412                   */
5419 5413                  if (e.error == 0)
5420 5414                          e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ?
5421 5415                              ENOTSUP : EIO;
5422 5416                  VN_RELE(*vpp);
5423 5417                  *vpp = NULL;
5424 5418                  kmem_free(argop, argoplist_size);
5425 5419                  (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5426 5420                  return (e.error);
5427 5421          }
5428 5422  
5429 5423          if (res.status != NFS4ERR_SAME) {
5430 5424                  e.error = geterrno4(res.status);
5431 5425  
5432 5426                  /*
5433 5427                   * The NVERIFY "failed" so the directory has changed
5434 5428                   * First make sure PUTFH succeeded and NVERIFY "failed"
5435 5429                   * cleanly.
5436 5430                   */
5437 5431                  if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) ||
5438 5432                      (res.array[1].nfs_resop4_u.opnverify.status != NFS4_OK)) {
5439 5433                          nfs4_purge_stale_fh(e.error, dvp, cr);
5440 5434                          VN_RELE(*vpp);
5441 5435                          *vpp = NULL;
5442 5436                          goto exit;
5443 5437                  }
5444 5438  
5445 5439                  /*
5446 5440                   * We know the NVERIFY "failed" so we must:
5447 5441                   *      purge the caches (access and indirectly dnlc if needed)
5448 5442                   */
5449 5443                  nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE);
5450 5444  
5451 5445                  if (res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) {
5452 5446                          nfs4_purge_stale_fh(e.error, dvp, cr);
5453 5447                          VN_RELE(*vpp);
5454 5448                          *vpp = NULL;
5455 5449                          goto exit;
5456 5450                  }
5457 5451  
5458 5452                  /*
5459 5453                   * Install new cached attributes for the directory
5460 5454                   */
5461 5455                  nfs4_attr_cache(dvp,
5462 5456                      &res.array[2].nfs_resop4_u.opgetattr.ga_res,
5463 5457                      t, cr, FALSE, NULL);
5464 5458  
5465 5459                  if (res.array[3].nfs_resop4_u.opaccess.status != NFS4_OK) {
5466 5460                          nfs4_purge_stale_fh(e.error, dvp, cr);
5467 5461                          VN_RELE(*vpp);
5468 5462                          *vpp = NULL;
5469 5463                          e.error = geterrno4(res.status);
5470 5464                          goto exit;
5471 5465                  }
5472 5466  
5473 5467                  /*
5474 5468                   * Now we know the directory is valid,
5475 5469                   * cache new directory access
5476 5470                   */
5477 5471                  nfs4_access_cache(drp,
5478 5472                      args.array[3].nfs_argop4_u.opaccess.access,
5479 5473                      res.array[3].nfs_resop4_u.opaccess.access, cr);
5480 5474  
5481 5475                  /*
5482 5476                   * recheck VEXEC access
5483 5477                   */
5484 5478                  cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr);
5485 5479                  if (cacc != NFS4_ACCESS_ALLOWED) {
5486 5480                          /*
5487 5481                           * Directory permissions might have been revoked
5488 5482                           */
5489 5483                          if (cacc == NFS4_ACCESS_DENIED) {
5490 5484                                  e.error = EACCES;
5491 5485                                  VN_RELE(*vpp);
5492 5486                                  *vpp = NULL;
5493 5487                                  goto exit;
5494 5488                          }
5495 5489  
5496 5490                          /*
5497 5491                           * Somehow we must not have asked for enough
5498 5492                           * so try a singleton ACCESS, should never happen.
5499 5493                           */
5500 5494                          e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5501 5495                          if (e.error) {
5502 5496                                  VN_RELE(*vpp);
5503 5497                                  *vpp = NULL;
5504 5498                                  goto exit;
5505 5499                          }
5506 5500                  }
5507 5501  
5508 5502                  e.error = geterrno4(res.status);
5509 5503                  if (res.array[4].nfs_resop4_u.oplookup.status != NFS4_OK) {
5510 5504                          /*
5511 5505                           * The lookup failed, probably no entry
5512 5506                           */
5513 5507                          if (e.error == ENOENT && nfs4_lookup_neg_cache) {
5514 5508                                  dnlc_update(dvp, nm, DNLC_NO_VNODE);
5515 5509                          } else {
5516 5510                                  /*
5517 5511                                   * Might be some other error, so remove
5518 5512                                   * the dnlc entry to make sure we start all
5519 5513                                   * over again, next time.
5520 5514                                   */
5521 5515                                  dnlc_remove(dvp, nm);
5522 5516                          }
5523 5517                          VN_RELE(*vpp);
5524 5518                          *vpp = NULL;
5525 5519                          goto exit;
5526 5520                  }
5527 5521  
5528 5522                  if (res.array[5].nfs_resop4_u.opgetfh.status != NFS4_OK) {
5529 5523                          /*
5530 5524                           * The file exists but we can't get its fh for
5531 5525                           * some unknown reason.  Remove it from the dnlc
5532 5526                           * and error out to be safe.
5533 5527                           */
5534 5528                          dnlc_remove(dvp, nm);
5535 5529                          VN_RELE(*vpp);
5536 5530                          *vpp = NULL;
5537 5531                          goto exit;
5538 5532                  }
5539 5533                  fhp = &res.array[5].nfs_resop4_u.opgetfh.object;
5540 5534                  if (fhp->nfs_fh4_len == 0) {
5541 5535                          /*
5542 5536                           * The file exists but a bogus fh
5543 5537                           * some unknown reason.  Remove it from the dnlc
5544 5538                           * and error out to be safe.
5545 5539                           */
5546 5540                          e.error = ENOENT;
5547 5541                          dnlc_remove(dvp, nm);
5548 5542                          VN_RELE(*vpp);
5549 5543                          *vpp = NULL;
5550 5544                          goto exit;
5551 5545                  }
5552 5546                  sfhp = sfh4_get(fhp, mi);
5553 5547  
5554 5548                  if (res.array[6].nfs_resop4_u.opgetattr.status == NFS4_OK)
5555 5549                          garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res;
5556 5550  
5557 5551                  /*
5558 5552                   * Make the new rnode
5559 5553                   */
5560 5554                  if (isdotdot) {
5561 5555                          e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1);
5562 5556                          if (e.error) {
5563 5557                                  sfh4_rele(&sfhp);
5564 5558                                  VN_RELE(*vpp);
5565 5559                                  *vpp = NULL;
5566 5560                                  goto exit;
5567 5561                          }
5568 5562                          /*
5569 5563                           * XXX if nfs4_make_dotdot uses an existing rnode
5570 5564                           * XXX it doesn't update the attributes.
5571 5565                           * XXX for now just save them again to save an OTW
5572 5566                           */
5573 5567                          nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL);
5574 5568                  } else {
5575 5569                          nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr,
5576 5570                              dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
5577 5571                          /*
5578 5572                           * If v_type == VNON, then garp was NULL because
5579 5573                           * the last op in the compound failed and makenfs4node
5580 5574                           * could not find the vnode for sfhp. It created
5581 5575                           * a new vnode, so we have nothing to purge here.
5582 5576                           */
5583 5577                          if (nvp->v_type == VNON) {
5584 5578                                  vattr_t vattr;
5585 5579  
5586 5580                                  vattr.va_mask = AT_TYPE;
5587 5581                                  /*
5588 5582                                   * N.B. We've already called nfs4_end_fop above.
5589 5583                                   */
5590 5584                                  e.error = nfs4getattr(nvp, &vattr, cr);
5591 5585                                  if (e.error) {
5592 5586                                          sfh4_rele(&sfhp);
5593 5587                                          VN_RELE(*vpp);
5594 5588                                          *vpp = NULL;
5595 5589                                          VN_RELE(nvp);
5596 5590                                          goto exit;
5597 5591                                  }
5598 5592                                  nvp->v_type = vattr.va_type;
5599 5593                          }
5600 5594                  }
5601 5595                  sfh4_rele(&sfhp);
5602 5596  
5603 5597                  nrp = VTOR4(nvp);
5604 5598                  mutex_enter(&nrp->r_statev4_lock);
5605 5599                  if (!nrp->created_v4) {
5606 5600                          mutex_exit(&nrp->r_statev4_lock);
5607 5601                          dnlc_update(dvp, nm, nvp);
5608 5602                  } else
5609 5603                          mutex_exit(&nrp->r_statev4_lock);
5610 5604  
5611 5605                  VN_RELE(*vpp);
5612 5606                  *vpp = nvp;
5613 5607          } else {
5614 5608                  hrtime_t now;
5615 5609                  hrtime_t delta = 0;
5616 5610  
5617 5611                  e.error = 0;
5618 5612  
5619 5613                  /*
5620 5614                   * Because the NVERIFY "succeeded" we know that the
5621 5615                   * directory attributes are still valid
5622 5616                   * so update r_time_attr_inval
5623 5617                   */
5624 5618                  now = gethrtime();
5625 5619                  mutex_enter(&drp->r_statelock);
5626 5620                  if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) {
5627 5621                          delta = now - drp->r_time_attr_saved;
5628 5622                          if (delta < mi->mi_acdirmin)
5629 5623                                  delta = mi->mi_acdirmin;
5630 5624                          else if (delta > mi->mi_acdirmax)
5631 5625                                  delta = mi->mi_acdirmax;
5632 5626                  }
5633 5627                  drp->r_time_attr_inval = now + delta;
5634 5628                  mutex_exit(&drp->r_statelock);
5635 5629                  dnlc_update(dvp, nm, *vpp);
5636 5630  
5637 5631                  /*
5638 5632                   * Even though we have a valid directory attr cache
5639 5633                   * and dnlc entry, we may not have access.
5640 5634                   * This should almost always hit the cache.
5641 5635                   */
5642 5636                  e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5643 5637                  if (e.error) {
5644 5638                          VN_RELE(*vpp);
5645 5639                          *vpp = NULL;
5646 5640                  }
5647 5641  
5648 5642                  if (*vpp == DNLC_NO_VNODE) {
5649 5643                          VN_RELE(*vpp);
5650 5644                          *vpp = NULL;
5651 5645                          e.error = ENOENT;
5652 5646                  }
5653 5647          }
5654 5648  
5655 5649  exit:
5656 5650          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5657 5651          kmem_free(argop, argoplist_size);
5658 5652          (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5659 5653          return (e.error);
5660 5654  }
5661 5655  
5662 5656  /*
5663 5657   * We need to go over the wire to lookup the name, but
5664 5658   * while we are there verify the directory has not
5665 5659   * changed but if it has, get new attributes and check access
5666 5660   *
5667 5661   * PUTFH dfh SAVEFH LOOKUP nm GETFH GETATTR RESTOREFH
5668 5662   *                                      NVERIFY GETATTR ACCESS
5669 5663   *
5670 5664   * With the results:
5671 5665   *      if the NVERIFY failed we must purge the caches, add new attributes,
5672 5666   *              and cache new access.
5673 5667   *      set a new r_time_attr_inval
5674 5668   *      add name to dnlc, possibly negative
5675 5669   *      if LOOKUP succeeded
5676 5670   *              cache new attributes
5677 5671   */
5678 5672  static int
5679 5673  nfs4lookupnew_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
5680 5674  {
5681 5675          COMPOUND4args_clnt args;
5682 5676          COMPOUND4res_clnt res;
5683 5677          fattr4 *ver_fattr;
5684 5678          fattr4_change dchange;
5685 5679          int32_t *ptr;
5686 5680          nfs4_ga_res_t *garp = NULL;
5687 5681          int argoplist_size  = 9 * sizeof (nfs_argop4);
5688 5682          nfs_argop4 *argop;
5689 5683          int doqueue;
5690 5684          mntinfo4_t *mi;
5691 5685          nfs4_recov_state_t recov_state;
5692 5686          hrtime_t t;
5693 5687          int isdotdot;
5694 5688          vnode_t *nvp;
5695 5689          nfs_fh4 *fhp;
5696 5690          nfs4_sharedfh_t *sfhp;
5697 5691          nfs4_access_type_t cacc;
5698 5692          rnode4_t *nrp;
5699 5693          rnode4_t *drp = VTOR4(dvp);
5700 5694          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
5701 5695  
5702 5696          ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5703 5697          ASSERT(nm != NULL);
5704 5698          ASSERT(nm[0] != '\0');
5705 5699          ASSERT(dvp->v_type == VDIR);
5706 5700          ASSERT(nm[0] != '.' || nm[1] != '\0');
5707 5701          ASSERT(*vpp == NULL);
5708 5702  
5709 5703          if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') {
5710 5704                  isdotdot = 1;
5711 5705                  args.ctag = TAG_LOOKUP_PARENT;
5712 5706          } else {
5713 5707                  /*
5714 5708                   * If dvp were a stub, it should have triggered and caused
5715 5709                   * a mount for us to get this far.
5716 5710                   */
5717 5711                  ASSERT(!RP_ISSTUB(VTOR4(dvp)));
5718 5712  
5719 5713                  isdotdot = 0;
5720 5714                  args.ctag = TAG_LOOKUP;
5721 5715          }
5722 5716  
5723 5717          mi = VTOMI4(dvp);
5724 5718          recov_state.rs_flags = 0;
5725 5719          recov_state.rs_num_retry_despite_err = 0;
5726 5720  
5727 5721          nvp = NULL;
5728 5722  
5729 5723          /* Save the original mount point security information */
5730 5724          (void) save_mnt_secinfo(mi->mi_curr_serv);
5731 5725  
5732 5726  recov_retry:
5733 5727          e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP,
5734 5728              &recov_state, NULL);
5735 5729          if (e.error) {
5736 5730                  (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5737 5731                  return (e.error);
5738 5732          }
5739 5733  
5740 5734          argop = kmem_alloc(argoplist_size, KM_SLEEP);
5741 5735  
5742 5736          /* PUTFH SAVEFH LOOKUP GETFH GETATTR RESTOREFH NVERIFY GETATTR ACCESS */
5743 5737          args.array_len = 9;
5744 5738          args.array = argop;
5745 5739  
5746 5740          /* 0. putfh file */
5747 5741          argop[0].argop = OP_CPUTFH;
5748 5742          argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh;
5749 5743  
5750 5744          /* 1. savefh for the nverify */
5751 5745          argop[1].argop = OP_SAVEFH;
5752 5746  
5753 5747          /* 2. lookup name */
5754 5748          if (isdotdot) {
5755 5749                  argop[2].argop = OP_LOOKUPP;
5756 5750          } else {
5757 5751                  argop[2].argop = OP_CLOOKUP;
5758 5752                  argop[2].nfs_argop4_u.opclookup.cname = nm;
5759 5753          }
5760 5754  
5761 5755          /* 3. resulting file handle */
5762 5756          argop[3].argop = OP_GETFH;
5763 5757  
5764 5758          /* 4. resulting file attributes */
5765 5759          argop[4].argop = OP_GETATTR;
5766 5760          argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5767 5761          argop[4].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5768 5762  
5769 5763          /* 5. restorefh back the directory for the nverify */
5770 5764          argop[5].argop = OP_RESTOREFH;
5771 5765  
5772 5766          /* 6. nverify the change info */
5773 5767          argop[6].argop = OP_NVERIFY;
5774 5768          ver_fattr = &argop[6].nfs_argop4_u.opnverify.obj_attributes;
5775 5769          ver_fattr->attrmask = FATTR4_CHANGE_MASK;
5776 5770          ver_fattr->attrlist4 = (char *)&dchange;
5777 5771          ptr = (int32_t *)&dchange;
5778 5772          IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change);
5779 5773          ver_fattr->attrlist4_len = sizeof (fattr4_change);
5780 5774  
5781 5775          /* 7. getattr directory */
5782 5776          argop[7].argop = OP_GETATTR;
5783 5777          argop[7].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5784 5778          argop[7].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5785 5779  
5786 5780          /* 8. access directory */
5787 5781          argop[8].argop = OP_ACCESS;
5788 5782          argop[8].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE |
5789 5783              ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP;
5790 5784  
5791 5785          doqueue = 1;
5792 5786          t = gethrtime();
5793 5787  
5794 5788          rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
5795 5789  
5796 5790          if (!isdotdot && res.status == NFS4ERR_MOVED) {
5797 5791                  e.error = nfs4_setup_referral(dvp, nm, vpp, cr);
5798 5792                  if (e.error != 0 && *vpp != NULL)
5799 5793                          VN_RELE(*vpp);
5800 5794                  nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5801 5795                      &recov_state, FALSE);
5802 5796                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5803 5797                  kmem_free(argop, argoplist_size);
5804 5798                  return (e.error);
5805 5799          }
5806 5800  
5807 5801          if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) {
5808 5802                  /*
5809 5803                   * For WRONGSEC of a non-dotdot case, send secinfo directly
5810 5804                   * from this thread, do not go thru the recovery thread since
5811 5805                   * we need the nm information.
5812 5806                   *
5813 5807                   * Not doing dotdot case because there is no specification
5814 5808                   * for (PUTFH, SECINFO "..") yet.
5815 5809                   */
5816 5810                  if (!isdotdot && res.status == NFS4ERR_WRONGSEC) {
5817 5811                          if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr)))
5818 5812                                  nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5819 5813                                      &recov_state, FALSE);
5820 5814                          else
5821 5815                                  nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5822 5816                                      &recov_state, TRUE);
5823 5817                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5824 5818                          kmem_free(argop, argoplist_size);
5825 5819                          if (!e.error)
5826 5820                                  goto recov_retry;
5827 5821                          (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5828 5822                          return (e.error);
5829 5823                  }
5830 5824  
5831 5825                  if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
5832 5826                      OP_LOOKUP, NULL, NULL, NULL) == FALSE) {
5833 5827                          nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5834 5828                              &recov_state, TRUE);
5835 5829  
5836 5830                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5837 5831                          kmem_free(argop, argoplist_size);
5838 5832                          goto recov_retry;
5839 5833                  }
5840 5834          }
5841 5835  
5842 5836          nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE);
5843 5837  
5844 5838          if (e.error || res.array_len == 0) {
5845 5839                  /*
5846 5840                   * If e.error isn't set, then reply has no ops (or we couldn't
5847 5841                   * be here).  The only legal way to reply without an op array
5848 5842                   * is via NFS4ERR_MINOR_VERS_MISMATCH.  An ops array should
5849 5843                   * be in the reply for all other status values.
5850 5844                   *
5851 5845                   * For valid replies without an ops array, return ENOTSUP
5852 5846                   * (geterrno4 xlation of VERS_MISMATCH).  For illegal replies,
5853 5847                   * return EIO -- don't trust status.
5854 5848                   */
5855 5849                  if (e.error == 0)
5856 5850                          e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ?
5857 5851                              ENOTSUP : EIO;
5858 5852  
5859 5853                  kmem_free(argop, argoplist_size);
5860 5854                  (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5861 5855                  return (e.error);
5862 5856          }
5863 5857  
5864 5858          e.error = geterrno4(res.status);
5865 5859  
5866 5860          /*
5867 5861           * The PUTFH and SAVEFH may have failed.
5868 5862           */
5869 5863          if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) ||
5870 5864              (res.array[1].nfs_resop4_u.opsavefh.status != NFS4_OK)) {
5871 5865                  nfs4_purge_stale_fh(e.error, dvp, cr);
5872 5866                  goto exit;
5873 5867          }
5874 5868  
5875 5869          /*
5876 5870           * Check if the file exists, if it does delay entering
5877 5871           * into the dnlc until after we update the directory
5878 5872           * attributes so we don't cause it to get purged immediately.
5879 5873           */
5880 5874          if (res.array[2].nfs_resop4_u.oplookup.status != NFS4_OK) {
5881 5875                  /*
5882 5876                   * The lookup failed, probably no entry
5883 5877                   */
5884 5878                  if (e.error == ENOENT && nfs4_lookup_neg_cache)
5885 5879                          dnlc_update(dvp, nm, DNLC_NO_VNODE);
5886 5880                  goto exit;
5887 5881          }
5888 5882  
5889 5883          if (res.array[3].nfs_resop4_u.opgetfh.status != NFS4_OK) {
5890 5884                  /*
5891 5885                   * The file exists but we can't get its fh for
5892 5886                   * some unknown reason. Error out to be safe.
5893 5887                   */
5894 5888                  goto exit;
5895 5889          }
5896 5890  
5897 5891          fhp = &res.array[3].nfs_resop4_u.opgetfh.object;
5898 5892          if (fhp->nfs_fh4_len == 0) {
5899 5893                  /*
5900 5894                   * The file exists but a bogus fh
5901 5895                   * some unknown reason.  Error out to be safe.
5902 5896                   */
5903 5897                  e.error = EIO;
5904 5898                  goto exit;
5905 5899          }
5906 5900          sfhp = sfh4_get(fhp, mi);
5907 5901  
5908 5902          if (res.array[4].nfs_resop4_u.opgetattr.status != NFS4_OK) {
5909 5903                  sfh4_rele(&sfhp);
5910 5904                  goto exit;
5911 5905          }
5912 5906          garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res;
5913 5907  
5914 5908          /*
5915 5909           * The RESTOREFH may have failed
5916 5910           */
5917 5911          if (res.array[5].nfs_resop4_u.oprestorefh.status != NFS4_OK) {
5918 5912                  sfh4_rele(&sfhp);
5919 5913                  e.error = EIO;
5920 5914                  goto exit;
5921 5915          }
5922 5916  
5923 5917          if (res.array[6].nfs_resop4_u.opnverify.status != NFS4ERR_SAME) {
5924 5918                  /*
5925 5919                   * First make sure the NVERIFY failed as we expected,
5926 5920                   * if it didn't then be conservative and error out
5927 5921                   * as we can't trust the directory.
5928 5922                   */
5929 5923                  if (res.array[6].nfs_resop4_u.opnverify.status != NFS4_OK) {
5930 5924                          sfh4_rele(&sfhp);
5931 5925                          e.error = EIO;
5932 5926                          goto exit;
5933 5927                  }
5934 5928  
5935 5929                  /*
5936 5930                   * We know the NVERIFY "failed" so the directory has changed,
5937 5931                   * so we must:
5938 5932                   *      purge the caches (access and indirectly dnlc if needed)
5939 5933                   */
5940 5934                  nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE);
5941 5935  
5942 5936                  if (res.array[7].nfs_resop4_u.opgetattr.status != NFS4_OK) {
5943 5937                          sfh4_rele(&sfhp);
5944 5938                          goto exit;
5945 5939                  }
5946 5940                  nfs4_attr_cache(dvp,
5947 5941                      &res.array[7].nfs_resop4_u.opgetattr.ga_res,
5948 5942                      t, cr, FALSE, NULL);
5949 5943  
5950 5944                  if (res.array[8].nfs_resop4_u.opaccess.status != NFS4_OK) {
5951 5945                          nfs4_purge_stale_fh(e.error, dvp, cr);
5952 5946                          sfh4_rele(&sfhp);
5953 5947                          e.error = geterrno4(res.status);
5954 5948                          goto exit;
5955 5949                  }
5956 5950  
5957 5951                  /*
5958 5952                   * Now we know the directory is valid,
5959 5953                   * cache new directory access
5960 5954                   */
5961 5955                  nfs4_access_cache(drp,
5962 5956                      args.array[8].nfs_argop4_u.opaccess.access,
5963 5957                      res.array[8].nfs_resop4_u.opaccess.access, cr);
5964 5958  
5965 5959                  /*
5966 5960                   * recheck VEXEC access
5967 5961                   */
5968 5962                  cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr);
5969 5963                  if (cacc != NFS4_ACCESS_ALLOWED) {
5970 5964                          /*
5971 5965                           * Directory permissions might have been revoked
5972 5966                           */
5973 5967                          if (cacc == NFS4_ACCESS_DENIED) {
5974 5968                                  sfh4_rele(&sfhp);
5975 5969                                  e.error = EACCES;
5976 5970                                  goto exit;
5977 5971                          }
5978 5972  
5979 5973                          /*
5980 5974                           * Somehow we must not have asked for enough
5981 5975                           * so try a singleton ACCESS should never happen
5982 5976                           */
5983 5977                          e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5984 5978                          if (e.error) {
5985 5979                                  sfh4_rele(&sfhp);
5986 5980                                  goto exit;
5987 5981                          }
5988 5982                  }
5989 5983  
5990 5984                  e.error = geterrno4(res.status);
5991 5985          } else {
5992 5986                  hrtime_t now;
5993 5987                  hrtime_t delta = 0;
5994 5988  
5995 5989                  e.error = 0;
5996 5990  
5997 5991                  /*
5998 5992                   * Because the NVERIFY "succeeded" we know that the
5999 5993                   * directory attributes are still valid
6000 5994                   * so update r_time_attr_inval
6001 5995                   */
6002 5996                  now = gethrtime();
6003 5997                  mutex_enter(&drp->r_statelock);
6004 5998                  if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) {
6005 5999                          delta = now - drp->r_time_attr_saved;
6006 6000                          if (delta < mi->mi_acdirmin)
6007 6001                                  delta = mi->mi_acdirmin;
6008 6002                          else if (delta > mi->mi_acdirmax)
6009 6003                                  delta = mi->mi_acdirmax;
6010 6004                  }
6011 6005                  drp->r_time_attr_inval = now + delta;
6012 6006                  mutex_exit(&drp->r_statelock);
6013 6007  
6014 6008                  /*
6015 6009                   * Even though we have a valid directory attr cache,
6016 6010                   * we may not have access.
6017 6011                   * This should almost always hit the cache.
6018 6012                   */
6019 6013                  e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
6020 6014                  if (e.error) {
6021 6015                          sfh4_rele(&sfhp);
6022 6016                          goto exit;
6023 6017                  }
6024 6018          }
6025 6019  
6026 6020          /*
6027 6021           * Now we have successfully completed the lookup, if the
6028 6022           * directory has changed we now have the valid attributes.
6029 6023           * We also know we have directory access.
6030 6024           * Create the new rnode and insert it in the dnlc.
6031 6025           */
6032 6026          if (isdotdot) {
6033 6027                  e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1);
6034 6028                  if (e.error) {
6035 6029                          sfh4_rele(&sfhp);
6036 6030                          goto exit;
6037 6031                  }
6038 6032                  /*
6039 6033                   * XXX if nfs4_make_dotdot uses an existing rnode
6040 6034                   * XXX it doesn't update the attributes.
6041 6035                   * XXX for now just save them again to save an OTW
6042 6036                   */
6043 6037                  nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL);
6044 6038          } else {
6045 6039                  nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr,
6046 6040                      dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
6047 6041          }
6048 6042          sfh4_rele(&sfhp);
6049 6043  
6050 6044          nrp = VTOR4(nvp);
6051 6045          mutex_enter(&nrp->r_statev4_lock);
6052 6046          if (!nrp->created_v4) {
6053 6047                  mutex_exit(&nrp->r_statev4_lock);
6054 6048                  dnlc_update(dvp, nm, nvp);
6055 6049          } else
6056 6050                  mutex_exit(&nrp->r_statev4_lock);
6057 6051  
6058 6052          *vpp = nvp;
6059 6053  
6060 6054  exit:
6061 6055          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6062 6056          kmem_free(argop, argoplist_size);
6063 6057          (void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
6064 6058          return (e.error);
6065 6059  }
6066 6060  
6067 6061  #ifdef DEBUG
6068 6062  void
6069 6063  nfs4lookup_dump_compound(char *where, nfs_argop4 *argbase, int argcnt)
6070 6064  {
6071 6065          uint_t i, len;
6072 6066          zoneid_t zoneid = getzoneid();
6073 6067          char *s;
6074 6068  
6075 6069          zcmn_err(zoneid, CE_NOTE, "%s: dumping cmpd", where);
6076 6070          for (i = 0; i < argcnt; i++) {
6077 6071                  nfs_argop4 *op = &argbase[i];
6078 6072                  switch (op->argop) {
6079 6073                  case OP_CPUTFH:
6080 6074                  case OP_PUTFH:
6081 6075                          zcmn_err(zoneid, CE_NOTE, "\t op %d, putfh", i);
6082 6076                          break;
6083 6077                  case OP_PUTROOTFH:
6084 6078                          zcmn_err(zoneid, CE_NOTE, "\t op %d, putrootfh", i);
6085 6079                          break;
6086 6080                  case OP_CLOOKUP:
6087 6081                          s = op->nfs_argop4_u.opclookup.cname;
6088 6082                          zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s);
6089 6083                          break;
6090 6084                  case OP_LOOKUP:
6091 6085                          s = utf8_to_str(&op->nfs_argop4_u.oplookup.objname,
6092 6086                              &len, NULL);
6093 6087                          zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s);
6094 6088                          kmem_free(s, len);
6095 6089                          break;
6096 6090                  case OP_LOOKUPP:
6097 6091                          zcmn_err(zoneid, CE_NOTE, "\t op %d, lookupp ..", i);
6098 6092                          break;
6099 6093                  case OP_GETFH:
6100 6094                          zcmn_err(zoneid, CE_NOTE, "\t op %d, getfh", i);
6101 6095                          break;
6102 6096                  case OP_GETATTR:
6103 6097                          zcmn_err(zoneid, CE_NOTE, "\t op %d, getattr", i);
6104 6098                          break;
6105 6099                  case OP_OPENATTR:
6106 6100                          zcmn_err(zoneid, CE_NOTE, "\t op %d, openattr", i);
6107 6101                          break;
6108 6102                  default:
6109 6103                          zcmn_err(zoneid, CE_NOTE, "\t op %d, opcode %d", i,
6110 6104                              op->argop);
6111 6105                          break;
6112 6106                  }
6113 6107          }
6114 6108  }
6115 6109  #endif
6116 6110  
6117 6111  /*
6118 6112   * nfs4lookup_setup - constructs a multi-lookup compound request.
6119 6113   *
6120 6114   * Given the path "nm1/nm2/.../nmn", the following compound requests
6121 6115   * may be created:
6122 6116   *
6123 6117   * Note: Getfh is not be needed because filehandle attr is mandatory, but it
6124 6118   * is faster, for now.
6125 6119   *
6126 6120   * l4_getattrs indicates the type of compound requested.
6127 6121   *
6128 6122   * LKP4_NO_ATTRIBUTE - no attributes (used by secinfo):
6129 6123   *
6130 6124   *      compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ...  Lookup {nmn} }
6131 6125   *
6132 6126   *   total number of ops is n + 1.
6133 6127   *
6134 6128   * LKP4_LAST_NAMED_ATTR - multi-component path for a named
6135 6129   *      attribute: create lookups plus one OPENATTR/GETFH/GETATTR
6136 6130   *      before the last component, and only get attributes
6137 6131   *      for the last component.  Note that the second-to-last
6138 6132   *      pathname component is XATTR_RPATH, which does NOT go
6139 6133   *      over-the-wire as a lookup.
6140 6134   *
6141 6135   *      compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn-2};
6142 6136   *              Openattr; Getfh; Getattr; Lookup {nmn}; Getfh; Getattr }
6143 6137   *
6144 6138   *   and total number of ops is n + 5.
6145 6139   *
6146 6140   * LKP4_LAST_ATTRDIR - multi-component path for the hidden named
6147 6141   *      attribute directory: create lookups plus an OPENATTR
6148 6142   *      replacing the last lookup.  Note that the last pathname
6149 6143   *      component is XATTR_RPATH, which does NOT go over-the-wire
6150 6144   *      as a lookup.
6151 6145   *
6152 6146   *      compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Getfh; Getattr;
6153 6147   *              Openattr; Getfh; Getattr }
6154 6148   *
6155 6149   *   and total number of ops is n + 5.
6156 6150   *
6157 6151   * LKP4_ALL_ATTRIBUTES - create lookups and get attributes for intermediate
6158 6152   *      nodes too.
6159 6153   *
6160 6154   *      compound { Put*fh; Lookup {nm1}; Getfh; Getattr;
6161 6155   *              Lookup {nm2}; ...  Lookup {nmn}; Getfh; Getattr }
6162 6156   *
6163 6157   *   and total number of ops is 3*n + 1.
6164 6158   *
6165 6159   * All cases: returns the index in the arg array of the final LOOKUP op, or
6166 6160   * -1 if no LOOKUPs were used.
6167 6161   */
6168 6162  int
6169 6163  nfs4lookup_setup(char *nm, lookup4_param_t *lookupargp, int needgetfh)
6170 6164  {
6171 6165          enum lkp4_attr_setup l4_getattrs = lookupargp->l4_getattrs;
6172 6166          nfs_argop4 *argbase, *argop;
6173 6167          int arglen, argcnt;
6174 6168          int n = 1;      /* number of components */
6175 6169          int nga = 1;    /* number of Getattr's in request */
6176 6170          char c = '\0', *s, *p;
6177 6171          int lookup_idx = -1;
6178 6172          int argoplist_size;
6179 6173  
6180 6174          /* set lookuparg response result to 0 */
6181 6175          lookupargp->resp->status = NFS4_OK;
6182 6176  
6183 6177          /* skip leading "/" or "." e.g. ".//./" if there is */
6184 6178          for (; ; nm++) {
6185 6179                  if (*nm != '/' && *nm != '.')
6186 6180                          break;
6187 6181  
6188 6182                  /* ".." is counted as 1 component */
6189 6183                  if (*nm == '.' && *(nm + 1) != '/')
6190 6184                          break;
6191 6185          }
6192 6186  
6193 6187          /*
6194 6188           * Find n = number of components - nm must be null terminated
6195 6189           * Skip "." components.
6196 6190           */
6197 6191          if (*nm != '\0')
6198 6192                  for (n = 1, s = nm; *s != '\0'; s++) {
6199 6193                          if ((*s == '/') && (*(s + 1) != '/') &&
6200 6194                              (*(s + 1) != '\0') &&
6201 6195                              !(*(s + 1) == '.' && (*(s + 2) == '/' ||
6202 6196                              *(s + 2) == '\0')))
6203 6197                                  n++;
6204 6198                  }
6205 6199          else
6206 6200                  n = 0;
6207 6201  
6208 6202          /*
6209 6203           * nga is number of components that need Getfh+Getattr
6210 6204           */
6211 6205          switch (l4_getattrs) {
6212 6206          case LKP4_NO_ATTRIBUTES:
6213 6207                  nga = 0;
6214 6208                  break;
6215 6209          case LKP4_ALL_ATTRIBUTES:
6216 6210                  nga = n;
6217 6211                  /*
6218 6212                   * Always have at least 1 getfh, getattr pair
6219 6213                   */
6220 6214                  if (nga == 0)
6221 6215                          nga++;
6222 6216                  break;
6223 6217          case LKP4_LAST_ATTRDIR:
6224 6218          case LKP4_LAST_NAMED_ATTR:
6225 6219                  nga = n+1;
6226 6220                  break;
6227 6221          }
6228 6222  
6229 6223          /*
6230 6224           * If change to use the filehandle attr instead of getfh
6231 6225           * the following line can be deleted.
6232 6226           */
6233 6227          nga *= 2;
6234 6228  
6235 6229          /*
6236 6230           * calculate number of ops in request as
6237 6231           * header + trailer + lookups + getattrs
6238 6232           */
6239 6233          arglen = lookupargp->header_len + lookupargp->trailer_len + n + nga;
6240 6234  
6241 6235          argoplist_size = arglen * sizeof (nfs_argop4);
6242 6236          argop = argbase = kmem_alloc(argoplist_size, KM_SLEEP);
6243 6237          lookupargp->argsp->array = argop;
6244 6238  
6245 6239          argcnt = lookupargp->header_len;
6246 6240          argop += argcnt;
6247 6241  
6248 6242          /*
6249 6243           * loop and create a lookup op and possibly getattr/getfh for
6250 6244           * each component. Skip "." components.
6251 6245           */
6252 6246          for (s = nm; *s != '\0'; s = p) {
6253 6247                  /*
6254 6248                   * Set up a pathname struct for each component if needed
6255 6249                   */
6256 6250                  while (*s == '/')
6257 6251                          s++;
6258 6252                  if (*s == '\0')
6259 6253                          break;
6260 6254  
6261 6255                  for (p = s; (*p != '/') && (*p != '\0'); p++)
6262 6256                          ;
6263 6257                  c = *p;
6264 6258                  *p = '\0';
6265 6259  
6266 6260                  if (s[0] == '.' && s[1] == '\0') {
6267 6261                          *p = c;
6268 6262                          continue;
6269 6263                  }
6270 6264                  if (l4_getattrs == LKP4_LAST_ATTRDIR &&
6271 6265                      strcmp(s, XATTR_RPATH) == 0) {
6272 6266                          /* getfh XXX may not be needed in future */
6273 6267                          argop->argop = OP_GETFH;
6274 6268                          argop++;
6275 6269                          argcnt++;
6276 6270  
6277 6271                          /* getattr */
6278 6272                          argop->argop = OP_GETATTR;
6279 6273                          argop->nfs_argop4_u.opgetattr.attr_request =
6280 6274                              lookupargp->ga_bits;
6281 6275                          argop->nfs_argop4_u.opgetattr.mi =
6282 6276                              lookupargp->mi;
6283 6277                          argop++;
6284 6278                          argcnt++;
6285 6279  
6286 6280                          /* openattr */
6287 6281                          argop->argop = OP_OPENATTR;
6288 6282                  } else if (l4_getattrs == LKP4_LAST_NAMED_ATTR &&
6289 6283                      strcmp(s, XATTR_RPATH) == 0) {
6290 6284                          /* openattr */
6291 6285                          argop->argop = OP_OPENATTR;
6292 6286                          argop++;
6293 6287                          argcnt++;
6294 6288  
6295 6289                          /* getfh XXX may not be needed in future */
6296 6290                          argop->argop = OP_GETFH;
6297 6291                          argop++;
6298 6292                          argcnt++;
6299 6293  
6300 6294                          /* getattr */
6301 6295                          argop->argop = OP_GETATTR;
6302 6296                          argop->nfs_argop4_u.opgetattr.attr_request =
6303 6297                              lookupargp->ga_bits;
6304 6298                          argop->nfs_argop4_u.opgetattr.mi =
6305 6299                              lookupargp->mi;
6306 6300                          argop++;
6307 6301                          argcnt++;
6308 6302                          *p = c;
6309 6303                          continue;
6310 6304                  } else if (s[0] == '.' && s[1] == '.' && s[2] == '\0') {
6311 6305                          /* lookupp */
6312 6306                          argop->argop = OP_LOOKUPP;
6313 6307                  } else {
6314 6308                          /* lookup */
6315 6309                          argop->argop = OP_LOOKUP;
6316 6310                          (void) str_to_utf8(s,
6317 6311                              &argop->nfs_argop4_u.oplookup.objname);
6318 6312                  }
6319 6313                  lookup_idx = argcnt;
6320 6314                  argop++;
6321 6315                  argcnt++;
6322 6316  
6323 6317                  *p = c;
6324 6318  
6325 6319                  if (l4_getattrs == LKP4_ALL_ATTRIBUTES) {
6326 6320                          /* getfh XXX may not be needed in future */
6327 6321                          argop->argop = OP_GETFH;
6328 6322                          argop++;
6329 6323                          argcnt++;
6330 6324  
6331 6325                          /* getattr */
6332 6326                          argop->argop = OP_GETATTR;
6333 6327                          argop->nfs_argop4_u.opgetattr.attr_request =
6334 6328                              lookupargp->ga_bits;
6335 6329                          argop->nfs_argop4_u.opgetattr.mi =
6336 6330                              lookupargp->mi;
6337 6331                          argop++;
6338 6332                          argcnt++;
6339 6333                  }
6340 6334          }
6341 6335  
6342 6336          if ((l4_getattrs != LKP4_NO_ATTRIBUTES) &&
6343 6337              ((l4_getattrs != LKP4_ALL_ATTRIBUTES) || (lookup_idx < 0))) {
6344 6338                  if (needgetfh) {
6345 6339                          /* stick in a post-lookup getfh */
6346 6340                          argop->argop = OP_GETFH;
6347 6341                          argcnt++;
6348 6342                          argop++;
6349 6343                  }
6350 6344                  /* post-lookup getattr */
6351 6345                  argop->argop = OP_GETATTR;
6352 6346                  argop->nfs_argop4_u.opgetattr.attr_request =
6353 6347                      lookupargp->ga_bits;
6354 6348                  argop->nfs_argop4_u.opgetattr.mi = lookupargp->mi;
6355 6349                  argcnt++;
6356 6350          }
6357 6351          argcnt += lookupargp->trailer_len;      /* actual op count */
6358 6352          lookupargp->argsp->array_len = argcnt;
6359 6353          lookupargp->arglen = arglen;
6360 6354  
6361 6355  #ifdef DEBUG
6362 6356          if (nfs4_client_lookup_debug)
6363 6357                  nfs4lookup_dump_compound("nfs4lookup_setup", argbase, argcnt);
6364 6358  #endif
6365 6359  
6366 6360          return (lookup_idx);
6367 6361  }
6368 6362  
6369 6363  static int
6370 6364  nfs4openattr(vnode_t *dvp, vnode_t **avp, int cflag, cred_t *cr)
6371 6365  {
6372 6366          COMPOUND4args_clnt      args;
6373 6367          COMPOUND4res_clnt       res;
6374 6368          GETFH4res       *gf_res = NULL;
6375 6369          nfs_argop4      argop[4];
6376 6370          nfs_resop4      *resop = NULL;
6377 6371          nfs4_sharedfh_t *sfhp;
6378 6372          hrtime_t t;
6379 6373          nfs4_error_t    e;
6380 6374  
6381 6375          rnode4_t        *drp;
6382 6376          int             doqueue = 1;
6383 6377          vnode_t         *vp;
6384 6378          int             needrecov = 0;
6385 6379          nfs4_recov_state_t recov_state;
6386 6380  
6387 6381          ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
6388 6382  
6389 6383          *avp = NULL;
6390 6384          recov_state.rs_flags = 0;
6391 6385          recov_state.rs_num_retry_despite_err = 0;
6392 6386  
6393 6387  recov_retry:
6394 6388          /* COMPOUND: putfh, openattr, getfh, getattr */
6395 6389          args.array_len = 4;
6396 6390          args.array = argop;
6397 6391          args.ctag = TAG_OPENATTR;
6398 6392  
6399 6393          e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
6400 6394          if (e.error)
6401 6395                  return (e.error);
6402 6396  
6403 6397          drp = VTOR4(dvp);
6404 6398  
6405 6399          /* putfh */
6406 6400          argop[0].argop = OP_CPUTFH;
6407 6401          argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
6408 6402  
6409 6403          /* openattr */
6410 6404          argop[1].argop = OP_OPENATTR;
6411 6405          argop[1].nfs_argop4_u.opopenattr.createdir = (cflag ? TRUE : FALSE);
6412 6406  
6413 6407          /* getfh */
6414 6408          argop[2].argop = OP_GETFH;
6415 6409  
6416 6410          /* getattr */
6417 6411          argop[3].argop = OP_GETATTR;
6418 6412          argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6419 6413          argop[3].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
6420 6414  
6421 6415          NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
6422 6416              "nfs4openattr: %s call, drp %s", needrecov ? "recov" : "first",
6423 6417              rnode4info(drp)));
6424 6418  
6425 6419          t = gethrtime();
6426 6420  
6427 6421          rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
6428 6422  
6429 6423          needrecov = nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp);
6430 6424          if (needrecov) {
6431 6425                  bool_t abort;
6432 6426  
6433 6427                  NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
6434 6428                      "nfs4openattr: initiating recovery\n"));
6435 6429  
6436 6430                  abort = nfs4_start_recovery(&e,
6437 6431                      VTOMI4(dvp), dvp, NULL, NULL, NULL,
6438 6432                      OP_OPENATTR, NULL, NULL, NULL);
6439 6433                  nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6440 6434                  if (!e.error) {
6441 6435                          e.error = geterrno4(res.status);
6442 6436                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6443 6437                  }
6444 6438                  if (abort == FALSE)
6445 6439                          goto recov_retry;
6446 6440                  return (e.error);
6447 6441          }
6448 6442  
6449 6443          if (e.error) {
6450 6444                  nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6451 6445                  return (e.error);
6452 6446          }
6453 6447  
6454 6448          if (res.status) {
6455 6449                  /*
6456 6450                   * If OTW errro is NOTSUPP, then it should be
6457 6451                   * translated to EINVAL.  All Solaris file system
6458 6452                   * implementations return EINVAL to the syscall layer
6459 6453                   * when the attrdir cannot be created due to an
6460 6454                   * implementation restriction or noxattr mount option.
6461 6455                   */
6462 6456                  if (res.status == NFS4ERR_NOTSUPP) {
6463 6457                          mutex_enter(&drp->r_statelock);
6464 6458                          if (drp->r_xattr_dir)
6465 6459                                  VN_RELE(drp->r_xattr_dir);
6466 6460                          VN_HOLD(NFS4_XATTR_DIR_NOTSUPP);
6467 6461                          drp->r_xattr_dir = NFS4_XATTR_DIR_NOTSUPP;
6468 6462                          mutex_exit(&drp->r_statelock);
6469 6463  
6470 6464                          e.error = EINVAL;
6471 6465                  } else {
6472 6466                          e.error = geterrno4(res.status);
6473 6467                  }
6474 6468  
6475 6469                  if (e.error) {
6476 6470                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6477 6471                          nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
6478 6472                              needrecov);
6479 6473                          return (e.error);
6480 6474                  }
6481 6475          }
6482 6476  
6483 6477          resop = &res.array[0];  /* putfh res */
6484 6478          ASSERT(resop->nfs_resop4_u.opgetfh.status == NFS4_OK);
6485 6479  
6486 6480          resop = &res.array[1];  /* openattr res */
6487 6481          ASSERT(resop->nfs_resop4_u.opopenattr.status == NFS4_OK);
6488 6482  
6489 6483          resop = &res.array[2];  /* getfh res */
6490 6484          gf_res = &resop->nfs_resop4_u.opgetfh;
6491 6485          if (gf_res->object.nfs_fh4_len == 0) {
6492 6486                  *avp = NULL;
6493 6487                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6494 6488                  nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6495 6489                  return (ENOENT);
6496 6490          }
6497 6491  
6498 6492          sfhp = sfh4_get(&gf_res->object, VTOMI4(dvp));
6499 6493          vp = makenfs4node(sfhp, &res.array[3].nfs_resop4_u.opgetattr.ga_res,
6500 6494              dvp->v_vfsp, t, cr, dvp,
6501 6495              fn_get(VTOSV(dvp)->sv_name, XATTR_RPATH, sfhp));
6502 6496          sfh4_rele(&sfhp);
6503 6497  
6504 6498          if (e.error)
6505 6499                  PURGE_ATTRCACHE4(vp);
6506 6500  
6507 6501          mutex_enter(&vp->v_lock);
6508 6502          vp->v_flag |= V_XATTRDIR;
6509 6503          mutex_exit(&vp->v_lock);
6510 6504  
6511 6505          *avp = vp;
6512 6506  
6513 6507          mutex_enter(&drp->r_statelock);
6514 6508          if (drp->r_xattr_dir)
6515 6509                  VN_RELE(drp->r_xattr_dir);
6516 6510          VN_HOLD(vp);
6517 6511          drp->r_xattr_dir = vp;
6518 6512  
6519 6513          /*
6520 6514           * Invalidate pathconf4 cache because r_xattr_dir is no longer
6521 6515           * NULL.  xattrs could be created at any time, and we have no
6522 6516           * way to update pc4_xattr_exists in the base object if/when
6523 6517           * it happens.
6524 6518           */
6525 6519          drp->r_pathconf.pc4_xattr_valid = 0;
6526 6520  
6527 6521          mutex_exit(&drp->r_statelock);
6528 6522  
6529 6523          nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6530 6524  
6531 6525          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6532 6526  
6533 6527          return (0);
6534 6528  }
6535 6529  
6536 6530  /* ARGSUSED */
6537 6531  static int
6538 6532  nfs4_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
6539 6533      int mode, vnode_t **vpp, cred_t *cr, int flags, caller_context_t *ct,
6540 6534      vsecattr_t *vsecp)
6541 6535  {
6542 6536          int error;
6543 6537          vnode_t *vp = NULL;
6544 6538          rnode4_t *rp;
6545 6539          struct vattr vattr;
6546 6540          rnode4_t *drp;
6547 6541          vnode_t *tempvp;
6548 6542          enum createmode4 createmode;
6549 6543          bool_t must_trunc = FALSE;
6550 6544          int     truncating = 0;
6551 6545  
6552 6546          if (nfs_zone() != VTOMI4(dvp)->mi_zone)
6553 6547                  return (EPERM);
6554 6548          if (exclusive == EXCL && (dvp->v_flag & V_XATTRDIR)) {
6555 6549                  return (EINVAL);
6556 6550          }
6557 6551  
6558 6552          /* . and .. have special meaning in the protocol, reject them. */
6559 6553  
6560 6554          if (nm[0] == '.' && (nm[1] == '\0' || (nm[1] == '.' && nm[2] == '\0')))
6561 6555                  return (EISDIR);
6562 6556  
6563 6557          drp = VTOR4(dvp);
6564 6558  
6565 6559          if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
6566 6560                  return (EINTR);
6567 6561  
6568 6562  top:
6569 6563          /*
6570 6564           * We make a copy of the attributes because the caller does not
6571 6565           * expect us to change what va points to.
6572 6566           */
6573 6567          vattr = *va;
6574 6568  
6575 6569          /*
6576 6570           * If the pathname is "", then dvp is the root vnode of
6577 6571           * a remote file mounted over a local directory.
6578 6572           * All that needs to be done is access
6579 6573           * checking and truncation.  Note that we avoid doing
6580 6574           * open w/ create because the parent directory might
6581 6575           * be in pseudo-fs and the open would fail.
6582 6576           */
6583 6577          if (*nm == '\0') {
6584 6578                  error = 0;
6585 6579                  VN_HOLD(dvp);
6586 6580                  vp = dvp;
6587 6581                  must_trunc = TRUE;
6588 6582          } else {
6589 6583                  /*
6590 6584                   * We need to go over the wire, just to be sure whether the
6591 6585                   * file exists or not.  Using the DNLC can be dangerous in
6592 6586                   * this case when making a decision regarding existence.
6593 6587                   */
6594 6588                  error = nfs4lookup(dvp, nm, &vp, cr, 1);
6595 6589          }
6596 6590  
6597 6591          if (exclusive)
6598 6592                  createmode = EXCLUSIVE4;
6599 6593          else
6600 6594                  createmode = GUARDED4;
6601 6595  
6602 6596          /*
6603 6597           * error would be set if the file does not exist on the
6604 6598           * server, so lets go create it.
6605 6599           */
6606 6600          if (error) {
6607 6601                  goto create_otw;
6608 6602          }
6609 6603  
6610 6604          /*
6611 6605           * File does exist on the server
6612 6606           */
6613 6607          if (exclusive == EXCL)
6614 6608                  error = EEXIST;
6615 6609          else if (vp->v_type == VDIR && (mode & VWRITE))
6616 6610                  error = EISDIR;
6617 6611          else {
6618 6612                  /*
6619 6613                   * If vnode is a device, create special vnode.
6620 6614                   */
6621 6615                  if (ISVDEV(vp->v_type)) {
6622 6616                          tempvp = vp;
6623 6617                          vp = specvp(vp, vp->v_rdev, vp->v_type, cr);
6624 6618                          VN_RELE(tempvp);
6625 6619                  }
6626 6620                  if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) {
6627 6621                          if ((vattr.va_mask & AT_SIZE) &&
6628 6622                              vp->v_type == VREG) {
6629 6623                                  rp = VTOR4(vp);
6630 6624                                  /*
6631 6625                                   * Check here for large file handled
6632 6626                                   * by LF-unaware process (as
6633 6627                                   * ufs_create() does)
6634 6628                                   */
6635 6629                                  if (!(flags & FOFFMAX)) {
6636 6630                                          mutex_enter(&rp->r_statelock);
6637 6631                                          if (rp->r_size > MAXOFF32_T)
6638 6632                                                  error = EOVERFLOW;
6639 6633                                          mutex_exit(&rp->r_statelock);
6640 6634                                  }
6641 6635  
6642 6636                                  /* if error is set then we need to return */
6643 6637                                  if (error) {
6644 6638                                          nfs_rw_exit(&drp->r_rwlock);
6645 6639                                          VN_RELE(vp);
6646 6640                                          return (error);
6647 6641                                  }
6648 6642  
6649 6643                                  if (must_trunc) {
6650 6644                                          vattr.va_mask = AT_SIZE;
6651 6645                                          error = nfs4setattr(vp, &vattr, 0, cr,
6652 6646                                              NULL);
6653 6647                                  } else {
6654 6648                                  /*
6655 6649                                   * we know we have a regular file that already
6656 6650                                   * exists and we may end up truncating the file
6657 6651                                   * as a result of the open_otw, so flush out
6658 6652                                   * any dirty pages for this file first.
6659 6653                                   */
6660 6654                                          if (nfs4_has_pages(vp) &&
6661 6655                                              ((rp->r_flags & R4DIRTY) ||
6662 6656                                              rp->r_count > 0 ||
6663 6657                                              rp->r_mapcnt > 0)) {
6664 6658                                                  error = nfs4_putpage(vp,
6665 6659                                                      (offset_t)0, 0, 0, cr, ct);
6666 6660                                                  if (error && (error == ENOSPC ||
6667 6661                                                      error == EDQUOT)) {
6668 6662                                                          mutex_enter(
6669 6663                                                              &rp->r_statelock);
6670 6664                                                          if (!rp->r_error)
6671 6665                                                                  rp->r_error =
6672 6666                                                                      error;
6673 6667                                                          mutex_exit(
6674 6668                                                              &rp->r_statelock);
6675 6669                                                  }
6676 6670                                          }
6677 6671                                          vattr.va_mask = (AT_SIZE |
6678 6672                                              AT_TYPE | AT_MODE);
6679 6673                                          vattr.va_type = VREG;
6680 6674                                          createmode = UNCHECKED4;
6681 6675                                          truncating = 1;
6682 6676                                          goto create_otw;
6683 6677                                  }
6684 6678                          }
6685 6679                  }
6686 6680          }
6687 6681          nfs_rw_exit(&drp->r_rwlock);
6688 6682          if (error) {
6689 6683                  VN_RELE(vp);
6690 6684          } else {
6691 6685                  vnode_t *tvp;
6692 6686                  rnode4_t *trp;
6693 6687                  tvp = vp;
6694 6688                  if (vp->v_type == VREG) {
6695 6689                          trp = VTOR4(vp);
6696 6690                          if (IS_SHADOW(vp, trp))
6697 6691                                  tvp = RTOV4(trp);
6698 6692                  }
6699 6693  
6700 6694                  if (must_trunc) {
6701 6695                          /*
6702 6696                           * existing file got truncated, notify.
6703 6697                           */
6704 6698                          vnevent_create(tvp, ct);
6705 6699                  }
6706 6700  
6707 6701                  *vpp = vp;
6708 6702          }
6709 6703          return (error);
6710 6704  
6711 6705  create_otw:
6712 6706          dnlc_remove(dvp, nm);
6713 6707  
6714 6708          ASSERT(vattr.va_mask & AT_TYPE);
6715 6709  
6716 6710          /*
6717 6711           * If not a regular file let nfs4mknod() handle it.
6718 6712           */
6719 6713          if (vattr.va_type != VREG) {
6720 6714                  error = nfs4mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr);
6721 6715                  nfs_rw_exit(&drp->r_rwlock);
6722 6716                  return (error);
6723 6717          }
6724 6718  
6725 6719          /*
6726 6720           * It _is_ a regular file.
6727 6721           */
6728 6722          ASSERT(vattr.va_mask & AT_MODE);
6729 6723          if (MANDMODE(vattr.va_mode)) {
6730 6724                  nfs_rw_exit(&drp->r_rwlock);
6731 6725                  return (EACCES);
6732 6726          }
6733 6727  
6734 6728          /*
6735 6729           * If this happens to be a mknod of a regular file, then flags will
6736 6730           * have neither FREAD or FWRITE.  However, we must set at least one
6737 6731           * for the call to nfs4open_otw.  If it's open(O_CREAT) driving
6738 6732           * nfs4_create, then either FREAD, FWRITE, or FRDWR has already been
6739 6733           * set (based on openmode specified by app).
6740 6734           */
6741 6735          if ((flags & (FREAD|FWRITE)) == 0)
6742 6736                  flags |= (FREAD|FWRITE);
6743 6737  
6744 6738          error = nfs4open_otw(dvp, nm, &vattr, vpp, cr, 1, flags, createmode, 0);
6745 6739  
6746 6740          if (vp != NULL) {
6747 6741                  /* if create was successful, throw away the file's pages */
6748 6742                  if (!error && (vattr.va_mask & AT_SIZE))
6749 6743                          nfs4_invalidate_pages(vp, (vattr.va_size & PAGEMASK),
6750 6744                              cr);
6751 6745                  /* release the lookup hold */
6752 6746                  VN_RELE(vp);
6753 6747                  vp = NULL;
6754 6748          }
6755 6749  
6756 6750          /*
6757 6751           * validate that we opened a regular file. This handles a misbehaving
6758 6752           * server that returns an incorrect FH.
6759 6753           */
6760 6754          if ((error == 0) && *vpp && (*vpp)->v_type != VREG) {
6761 6755                  error = EISDIR;
6762 6756                  VN_RELE(*vpp);
6763 6757          }
6764 6758  
6765 6759          /*
6766 6760           * If this is not an exclusive create, then the CREATE
6767 6761           * request will be made with the GUARDED mode set.  This
6768 6762           * means that the server will return EEXIST if the file
6769 6763           * exists.  The file could exist because of a retransmitted
6770 6764           * request.  In this case, we recover by starting over and
6771 6765           * checking to see whether the file exists.  This second
6772 6766           * time through it should and a CREATE request will not be
6773 6767           * sent.
6774 6768           *
6775 6769           * This handles the problem of a dangling CREATE request
6776 6770           * which contains attributes which indicate that the file
6777 6771           * should be truncated.  This retransmitted request could
6778 6772           * possibly truncate valid data in the file if not caught
6779 6773           * by the duplicate request mechanism on the server or if
6780 6774           * not caught by other means.  The scenario is:
6781 6775           *
6782 6776           * Client transmits CREATE request with size = 0
6783 6777           * Client times out, retransmits request.
6784 6778           * Response to the first request arrives from the server
6785 6779           *  and the client proceeds on.
6786 6780           * Client writes data to the file.
6787 6781           * The server now processes retransmitted CREATE request
6788 6782           *  and truncates file.
6789 6783           *
6790 6784           * The use of the GUARDED CREATE request prevents this from
6791 6785           * happening because the retransmitted CREATE would fail
6792 6786           * with EEXIST and would not truncate the file.
6793 6787           */
6794 6788          if (error == EEXIST && exclusive == NONEXCL) {
6795 6789  #ifdef DEBUG
6796 6790                  nfs4_create_misses++;
6797 6791  #endif
6798 6792                  goto top;
6799 6793          }
6800 6794          nfs_rw_exit(&drp->r_rwlock);
6801 6795          if (truncating && !error && *vpp) {
6802 6796                  vnode_t *tvp;
6803 6797                  rnode4_t *trp;
6804 6798                  /*
6805 6799                   * existing file got truncated, notify.
6806 6800                   */
6807 6801                  tvp = *vpp;
6808 6802                  trp = VTOR4(tvp);
6809 6803                  if (IS_SHADOW(tvp, trp))
6810 6804                          tvp = RTOV4(trp);
6811 6805                  vnevent_create(tvp, ct);
6812 6806          }
6813 6807          return (error);
6814 6808  }
6815 6809  
6816 6810  /*
6817 6811   * Create compound (for mkdir, mknod, symlink):
6818 6812   * { Putfh <dfh>; Create; Getfh; Getattr }
6819 6813   * It's okay if setattr failed to set gid - this is not considered
6820 6814   * an error, but purge attrs in that case.
6821 6815   */
6822 6816  static int
6823 6817  call_nfs4_create_req(vnode_t *dvp, char *nm, void *data, struct vattr *va,
6824 6818      vnode_t **vpp, cred_t *cr, nfs_ftype4 type)
6825 6819  {
6826 6820          int need_end_op = FALSE;
6827 6821          COMPOUND4args_clnt args;
6828 6822          COMPOUND4res_clnt res, *resp = NULL;
6829 6823          nfs_argop4 *argop;
6830 6824          nfs_resop4 *resop;
6831 6825          int doqueue;
6832 6826          mntinfo4_t *mi;
6833 6827          rnode4_t *drp = VTOR4(dvp);
6834 6828          change_info4 *cinfo;
6835 6829          GETFH4res *gf_res;
6836 6830          struct vattr vattr;
6837 6831          vnode_t *vp;
6838 6832          fattr4 *crattr;
6839 6833          bool_t needrecov = FALSE;
6840 6834          nfs4_recov_state_t recov_state;
6841 6835          nfs4_sharedfh_t *sfhp = NULL;
6842 6836          hrtime_t t;
6843 6837          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
6844 6838          int numops, argoplist_size, setgid_flag, idx_create, idx_fattr;
6845 6839          dirattr_info_t dinfo, *dinfop;
6846 6840          servinfo4_t *svp;
6847 6841          bitmap4 supp_attrs;
6848 6842  
6849 6843          ASSERT(type == NF4DIR || type == NF4LNK || type == NF4BLK ||
6850 6844              type == NF4CHR || type == NF4SOCK || type == NF4FIFO);
6851 6845  
6852 6846          mi = VTOMI4(dvp);
6853 6847  
6854 6848          /*
6855 6849           * Make sure we properly deal with setting the right gid
6856 6850           * on a new directory to reflect the parent's setgid bit
6857 6851           */
6858 6852          setgid_flag = 0;
6859 6853          if (type == NF4DIR) {
6860 6854                  struct vattr dva;
6861 6855  
6862 6856                  va->va_mode &= ~VSGID;
6863 6857                  dva.va_mask = AT_MODE | AT_GID;
6864 6858                  if (VOP_GETATTR(dvp, &dva, 0, cr, NULL) == 0) {
6865 6859  
6866 6860                          /*
6867 6861                           * If the parent's directory has the setgid bit set
6868 6862                           * _and_ the client was able to get a valid mapping
6869 6863                           * for the parent dir's owner_group, we want to
6870 6864                           * append NVERIFY(owner_group == dva.va_gid) and
6871 6865                           * SETTATTR to the CREATE compound.
6872 6866                           */
6873 6867                          if (mi->mi_flags & MI4_GRPID || dva.va_mode & VSGID) {
6874 6868                                  setgid_flag = 1;
6875 6869                                  va->va_mode |= VSGID;
6876 6870                                  if (dva.va_gid != GID_NOBODY) {
6877 6871                                          va->va_mask |= AT_GID;
6878 6872                                          va->va_gid = dva.va_gid;
6879 6873                                  }
6880 6874                          }
6881 6875                  }
6882 6876          }
6883 6877  
6884 6878          /*
6885 6879           * Create ops:
6886 6880           *      0:putfh(dir) 1:savefh(dir) 2:create 3:getfh(new) 4:getattr(new)
6887 6881           *      5:restorefh(dir) 6:getattr(dir)
6888 6882           *
6889 6883           * if (setgid)
6890 6884           *      0:putfh(dir) 1:create 2:getfh(new) 3:getattr(new)
6891 6885           *      4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new)
6892 6886           *      8:nverify 9:setattr
6893 6887           */
6894 6888          if (setgid_flag) {
6895 6889                  numops = 10;
6896 6890                  idx_create = 1;
6897 6891                  idx_fattr = 3;
6898 6892          } else {
6899 6893                  numops = 7;
6900 6894                  idx_create = 2;
6901 6895                  idx_fattr = 4;
6902 6896          }
6903 6897  
6904 6898          ASSERT(nfs_zone() == mi->mi_zone);
6905 6899          if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) {
6906 6900                  return (EINTR);
6907 6901          }
6908 6902          recov_state.rs_flags = 0;
6909 6903          recov_state.rs_num_retry_despite_err = 0;
6910 6904  
6911 6905          argoplist_size = numops * sizeof (nfs_argop4);
6912 6906          argop = kmem_alloc(argoplist_size, KM_SLEEP);
6913 6907  
6914 6908  recov_retry:
6915 6909          if (type == NF4LNK)
6916 6910                  args.ctag = TAG_SYMLINK;
6917 6911          else if (type == NF4DIR)
6918 6912                  args.ctag = TAG_MKDIR;
6919 6913          else
6920 6914                  args.ctag = TAG_MKNOD;
6921 6915  
6922 6916          args.array_len = numops;
6923 6917          args.array = argop;
6924 6918  
6925 6919          if (e.error = nfs4_start_op(mi, dvp, NULL, &recov_state)) {
6926 6920                  nfs_rw_exit(&drp->r_rwlock);
6927 6921                  kmem_free(argop, argoplist_size);
6928 6922                  return (e.error);
6929 6923          }
6930 6924          need_end_op = TRUE;
6931 6925  
6932 6926  
6933 6927          /* 0: putfh directory */
6934 6928          argop[0].argop = OP_CPUTFH;
6935 6929          argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
6936 6930  
6937 6931          /* 1/2: Create object */
6938 6932          argop[idx_create].argop = OP_CCREATE;
6939 6933          argop[idx_create].nfs_argop4_u.opccreate.cname = nm;
6940 6934          argop[idx_create].nfs_argop4_u.opccreate.type = type;
6941 6935          if (type == NF4LNK) {
6942 6936                  /*
6943 6937                   * symlink, treat name as data
6944 6938                   */
6945 6939                  ASSERT(data != NULL);
6946 6940                  argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.clinkdata =
6947 6941                      (char *)data;
6948 6942          }
6949 6943          if (type == NF4BLK || type == NF4CHR) {
6950 6944                  ASSERT(data != NULL);
6951 6945                  argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.devdata =
6952 6946                      *((specdata4 *)data);
6953 6947          }
6954 6948  
6955 6949          crattr = &argop[idx_create].nfs_argop4_u.opccreate.createattrs;
6956 6950  
6957 6951          svp = drp->r_server;
6958 6952          (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
6959 6953          supp_attrs = svp->sv_supp_attrs;
6960 6954          nfs_rw_exit(&svp->sv_lock);
6961 6955  
6962 6956          if (vattr_to_fattr4(va, NULL, crattr, 0, OP_CREATE, supp_attrs)) {
6963 6957                  nfs_rw_exit(&drp->r_rwlock);
6964 6958                  nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov);
6965 6959                  e.error = EINVAL;
6966 6960                  kmem_free(argop, argoplist_size);
6967 6961                  return (e.error);
6968 6962          }
6969 6963  
6970 6964          /* 2/3: getfh fh of created object */
6971 6965          ASSERT(idx_create + 1 == idx_fattr - 1);
6972 6966          argop[idx_create + 1].argop = OP_GETFH;
6973 6967  
6974 6968          /* 3/4: getattr of new object */
6975 6969          argop[idx_fattr].argop = OP_GETATTR;
6976 6970          argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6977 6971          argop[idx_fattr].nfs_argop4_u.opgetattr.mi = mi;
6978 6972  
6979 6973          if (setgid_flag) {
6980 6974                  vattr_t _v;
6981 6975  
6982 6976                  argop[4].argop = OP_SAVEFH;
6983 6977  
6984 6978                  argop[5].argop = OP_CPUTFH;
6985 6979                  argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
6986 6980  
6987 6981                  argop[6].argop = OP_GETATTR;
6988 6982                  argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6989 6983                  argop[6].nfs_argop4_u.opgetattr.mi = mi;
6990 6984  
6991 6985                  argop[7].argop = OP_RESTOREFH;
6992 6986  
6993 6987                  /*
6994 6988                   * nverify
6995 6989                   *
6996 6990                   * XXX - Revisit the last argument to nfs4_end_op()
6997 6991                   *       once 5020486 is fixed.
6998 6992                   */
6999 6993                  _v.va_mask = AT_GID;
7000 6994                  _v.va_gid = va->va_gid;
7001 6995                  if (e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY,
7002 6996                      supp_attrs)) {
7003 6997                          nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE);
7004 6998                          nfs_rw_exit(&drp->r_rwlock);
7005 6999                          nfs4_fattr4_free(crattr);
7006 7000                          kmem_free(argop, argoplist_size);
7007 7001                          return (e.error);
7008 7002                  }
7009 7003  
7010 7004                  /*
7011 7005                   * setattr
7012 7006                   *
7013 7007                   * We _know_ we're not messing with AT_SIZE or AT_XTIME,
7014 7008                   * so no need for stateid or flags. Also we specify NULL
7015 7009                   * rp since we're only interested in setting owner_group
7016 7010                   * attributes.
7017 7011                   */
7018 7012                  nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, supp_attrs,
7019 7013                      &e.error, 0);
7020 7014  
7021 7015                  if (e.error) {
7022 7016                          nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE);
7023 7017                          nfs_rw_exit(&drp->r_rwlock);
7024 7018                          nfs4_fattr4_free(crattr);
7025 7019                          nfs4args_verify_free(&argop[8]);
7026 7020                          kmem_free(argop, argoplist_size);
7027 7021                          return (e.error);
7028 7022                  }
7029 7023          } else {
7030 7024                  argop[1].argop = OP_SAVEFH;
7031 7025  
7032 7026                  argop[5].argop = OP_RESTOREFH;
7033 7027  
7034 7028                  argop[6].argop = OP_GETATTR;
7035 7029                  argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7036 7030                  argop[6].nfs_argop4_u.opgetattr.mi = mi;
7037 7031          }
7038 7032  
7039 7033          dnlc_remove(dvp, nm);
7040 7034  
7041 7035          doqueue = 1;
7042 7036          t = gethrtime();
7043 7037          rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
7044 7038  
7045 7039          needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
7046 7040          if (e.error) {
7047 7041                  PURGE_ATTRCACHE4(dvp);
7048 7042                  if (!needrecov)
7049 7043                          goto out;
7050 7044          }
7051 7045  
7052 7046          if (needrecov) {
7053 7047                  if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
7054 7048                      OP_CREATE, NULL, NULL, NULL) == FALSE) {
7055 7049                          nfs4_end_op(mi, dvp, NULL, &recov_state,
7056 7050                              needrecov);
7057 7051                          need_end_op = FALSE;
7058 7052                          nfs4_fattr4_free(crattr);
7059 7053                          if (setgid_flag) {
7060 7054                                  nfs4args_verify_free(&argop[8]);
7061 7055                                  nfs4args_setattr_free(&argop[9]);
7062 7056                          }
7063 7057                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
7064 7058                          goto recov_retry;
7065 7059                  }
7066 7060          }
7067 7061  
7068 7062          resp = &res;
7069 7063  
7070 7064          if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) {
7071 7065  
7072 7066                  if (res.status == NFS4ERR_BADOWNER)
7073 7067                          nfs4_log_badowner(mi, OP_CREATE);
7074 7068  
7075 7069                  e.error = geterrno4(res.status);
7076 7070  
7077 7071                  /*
7078 7072                   * This check is left over from when create was implemented
7079 7073                   * using a setattr op (instead of createattrs).  If the
7080 7074                   * putfh/create/getfh failed, the error was returned.  If
7081 7075                   * setattr/getattr failed, we keep going.
7082 7076                   *
7083 7077                   * It might be better to get rid of the GETFH also, and just
7084 7078                   * do PUTFH/CREATE/GETATTR since the FH attr is mandatory.
7085 7079                   * Then if any of the operations failed, we could return the
7086 7080                   * error now, and remove much of the error code below.
7087 7081                   */
7088 7082                  if (res.array_len <= idx_fattr) {
7089 7083                          /*
7090 7084                           * Either Putfh, Create or Getfh failed.
7091 7085                           */
7092 7086                          PURGE_ATTRCACHE4(dvp);
7093 7087                          /*
7094 7088                           * nfs4_purge_stale_fh() may generate otw calls through
7095 7089                           * nfs4_invalidate_pages. Hence the need to call
7096 7090                           * nfs4_end_op() here to avoid nfs4_start_op() deadlock.
7097 7091                           */
7098 7092                          nfs4_end_op(mi, dvp, NULL, &recov_state,
7099 7093                              needrecov);
7100 7094                          need_end_op = FALSE;
7101 7095                          nfs4_purge_stale_fh(e.error, dvp, cr);
7102 7096                          goto out;
7103 7097                  }
7104 7098          }
7105 7099  
7106 7100          resop = &res.array[idx_create]; /* create res */
7107 7101          cinfo = &resop->nfs_resop4_u.opcreate.cinfo;
7108 7102  
7109 7103          resop = &res.array[idx_create + 1]; /* getfh res */
7110 7104          gf_res = &resop->nfs_resop4_u.opgetfh;
7111 7105  
7112 7106          sfhp = sfh4_get(&gf_res->object, mi);
7113 7107          if (e.error) {
7114 7108                  *vpp = vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t, cr, dvp,
7115 7109                      fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
7116 7110                  if (vp->v_type == VNON) {
7117 7111                          vattr.va_mask = AT_TYPE;
7118 7112                          /*
7119 7113                           * Need to call nfs4_end_op before nfs4getattr to avoid
7120 7114                           * potential nfs4_start_op deadlock. See RFE 4777612.
7121 7115                           */
7122 7116                          nfs4_end_op(mi, dvp, NULL, &recov_state,
7123 7117                              needrecov);
7124 7118                          need_end_op = FALSE;
7125 7119                          e.error = nfs4getattr(vp, &vattr, cr);
7126 7120                          if (e.error) {
7127 7121                                  VN_RELE(vp);
7128 7122                                  *vpp = NULL;
7129 7123                                  goto out;
7130 7124                          }
7131 7125                          vp->v_type = vattr.va_type;
7132 7126                  }
7133 7127                  e.error = 0;
7134 7128          } else {
7135 7129                  *vpp = vp = makenfs4node(sfhp,
7136 7130                      &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res,
7137 7131                      dvp->v_vfsp, t, cr,
7138 7132                      dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
7139 7133          }
7140 7134  
7141 7135          /*
7142 7136           * If compound succeeded, then update dir attrs
7143 7137           */
7144 7138          if (res.status == NFS4_OK) {
7145 7139                  dinfo.di_garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res;
7146 7140                  dinfo.di_cred = cr;
7147 7141                  dinfo.di_time_call = t;
7148 7142                  dinfop = &dinfo;
7149 7143          } else
7150 7144                  dinfop = NULL;
7151 7145  
7152 7146          /* Update directory cache attribute, readdir and dnlc caches */
7153 7147          nfs4_update_dircaches(cinfo, dvp, vp, nm, dinfop);
7154 7148  
7155 7149  out:
7156 7150          if (sfhp != NULL)
7157 7151                  sfh4_rele(&sfhp);
7158 7152          nfs_rw_exit(&drp->r_rwlock);
7159 7153          nfs4_fattr4_free(crattr);
7160 7154          if (setgid_flag) {
7161 7155                  nfs4args_verify_free(&argop[8]);
7162 7156                  nfs4args_setattr_free(&argop[9]);
7163 7157          }
7164 7158          if (resp)
7165 7159                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
7166 7160          if (need_end_op)
7167 7161                  nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov);
7168 7162  
7169 7163          kmem_free(argop, argoplist_size);
7170 7164          return (e.error);
7171 7165  }
7172 7166  
7173 7167  /* ARGSUSED */
7174 7168  static int
7175 7169  nfs4mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
7176 7170      int mode, vnode_t **vpp, cred_t *cr)
7177 7171  {
7178 7172          int error;
7179 7173          vnode_t *vp;
7180 7174          nfs_ftype4 type;
7181 7175          specdata4 spec, *specp = NULL;
7182 7176  
7183 7177          ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
7184 7178  
7185 7179          switch (va->va_type) {
7186 7180          case VCHR:
7187 7181          case VBLK:
7188 7182                  type = (va->va_type == VCHR) ? NF4CHR : NF4BLK;
7189 7183                  spec.specdata1 = getmajor(va->va_rdev);
7190 7184                  spec.specdata2 = getminor(va->va_rdev);
7191 7185                  specp = &spec;
7192 7186                  break;
7193 7187  
7194 7188          case VFIFO:
7195 7189                  type = NF4FIFO;
7196 7190                  break;
7197 7191          case VSOCK:
7198 7192                  type = NF4SOCK;
7199 7193                  break;
7200 7194  
7201 7195          default:
7202 7196                  return (EINVAL);
7203 7197          }
7204 7198  
7205 7199          error = call_nfs4_create_req(dvp, nm, specp, va, &vp, cr, type);
7206 7200          if (error) {
7207 7201                  return (error);
7208 7202          }
7209 7203  
7210 7204          /*
7211 7205           * This might not be needed any more; special case to deal
7212 7206           * with problematic v2/v3 servers.  Since create was unable
7213 7207           * to set group correctly, not sure what hope setattr has.
7214 7208           */
7215 7209          if (va->va_gid != VTOR4(vp)->r_attr.va_gid) {
7216 7210                  va->va_mask = AT_GID;
7217 7211                  (void) nfs4setattr(vp, va, 0, cr, NULL);
7218 7212          }
7219 7213  
7220 7214          /*
7221 7215           * If vnode is a device create special vnode
7222 7216           */
7223 7217          if (ISVDEV(vp->v_type)) {
7224 7218                  *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
7225 7219                  VN_RELE(vp);
7226 7220          } else {
7227 7221                  *vpp = vp;
7228 7222          }
7229 7223          return (error);
7230 7224  }
7231 7225  
7232 7226  /*
7233 7227   * Remove requires that the current fh be the target directory.
7234 7228   * After the operation, the current fh is unchanged.
7235 7229   * The compound op structure is:
7236 7230   *      PUTFH(targetdir), REMOVE
7237 7231   *
7238 7232   * Weirdness: if the vnode to be removed is open
7239 7233   * we rename it instead of removing it and nfs_inactive
7240 7234   * will remove the new name.
7241 7235   */
7242 7236  /* ARGSUSED */
7243 7237  static int
7244 7238  nfs4_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags)
7245 7239  {
7246 7240          COMPOUND4args_clnt args;
7247 7241          COMPOUND4res_clnt res, *resp = NULL;
7248 7242          REMOVE4res *rm_res;
7249 7243          nfs_argop4 argop[3];
7250 7244          nfs_resop4 *resop;
7251 7245          vnode_t *vp;
7252 7246          char *tmpname;
7253 7247          int doqueue;
7254 7248          mntinfo4_t *mi;
7255 7249          rnode4_t *rp;
7256 7250          rnode4_t *drp;
7257 7251          int needrecov = 0;
7258 7252          nfs4_recov_state_t recov_state;
7259 7253          int isopen;
7260 7254          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
7261 7255          dirattr_info_t dinfo;
7262 7256  
7263 7257          if (nfs_zone() != VTOMI4(dvp)->mi_zone)
7264 7258                  return (EPERM);
7265 7259          drp = VTOR4(dvp);
7266 7260          if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
7267 7261                  return (EINTR);
7268 7262  
7269 7263          e.error = nfs4lookup(dvp, nm, &vp, cr, 0);
7270 7264          if (e.error) {
7271 7265                  nfs_rw_exit(&drp->r_rwlock);
7272 7266                  return (e.error);
7273 7267          }
7274 7268  
7275 7269          if (vp->v_type == VDIR) {
7276 7270                  VN_RELE(vp);
7277 7271                  nfs_rw_exit(&drp->r_rwlock);
7278 7272                  return (EISDIR);
7279 7273          }
7280 7274  
7281 7275          /*
7282 7276           * First just remove the entry from the name cache, as it
7283 7277           * is most likely the only entry for this vp.
7284 7278           */
7285 7279          dnlc_remove(dvp, nm);
7286 7280  
7287 7281          rp = VTOR4(vp);
7288 7282  
7289 7283          /*
7290 7284           * For regular file types, check to see if the file is open by looking
7291 7285           * at the open streams.
7292 7286           * For all other types, check the reference count on the vnode.  Since
7293 7287           * they are not opened OTW they never have an open stream.
7294 7288           *
7295 7289           * If the file is open, rename it to .nfsXXXX.
7296 7290           */
7297 7291          if (vp->v_type != VREG) {
7298 7292                  /*
7299 7293                   * If the file has a v_count > 1 then there may be more than one
7300 7294                   * entry in the name cache due multiple links or an open file,
7301 7295                   * but we don't have the real reference count so flush all
7302 7296                   * possible entries.
7303 7297                   */
7304 7298                  if (vp->v_count > 1)
7305 7299                          dnlc_purge_vp(vp);
7306 7300  
7307 7301                  /*
7308 7302                   * Now we have the real reference count.
7309 7303                   */
7310 7304                  isopen = vp->v_count > 1;
7311 7305          } else {
7312 7306                  mutex_enter(&rp->r_os_lock);
7313 7307                  isopen = list_head(&rp->r_open_streams) != NULL;
7314 7308                  mutex_exit(&rp->r_os_lock);
7315 7309          }
7316 7310  
7317 7311          mutex_enter(&rp->r_statelock);
7318 7312          if (isopen &&
7319 7313              (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) {
7320 7314                  mutex_exit(&rp->r_statelock);
7321 7315                  tmpname = newname();
7322 7316                  e.error = nfs4rename(dvp, nm, dvp, tmpname, cr, ct);
7323 7317                  if (e.error)
7324 7318                          kmem_free(tmpname, MAXNAMELEN);
7325 7319                  else {
7326 7320                          mutex_enter(&rp->r_statelock);
7327 7321                          if (rp->r_unldvp == NULL) {
7328 7322                                  VN_HOLD(dvp);
7329 7323                                  rp->r_unldvp = dvp;
7330 7324                                  if (rp->r_unlcred != NULL)
7331 7325                                          crfree(rp->r_unlcred);
7332 7326                                  crhold(cr);
7333 7327                                  rp->r_unlcred = cr;
7334 7328                                  rp->r_unlname = tmpname;
7335 7329                          } else {
7336 7330                                  kmem_free(rp->r_unlname, MAXNAMELEN);
7337 7331                                  rp->r_unlname = tmpname;
7338 7332                          }
7339 7333                          mutex_exit(&rp->r_statelock);
7340 7334                  }
7341 7335                  VN_RELE(vp);
7342 7336                  nfs_rw_exit(&drp->r_rwlock);
7343 7337                  return (e.error);
7344 7338          }
7345 7339          /*
7346 7340           * Actually remove the file/dir
7347 7341           */
7348 7342          mutex_exit(&rp->r_statelock);
7349 7343  
7350 7344          /*
7351 7345           * We need to flush any dirty pages which happen to
7352 7346           * be hanging around before removing the file.
7353 7347           * This shouldn't happen very often since in NFSv4
7354 7348           * we should be close to open consistent.
7355 7349           */
7356 7350          if (nfs4_has_pages(vp) &&
7357 7351              ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) {
7358 7352                  e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, ct);
7359 7353                  if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) {
7360 7354                          mutex_enter(&rp->r_statelock);
7361 7355                          if (!rp->r_error)
7362 7356                                  rp->r_error = e.error;
7363 7357                          mutex_exit(&rp->r_statelock);
7364 7358                  }
7365 7359          }
7366 7360  
7367 7361          mi = VTOMI4(dvp);
7368 7362  
7369 7363          (void) nfs4delegreturn(rp, NFS4_DR_REOPEN);
7370 7364          recov_state.rs_flags = 0;
7371 7365          recov_state.rs_num_retry_despite_err = 0;
7372 7366  
7373 7367  recov_retry:
7374 7368          /*
7375 7369           * Remove ops: putfh dir; remove
7376 7370           */
7377 7371          args.ctag = TAG_REMOVE;
7378 7372          args.array_len = 3;
7379 7373          args.array = argop;
7380 7374  
7381 7375          e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
7382 7376          if (e.error) {
7383 7377                  nfs_rw_exit(&drp->r_rwlock);
7384 7378                  VN_RELE(vp);
7385 7379                  return (e.error);
7386 7380          }
7387 7381  
7388 7382          /* putfh directory */
7389 7383          argop[0].argop = OP_CPUTFH;
7390 7384          argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
7391 7385  
7392 7386          /* remove */
7393 7387          argop[1].argop = OP_CREMOVE;
7394 7388          argop[1].nfs_argop4_u.opcremove.ctarget = nm;
7395 7389  
7396 7390          /* getattr dir */
7397 7391          argop[2].argop = OP_GETATTR;
7398 7392          argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7399 7393          argop[2].nfs_argop4_u.opgetattr.mi = mi;
7400 7394  
7401 7395          doqueue = 1;
7402 7396          dinfo.di_time_call = gethrtime();
7403 7397          rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
7404 7398  
7405 7399          PURGE_ATTRCACHE4(vp);
7406 7400  
7407 7401          needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
7408 7402          if (e.error)
7409 7403                  PURGE_ATTRCACHE4(dvp);
7410 7404  
7411 7405          if (needrecov) {
7412 7406                  if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp,
7413 7407                      NULL, NULL, NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) {
7414 7408                          if (!e.error)
7415 7409                                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
7416 7410                          nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
7417 7411                              needrecov);
7418 7412                          goto recov_retry;
7419 7413                  }
7420 7414          }
7421 7415  
7422 7416          /*
7423 7417           * Matching nfs4_end_op() for start_op() above.
7424 7418           * There is a path in the code below which calls
7425 7419           * nfs4_purge_stale_fh(), which may generate otw calls through
7426 7420           * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
7427 7421           * here to avoid nfs4_start_op() deadlock.
7428 7422           */
7429 7423          nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
7430 7424  
7431 7425          if (!e.error) {
7432 7426                  resp = &res;
7433 7427  
7434 7428                  if (res.status) {
7435 7429                          e.error = geterrno4(res.status);
7436 7430                          PURGE_ATTRCACHE4(dvp);
7437 7431                          nfs4_purge_stale_fh(e.error, dvp, cr);
7438 7432                  } else {
7439 7433                          resop = &res.array[1];  /* remove res */
7440 7434                          rm_res = &resop->nfs_resop4_u.opremove;
7441 7435  
7442 7436                          dinfo.di_garp =
7443 7437                              &res.array[2].nfs_resop4_u.opgetattr.ga_res;
7444 7438                          dinfo.di_cred = cr;
7445 7439  
7446 7440                          /* Update directory attr, readdir and dnlc caches */
7447 7441                          nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL,
7448 7442                              &dinfo);
7449 7443                  }
7450 7444          }
7451 7445          nfs_rw_exit(&drp->r_rwlock);
7452 7446          if (resp)
7453 7447                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
7454 7448  
7455 7449          if (e.error == 0) {
7456 7450                  vnode_t *tvp;
7457 7451                  rnode4_t *trp;
7458 7452                  trp = VTOR4(vp);
7459 7453                  tvp = vp;
7460 7454                  if (IS_SHADOW(vp, trp))
7461 7455                          tvp = RTOV4(trp);
7462 7456                  vnevent_remove(tvp, dvp, nm, ct);
7463 7457          }
7464 7458          VN_RELE(vp);
7465 7459          return (e.error);
7466 7460  }
7467 7461  
7468 7462  /*
7469 7463   * Link requires that the current fh be the target directory and the
7470 7464   * saved fh be the source fh. After the operation, the current fh is unchanged.
7471 7465   * Thus the compound op structure is:
7472 7466   *      PUTFH(file), SAVEFH, PUTFH(targetdir), LINK, RESTOREFH,
7473 7467   *      GETATTR(file)
7474 7468   */
7475 7469  /* ARGSUSED */
7476 7470  static int
7477 7471  nfs4_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
7478 7472      caller_context_t *ct, int flags)
7479 7473  {
7480 7474          COMPOUND4args_clnt args;
7481 7475          COMPOUND4res_clnt res, *resp = NULL;
7482 7476          LINK4res *ln_res;
7483 7477          int argoplist_size  = 7 * sizeof (nfs_argop4);
7484 7478          nfs_argop4 *argop;
7485 7479          nfs_resop4 *resop;
7486 7480          vnode_t *realvp, *nvp;
7487 7481          int doqueue;
7488 7482          mntinfo4_t *mi;
7489 7483          rnode4_t *tdrp;
7490 7484          bool_t needrecov = FALSE;
7491 7485          nfs4_recov_state_t recov_state;
7492 7486          hrtime_t t;
7493 7487          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
7494 7488          dirattr_info_t dinfo;
7495 7489  
7496 7490          ASSERT(*tnm != '\0');
7497 7491          ASSERT(tdvp->v_type == VDIR);
7498 7492          ASSERT(nfs4_consistent_type(tdvp));
7499 7493          ASSERT(nfs4_consistent_type(svp));
7500 7494  
7501 7495          if (nfs_zone() != VTOMI4(tdvp)->mi_zone)
7502 7496                  return (EPERM);
7503 7497          if (VOP_REALVP(svp, &realvp, ct) == 0) {
7504 7498                  svp = realvp;
7505 7499                  ASSERT(nfs4_consistent_type(svp));
7506 7500          }
7507 7501  
7508 7502          tdrp = VTOR4(tdvp);
7509 7503          mi = VTOMI4(svp);
7510 7504  
7511 7505          if (!(mi->mi_flags & MI4_LINK)) {
7512 7506                  return (EOPNOTSUPP);
7513 7507          }
7514 7508          recov_state.rs_flags = 0;
7515 7509          recov_state.rs_num_retry_despite_err = 0;
7516 7510  
7517 7511          if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR4(tdvp)))
7518 7512                  return (EINTR);
7519 7513  
7520 7514  recov_retry:
7521 7515          argop = kmem_alloc(argoplist_size, KM_SLEEP);
7522 7516  
7523 7517          args.ctag = TAG_LINK;
7524 7518  
7525 7519          /*
7526 7520           * Link ops: putfh fl; savefh; putfh tdir; link; getattr(dir);
7527 7521           * restorefh; getattr(fl)
7528 7522           */
7529 7523          args.array_len = 7;
7530 7524          args.array = argop;
7531 7525  
7532 7526          e.error = nfs4_start_op(VTOMI4(svp), svp, tdvp, &recov_state);
7533 7527          if (e.error) {
7534 7528                  kmem_free(argop, argoplist_size);
7535 7529                  nfs_rw_exit(&tdrp->r_rwlock);
7536 7530                  return (e.error);
7537 7531          }
7538 7532  
7539 7533          /* 0. putfh file */
7540 7534          argop[0].argop = OP_CPUTFH;
7541 7535          argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(svp)->r_fh;
7542 7536  
7543 7537          /* 1. save current fh to free up the space for the dir */
7544 7538          argop[1].argop = OP_SAVEFH;
7545 7539  
7546 7540          /* 2. putfh targetdir */
7547 7541          argop[2].argop = OP_CPUTFH;
7548 7542          argop[2].nfs_argop4_u.opcputfh.sfh = tdrp->r_fh;
7549 7543  
7550 7544          /* 3. link: current_fh is targetdir, saved_fh is source */
7551 7545          argop[3].argop = OP_CLINK;
7552 7546          argop[3].nfs_argop4_u.opclink.cnewname = tnm;
7553 7547  
7554 7548          /* 4. Get attributes of dir */
7555 7549          argop[4].argop = OP_GETATTR;
7556 7550          argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7557 7551          argop[4].nfs_argop4_u.opgetattr.mi = mi;
7558 7552  
7559 7553          /* 5. If link was successful, restore current vp to file */
7560 7554          argop[5].argop = OP_RESTOREFH;
7561 7555  
7562 7556          /* 6. Get attributes of linked object */
7563 7557          argop[6].argop = OP_GETATTR;
7564 7558          argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7565 7559          argop[6].nfs_argop4_u.opgetattr.mi = mi;
7566 7560  
7567 7561          dnlc_remove(tdvp, tnm);
7568 7562  
7569 7563          doqueue = 1;
7570 7564          t = gethrtime();
7571 7565  
7572 7566          rfs4call(VTOMI4(svp), &args, &res, cr, &doqueue, 0, &e);
7573 7567  
7574 7568          needrecov = nfs4_needs_recovery(&e, FALSE, svp->v_vfsp);
7575 7569          if (e.error != 0 && !needrecov) {
7576 7570                  PURGE_ATTRCACHE4(tdvp);
7577 7571                  PURGE_ATTRCACHE4(svp);
7578 7572                  nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov);
7579 7573                  goto out;
7580 7574          }
7581 7575  
7582 7576          if (needrecov) {
7583 7577                  bool_t abort;
7584 7578  
7585 7579                  abort = nfs4_start_recovery(&e, VTOMI4(svp), svp, tdvp,
7586 7580                      NULL, NULL, OP_LINK, NULL, NULL, NULL);
7587 7581                  if (abort == FALSE) {
7588 7582                          nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state,
7589 7583                              needrecov);
7590 7584                          kmem_free(argop, argoplist_size);
7591 7585                          if (!e.error)
7592 7586                                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
7593 7587                          goto recov_retry;
7594 7588                  } else {
7595 7589                          if (e.error != 0) {
7596 7590                                  PURGE_ATTRCACHE4(tdvp);
7597 7591                                  PURGE_ATTRCACHE4(svp);
7598 7592                                  nfs4_end_op(VTOMI4(svp), svp, tdvp,
7599 7593                                      &recov_state, needrecov);
7600 7594                                  goto out;
7601 7595                          }
7602 7596                          /* fall through for res.status case */
7603 7597                  }
7604 7598          }
7605 7599  
7606 7600          nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov);
7607 7601  
7608 7602          resp = &res;
7609 7603          if (res.status) {
7610 7604                  /* If link succeeded, then don't return error */
7611 7605                  e.error = geterrno4(res.status);
7612 7606                  if (res.array_len <= 4) {
7613 7607                          /*
7614 7608                           * Either Putfh, Savefh, Putfh dir, or Link failed
7615 7609                           */
7616 7610                          PURGE_ATTRCACHE4(svp);
7617 7611                          PURGE_ATTRCACHE4(tdvp);
7618 7612                          if (e.error == EOPNOTSUPP) {
7619 7613                                  mutex_enter(&mi->mi_lock);
7620 7614                                  mi->mi_flags &= ~MI4_LINK;
7621 7615                                  mutex_exit(&mi->mi_lock);
7622 7616                          }
7623 7617                          /* Remap EISDIR to EPERM for non-root user for SVVS */
7624 7618                          /* XXX-LP */
7625 7619                          if (e.error == EISDIR && crgetuid(cr) != 0)
7626 7620                                  e.error = EPERM;
7627 7621                          goto out;
7628 7622                  }
7629 7623          }
7630 7624  
7631 7625          /* either no error or one of the postop getattr failed */
7632 7626  
7633 7627          /*
7634 7628           * XXX - if LINK succeeded, but no attrs were returned for link
7635 7629           * file, purge its cache.
7636 7630           *
7637 7631           * XXX Perform a simplified version of wcc checking. Instead of
7638 7632           * have another getattr to get pre-op, just purge cache if
7639 7633           * any of the ops prior to and including the getattr failed.
7640 7634           * If the getattr succeeded then update the attrcache accordingly.
7641 7635           */
7642 7636  
7643 7637          /*
7644 7638           * update cache with link file postattrs.
7645 7639           * Note: at this point resop points to link res.
7646 7640           */
7647 7641          resop = &res.array[3];  /* link res */
7648 7642          ln_res = &resop->nfs_resop4_u.oplink;
7649 7643          if (res.status == NFS4_OK)
7650 7644                  e.error = nfs4_update_attrcache(res.status,
7651 7645                      &res.array[6].nfs_resop4_u.opgetattr.ga_res,
7652 7646                      t, svp, cr);
7653 7647  
7654 7648          /*
7655 7649           * Call makenfs4node to create the new shadow vp for tnm.
7656 7650           * We pass NULL attrs because we just cached attrs for
7657 7651           * the src object.  All we're trying to accomplish is to
7658 7652           * to create the new shadow vnode.
7659 7653           */
7660 7654          nvp = makenfs4node(VTOR4(svp)->r_fh, NULL, tdvp->v_vfsp, t, cr,
7661 7655              tdvp, fn_get(VTOSV(tdvp)->sv_name, tnm, VTOR4(svp)->r_fh));
7662 7656  
7663 7657          /* Update target cache attribute, readdir and dnlc caches */
7664 7658          dinfo.di_garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res;
7665 7659          dinfo.di_time_call = t;
7666 7660          dinfo.di_cred = cr;
7667 7661  
7668 7662          nfs4_update_dircaches(&ln_res->cinfo, tdvp, nvp, tnm, &dinfo);
7669 7663          ASSERT(nfs4_consistent_type(tdvp));
7670 7664          ASSERT(nfs4_consistent_type(svp));
7671 7665          ASSERT(nfs4_consistent_type(nvp));
7672 7666          VN_RELE(nvp);
7673 7667  
7674 7668          if (!e.error) {
7675 7669                  vnode_t *tvp;
7676 7670                  rnode4_t *trp;
7677 7671                  /*
7678 7672                   * Notify the source file of this link operation.
7679 7673                   */
7680 7674                  trp = VTOR4(svp);
7681 7675                  tvp = svp;
7682 7676                  if (IS_SHADOW(svp, trp))
7683 7677                          tvp = RTOV4(trp);
7684 7678                  vnevent_link(tvp, ct);
7685 7679          }
7686 7680  out:
7687 7681          kmem_free(argop, argoplist_size);
7688 7682          if (resp)
7689 7683                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
7690 7684  
7691 7685          nfs_rw_exit(&tdrp->r_rwlock);
7692 7686  
7693 7687          return (e.error);
7694 7688  }
7695 7689  
7696 7690  /* ARGSUSED */
7697 7691  static int
7698 7692  nfs4_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
7699 7693      caller_context_t *ct, int flags)
7700 7694  {
7701 7695          vnode_t *realvp;
7702 7696  
7703 7697          if (nfs_zone() != VTOMI4(odvp)->mi_zone)
7704 7698                  return (EPERM);
7705 7699          if (VOP_REALVP(ndvp, &realvp, ct) == 0)
7706 7700                  ndvp = realvp;
7707 7701  
7708 7702          return (nfs4rename(odvp, onm, ndvp, nnm, cr, ct));
7709 7703  }
7710 7704  
7711 7705  /*
7712 7706   * nfs4rename does the real work of renaming in NFS Version 4.
7713 7707   *
7714 7708   * A file handle is considered volatile for renaming purposes if either
7715 7709   * of the volatile bits are turned on. However, the compound may differ
7716 7710   * based on the likelihood of the filehandle to change during rename.
7717 7711   */
7718 7712  static int
7719 7713  nfs4rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
7720 7714      caller_context_t *ct)
7721 7715  {
7722 7716          int error;
7723 7717          mntinfo4_t *mi;
7724 7718          vnode_t *nvp = NULL;
7725 7719          vnode_t *ovp = NULL;
7726 7720          char *tmpname = NULL;
7727 7721          rnode4_t *rp;
7728 7722          rnode4_t *odrp;
7729 7723          rnode4_t *ndrp;
7730 7724          int did_link = 0;
7731 7725          int do_link = 1;
7732 7726          nfsstat4 stat = NFS4_OK;
7733 7727  
7734 7728          ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone);
7735 7729          ASSERT(nfs4_consistent_type(odvp));
7736 7730          ASSERT(nfs4_consistent_type(ndvp));
7737 7731  
7738 7732          if (onm[0] == '.' && (onm[1] == '\0' ||
7739 7733              (onm[1] == '.' && onm[2] == '\0')))
7740 7734                  return (EINVAL);
7741 7735  
7742 7736          if (nnm[0] == '.' && (nnm[1] == '\0' ||
7743 7737              (nnm[1] == '.' && nnm[2] == '\0')))
7744 7738                  return (EINVAL);
7745 7739  
7746 7740          odrp = VTOR4(odvp);
7747 7741          ndrp = VTOR4(ndvp);
7748 7742          if ((intptr_t)odrp < (intptr_t)ndrp) {
7749 7743                  if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp)))
7750 7744                          return (EINTR);
7751 7745                  if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) {
7752 7746                          nfs_rw_exit(&odrp->r_rwlock);
7753 7747                          return (EINTR);
7754 7748                  }
7755 7749          } else {
7756 7750                  if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp)))
7757 7751                          return (EINTR);
7758 7752                  if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) {
7759 7753                          nfs_rw_exit(&ndrp->r_rwlock);
7760 7754                          return (EINTR);
7761 7755                  }
7762 7756          }
7763 7757  
7764 7758          /*
7765 7759           * Lookup the target file.  If it exists, it needs to be
7766 7760           * checked to see whether it is a mount point and whether
7767 7761           * it is active (open).
7768 7762           */
7769 7763          error = nfs4lookup(ndvp, nnm, &nvp, cr, 0);
7770 7764          if (!error) {
7771 7765                  int     isactive;
7772 7766  
7773 7767                  ASSERT(nfs4_consistent_type(nvp));
7774 7768                  /*
7775 7769                   * If this file has been mounted on, then just
7776 7770                   * return busy because renaming to it would remove
7777 7771                   * the mounted file system from the name space.
7778 7772                   */
7779 7773                  if (vn_ismntpt(nvp)) {
7780 7774                          VN_RELE(nvp);
7781 7775                          nfs_rw_exit(&odrp->r_rwlock);
7782 7776                          nfs_rw_exit(&ndrp->r_rwlock);
7783 7777                          return (EBUSY);
7784 7778                  }
7785 7779  
7786 7780                  /*
7787 7781                   * First just remove the entry from the name cache, as it
7788 7782                   * is most likely the only entry for this vp.
7789 7783                   */
7790 7784                  dnlc_remove(ndvp, nnm);
7791 7785  
7792 7786                  rp = VTOR4(nvp);
7793 7787  
7794 7788                  if (nvp->v_type != VREG) {
7795 7789                          /*
7796 7790                           * Purge the name cache of all references to this vnode
7797 7791                           * so that we can check the reference count to infer
7798 7792                           * whether it is active or not.
7799 7793                           */
7800 7794                          if (nvp->v_count > 1)
7801 7795                                  dnlc_purge_vp(nvp);
7802 7796  
7803 7797                          isactive = nvp->v_count > 1;
7804 7798                  } else {
7805 7799                          mutex_enter(&rp->r_os_lock);
7806 7800                          isactive = list_head(&rp->r_open_streams) != NULL;
7807 7801                          mutex_exit(&rp->r_os_lock);
7808 7802                  }
7809 7803  
7810 7804                  /*
7811 7805                   * If the vnode is active and is not a directory,
7812 7806                   * arrange to rename it to a
7813 7807                   * temporary file so that it will continue to be
7814 7808                   * accessible.  This implements the "unlink-open-file"
7815 7809                   * semantics for the target of a rename operation.
7816 7810                   * Before doing this though, make sure that the
7817 7811                   * source and target files are not already the same.
7818 7812                   */
7819 7813                  if (isactive && nvp->v_type != VDIR) {
7820 7814                          /*
7821 7815                           * Lookup the source name.
7822 7816                           */
7823 7817                          error = nfs4lookup(odvp, onm, &ovp, cr, 0);
7824 7818  
7825 7819                          /*
7826 7820                           * The source name *should* already exist.
7827 7821                           */
7828 7822                          if (error) {
7829 7823                                  VN_RELE(nvp);
7830 7824                                  nfs_rw_exit(&odrp->r_rwlock);
7831 7825                                  nfs_rw_exit(&ndrp->r_rwlock);
7832 7826                                  return (error);
7833 7827                          }
7834 7828  
7835 7829                          ASSERT(nfs4_consistent_type(ovp));
7836 7830  
7837 7831                          /*
7838 7832                           * Compare the two vnodes.  If they are the same,
7839 7833                           * just release all held vnodes and return success.
7840 7834                           */
7841 7835                          if (VN_CMP(ovp, nvp)) {
7842 7836                                  VN_RELE(ovp);
7843 7837                                  VN_RELE(nvp);
7844 7838                                  nfs_rw_exit(&odrp->r_rwlock);
7845 7839                                  nfs_rw_exit(&ndrp->r_rwlock);
7846 7840                                  return (0);
7847 7841                          }
7848 7842  
7849 7843                          /*
7850 7844                           * Can't mix and match directories and non-
7851 7845                           * directories in rename operations.  We already
7852 7846                           * know that the target is not a directory.  If
7853 7847                           * the source is a directory, return an error.
7854 7848                           */
7855 7849                          if (ovp->v_type == VDIR) {
7856 7850                                  VN_RELE(ovp);
7857 7851                                  VN_RELE(nvp);
7858 7852                                  nfs_rw_exit(&odrp->r_rwlock);
7859 7853                                  nfs_rw_exit(&ndrp->r_rwlock);
7860 7854                                  return (ENOTDIR);
7861 7855                          }
7862 7856  link_call:
7863 7857                          /*
7864 7858                           * The target file exists, is not the same as
7865 7859                           * the source file, and is active.  We first
7866 7860                           * try to Link it to a temporary filename to
7867 7861                           * avoid having the server removing the file
7868 7862                           * completely (which could cause data loss to
7869 7863                           * the user's POV in the event the Rename fails
7870 7864                           * -- see bug 1165874).
7871 7865                           */
7872 7866                          /*
7873 7867                           * The do_link and did_link booleans are
7874 7868                           * introduced in the event we get NFS4ERR_FILE_OPEN
7875 7869                           * returned for the Rename.  Some servers can
7876 7870                           * not Rename over an Open file, so they return
7877 7871                           * this error.  The client needs to Remove the
7878 7872                           * newly created Link and do two Renames, just
7879 7873                           * as if the server didn't support LINK.
7880 7874                           */
7881 7875                          tmpname = newname();
7882 7876                          error = 0;
7883 7877  
7884 7878                          if (do_link) {
7885 7879                                  error = nfs4_link(ndvp, nvp, tmpname, cr,
7886 7880                                      NULL, 0);
7887 7881                          }
7888 7882                          if (error == EOPNOTSUPP || !do_link) {
7889 7883                                  error = nfs4_rename(ndvp, nnm, ndvp, tmpname,
7890 7884                                      cr, NULL, 0);
7891 7885                                  did_link = 0;
7892 7886                          } else {
7893 7887                                  did_link = 1;
7894 7888                          }
7895 7889                          if (error) {
7896 7890                                  kmem_free(tmpname, MAXNAMELEN);
7897 7891                                  VN_RELE(ovp);
7898 7892                                  VN_RELE(nvp);
7899 7893                                  nfs_rw_exit(&odrp->r_rwlock);
7900 7894                                  nfs_rw_exit(&ndrp->r_rwlock);
7901 7895                                  return (error);
7902 7896                          }
7903 7897  
7904 7898                          mutex_enter(&rp->r_statelock);
7905 7899                          if (rp->r_unldvp == NULL) {
7906 7900                                  VN_HOLD(ndvp);
7907 7901                                  rp->r_unldvp = ndvp;
7908 7902                                  if (rp->r_unlcred != NULL)
7909 7903                                          crfree(rp->r_unlcred);
7910 7904                                  crhold(cr);
7911 7905                                  rp->r_unlcred = cr;
7912 7906                                  rp->r_unlname = tmpname;
7913 7907                          } else {
7914 7908                                  if (rp->r_unlname)
7915 7909                                          kmem_free(rp->r_unlname, MAXNAMELEN);
7916 7910                                  rp->r_unlname = tmpname;
7917 7911                          }
7918 7912                          mutex_exit(&rp->r_statelock);
7919 7913                  }
7920 7914  
7921 7915                  (void) nfs4delegreturn(VTOR4(nvp), NFS4_DR_PUSH|NFS4_DR_REOPEN);
7922 7916  
7923 7917                  ASSERT(nfs4_consistent_type(nvp));
7924 7918          }
7925 7919  
7926 7920          if (ovp == NULL) {
7927 7921                  /*
7928 7922                   * When renaming directories to be a subdirectory of a
7929 7923                   * different parent, the dnlc entry for ".." will no
7930 7924                   * longer be valid, so it must be removed.
7931 7925                   *
7932 7926                   * We do a lookup here to determine whether we are renaming
7933 7927                   * a directory and we need to check if we are renaming
7934 7928                   * an unlinked file.  This might have already been done
7935 7929                   * in previous code, so we check ovp == NULL to avoid
7936 7930                   * doing it twice.
7937 7931                   */
7938 7932                  error = nfs4lookup(odvp, onm, &ovp, cr, 0);
7939 7933                  /*
7940 7934                   * The source name *should* already exist.
7941 7935                   */
7942 7936                  if (error) {
7943 7937                          nfs_rw_exit(&odrp->r_rwlock);
7944 7938                          nfs_rw_exit(&ndrp->r_rwlock);
7945 7939                          if (nvp) {
7946 7940                                  VN_RELE(nvp);
7947 7941                          }
7948 7942                          return (error);
7949 7943                  }
7950 7944                  ASSERT(ovp != NULL);
7951 7945                  ASSERT(nfs4_consistent_type(ovp));
7952 7946          }
7953 7947  
7954 7948          /*
7955 7949           * Is the object being renamed a dir, and if so, is
7956 7950           * it being renamed to a child of itself?  The underlying
7957 7951           * fs should ultimately return EINVAL for this case;
7958 7952           * however, buggy beta non-Solaris NFSv4 servers at
7959 7953           * interop testing events have allowed this behavior,
7960 7954           * and it caused our client to panic due to a recursive
7961 7955           * mutex_enter in fn_move.
7962 7956           *
7963 7957           * The tedious locking in fn_move could be changed to
7964 7958           * deal with this case, and the client could avoid the
7965 7959           * panic; however, the client would just confuse itself
7966 7960           * later and misbehave.  A better way to handle the broken
7967 7961           * server is to detect this condition and return EINVAL
7968 7962           * without ever sending the the bogus rename to the server.
7969 7963           * We know the rename is invalid -- just fail it now.
7970 7964           */
7971 7965          if (ovp->v_type == VDIR && VN_CMP(ndvp, ovp)) {
7972 7966                  VN_RELE(ovp);
7973 7967                  nfs_rw_exit(&odrp->r_rwlock);
7974 7968                  nfs_rw_exit(&ndrp->r_rwlock);
7975 7969                  if (nvp) {
7976 7970                          VN_RELE(nvp);
7977 7971                  }
7978 7972                  return (EINVAL);
7979 7973          }
7980 7974  
7981 7975          (void) nfs4delegreturn(VTOR4(ovp), NFS4_DR_PUSH|NFS4_DR_REOPEN);
7982 7976  
7983 7977          /*
7984 7978           * If FH4_VOL_RENAME or FH4_VOLATILE_ANY bits are set, it is
7985 7979           * possible for the filehandle to change due to the rename.
7986 7980           * If neither of these bits is set, but FH4_VOL_MIGRATION is set,
7987 7981           * the fh will not change because of the rename, but we still need
7988 7982           * to update its rnode entry with the new name for
7989 7983           * an eventual fh change due to migration. The FH4_NOEXPIRE_ON_OPEN
7990 7984           * has no effect on these for now, but for future improvements,
7991 7985           * we might want to use it too to simplify handling of files
7992 7986           * that are open with that flag on. (XXX)
7993 7987           */
7994 7988          mi = VTOMI4(odvp);
7995 7989          if (NFS4_VOLATILE_FH(mi))
7996 7990                  error = nfs4rename_volatile_fh(odvp, onm, ovp, ndvp, nnm, cr,
7997 7991                      &stat);
7998 7992          else
7999 7993                  error = nfs4rename_persistent_fh(odvp, onm, ovp, ndvp, nnm, cr,
8000 7994                      &stat);
8001 7995  
8002 7996          ASSERT(nfs4_consistent_type(odvp));
8003 7997          ASSERT(nfs4_consistent_type(ndvp));
8004 7998          ASSERT(nfs4_consistent_type(ovp));
8005 7999  
8006 8000          if (stat == NFS4ERR_FILE_OPEN && did_link) {
8007 8001                  do_link = 0;
8008 8002                  /*
8009 8003                   * Before the 'link_call' code, we did a nfs4_lookup
8010 8004                   * that puts a VN_HOLD on nvp.  After the nfs4_link
8011 8005                   * call we call VN_RELE to match that hold.  We need
8012 8006                   * to place an additional VN_HOLD here since we will
8013 8007                   * be hitting that VN_RELE again.
8014 8008                   */
8015 8009                  VN_HOLD(nvp);
8016 8010  
8017 8011                  (void) nfs4_remove(ndvp, tmpname, cr, NULL, 0);
8018 8012  
8019 8013                  /* Undo the unlinked file naming stuff we just did */
8020 8014                  mutex_enter(&rp->r_statelock);
8021 8015                  if (rp->r_unldvp) {
8022 8016                          VN_RELE(ndvp);
8023 8017                          rp->r_unldvp = NULL;
8024 8018                          if (rp->r_unlcred != NULL)
8025 8019                                  crfree(rp->r_unlcred);
8026 8020                          rp->r_unlcred = NULL;
8027 8021                          /* rp->r_unlanme points to tmpname */
8028 8022                          if (rp->r_unlname)
8029 8023                                  kmem_free(rp->r_unlname, MAXNAMELEN);
8030 8024                          rp->r_unlname = NULL;
8031 8025                  }
8032 8026                  mutex_exit(&rp->r_statelock);
8033 8027  
8034 8028                  if (nvp) {
8035 8029                          VN_RELE(nvp);
8036 8030                  }
8037 8031                  goto link_call;
8038 8032          }
8039 8033  
8040 8034          if (error) {
8041 8035                  VN_RELE(ovp);
8042 8036                  nfs_rw_exit(&odrp->r_rwlock);
8043 8037                  nfs_rw_exit(&ndrp->r_rwlock);
8044 8038                  if (nvp) {
8045 8039                          VN_RELE(nvp);
8046 8040                  }
8047 8041                  return (error);
8048 8042          }
8049 8043  
8050 8044          /*
8051 8045           * when renaming directories to be a subdirectory of a
8052 8046           * different parent, the dnlc entry for ".." will no
8053 8047           * longer be valid, so it must be removed
8054 8048           */
8055 8049          rp = VTOR4(ovp);
8056 8050          if (ndvp != odvp) {
8057 8051                  if (ovp->v_type == VDIR) {
8058 8052                          dnlc_remove(ovp, "..");
8059 8053                          if (rp->r_dir != NULL)
8060 8054                                  nfs4_purge_rddir_cache(ovp);
8061 8055                  }
8062 8056          }
8063 8057  
8064 8058          /*
8065 8059           * If we are renaming the unlinked file, update the
8066 8060           * r_unldvp and r_unlname as needed.
8067 8061           */
8068 8062          mutex_enter(&rp->r_statelock);
8069 8063          if (rp->r_unldvp != NULL) {
8070 8064                  if (strcmp(rp->r_unlname, onm) == 0) {
8071 8065                          (void) strncpy(rp->r_unlname, nnm, MAXNAMELEN);
8072 8066                          rp->r_unlname[MAXNAMELEN - 1] = '\0';
8073 8067                          if (ndvp != rp->r_unldvp) {
8074 8068                                  VN_RELE(rp->r_unldvp);
8075 8069                                  rp->r_unldvp = ndvp;
8076 8070                                  VN_HOLD(ndvp);
8077 8071                          }
8078 8072                  }
8079 8073          }
8080 8074          mutex_exit(&rp->r_statelock);
8081 8075  
8082 8076          /*
8083 8077           * Notify the rename vnevents to source vnode, and to the target
8084 8078           * vnode if it already existed.
8085 8079           */
8086 8080          if (error == 0) {
8087 8081                  vnode_t *tvp, *tovp;
8088 8082                  rnode4_t *trp;
8089 8083  
8090 8084                  /*
8091 8085                   * Notify the vnode. Each links is represented by
8092 8086                   * a different vnode, in nfsv4.
8093 8087                   */
8094 8088                  if (nvp) {
8095 8089                          trp = VTOR4(nvp);
8096 8090                          tvp = nvp;
8097 8091                          if (IS_SHADOW(nvp, trp))
8098 8092                                  tvp = RTOV4(trp);
8099 8093                          vnevent_rename_dest(tvp, ndvp, nnm, ct);
8100 8094                  }
8101 8095  
8102 8096                  trp = VTOR4(ovp);
8103 8097                  tovp = ovp;
8104 8098                  if (IS_SHADOW(ovp, trp))
8105 8099                          tovp = RTOV4(trp);
8106 8100  
8107 8101                  vnevent_rename_src(tovp, odvp, onm, ct);
8108 8102  
8109 8103                  trp = VTOR4(ndvp);
8110 8104                  tvp = ndvp;
8111 8105  
8112 8106                  if (IS_SHADOW(ndvp, trp))
8113 8107                          tvp = RTOV4(trp);
8114 8108  
8115 8109                  vnevent_rename_dest_dir(tvp, tovp, nnm, ct);
8116 8110          }
8117 8111  
8118 8112          if (nvp) {
8119 8113                  VN_RELE(nvp);
8120 8114          }
8121 8115          VN_RELE(ovp);
8122 8116  
8123 8117          nfs_rw_exit(&odrp->r_rwlock);
8124 8118          nfs_rw_exit(&ndrp->r_rwlock);
8125 8119  
8126 8120          return (error);
8127 8121  }
8128 8122  
8129 8123  /*
8130 8124   * When the parent directory has changed, sv_dfh must be updated
8131 8125   */
8132 8126  static void
8133 8127  update_parentdir_sfh(vnode_t *vp, vnode_t *ndvp)
8134 8128  {
8135 8129          svnode_t *sv = VTOSV(vp);
8136 8130          nfs4_sharedfh_t *old_dfh = sv->sv_dfh;
8137 8131          nfs4_sharedfh_t *new_dfh = VTOR4(ndvp)->r_fh;
8138 8132  
8139 8133          sfh4_hold(new_dfh);
8140 8134          sv->sv_dfh = new_dfh;
8141 8135          sfh4_rele(&old_dfh);
8142 8136  }
8143 8137  
8144 8138  /*
8145 8139   * nfs4rename_persistent does the otw portion of renaming in NFS Version 4,
8146 8140   * when it is known that the filehandle is persistent through rename.
8147 8141   *
8148 8142   * Rename requires that the current fh be the target directory and the
8149 8143   * saved fh be the source directory. After the operation, the current fh
8150 8144   * is unchanged.
8151 8145   * The compound op structure for persistent fh rename is:
8152 8146   *      PUTFH(sourcdir), SAVEFH, PUTFH(targetdir), RENAME
8153 8147   * Rather than bother with the directory postop args, we'll simply
8154 8148   * update that a change occurred in the cache, so no post-op getattrs.
8155 8149   */
8156 8150  static int
8157 8151  nfs4rename_persistent_fh(vnode_t *odvp, char *onm, vnode_t *renvp,
8158 8152      vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp)
8159 8153  {
8160 8154          COMPOUND4args_clnt args;
8161 8155          COMPOUND4res_clnt res, *resp = NULL;
8162 8156          nfs_argop4 *argop;
8163 8157          nfs_resop4 *resop;
8164 8158          int doqueue, argoplist_size;
8165 8159          mntinfo4_t *mi;
8166 8160          rnode4_t *odrp = VTOR4(odvp);
8167 8161          rnode4_t *ndrp = VTOR4(ndvp);
8168 8162          RENAME4res *rn_res;
8169 8163          bool_t needrecov;
8170 8164          nfs4_recov_state_t recov_state;
8171 8165          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
8172 8166          dirattr_info_t dinfo, *dinfop;
8173 8167  
8174 8168          ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone);
8175 8169  
8176 8170          recov_state.rs_flags = 0;
8177 8171          recov_state.rs_num_retry_despite_err = 0;
8178 8172  
8179 8173          /*
8180 8174           * Rename ops: putfh sdir; savefh; putfh tdir; rename; getattr tdir
8181 8175           *
8182 8176           * If source/target are different dirs, then append putfh(src); getattr
8183 8177           */
8184 8178          args.array_len = (odvp == ndvp) ? 5 : 7;
8185 8179          argoplist_size = args.array_len * sizeof (nfs_argop4);
8186 8180          args.array = argop = kmem_alloc(argoplist_size, KM_SLEEP);
8187 8181  
8188 8182  recov_retry:
8189 8183          *statp = NFS4_OK;
8190 8184  
8191 8185          /* No need to Lookup the file, persistent fh */
8192 8186          args.ctag = TAG_RENAME;
8193 8187  
8194 8188          mi = VTOMI4(odvp);
8195 8189          e.error = nfs4_start_op(mi, odvp, ndvp, &recov_state);
8196 8190          if (e.error) {
8197 8191                  kmem_free(argop, argoplist_size);
8198 8192                  return (e.error);
8199 8193          }
8200 8194  
8201 8195          /* 0: putfh source directory */
8202 8196          argop[0].argop = OP_CPUTFH;
8203 8197          argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh;
8204 8198  
8205 8199          /* 1: Save source fh to free up current for target */
8206 8200          argop[1].argop = OP_SAVEFH;
8207 8201  
8208 8202          /* 2: putfh targetdir */
8209 8203          argop[2].argop = OP_CPUTFH;
8210 8204          argop[2].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8211 8205  
8212 8206          /* 3: current_fh is targetdir, saved_fh is sourcedir */
8213 8207          argop[3].argop = OP_CRENAME;
8214 8208          argop[3].nfs_argop4_u.opcrename.coldname = onm;
8215 8209          argop[3].nfs_argop4_u.opcrename.cnewname = nnm;
8216 8210  
8217 8211          /* 4: getattr (targetdir) */
8218 8212          argop[4].argop = OP_GETATTR;
8219 8213          argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8220 8214          argop[4].nfs_argop4_u.opgetattr.mi = mi;
8221 8215  
8222 8216          if (ndvp != odvp) {
8223 8217  
8224 8218                  /* 5: putfh (sourcedir) */
8225 8219                  argop[5].argop = OP_CPUTFH;
8226 8220                  argop[5].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8227 8221  
8228 8222                  /* 6: getattr (sourcedir) */
8229 8223                  argop[6].argop = OP_GETATTR;
8230 8224                  argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8231 8225                  argop[6].nfs_argop4_u.opgetattr.mi = mi;
8232 8226          }
8233 8227  
8234 8228          dnlc_remove(odvp, onm);
8235 8229          dnlc_remove(ndvp, nnm);
8236 8230  
8237 8231          doqueue = 1;
8238 8232          dinfo.di_time_call = gethrtime();
8239 8233          rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
8240 8234  
8241 8235          needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
8242 8236          if (e.error) {
8243 8237                  PURGE_ATTRCACHE4(odvp);
8244 8238                  PURGE_ATTRCACHE4(ndvp);
8245 8239          } else {
8246 8240                  *statp = res.status;
8247 8241          }
8248 8242  
8249 8243          if (needrecov) {
8250 8244                  if (nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL,
8251 8245                      OP_RENAME, NULL, NULL, NULL) == FALSE) {
8252 8246                          nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov);
8253 8247                          if (!e.error)
8254 8248                                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
8255 8249                          goto recov_retry;
8256 8250                  }
8257 8251          }
8258 8252  
8259 8253          if (!e.error) {
8260 8254                  resp = &res;
8261 8255                  /*
8262 8256                   * as long as OP_RENAME
8263 8257                   */
8264 8258                  if (res.status != NFS4_OK && res.array_len <= 4) {
8265 8259                          e.error = geterrno4(res.status);
8266 8260                          PURGE_ATTRCACHE4(odvp);
8267 8261                          PURGE_ATTRCACHE4(ndvp);
8268 8262                          /*
8269 8263                           * System V defines rename to return EEXIST, not
8270 8264                           * ENOTEMPTY if the target directory is not empty.
8271 8265                           * Over the wire, the error is NFSERR_ENOTEMPTY
8272 8266                           * which geterrno4 maps to ENOTEMPTY.
8273 8267                           */
8274 8268                          if (e.error == ENOTEMPTY)
8275 8269                                  e.error = EEXIST;
8276 8270                  } else {
8277 8271  
8278 8272                          resop = &res.array[3];  /* rename res */
8279 8273                          rn_res = &resop->nfs_resop4_u.oprename;
8280 8274  
8281 8275                          if (res.status == NFS4_OK) {
8282 8276                                  /*
8283 8277                                   * Update target attribute, readdir and dnlc
8284 8278                                   * caches.
8285 8279                                   */
8286 8280                                  dinfo.di_garp =
8287 8281                                      &res.array[4].nfs_resop4_u.opgetattr.ga_res;
8288 8282                                  dinfo.di_cred = cr;
8289 8283                                  dinfop = &dinfo;
8290 8284                          } else
8291 8285                                  dinfop = NULL;
8292 8286  
8293 8287                          nfs4_update_dircaches(&rn_res->target_cinfo,
8294 8288                              ndvp, NULL, NULL, dinfop);
8295 8289  
8296 8290                          /*
8297 8291                           * Update source attribute, readdir and dnlc caches
8298 8292                           *
8299 8293                           */
8300 8294                          if (ndvp != odvp) {
8301 8295                                  update_parentdir_sfh(renvp, ndvp);
8302 8296  
8303 8297                                  if (dinfop)
8304 8298                                          dinfo.di_garp =
8305 8299                                              &(res.array[6].nfs_resop4_u.
8306 8300                                              opgetattr.ga_res);
8307 8301  
8308 8302                                  nfs4_update_dircaches(&rn_res->source_cinfo,
8309 8303                                      odvp, NULL, NULL, dinfop);
8310 8304                          }
8311 8305  
8312 8306                          fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name,
8313 8307                              nnm);
8314 8308                  }
8315 8309          }
8316 8310  
8317 8311          if (resp)
8318 8312                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
8319 8313          nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov);
8320 8314          kmem_free(argop, argoplist_size);
8321 8315  
8322 8316          return (e.error);
8323 8317  }
8324 8318  
8325 8319  /*
8326 8320   * nfs4rename_volatile_fh does the otw part of renaming in NFS Version 4, when
8327 8321   * it is possible for the filehandle to change due to the rename.
8328 8322   *
8329 8323   * The compound req in this case includes a post-rename lookup and getattr
8330 8324   * to ensure that we have the correct fh and attributes for the object.
8331 8325   *
8332 8326   * Rename requires that the current fh be the target directory and the
8333 8327   * saved fh be the source directory. After the operation, the current fh
8334 8328   * is unchanged.
8335 8329   *
8336 8330   * We need the new filehandle (hence a LOOKUP and GETFH) so that we can
8337 8331   * update the filehandle for the renamed object.  We also get the old
8338 8332   * filehandle for historical reasons; this should be taken out sometime.
8339 8333   * This results in a rather cumbersome compound...
8340 8334   *
8341 8335   *    PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old),
8342 8336   *    PUTFH(targetdir), RENAME, LOOKUP(trgt), GETFH(new), GETATTR
8343 8337   *
8344 8338   */
8345 8339  static int
8346 8340  nfs4rename_volatile_fh(vnode_t *odvp, char *onm, vnode_t *ovp,
8347 8341      vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp)
8348 8342  {
8349 8343          COMPOUND4args_clnt args;
8350 8344          COMPOUND4res_clnt res, *resp = NULL;
8351 8345          int argoplist_size;
8352 8346          nfs_argop4 *argop;
8353 8347          nfs_resop4 *resop;
8354 8348          int doqueue;
8355 8349          mntinfo4_t *mi;
8356 8350          rnode4_t *odrp = VTOR4(odvp);   /* old directory */
8357 8351          rnode4_t *ndrp = VTOR4(ndvp);   /* new directory */
8358 8352          rnode4_t *orp = VTOR4(ovp);     /* object being renamed */
8359 8353          RENAME4res *rn_res;
8360 8354          GETFH4res *ngf_res;
8361 8355          bool_t needrecov;
8362 8356          nfs4_recov_state_t recov_state;
8363 8357          hrtime_t t;
8364 8358          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
8365 8359          dirattr_info_t dinfo, *dinfop = &dinfo;
8366 8360  
8367 8361          ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone);
8368 8362  
8369 8363          recov_state.rs_flags = 0;
8370 8364          recov_state.rs_num_retry_despite_err = 0;
8371 8365  
8372 8366  recov_retry:
8373 8367          *statp = NFS4_OK;
8374 8368  
8375 8369          /*
8376 8370           * There is a window between the RPC and updating the path and
8377 8371           * filehandle stored in the rnode.  Lock out the FHEXPIRED recovery
8378 8372           * code, so that it doesn't try to use the old path during that
8379 8373           * window.
8380 8374           */
8381 8375          mutex_enter(&orp->r_statelock);
8382 8376          while (orp->r_flags & R4RECEXPFH) {
8383 8377                  klwp_t *lwp = ttolwp(curthread);
8384 8378  
8385 8379                  if (lwp != NULL)
8386 8380                          lwp->lwp_nostop++;
8387 8381                  if (cv_wait_sig(&orp->r_cv, &orp->r_statelock) == 0) {
8388 8382                          mutex_exit(&orp->r_statelock);
8389 8383                          if (lwp != NULL)
8390 8384                                  lwp->lwp_nostop--;
8391 8385                          return (EINTR);
8392 8386                  }
8393 8387                  if (lwp != NULL)
8394 8388                          lwp->lwp_nostop--;
8395 8389          }
8396 8390          orp->r_flags |= R4RECEXPFH;
8397 8391          mutex_exit(&orp->r_statelock);
8398 8392  
8399 8393          mi = VTOMI4(odvp);
8400 8394  
8401 8395          args.ctag = TAG_RENAME_VFH;
8402 8396          args.array_len = (odvp == ndvp) ? 10 : 12;
8403 8397          argoplist_size  = args.array_len * sizeof (nfs_argop4);
8404 8398          argop = kmem_alloc(argoplist_size, KM_SLEEP);
8405 8399  
8406 8400          /*
8407 8401           * Rename ops:
8408 8402           *    PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old),
8409 8403           *    PUTFH(targetdir), RENAME, GETATTR(targetdir)
8410 8404           *    LOOKUP(trgt), GETFH(new), GETATTR,
8411 8405           *
8412 8406           *    if (odvp != ndvp)
8413 8407           *      add putfh(sourcedir), getattr(sourcedir) }
8414 8408           */
8415 8409          args.array = argop;
8416 8410  
8417 8411          e.error = nfs4_start_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8418 8412              &recov_state, NULL);
8419 8413          if (e.error) {
8420 8414                  kmem_free(argop, argoplist_size);
8421 8415                  mutex_enter(&orp->r_statelock);
8422 8416                  orp->r_flags &= ~R4RECEXPFH;
8423 8417                  cv_broadcast(&orp->r_cv);
8424 8418                  mutex_exit(&orp->r_statelock);
8425 8419                  return (e.error);
8426 8420          }
8427 8421  
8428 8422          /* 0: putfh source directory */
8429 8423          argop[0].argop = OP_CPUTFH;
8430 8424          argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh;
8431 8425  
8432 8426          /* 1: Save source fh to free up current for target */
8433 8427          argop[1].argop = OP_SAVEFH;
8434 8428  
8435 8429          /* 2: Lookup pre-rename fh of renamed object */
8436 8430          argop[2].argop = OP_CLOOKUP;
8437 8431          argop[2].nfs_argop4_u.opclookup.cname = onm;
8438 8432  
8439 8433          /* 3: getfh fh of renamed object (before rename) */
8440 8434          argop[3].argop = OP_GETFH;
8441 8435  
8442 8436          /* 4: putfh targetdir */
8443 8437          argop[4].argop = OP_CPUTFH;
8444 8438          argop[4].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8445 8439  
8446 8440          /* 5: current_fh is targetdir, saved_fh is sourcedir */
8447 8441          argop[5].argop = OP_CRENAME;
8448 8442          argop[5].nfs_argop4_u.opcrename.coldname = onm;
8449 8443          argop[5].nfs_argop4_u.opcrename.cnewname = nnm;
8450 8444  
8451 8445          /* 6: getattr of target dir (post op attrs) */
8452 8446          argop[6].argop = OP_GETATTR;
8453 8447          argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8454 8448          argop[6].nfs_argop4_u.opgetattr.mi = mi;
8455 8449  
8456 8450          /* 7: Lookup post-rename fh of renamed object */
8457 8451          argop[7].argop = OP_CLOOKUP;
8458 8452          argop[7].nfs_argop4_u.opclookup.cname = nnm;
8459 8453  
8460 8454          /* 8: getfh fh of renamed object (after rename) */
8461 8455          argop[8].argop = OP_GETFH;
8462 8456  
8463 8457          /* 9: getattr of renamed object */
8464 8458          argop[9].argop = OP_GETATTR;
8465 8459          argop[9].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8466 8460          argop[9].nfs_argop4_u.opgetattr.mi = mi;
8467 8461  
8468 8462          /*
8469 8463           * If source/target dirs are different, then get new post-op
8470 8464           * attrs for source dir also.
8471 8465           */
8472 8466          if (ndvp != odvp) {
8473 8467                  /* 10: putfh (sourcedir) */
8474 8468                  argop[10].argop = OP_CPUTFH;
8475 8469                  argop[10].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8476 8470  
8477 8471                  /* 11: getattr (sourcedir) */
8478 8472                  argop[11].argop = OP_GETATTR;
8479 8473                  argop[11].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8480 8474                  argop[11].nfs_argop4_u.opgetattr.mi = mi;
8481 8475          }
8482 8476  
8483 8477          dnlc_remove(odvp, onm);
8484 8478          dnlc_remove(ndvp, nnm);
8485 8479  
8486 8480          doqueue = 1;
8487 8481          t = gethrtime();
8488 8482          rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
8489 8483  
8490 8484          needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
8491 8485          if (e.error) {
8492 8486                  PURGE_ATTRCACHE4(odvp);
8493 8487                  PURGE_ATTRCACHE4(ndvp);
8494 8488                  if (!needrecov) {
8495 8489                          nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8496 8490                              &recov_state, needrecov);
8497 8491                          goto out;
8498 8492                  }
8499 8493          } else {
8500 8494                  *statp = res.status;
8501 8495          }
8502 8496  
8503 8497          if (needrecov) {
8504 8498                  bool_t abort;
8505 8499  
8506 8500                  abort = nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL,
8507 8501                      OP_RENAME, NULL, NULL, NULL);
8508 8502                  if (abort == FALSE) {
8509 8503                          nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8510 8504                              &recov_state, needrecov);
8511 8505                          kmem_free(argop, argoplist_size);
8512 8506                          if (!e.error)
8513 8507                                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
8514 8508                          mutex_enter(&orp->r_statelock);
8515 8509                          orp->r_flags &= ~R4RECEXPFH;
8516 8510                          cv_broadcast(&orp->r_cv);
8517 8511                          mutex_exit(&orp->r_statelock);
8518 8512                          goto recov_retry;
8519 8513                  } else {
8520 8514                          if (e.error != 0) {
8521 8515                                  nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8522 8516                                      &recov_state, needrecov);
8523 8517                                  goto out;
8524 8518                          }
8525 8519                          /* fall through for res.status case */
8526 8520                  }
8527 8521          }
8528 8522  
8529 8523          resp = &res;
8530 8524          /*
8531 8525           * If OP_RENAME (or any prev op) failed, then return an error.
8532 8526           * OP_RENAME is index 5, so if array len <= 6 we return an error.
8533 8527           */
8534 8528          if ((res.status != NFS4_OK) && (res.array_len <= 6)) {
8535 8529                  /*
8536 8530                   * Error in an op other than last Getattr
8537 8531                   */
8538 8532                  e.error = geterrno4(res.status);
8539 8533                  PURGE_ATTRCACHE4(odvp);
8540 8534                  PURGE_ATTRCACHE4(ndvp);
8541 8535                  /*
8542 8536                   * System V defines rename to return EEXIST, not
8543 8537                   * ENOTEMPTY if the target directory is not empty.
8544 8538                   * Over the wire, the error is NFSERR_ENOTEMPTY
8545 8539                   * which geterrno4 maps to ENOTEMPTY.
8546 8540                   */
8547 8541                  if (e.error == ENOTEMPTY)
8548 8542                          e.error = EEXIST;
8549 8543                  nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state,
8550 8544                      needrecov);
8551 8545                  goto out;
8552 8546          }
8553 8547  
8554 8548          /* rename results */
8555 8549          rn_res = &res.array[5].nfs_resop4_u.oprename;
8556 8550  
8557 8551          if (res.status == NFS4_OK) {
8558 8552                  /* Update target attribute, readdir and dnlc caches */
8559 8553                  dinfo.di_garp =
8560 8554                      &res.array[6].nfs_resop4_u.opgetattr.ga_res;
8561 8555                  dinfo.di_cred = cr;
8562 8556                  dinfo.di_time_call = t;
8563 8557          } else
8564 8558                  dinfop = NULL;
8565 8559  
8566 8560          /* Update source cache attribute, readdir and dnlc caches */
8567 8561          nfs4_update_dircaches(&rn_res->target_cinfo, ndvp, NULL, NULL, dinfop);
8568 8562  
8569 8563          /* Update source cache attribute, readdir and dnlc caches */
8570 8564          if (ndvp != odvp) {
8571 8565                  update_parentdir_sfh(ovp, ndvp);
8572 8566  
8573 8567                  /*
8574 8568                   * If dinfop is non-NULL, then compound succeded, so
8575 8569                   * set di_garp to attrs for source dir.  dinfop is only
8576 8570                   * set to NULL when compound fails.
8577 8571                   */
8578 8572                  if (dinfop)
8579 8573                          dinfo.di_garp =
8580 8574                              &res.array[11].nfs_resop4_u.opgetattr.ga_res;
8581 8575                  nfs4_update_dircaches(&rn_res->source_cinfo, odvp, NULL, NULL,
8582 8576                      dinfop);
8583 8577          }
8584 8578  
8585 8579          /*
8586 8580           * Update the rnode with the new component name and args,
8587 8581           * and if the file handle changed, also update it with the new fh.
8588 8582           * This is only necessary if the target object has an rnode
8589 8583           * entry and there is no need to create one for it.
8590 8584           */
8591 8585          resop = &res.array[8];  /* getfh new res */
8592 8586          ngf_res = &resop->nfs_resop4_u.opgetfh;
8593 8587  
8594 8588          /*
8595 8589           * Update the path and filehandle for the renamed object.
8596 8590           */
8597 8591          nfs4rename_update(ovp, ndvp, &ngf_res->object, nnm);
8598 8592  
8599 8593          nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, needrecov);
8600 8594  
8601 8595          if (res.status == NFS4_OK) {
8602 8596                  resop++;        /* getattr res */
8603 8597                  e.error = nfs4_update_attrcache(res.status,
8604 8598                      &resop->nfs_resop4_u.opgetattr.ga_res,
8605 8599                      t, ovp, cr);
8606 8600          }
8607 8601  
8608 8602  out:
8609 8603          kmem_free(argop, argoplist_size);
8610 8604          if (resp)
8611 8605                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
8612 8606          mutex_enter(&orp->r_statelock);
8613 8607          orp->r_flags &= ~R4RECEXPFH;
8614 8608          cv_broadcast(&orp->r_cv);
8615 8609          mutex_exit(&orp->r_statelock);
8616 8610  
8617 8611          return (e.error);
8618 8612  }
8619 8613  
8620 8614  /* ARGSUSED */
8621 8615  static int
8622 8616  nfs4_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr,
8623 8617      caller_context_t *ct, int flags, vsecattr_t *vsecp)
8624 8618  {
8625 8619          int error;
8626 8620          vnode_t *vp;
8627 8621  
8628 8622          if (nfs_zone() != VTOMI4(dvp)->mi_zone)
8629 8623                  return (EPERM);
8630 8624          /*
8631 8625           * As ".." has special meaning and rather than send a mkdir
8632 8626           * over the wire to just let the server freak out, we just
8633 8627           * short circuit it here and return EEXIST
8634 8628           */
8635 8629          if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0')
8636 8630                  return (EEXIST);
8637 8631  
8638 8632          /*
8639 8633           * Decision to get the right gid and setgid bit of the
8640 8634           * new directory is now made in call_nfs4_create_req.
8641 8635           */
8642 8636          va->va_mask |= AT_MODE;
8643 8637          error = call_nfs4_create_req(dvp, nm, NULL, va, &vp, cr, NF4DIR);
8644 8638          if (error)
8645 8639                  return (error);
8646 8640  
8647 8641          *vpp = vp;
8648 8642          return (0);
8649 8643  }
8650 8644  
8651 8645  
8652 8646  /*
8653 8647   * rmdir is using the same remove v4 op as does remove.
8654 8648   * Remove requires that the current fh be the target directory.
8655 8649   * After the operation, the current fh is unchanged.
8656 8650   * The compound op structure is:
8657 8651   *      PUTFH(targetdir), REMOVE
8658 8652   */
8659 8653  /*ARGSUSED4*/
8660 8654  static int
8661 8655  nfs4_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
8662 8656      caller_context_t *ct, int flags)
8663 8657  {
8664 8658          int need_end_op = FALSE;
8665 8659          COMPOUND4args_clnt args;
8666 8660          COMPOUND4res_clnt res, *resp = NULL;
8667 8661          REMOVE4res *rm_res;
8668 8662          nfs_argop4 argop[3];
8669 8663          nfs_resop4 *resop;
8670 8664          vnode_t *vp;
8671 8665          int doqueue;
8672 8666          mntinfo4_t *mi;
8673 8667          rnode4_t *drp;
8674 8668          bool_t needrecov = FALSE;
8675 8669          nfs4_recov_state_t recov_state;
8676 8670          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
8677 8671          dirattr_info_t dinfo, *dinfop;
8678 8672  
8679 8673          if (nfs_zone() != VTOMI4(dvp)->mi_zone)
8680 8674                  return (EPERM);
8681 8675          /*
8682 8676           * As ".." has special meaning and rather than send a rmdir
8683 8677           * over the wire to just let the server freak out, we just
8684 8678           * short circuit it here and return EEXIST
8685 8679           */
8686 8680          if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0')
8687 8681                  return (EEXIST);
8688 8682  
8689 8683          drp = VTOR4(dvp);
8690 8684          if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
8691 8685                  return (EINTR);
8692 8686  
8693 8687          /*
8694 8688           * Attempt to prevent a rmdir(".") from succeeding.
8695 8689           */
8696 8690          e.error = nfs4lookup(dvp, nm, &vp, cr, 0);
8697 8691          if (e.error) {
8698 8692                  nfs_rw_exit(&drp->r_rwlock);
8699 8693                  return (e.error);
8700 8694          }
8701 8695          if (vp == cdir) {
8702 8696                  VN_RELE(vp);
8703 8697                  nfs_rw_exit(&drp->r_rwlock);
8704 8698                  return (EINVAL);
8705 8699          }
8706 8700  
8707 8701          /*
8708 8702           * Since nfsv4 remove op works on both files and directories,
8709 8703           * check that the removed object is indeed a directory.
8710 8704           */
8711 8705          if (vp->v_type != VDIR) {
8712 8706                  VN_RELE(vp);
8713 8707                  nfs_rw_exit(&drp->r_rwlock);
8714 8708                  return (ENOTDIR);
8715 8709          }
8716 8710  
8717 8711          /*
8718 8712           * First just remove the entry from the name cache, as it
8719 8713           * is most likely an entry for this vp.
8720 8714           */
8721 8715          dnlc_remove(dvp, nm);
8722 8716  
8723 8717          /*
8724 8718           * If there vnode reference count is greater than one, then
8725 8719           * there may be additional references in the DNLC which will
8726 8720           * need to be purged.  First, trying removing the entry for
8727 8721           * the parent directory and see if that removes the additional
8728 8722           * reference(s).  If that doesn't do it, then use dnlc_purge_vp
8729 8723           * to completely remove any references to the directory which
8730 8724           * might still exist in the DNLC.
8731 8725           */
8732 8726          if (vp->v_count > 1) {
8733 8727                  dnlc_remove(vp, "..");
8734 8728                  if (vp->v_count > 1)
8735 8729                          dnlc_purge_vp(vp);
8736 8730          }
8737 8731  
8738 8732          mi = VTOMI4(dvp);
8739 8733          recov_state.rs_flags = 0;
8740 8734          recov_state.rs_num_retry_despite_err = 0;
8741 8735  
8742 8736  recov_retry:
8743 8737          args.ctag = TAG_RMDIR;
8744 8738  
8745 8739          /*
8746 8740           * Rmdir ops: putfh dir; remove
8747 8741           */
8748 8742          args.array_len = 3;
8749 8743          args.array = argop;
8750 8744  
8751 8745          e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
8752 8746          if (e.error) {
8753 8747                  nfs_rw_exit(&drp->r_rwlock);
8754 8748                  return (e.error);
8755 8749          }
8756 8750          need_end_op = TRUE;
8757 8751  
8758 8752          /* putfh directory */
8759 8753          argop[0].argop = OP_CPUTFH;
8760 8754          argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
8761 8755  
8762 8756          /* remove */
8763 8757          argop[1].argop = OP_CREMOVE;
8764 8758          argop[1].nfs_argop4_u.opcremove.ctarget = nm;
8765 8759  
8766 8760          /* getattr (postop attrs for dir that contained removed dir) */
8767 8761          argop[2].argop = OP_GETATTR;
8768 8762          argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8769 8763          argop[2].nfs_argop4_u.opgetattr.mi = mi;
8770 8764  
8771 8765          dinfo.di_time_call = gethrtime();
8772 8766          doqueue = 1;
8773 8767          rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
8774 8768  
8775 8769          PURGE_ATTRCACHE4(vp);
8776 8770  
8777 8771          needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
8778 8772          if (e.error) {
8779 8773                  PURGE_ATTRCACHE4(dvp);
8780 8774          }
8781 8775  
8782 8776          if (needrecov) {
8783 8777                  if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, NULL, NULL,
8784 8778                      NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) {
8785 8779                          if (!e.error)
8786 8780                                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
8787 8781  
8788 8782                          nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
8789 8783                              needrecov);
8790 8784                          need_end_op = FALSE;
8791 8785                          goto recov_retry;
8792 8786                  }
8793 8787          }
8794 8788  
8795 8789          if (!e.error) {
8796 8790                  resp = &res;
8797 8791  
8798 8792                  /*
8799 8793                   * Only return error if first 2 ops (OP_REMOVE or earlier)
8800 8794                   * failed.
8801 8795                   */
8802 8796                  if (res.status != NFS4_OK && res.array_len <= 2) {
8803 8797                          e.error = geterrno4(res.status);
8804 8798                          PURGE_ATTRCACHE4(dvp);
8805 8799                          nfs4_end_op(VTOMI4(dvp), dvp, NULL,
8806 8800                              &recov_state, needrecov);
8807 8801                          need_end_op = FALSE;
8808 8802                          nfs4_purge_stale_fh(e.error, dvp, cr);
8809 8803                          /*
8810 8804                           * System V defines rmdir to return EEXIST, not
8811 8805                           * ENOTEMPTY if the directory is not empty.  Over
8812 8806                           * the wire, the error is NFSERR_ENOTEMPTY which
8813 8807                           * geterrno4 maps to ENOTEMPTY.
8814 8808                           */
8815 8809                          if (e.error == ENOTEMPTY)
8816 8810                                  e.error = EEXIST;
8817 8811                  } else {
8818 8812                          resop = &res.array[1];  /* remove res */
8819 8813                          rm_res = &resop->nfs_resop4_u.opremove;
8820 8814  
8821 8815                          if (res.status == NFS4_OK) {
8822 8816                                  resop = &res.array[2];  /* dir attrs */
8823 8817                                  dinfo.di_garp =
8824 8818                                      &resop->nfs_resop4_u.opgetattr.ga_res;
8825 8819                                  dinfo.di_cred = cr;
8826 8820                                  dinfop = &dinfo;
8827 8821                          } else
8828 8822                                  dinfop = NULL;
8829 8823  
8830 8824                          /* Update dir attribute, readdir and dnlc caches */
8831 8825                          nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL,
8832 8826                              dinfop);
8833 8827  
8834 8828                          /* destroy rddir cache for dir that was removed */
8835 8829                          if (VTOR4(vp)->r_dir != NULL)
8836 8830                                  nfs4_purge_rddir_cache(vp);
8837 8831                  }
8838 8832          }
8839 8833  
8840 8834          if (need_end_op)
8841 8835                  nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
8842 8836  
8843 8837          nfs_rw_exit(&drp->r_rwlock);
8844 8838  
8845 8839          if (resp)
8846 8840                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
8847 8841  
8848 8842          if (e.error == 0) {
8849 8843                  vnode_t *tvp;
8850 8844                  rnode4_t *trp;
8851 8845                  trp = VTOR4(vp);
8852 8846                  tvp = vp;
8853 8847                  if (IS_SHADOW(vp, trp))
8854 8848                          tvp = RTOV4(trp);
8855 8849                  vnevent_rmdir(tvp, dvp, nm, ct);
8856 8850          }
8857 8851  
8858 8852          VN_RELE(vp);
8859 8853  
8860 8854          return (e.error);
8861 8855  }
8862 8856  
8863 8857  /* ARGSUSED */
8864 8858  static int
8865 8859  nfs4_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr,
8866 8860      caller_context_t *ct, int flags)
8867 8861  {
8868 8862          int error;
8869 8863          vnode_t *vp;
8870 8864          rnode4_t *rp;
8871 8865          char *contents;
8872 8866          mntinfo4_t *mi = VTOMI4(dvp);
8873 8867  
8874 8868          if (nfs_zone() != mi->mi_zone)
8875 8869                  return (EPERM);
8876 8870          if (!(mi->mi_flags & MI4_SYMLINK))
8877 8871                  return (EOPNOTSUPP);
8878 8872  
8879 8873          error = call_nfs4_create_req(dvp, lnm, tnm, tva, &vp, cr, NF4LNK);
8880 8874          if (error)
8881 8875                  return (error);
8882 8876  
8883 8877          ASSERT(nfs4_consistent_type(vp));
8884 8878          rp = VTOR4(vp);
8885 8879          if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) {
8886 8880  
8887 8881                  contents = kmem_alloc(MAXPATHLEN, KM_SLEEP);
8888 8882  
8889 8883                  if (contents != NULL) {
8890 8884                          mutex_enter(&rp->r_statelock);
8891 8885                          if (rp->r_symlink.contents == NULL) {
8892 8886                                  rp->r_symlink.len = strlen(tnm);
8893 8887                                  bcopy(tnm, contents, rp->r_symlink.len);
8894 8888                                  rp->r_symlink.contents = contents;
8895 8889                                  rp->r_symlink.size = MAXPATHLEN;
8896 8890                                  mutex_exit(&rp->r_statelock);
8897 8891                          } else {
8898 8892                                  mutex_exit(&rp->r_statelock);
8899 8893                                  kmem_free((void *)contents, MAXPATHLEN);
8900 8894                          }
8901 8895                  }
8902 8896          }
8903 8897          VN_RELE(vp);
8904 8898  
8905 8899          return (error);
8906 8900  }
8907 8901  
8908 8902  
8909 8903  /*
8910 8904   * Read directory entries.
8911 8905   * There are some weird things to look out for here.  The uio_loffset
8912 8906   * field is either 0 or it is the offset returned from a previous
8913 8907   * readdir.  It is an opaque value used by the server to find the
8914 8908   * correct directory block to read. The count field is the number
8915 8909   * of blocks to read on the server.  This is advisory only, the server
8916 8910   * may return only one block's worth of entries.  Entries may be compressed
8917 8911   * on the server.
8918 8912   */
8919 8913  /* ARGSUSED */
8920 8914  static int
8921 8915  nfs4_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
8922 8916      caller_context_t *ct, int flags)
8923 8917  {
8924 8918          int error;
8925 8919          uint_t count;
8926 8920          rnode4_t *rp;
8927 8921          rddir4_cache *rdc;
8928 8922          rddir4_cache *rrdc;
8929 8923  
8930 8924          if (nfs_zone() != VTOMI4(vp)->mi_zone)
8931 8925                  return (EIO);
8932 8926          rp = VTOR4(vp);
8933 8927  
8934 8928          ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
8935 8929  
8936 8930          /*
8937 8931           * Make sure that the directory cache is valid.
8938 8932           */
8939 8933          if (rp->r_dir != NULL) {
8940 8934                  if (nfs_disable_rddir_cache != 0) {
8941 8935                          /*
8942 8936                           * Setting nfs_disable_rddir_cache in /etc/system
8943 8937                           * allows interoperability with servers that do not
8944 8938                           * properly update the attributes of directories.
8945 8939                           * Any cached information gets purged before an
8946 8940                           * access is made to it.
8947 8941                           */
8948 8942                          nfs4_purge_rddir_cache(vp);
8949 8943                  }
8950 8944  
8951 8945                  error = nfs4_validate_caches(vp, cr);
8952 8946                  if (error)
8953 8947                          return (error);
8954 8948          }
8955 8949  
8956 8950          count = MIN(uiop->uio_iov->iov_len, MAXBSIZE);
8957 8951  
8958 8952          /*
8959 8953           * Short circuit last readdir which always returns 0 bytes.
8960 8954           * This can be done after the directory has been read through
8961 8955           * completely at least once.  This will set r_direof which
8962 8956           * can be used to find the value of the last cookie.
8963 8957           */
8964 8958          mutex_enter(&rp->r_statelock);
8965 8959          if (rp->r_direof != NULL &&
8966 8960              uiop->uio_loffset == rp->r_direof->nfs4_ncookie) {
8967 8961                  mutex_exit(&rp->r_statelock);
8968 8962  #ifdef DEBUG
8969 8963                  nfs4_readdir_cache_shorts++;
8970 8964  #endif
8971 8965                  if (eofp)
8972 8966                          *eofp = 1;
8973 8967                  return (0);
8974 8968          }
8975 8969  
8976 8970          /*
8977 8971           * Look for a cache entry.  Cache entries are identified
8978 8972           * by the NFS cookie value and the byte count requested.
8979 8973           */
8980 8974          rdc = rddir4_cache_lookup(rp, uiop->uio_loffset, count);
8981 8975  
8982 8976          /*
8983 8977           * If rdc is NULL then the lookup resulted in an unrecoverable error.
8984 8978           */
8985 8979          if (rdc == NULL) {
8986 8980                  mutex_exit(&rp->r_statelock);
8987 8981                  return (EINTR);
8988 8982          }
8989 8983  
8990 8984          /*
8991 8985           * Check to see if we need to fill this entry in.
8992 8986           */
8993 8987          if (rdc->flags & RDDIRREQ) {
8994 8988                  rdc->flags &= ~RDDIRREQ;
8995 8989                  rdc->flags |= RDDIR;
8996 8990                  mutex_exit(&rp->r_statelock);
8997 8991  
8998 8992                  /*
8999 8993                   * Do the readdir.
9000 8994                   */
9001 8995                  nfs4readdir(vp, rdc, cr);
9002 8996  
9003 8997                  /*
9004 8998                   * Reacquire the lock, so that we can continue
9005 8999                   */
9006 9000                  mutex_enter(&rp->r_statelock);
9007 9001                  /*
9008 9002                   * The entry is now complete
9009 9003                   */
9010 9004                  rdc->flags &= ~RDDIR;
9011 9005          }
9012 9006  
9013 9007          ASSERT(!(rdc->flags & RDDIR));
9014 9008  
9015 9009          /*
9016 9010           * If an error occurred while attempting
9017 9011           * to fill the cache entry, mark the entry invalid and
9018 9012           * just return the error.
9019 9013           */
9020 9014          if (rdc->error) {
9021 9015                  error = rdc->error;
9022 9016                  rdc->flags |= RDDIRREQ;
9023 9017                  rddir4_cache_rele(rp, rdc);
9024 9018                  mutex_exit(&rp->r_statelock);
9025 9019                  return (error);
9026 9020          }
9027 9021  
9028 9022          /*
9029 9023           * The cache entry is complete and good,
9030 9024           * copyout the dirent structs to the calling
9031 9025           * thread.
9032 9026           */
9033 9027          error = uiomove(rdc->entries, rdc->actlen, UIO_READ, uiop);
9034 9028  
9035 9029          /*
9036 9030           * If no error occurred during the copyout,
9037 9031           * update the offset in the uio struct to
9038 9032           * contain the value of the next NFS 4 cookie
9039 9033           * and set the eof value appropriately.
9040 9034           */
9041 9035          if (!error) {
9042 9036                  uiop->uio_loffset = rdc->nfs4_ncookie;
9043 9037                  if (eofp)
9044 9038                          *eofp = rdc->eof;
9045 9039          }
9046 9040  
9047 9041          /*
9048 9042           * Decide whether to do readahead.  Don't if we
9049 9043           * have already read to the end of directory.
9050 9044           */
9051 9045          if (rdc->eof) {
9052 9046                  /*
9053 9047                   * Make the entry the direof only if it is cached
9054 9048                   */
9055 9049                  if (rdc->flags & RDDIRCACHED)
9056 9050                          rp->r_direof = rdc;
9057 9051                  rddir4_cache_rele(rp, rdc);
9058 9052                  mutex_exit(&rp->r_statelock);
9059 9053                  return (error);
9060 9054          }
9061 9055  
9062 9056          /* Determine if a readdir readahead should be done */
9063 9057          if (!(rp->r_flags & R4LOOKUP)) {
9064 9058                  rddir4_cache_rele(rp, rdc);
9065 9059                  mutex_exit(&rp->r_statelock);
9066 9060                  return (error);
9067 9061          }
9068 9062  
9069 9063          /*
9070 9064           * Now look for a readahead entry.
9071 9065           *
9072 9066           * Check to see whether we found an entry for the readahead.
9073 9067           * If so, we don't need to do anything further, so free the new
9074 9068           * entry if one was allocated.  Otherwise, allocate a new entry, add
9075 9069           * it to the cache, and then initiate an asynchronous readdir
9076 9070           * operation to fill it.
9077 9071           */
9078 9072          rrdc = rddir4_cache_lookup(rp, rdc->nfs4_ncookie, count);
9079 9073  
9080 9074          /*
9081 9075           * A readdir cache entry could not be obtained for the readahead.  In
9082 9076           * this case we skip the readahead and return.
9083 9077           */
9084 9078          if (rrdc == NULL) {
9085 9079                  rddir4_cache_rele(rp, rdc);
9086 9080                  mutex_exit(&rp->r_statelock);
9087 9081                  return (error);
9088 9082          }
9089 9083  
9090 9084          /*
9091 9085           * Check to see if we need to fill this entry in.
9092 9086           */
9093 9087          if (rrdc->flags & RDDIRREQ) {
9094 9088                  rrdc->flags &= ~RDDIRREQ;
9095 9089                  rrdc->flags |= RDDIR;
9096 9090                  rddir4_cache_rele(rp, rdc);
9097 9091                  mutex_exit(&rp->r_statelock);
9098 9092  #ifdef DEBUG
9099 9093                  nfs4_readdir_readahead++;
9100 9094  #endif
9101 9095                  /*
9102 9096                   * Do the readdir.
9103 9097                   */
9104 9098                  nfs4_async_readdir(vp, rrdc, cr, do_nfs4readdir);
9105 9099                  return (error);
9106 9100          }
9107 9101  
9108 9102          rddir4_cache_rele(rp, rrdc);
9109 9103          rddir4_cache_rele(rp, rdc);
9110 9104          mutex_exit(&rp->r_statelock);
9111 9105          return (error);
9112 9106  }
9113 9107  
9114 9108  static int
9115 9109  do_nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr)
9116 9110  {
9117 9111          int error;
9118 9112          rnode4_t *rp;
9119 9113  
9120 9114          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
9121 9115  
9122 9116          rp = VTOR4(vp);
9123 9117  
9124 9118          /*
9125 9119           * Obtain the readdir results for the caller.
9126 9120           */
9127 9121          nfs4readdir(vp, rdc, cr);
9128 9122  
9129 9123          mutex_enter(&rp->r_statelock);
9130 9124          /*
9131 9125           * The entry is now complete
9132 9126           */
9133 9127          rdc->flags &= ~RDDIR;
9134 9128  
9135 9129          error = rdc->error;
9136 9130          if (error)
9137 9131                  rdc->flags |= RDDIRREQ;
9138 9132          rddir4_cache_rele(rp, rdc);
9139 9133          mutex_exit(&rp->r_statelock);
9140 9134  
9141 9135          return (error);
9142 9136  }
9143 9137  
9144 9138  /*
9145 9139   * Read directory entries.
9146 9140   * There are some weird things to look out for here.  The uio_loffset
9147 9141   * field is either 0 or it is the offset returned from a previous
9148 9142   * readdir.  It is an opaque value used by the server to find the
9149 9143   * correct directory block to read. The count field is the number
9150 9144   * of blocks to read on the server.  This is advisory only, the server
9151 9145   * may return only one block's worth of entries.  Entries may be compressed
9152 9146   * on the server.
9153 9147   *
9154 9148   * Generates the following compound request:
9155 9149   * 1. If readdir offset is zero and no dnlc entry for parent exists,
9156 9150   *    must include a Lookupp as well. In this case, send:
9157 9151   *    { Putfh <fh>; Readdir; Lookupp; Getfh; Getattr }
9158 9152   * 2. Otherwise just do: { Putfh <fh>; Readdir }
9159 9153   *
9160 9154   * Get complete attributes and filehandles for entries if this is the
9161 9155   * first read of the directory. Otherwise, just get fileid's.
9162 9156   */
9163 9157  static void
9164 9158  nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr)
9165 9159  {
9166 9160          COMPOUND4args_clnt args;
9167 9161          COMPOUND4res_clnt res;
9168 9162          READDIR4args *rargs;
9169 9163          READDIR4res_clnt *rd_res;
9170 9164          bitmap4 rd_bitsval;
9171 9165          nfs_argop4 argop[5];
9172 9166          nfs_resop4 *resop;
9173 9167          rnode4_t *rp = VTOR4(vp);
9174 9168          mntinfo4_t *mi = VTOMI4(vp);
9175 9169          int doqueue;
9176 9170          u_longlong_t nodeid, pnodeid;   /* id's of dir and its parents */
9177 9171          vnode_t *dvp;
9178 9172          nfs_cookie4 cookie = (nfs_cookie4)rdc->nfs4_cookie;
9179 9173          int num_ops, res_opcnt;
9180 9174          bool_t needrecov = FALSE;
9181 9175          nfs4_recov_state_t recov_state;
9182 9176          hrtime_t t;
9183 9177          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
9184 9178  
9185 9179          ASSERT(nfs_zone() == mi->mi_zone);
9186 9180          ASSERT(rdc->flags & RDDIR);
9187 9181          ASSERT(rdc->entries == NULL);
9188 9182  
9189 9183          /*
9190 9184           * If rp were a stub, it should have triggered and caused
9191 9185           * a mount for us to get this far.
9192 9186           */
9193 9187          ASSERT(!RP_ISSTUB(rp));
9194 9188  
9195 9189          num_ops = 2;
9196 9190          if (cookie == (nfs_cookie4)0 || cookie == (nfs_cookie4)1) {
9197 9191                  /*
9198 9192                   * Since nfsv4 readdir may not return entries for "." and "..",
9199 9193                   * the client must recreate them:
9200 9194                   * To find the correct nodeid, do the following:
9201 9195                   * For current node, get nodeid from dnlc.
9202 9196                   * - if current node is rootvp, set pnodeid to nodeid.
9203 9197                   * - else if parent is in the dnlc, get its nodeid from there.
9204 9198                   * - else add LOOKUPP+GETATTR to compound.
9205 9199                   */
9206 9200                  nodeid = rp->r_attr.va_nodeid;
9207 9201                  if (vp->v_flag & VROOT) {
9208 9202                          pnodeid = nodeid;       /* root of mount point */
9209 9203                  } else {
9210 9204                          dvp = dnlc_lookup(vp, "..");
9211 9205                          if (dvp != NULL && dvp != DNLC_NO_VNODE) {
9212 9206                                  /* parent in dnlc cache - no need for otw */
9213 9207                                  pnodeid = VTOR4(dvp)->r_attr.va_nodeid;
9214 9208                          } else {
9215 9209                                  /*
9216 9210                                   * parent not in dnlc cache,
9217 9211                                   * do lookupp to get its id
9218 9212                                   */
9219 9213                                  num_ops = 5;
9220 9214                                  pnodeid = 0; /* set later by getattr parent */
9221 9215                          }
9222 9216                          if (dvp)
9223 9217                                  VN_RELE(dvp);
9224 9218                  }
9225 9219          }
9226 9220          recov_state.rs_flags = 0;
9227 9221          recov_state.rs_num_retry_despite_err = 0;
9228 9222  
9229 9223          /* Save the original mount point security flavor */
9230 9224          (void) save_mnt_secinfo(mi->mi_curr_serv);
9231 9225  
9232 9226  recov_retry:
9233 9227          args.ctag = TAG_READDIR;
9234 9228  
9235 9229          args.array = argop;
9236 9230          args.array_len = num_ops;
9237 9231  
9238 9232          if (e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_READDIR,
9239 9233              &recov_state, NULL)) {
9240 9234                  /*
9241 9235                   * If readdir a node that is a stub for a crossed mount point,
9242 9236                   * keep the original secinfo flavor for the current file
9243 9237                   * system, not the crossed one.
9244 9238                   */
9245 9239                  (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
9246 9240                  rdc->error = e.error;
9247 9241                  return;
9248 9242          }
9249 9243  
9250 9244          /*
9251 9245           * Determine which attrs to request for dirents.  This code
9252 9246           * must be protected by nfs4_start/end_fop because of r_server
9253 9247           * (which will change during failover recovery).
9254 9248           *
9255 9249           */
9256 9250          if (rp->r_flags & (R4LOOKUP | R4READDIRWATTR)) {
9257 9251                  /*
9258 9252                   * Get all vattr attrs plus filehandle and rdattr_error
9259 9253                   */
9260 9254                  rd_bitsval = NFS4_VATTR_MASK |
9261 9255                      FATTR4_RDATTR_ERROR_MASK |
9262 9256                      FATTR4_FILEHANDLE_MASK;
9263 9257  
9264 9258                  if (rp->r_flags & R4READDIRWATTR) {
9265 9259                          mutex_enter(&rp->r_statelock);
9266 9260                          rp->r_flags &= ~R4READDIRWATTR;
9267 9261                          mutex_exit(&rp->r_statelock);
9268 9262                  }
9269 9263          } else {
9270 9264                  servinfo4_t *svp = rp->r_server;
9271 9265  
9272 9266                  /*
9273 9267                   * Already read directory. Use readdir with
9274 9268                   * no attrs (except for mounted_on_fileid) for updates.
9275 9269                   */
9276 9270                  rd_bitsval = FATTR4_RDATTR_ERROR_MASK;
9277 9271  
9278 9272                  /*
9279 9273                   * request mounted on fileid if supported, else request
9280 9274                   * fileid.  maybe we should verify that fileid is supported
9281 9275                   * and request something else if not.
9282 9276                   */
9283 9277                  (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
9284 9278                  if (svp->sv_supp_attrs & FATTR4_MOUNTED_ON_FILEID_MASK)
9285 9279                          rd_bitsval |= FATTR4_MOUNTED_ON_FILEID_MASK;
9286 9280                  nfs_rw_exit(&svp->sv_lock);
9287 9281          }
9288 9282  
9289 9283          /* putfh directory fh */
9290 9284          argop[0].argop = OP_CPUTFH;
9291 9285          argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
9292 9286  
9293 9287          argop[1].argop = OP_READDIR;
9294 9288          rargs = &argop[1].nfs_argop4_u.opreaddir;
9295 9289          /*
9296 9290           * 1 and 2 are reserved for client "." and ".." entry offset.
9297 9291           * cookie 0 should be used over-the-wire to start reading at
9298 9292           * the beginning of the directory excluding "." and "..".
9299 9293           */
9300 9294          if (rdc->nfs4_cookie == 0 ||
9301 9295              rdc->nfs4_cookie == 1 ||
9302 9296              rdc->nfs4_cookie == 2) {
9303 9297                  rargs->cookie = (nfs_cookie4)0;
9304 9298                  rargs->cookieverf = 0;
9305 9299          } else {
9306 9300                  rargs->cookie = (nfs_cookie4)rdc->nfs4_cookie;
9307 9301                  mutex_enter(&rp->r_statelock);
9308 9302                  rargs->cookieverf = rp->r_cookieverf4;
9309 9303                  mutex_exit(&rp->r_statelock);
9310 9304          }
9311 9305          rargs->dircount = MIN(rdc->buflen, mi->mi_tsize);
9312 9306          rargs->maxcount = mi->mi_tsize;
9313 9307          rargs->attr_request = rd_bitsval;
9314 9308          rargs->rdc = rdc;
9315 9309          rargs->dvp = vp;
9316 9310          rargs->mi = mi;
9317 9311          rargs->cr = cr;
9318 9312  
9319 9313  
9320 9314          /*
9321 9315           * If count < than the minimum required, we return no entries
9322 9316           * and fail with EINVAL
9323 9317           */
9324 9318          if (rargs->dircount < (DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2))) {
9325 9319                  rdc->error = EINVAL;
9326 9320                  goto out;
9327 9321          }
9328 9322  
9329 9323          if (args.array_len == 5) {
9330 9324                  /*
9331 9325                   * Add lookupp and getattr for parent nodeid.
9332 9326                   */
9333 9327                  argop[2].argop = OP_LOOKUPP;
9334 9328  
9335 9329                  argop[3].argop = OP_GETFH;
9336 9330  
9337 9331                  /* getattr parent */
9338 9332                  argop[4].argop = OP_GETATTR;
9339 9333                  argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
9340 9334                  argop[4].nfs_argop4_u.opgetattr.mi = mi;
9341 9335          }
9342 9336  
9343 9337          doqueue = 1;
9344 9338  
9345 9339          if (mi->mi_io_kstats) {
9346 9340                  mutex_enter(&mi->mi_lock);
9347 9341                  kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
9348 9342                  mutex_exit(&mi->mi_lock);
9349 9343          }
9350 9344  
9351 9345          /* capture the time of this call */
9352 9346          rargs->t = t = gethrtime();
9353 9347  
9354 9348          rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
9355 9349  
9356 9350          if (mi->mi_io_kstats) {
9357 9351                  mutex_enter(&mi->mi_lock);
9358 9352                  kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
9359 9353                  mutex_exit(&mi->mi_lock);
9360 9354          }
9361 9355  
9362 9356          needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
9363 9357  
9364 9358          /*
9365 9359           * If RPC error occurred and it isn't an error that
9366 9360           * triggers recovery, then go ahead and fail now.
9367 9361           */
9368 9362          if (e.error != 0 && !needrecov) {
9369 9363                  rdc->error = e.error;
9370 9364                  goto out;
9371 9365          }
9372 9366  
9373 9367          if (needrecov) {
9374 9368                  bool_t abort;
9375 9369  
9376 9370                  NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
9377 9371                      "nfs4readdir: initiating recovery.\n"));
9378 9372  
9379 9373                  abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
9380 9374                      NULL, OP_READDIR, NULL, NULL, NULL);
9381 9375                  if (abort == FALSE) {
9382 9376                          nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR,
9383 9377                              &recov_state, needrecov);
9384 9378                          if (!e.error)
9385 9379                                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
9386 9380                          if (rdc->entries != NULL) {
9387 9381                                  kmem_free(rdc->entries, rdc->entlen);
9388 9382                                  rdc->entries = NULL;
9389 9383                          }
9390 9384                          goto recov_retry;
9391 9385                  }
9392 9386  
9393 9387                  if (e.error != 0) {
9394 9388                          rdc->error = e.error;
9395 9389                          goto out;
9396 9390                  }
9397 9391  
9398 9392                  /* fall through for res.status case */
9399 9393          }
9400 9394  
9401 9395          res_opcnt = res.array_len;
9402 9396  
9403 9397          /*
9404 9398           * If compound failed first 2 ops (PUTFH+READDIR), then return
9405 9399           * failure here.  Subsequent ops are for filling out dot-dot
9406 9400           * dirent, and if they fail, we still want to give the caller
9407 9401           * the dirents returned by (the successful) READDIR op, so we need
9408 9402           * to silently ignore failure for subsequent ops (LOOKUPP+GETATTR).
9409 9403           *
9410 9404           * One example where PUTFH+READDIR ops would succeed but
9411 9405           * LOOKUPP+GETATTR would fail would be a dir that has r perm
9412 9406           * but lacks x.  In this case, a POSIX server's VOP_READDIR
9413 9407           * would succeed; however, VOP_LOOKUP(..) would fail since no
9414 9408           * x perm.  We need to come up with a non-vendor-specific way
9415 9409           * for a POSIX server to return d_ino from dotdot's dirent if
9416 9410           * client only requests mounted_on_fileid, and just say the
9417 9411           * LOOKUPP succeeded and fill out the GETATTR.  However, if
9418 9412           * client requested any mandatory attrs, server would be required
9419 9413           * to fail the GETATTR op because it can't call VOP_LOOKUP+VOP_GETATTR
9420 9414           * for dotdot.
9421 9415           */
9422 9416  
9423 9417          if (res.status) {
9424 9418                  if (res_opcnt <= 2) {
9425 9419                          e.error = geterrno4(res.status);
9426 9420                          nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR,
9427 9421                              &recov_state, needrecov);
9428 9422                          nfs4_purge_stale_fh(e.error, vp, cr);
9429 9423                          rdc->error = e.error;
9430 9424                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
9431 9425                          if (rdc->entries != NULL) {
9432 9426                                  kmem_free(rdc->entries, rdc->entlen);
9433 9427                                  rdc->entries = NULL;
9434 9428                          }
9435 9429                          /*
9436 9430                           * If readdir a node that is a stub for a
9437 9431                           * crossed mount point, keep the original
9438 9432                           * secinfo flavor for the current file system,
9439 9433                           * not the crossed one.
9440 9434                           */
9441 9435                          (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
9442 9436                          return;
9443 9437                  }
9444 9438          }
9445 9439  
9446 9440          resop = &res.array[1];  /* readdir res */
9447 9441          rd_res = &resop->nfs_resop4_u.opreaddirclnt;
9448 9442  
9449 9443          mutex_enter(&rp->r_statelock);
9450 9444          rp->r_cookieverf4 = rd_res->cookieverf;
9451 9445          mutex_exit(&rp->r_statelock);
9452 9446  
9453 9447          /*
9454 9448           * For "." and ".." entries
9455 9449           * e.g.
9456 9450           *      seek(cookie=0) -> "." entry with d_off = 1
9457 9451           *      seek(cookie=1) -> ".." entry with d_off = 2
9458 9452           */
9459 9453          if (cookie == (nfs_cookie4) 0) {
9460 9454                  if (rd_res->dotp)
9461 9455                          rd_res->dotp->d_ino = nodeid;
9462 9456                  if (rd_res->dotdotp)
9463 9457                          rd_res->dotdotp->d_ino = pnodeid;
9464 9458          }
9465 9459          if (cookie == (nfs_cookie4) 1) {
9466 9460                  if (rd_res->dotdotp)
9467 9461                          rd_res->dotdotp->d_ino = pnodeid;
9468 9462          }
9469 9463  
9470 9464  
9471 9465          /* LOOKUPP+GETATTR attemped */
9472 9466          if (args.array_len == 5 && rd_res->dotdotp) {
9473 9467                  if (res.status == NFS4_OK && res_opcnt == 5) {
9474 9468                          nfs_fh4 *fhp;
9475 9469                          nfs4_sharedfh_t *sfhp;
9476 9470                          vnode_t *pvp;
9477 9471                          nfs4_ga_res_t *garp;
9478 9472  
9479 9473                          resop++;        /* lookupp */
9480 9474                          resop++;        /* getfh   */
9481 9475                          fhp = &resop->nfs_resop4_u.opgetfh.object;
9482 9476  
9483 9477                          resop++;        /* getattr of parent */
9484 9478  
9485 9479                          /*
9486 9480                           * First, take care of finishing the
9487 9481                           * readdir results.
9488 9482                           */
9489 9483                          garp = &resop->nfs_resop4_u.opgetattr.ga_res;
9490 9484                          /*
9491 9485                           * The d_ino of .. must be the inode number
9492 9486                           * of the mounted filesystem.
9493 9487                           */
9494 9488                          if (garp->n4g_va.va_mask & AT_NODEID)
9495 9489                                  rd_res->dotdotp->d_ino =
9496 9490                                      garp->n4g_va.va_nodeid;
9497 9491  
9498 9492  
9499 9493                          /*
9500 9494                           * Next, create the ".." dnlc entry
9501 9495                           */
9502 9496                          sfhp = sfh4_get(fhp, mi);
9503 9497                          if (!nfs4_make_dotdot(sfhp, t, vp, cr, &pvp, 0)) {
9504 9498                                  dnlc_update(vp, "..", pvp);
9505 9499                                  VN_RELE(pvp);
9506 9500                          }
9507 9501                          sfh4_rele(&sfhp);
9508 9502                  }
9509 9503          }
9510 9504  
9511 9505          if (mi->mi_io_kstats) {
9512 9506                  mutex_enter(&mi->mi_lock);
9513 9507                  KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
9514 9508                  KSTAT_IO_PTR(mi->mi_io_kstats)->nread += rdc->actlen;
9515 9509                  mutex_exit(&mi->mi_lock);
9516 9510          }
9517 9511  
9518 9512          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
9519 9513  
9520 9514  out:
9521 9515          /*
9522 9516           * If readdir a node that is a stub for a crossed mount point,
9523 9517           * keep the original secinfo flavor for the current file system,
9524 9518           * not the crossed one.
9525 9519           */
9526 9520          (void) check_mnt_secinfo(mi->mi_curr_serv, vp);
9527 9521  
9528 9522          nfs4_end_fop(mi, vp, NULL, OH_READDIR, &recov_state, needrecov);
9529 9523  }
9530 9524  
9531 9525  
9532 9526  static int
9533 9527  nfs4_bio(struct buf *bp, stable_how4 *stab_comm, cred_t *cr, bool_t readahead)
9534 9528  {
9535 9529          rnode4_t *rp = VTOR4(bp->b_vp);
9536 9530          int count;
9537 9531          int error;
9538 9532          cred_t *cred_otw = NULL;
9539 9533          offset_t offset;
9540 9534          nfs4_open_stream_t *osp = NULL;
9541 9535          bool_t first_time = TRUE;       /* first time getting otw cred */
9542 9536          bool_t last_time = FALSE;       /* last time getting otw cred */
9543 9537  
9544 9538          ASSERT(nfs_zone() == VTOMI4(bp->b_vp)->mi_zone);
9545 9539  
9546 9540          DTRACE_IO1(start, struct buf *, bp);
9547 9541          offset = ldbtob(bp->b_lblkno);
9548 9542  
9549 9543          if (bp->b_flags & B_READ) {
9550 9544          read_again:
9551 9545                  /*
9552 9546                   * Releases the osp, if it is provided.
9553 9547                   * Puts a hold on the cred_otw and the new osp (if found).
9554 9548                   */
9555 9549                  cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
9556 9550                      &first_time, &last_time);
9557 9551                  error = bp->b_error = nfs4read(bp->b_vp, bp->b_un.b_addr,
9558 9552                      offset, bp->b_bcount, &bp->b_resid, cred_otw,
9559 9553                      readahead, NULL);
9560 9554                  crfree(cred_otw);
9561 9555                  if (!error) {
9562 9556                          if (bp->b_resid) {
9563 9557                                  /*
9564 9558                                   * Didn't get it all because we hit EOF,
9565 9559                                   * zero all the memory beyond the EOF.
9566 9560                                   */
9567 9561                                  /* bzero(rdaddr + */
9568 9562                                  bzero(bp->b_un.b_addr +
9569 9563                                      bp->b_bcount - bp->b_resid, bp->b_resid);
9570 9564                          }
9571 9565                          mutex_enter(&rp->r_statelock);
9572 9566                          if (bp->b_resid == bp->b_bcount &&
9573 9567                              offset >= rp->r_size) {
9574 9568                                  /*
9575 9569                                   * We didn't read anything at all as we are
9576 9570                                   * past EOF.  Return an error indicator back
9577 9571                                   * but don't destroy the pages (yet).
9578 9572                                   */
9579 9573                                  error = NFS_EOF;
9580 9574                          }
9581 9575                          mutex_exit(&rp->r_statelock);
9582 9576                  } else if (error == EACCES && last_time == FALSE) {
9583 9577                                  goto read_again;
9584 9578                  }
9585 9579          } else {
9586 9580                  if (!(rp->r_flags & R4STALE)) {
9587 9581  write_again:
9588 9582                          /*
9589 9583                           * Releases the osp, if it is provided.
9590 9584                           * Puts a hold on the cred_otw and the new
9591 9585                           * osp (if found).
9592 9586                           */
9593 9587                          cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
9594 9588                              &first_time, &last_time);
9595 9589                          mutex_enter(&rp->r_statelock);
9596 9590                          count = MIN(bp->b_bcount, rp->r_size - offset);
9597 9591                          mutex_exit(&rp->r_statelock);
9598 9592                          if (count < 0)
9599 9593                                  cmn_err(CE_PANIC, "nfs4_bio: write count < 0");
9600 9594  #ifdef DEBUG
9601 9595                          if (count == 0) {
9602 9596                                  zoneid_t zoneid = getzoneid();
9603 9597  
9604 9598                                  zcmn_err(zoneid, CE_WARN,
9605 9599                                      "nfs4_bio: zero length write at %lld",
9606 9600                                      offset);
9607 9601                                  zcmn_err(zoneid, CE_CONT, "flags=0x%x, "
9608 9602                                      "b_bcount=%ld, file size=%lld",
9609 9603                                      rp->r_flags, (long)bp->b_bcount,
9610 9604                                      rp->r_size);
9611 9605                                  sfh4_printfhandle(VTOR4(bp->b_vp)->r_fh);
9612 9606                                  if (nfs4_bio_do_stop)
9613 9607                                          debug_enter("nfs4_bio");
9614 9608                          }
9615 9609  #endif
9616 9610                          error = nfs4write(bp->b_vp, bp->b_un.b_addr, offset,
9617 9611                              count, cred_otw, stab_comm);
9618 9612                          if (error == EACCES && last_time == FALSE) {
9619 9613                                  crfree(cred_otw);
9620 9614                                  goto write_again;
9621 9615                          }
9622 9616                          bp->b_error = error;
9623 9617                          if (error && error != EINTR &&
9624 9618                              !(bp->b_vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) {
9625 9619                                  /*
9626 9620                                   * Don't print EDQUOT errors on the console.
9627 9621                                   * Don't print asynchronous EACCES errors.
9628 9622                                   * Don't print EFBIG errors.
9629 9623                                   * Print all other write errors.
9630 9624                                   */
9631 9625                                  if (error != EDQUOT && error != EFBIG &&
9632 9626                                      (error != EACCES ||
9633 9627                                      !(bp->b_flags & B_ASYNC)))
9634 9628                                          nfs4_write_error(bp->b_vp,
9635 9629                                              error, cred_otw);
9636 9630                                  /*
9637 9631                                   * Update r_error and r_flags as appropriate.
9638 9632                                   * If the error was ESTALE, then mark the
9639 9633                                   * rnode as not being writeable and save
9640 9634                                   * the error status.  Otherwise, save any
9641 9635                                   * errors which occur from asynchronous
9642 9636                                   * page invalidations.  Any errors occurring
9643 9637                                   * from other operations should be saved
9644 9638                                   * by the caller.
9645 9639                                   */
9646 9640                                  mutex_enter(&rp->r_statelock);
9647 9641                                  if (error == ESTALE) {
9648 9642                                          rp->r_flags |= R4STALE;
9649 9643                                          if (!rp->r_error)
9650 9644                                                  rp->r_error = error;
9651 9645                                  } else if (!rp->r_error &&
9652 9646                                      (bp->b_flags &
9653 9647                                      (B_INVAL|B_FORCE|B_ASYNC)) ==
9654 9648                                      (B_INVAL|B_FORCE|B_ASYNC)) {
9655 9649                                          rp->r_error = error;
9656 9650                                  }
9657 9651                                  mutex_exit(&rp->r_statelock);
9658 9652                          }
9659 9653                          crfree(cred_otw);
9660 9654                  } else {
9661 9655                          error = rp->r_error;
9662 9656                          /*
9663 9657                           * A close may have cleared r_error, if so,
9664 9658                           * propagate ESTALE error return properly
9665 9659                           */
9666 9660                          if (error == 0)
9667 9661                                  error = ESTALE;
9668 9662                  }
9669 9663          }
9670 9664  
9671 9665          if (error != 0 && error != NFS_EOF)
9672 9666                  bp->b_flags |= B_ERROR;
9673 9667  
9674 9668          if (osp)
9675 9669                  open_stream_rele(osp, rp);
9676 9670  
9677 9671          DTRACE_IO1(done, struct buf *, bp);
9678 9672  
9679 9673          return (error);
9680 9674  }
9681 9675  
9682 9676  /* ARGSUSED */
9683 9677  int
9684 9678  nfs4_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
9685 9679  {
9686 9680          return (EREMOTE);
9687 9681  }
9688 9682  
9689 9683  /* ARGSUSED2 */
9690 9684  int
9691 9685  nfs4_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
9692 9686  {
9693 9687          rnode4_t *rp = VTOR4(vp);
9694 9688  
9695 9689          if (!write_lock) {
9696 9690                  (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
9697 9691                  return (V_WRITELOCK_FALSE);
9698 9692          }
9699 9693  
9700 9694          if ((rp->r_flags & R4DIRECTIO) ||
9701 9695              (VTOMI4(vp)->mi_flags & MI4_DIRECTIO)) {
9702 9696                  (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
9703 9697                  if (rp->r_mapcnt == 0 && !nfs4_has_pages(vp))
9704 9698                          return (V_WRITELOCK_FALSE);
9705 9699                  nfs_rw_exit(&rp->r_rwlock);
9706 9700          }
9707 9701  
9708 9702          (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE);
9709 9703          return (V_WRITELOCK_TRUE);
9710 9704  }
9711 9705  
9712 9706  /* ARGSUSED */
9713 9707  void
9714 9708  nfs4_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
9715 9709  {
9716 9710          rnode4_t *rp = VTOR4(vp);
9717 9711  
9718 9712          nfs_rw_exit(&rp->r_rwlock);
9719 9713  }
9720 9714  
9721 9715  /* ARGSUSED */
9722 9716  static int
9723 9717  nfs4_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
9724 9718  {
9725 9719          if (nfs_zone() != VTOMI4(vp)->mi_zone)
9726 9720                  return (EIO);
9727 9721  
9728 9722          /*
9729 9723           * Because we stuff the readdir cookie into the offset field
9730 9724           * someone may attempt to do an lseek with the cookie which
9731 9725           * we want to succeed.
9732 9726           */
9733 9727          if (vp->v_type == VDIR)
9734 9728                  return (0);
9735 9729          if (*noffp < 0)
9736 9730                  return (EINVAL);
9737 9731          return (0);
9738 9732  }
9739 9733  
9740 9734  
9741 9735  /*
9742 9736   * Return all the pages from [off..off+len) in file
9743 9737   */
9744 9738  /* ARGSUSED */
9745 9739  static int
9746 9740  nfs4_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
9747 9741      page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
9748 9742      enum seg_rw rw, cred_t *cr, caller_context_t *ct)
9749 9743  {
9750 9744          rnode4_t *rp;
9751 9745          int error;
9752 9746          mntinfo4_t *mi;
9753 9747  
9754 9748          if (nfs_zone() != VTOMI4(vp)->mi_zone)
9755 9749                  return (EIO);
9756 9750          rp = VTOR4(vp);
9757 9751          if (IS_SHADOW(vp, rp))
9758 9752                  vp = RTOV4(rp);
9759 9753  
9760 9754          if (vp->v_flag & VNOMAP)
9761 9755                  return (ENOSYS);
9762 9756  
9763 9757          if (protp != NULL)
9764 9758                  *protp = PROT_ALL;
9765 9759  
9766 9760          /*
9767 9761           * Now validate that the caches are up to date.
9768 9762           */
9769 9763          if (error = nfs4_validate_caches(vp, cr))
9770 9764                  return (error);
9771 9765  
9772 9766          mi = VTOMI4(vp);
9773 9767  retry:
9774 9768          mutex_enter(&rp->r_statelock);
9775 9769  
9776 9770          /*
9777 9771           * Don't create dirty pages faster than they
9778 9772           * can be cleaned so that the system doesn't
9779 9773           * get imbalanced.  If the async queue is
9780 9774           * maxed out, then wait for it to drain before
9781 9775           * creating more dirty pages.  Also, wait for
9782 9776           * any threads doing pagewalks in the vop_getattr
9783 9777           * entry points so that they don't block for
9784 9778           * long periods.
9785 9779           */
9786 9780          if (rw == S_CREATE) {
9787 9781                  while ((mi->mi_max_threads != 0 &&
9788 9782                      rp->r_awcount > 2 * mi->mi_max_threads) ||
9789 9783                      rp->r_gcount > 0)
9790 9784                          cv_wait(&rp->r_cv, &rp->r_statelock);
9791 9785          }
9792 9786  
9793 9787          /*
9794 9788           * If we are getting called as a side effect of an nfs_write()
9795 9789           * operation the local file size might not be extended yet.
9796 9790           * In this case we want to be able to return pages of zeroes.
9797 9791           */
9798 9792          if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) {
9799 9793                  NFS4_DEBUG(nfs4_pageio_debug,
9800 9794                      (CE_NOTE, "getpage beyond EOF: off=%lld, "
9801 9795                      "len=%llu, size=%llu, attrsize =%llu", off,
9802 9796                      (u_longlong_t)len, rp->r_size, rp->r_attr.va_size));
9803 9797                  mutex_exit(&rp->r_statelock);
9804 9798                  return (EFAULT);                /* beyond EOF */
9805 9799          }
9806 9800  
9807 9801          mutex_exit(&rp->r_statelock);
9808 9802  
9809 9803          error = pvn_getpages(nfs4_getapage, vp, off, len, protp,
9810 9804              pl, plsz, seg, addr, rw, cr);
9811 9805          NFS4_DEBUG(nfs4_pageio_debug && error,
9812 9806              (CE_NOTE, "getpages error %d; off=%lld, len=%lld",
9813 9807              error, off, (u_longlong_t)len));
9814 9808  
9815 9809          switch (error) {
9816 9810          case NFS_EOF:
9817 9811                  nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, FALSE);
9818 9812                  goto retry;
9819 9813          case ESTALE:
9820 9814                  nfs4_purge_stale_fh(error, vp, cr);
9821 9815          }
9822 9816  
9823 9817          return (error);
9824 9818  }
9825 9819  
9826 9820  /*
9827 9821   * Called from pvn_getpages to get a particular page.
9828 9822   */
9829 9823  /* ARGSUSED */
9830 9824  static int
9831 9825  nfs4_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp,
9832 9826      page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
9833 9827      enum seg_rw rw, cred_t *cr)
9834 9828  {
9835 9829          rnode4_t *rp;
9836 9830          uint_t bsize;
9837 9831          struct buf *bp;
9838 9832          page_t *pp;
9839 9833          u_offset_t lbn;
9840 9834          u_offset_t io_off;
9841 9835          u_offset_t blkoff;
9842 9836          u_offset_t rablkoff;
9843 9837          size_t io_len;
9844 9838          uint_t blksize;
9845 9839          int error;
9846 9840          int readahead;
9847 9841          int readahead_issued = 0;
9848 9842          int ra_window; /* readahead window */
9849 9843          page_t *pagefound;
9850 9844          page_t *savepp;
9851 9845  
9852 9846          if (nfs_zone() != VTOMI4(vp)->mi_zone)
9853 9847                  return (EIO);
9854 9848  
9855 9849          rp = VTOR4(vp);
9856 9850          ASSERT(!IS_SHADOW(vp, rp));
9857 9851          bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
9858 9852  
9859 9853  reread:
9860 9854          bp = NULL;
9861 9855          pp = NULL;
9862 9856          pagefound = NULL;
9863 9857  
9864 9858          if (pl != NULL)
9865 9859                  pl[0] = NULL;
9866 9860  
9867 9861          error = 0;
9868 9862          lbn = off / bsize;
9869 9863          blkoff = lbn * bsize;
9870 9864  
9871 9865          /*
9872 9866           * Queueing up the readahead before doing the synchronous read
9873 9867           * results in a significant increase in read throughput because
9874 9868           * of the increased parallelism between the async threads and
9875 9869           * the process context.
9876 9870           */
9877 9871          if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 &&
9878 9872              rw != S_CREATE &&
9879 9873              !(vp->v_flag & VNOCACHE)) {
9880 9874                  mutex_enter(&rp->r_statelock);
9881 9875  
9882 9876                  /*
9883 9877                   * Calculate the number of readaheads to do.
9884 9878                   * a) No readaheads at offset = 0.
9885 9879                   * b) Do maximum(nfs4_nra) readaheads when the readahead
9886 9880                   *    window is closed.
9887 9881                   * c) Do readaheads between 1 to (nfs4_nra - 1) depending
9888 9882                   *    upon how far the readahead window is open or close.
9889 9883                   * d) No readaheads if rp->r_nextr is not within the scope
9890 9884                   *    of the readahead window (random i/o).
9891 9885                   */
9892 9886  
9893 9887                  if (off == 0)
9894 9888                          readahead = 0;
9895 9889                  else if (blkoff == rp->r_nextr)
9896 9890                          readahead = nfs4_nra;
9897 9891                  else if (rp->r_nextr > blkoff &&
9898 9892                      ((ra_window = (rp->r_nextr - blkoff) / bsize)
9899 9893                      <= (nfs4_nra - 1)))
9900 9894                          readahead = nfs4_nra - ra_window;
9901 9895                  else
9902 9896                          readahead = 0;
9903 9897  
9904 9898                  rablkoff = rp->r_nextr;
9905 9899                  while (readahead > 0 && rablkoff + bsize < rp->r_size) {
9906 9900                          mutex_exit(&rp->r_statelock);
9907 9901                          if (nfs4_async_readahead(vp, rablkoff + bsize,
9908 9902                              addr + (rablkoff + bsize - off),
9909 9903                              seg, cr, nfs4_readahead) < 0) {
9910 9904                                  mutex_enter(&rp->r_statelock);
9911 9905                                  break;
9912 9906                          }
9913 9907                          readahead--;
9914 9908                          rablkoff += bsize;
9915 9909                          /*
9916 9910                           * Indicate that we did a readahead so
9917 9911                           * readahead offset is not updated
9918 9912                           * by the synchronous read below.
9919 9913                           */
9920 9914                          readahead_issued = 1;
9921 9915                          mutex_enter(&rp->r_statelock);
9922 9916                          /*
9923 9917                           * set readahead offset to
9924 9918                           * offset of last async readahead
9925 9919                           * request.
9926 9920                           */
9927 9921                          rp->r_nextr = rablkoff;
9928 9922                  }
9929 9923                  mutex_exit(&rp->r_statelock);
9930 9924          }
9931 9925  
9932 9926  again:
9933 9927          if ((pagefound = page_exists(vp, off)) == NULL) {
9934 9928                  if (pl == NULL) {
9935 9929                          (void) nfs4_async_readahead(vp, blkoff, addr, seg, cr,
9936 9930                              nfs4_readahead);
9937 9931                  } else if (rw == S_CREATE) {
9938 9932                          /*
9939 9933                           * Block for this page is not allocated, or the offset
9940 9934                           * is beyond the current allocation size, or we're
9941 9935                           * allocating a swap slot and the page was not found,
9942 9936                           * so allocate it and return a zero page.
9943 9937                           */
9944 9938                          if ((pp = page_create_va(vp, off,
9945 9939                              PAGESIZE, PG_WAIT, seg, addr)) == NULL)
9946 9940                                  cmn_err(CE_PANIC, "nfs4_getapage: page_create");
9947 9941                          io_len = PAGESIZE;
9948 9942                          mutex_enter(&rp->r_statelock);
9949 9943                          rp->r_nextr = off + PAGESIZE;
9950 9944                          mutex_exit(&rp->r_statelock);
9951 9945                  } else {
9952 9946                          /*
9953 9947                           * Need to go to server to get a block
9954 9948                           */
9955 9949                          mutex_enter(&rp->r_statelock);
9956 9950                          if (blkoff < rp->r_size &&
9957 9951                              blkoff + bsize > rp->r_size) {
9958 9952                                  /*
9959 9953                                   * If less than a block left in
9960 9954                                   * file read less than a block.
9961 9955                                   */
9962 9956                                  if (rp->r_size <= off) {
9963 9957                                          /*
9964 9958                                           * Trying to access beyond EOF,
9965 9959                                           * set up to get at least one page.
9966 9960                                           */
9967 9961                                          blksize = off + PAGESIZE - blkoff;
9968 9962                                  } else
9969 9963                                          blksize = rp->r_size - blkoff;
9970 9964                          } else if ((off == 0) ||
9971 9965                              (off != rp->r_nextr && !readahead_issued)) {
9972 9966                                  blksize = PAGESIZE;
9973 9967                                  blkoff = off; /* block = page here */
9974 9968                          } else
9975 9969                                  blksize = bsize;
9976 9970                          mutex_exit(&rp->r_statelock);
9977 9971  
9978 9972                          pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
9979 9973                              &io_len, blkoff, blksize, 0);
9980 9974  
9981 9975                          /*
9982 9976                           * Some other thread has entered the page,
9983 9977                           * so just use it.
9984 9978                           */
9985 9979                          if (pp == NULL)
9986 9980                                  goto again;
9987 9981  
9988 9982                          /*
9989 9983                           * Now round the request size up to page boundaries.
9990 9984                           * This ensures that the entire page will be
9991 9985                           * initialized to zeroes if EOF is encountered.
9992 9986                           */
9993 9987                          io_len = ptob(btopr(io_len));
9994 9988  
9995 9989                          bp = pageio_setup(pp, io_len, vp, B_READ);
9996 9990                          ASSERT(bp != NULL);
9997 9991  
9998 9992                          /*
9999 9993                           * pageio_setup should have set b_addr to 0.  This
10000 9994                           * is correct since we want to do I/O on a page
10001 9995                           * boundary.  bp_mapin will use this addr to calculate
10002 9996                           * an offset, and then set b_addr to the kernel virtual
10003 9997                           * address it allocated for us.
10004 9998                           */
10005 9999                          ASSERT(bp->b_un.b_addr == 0);
10006 10000  
10007 10001                          bp->b_edev = 0;
10008 10002                          bp->b_dev = 0;
10009 10003                          bp->b_lblkno = lbtodb(io_off);
10010 10004                          bp->b_file = vp;
10011 10005                          bp->b_offset = (offset_t)off;
10012 10006                          bp_mapin(bp);
10013 10007  
10014 10008                          /*
10015 10009                           * If doing a write beyond what we believe is EOF,
10016 10010                           * don't bother trying to read the pages from the
10017 10011                           * server, we'll just zero the pages here.  We
10018 10012                           * don't check that the rw flag is S_WRITE here
10019 10013                           * because some implementations may attempt a
10020 10014                           * read access to the buffer before copying data.
10021 10015                           */
10022 10016                          mutex_enter(&rp->r_statelock);
10023 10017                          if (io_off >= rp->r_size && seg == segkmap) {
10024 10018                                  mutex_exit(&rp->r_statelock);
10025 10019                                  bzero(bp->b_un.b_addr, io_len);
10026 10020                          } else {
10027 10021                                  mutex_exit(&rp->r_statelock);
10028 10022                                  error = nfs4_bio(bp, NULL, cr, FALSE);
10029 10023                          }
10030 10024  
10031 10025                          /*
10032 10026                           * Unmap the buffer before freeing it.
10033 10027                           */
10034 10028                          bp_mapout(bp);
10035 10029                          pageio_done(bp);
10036 10030  
10037 10031                          savepp = pp;
10038 10032                          do {
10039 10033                                  pp->p_fsdata = C_NOCOMMIT;
10040 10034                          } while ((pp = pp->p_next) != savepp);
10041 10035  
10042 10036                          if (error == NFS_EOF) {
10043 10037                                  /*
10044 10038                                   * If doing a write system call just return
10045 10039                                   * zeroed pages, else user tried to get pages
10046 10040                                   * beyond EOF, return error.  We don't check
10047 10041                                   * that the rw flag is S_WRITE here because
10048 10042                                   * some implementations may attempt a read
10049 10043                                   * access to the buffer before copying data.
10050 10044                                   */
10051 10045                                  if (seg == segkmap)
10052 10046                                          error = 0;
10053 10047                                  else
10054 10048                                          error = EFAULT;
10055 10049                          }
10056 10050  
10057 10051                          if (!readahead_issued && !error) {
10058 10052                                  mutex_enter(&rp->r_statelock);
10059 10053                                  rp->r_nextr = io_off + io_len;
10060 10054                                  mutex_exit(&rp->r_statelock);
10061 10055                          }
10062 10056                  }
10063 10057          }
10064 10058  
10065 10059  out:
10066 10060          if (pl == NULL)
10067 10061                  return (error);
10068 10062  
10069 10063          if (error) {
10070 10064                  if (pp != NULL)
10071 10065                          pvn_read_done(pp, B_ERROR);
10072 10066                  return (error);
10073 10067          }
10074 10068  
10075 10069          if (pagefound) {
10076 10070                  se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
10077 10071  
10078 10072                  /*
10079 10073                   * Page exists in the cache, acquire the appropriate lock.
10080 10074                   * If this fails, start all over again.
10081 10075                   */
10082 10076                  if ((pp = page_lookup(vp, off, se)) == NULL) {
10083 10077  #ifdef DEBUG
10084 10078                          nfs4_lostpage++;
10085 10079  #endif
10086 10080                          goto reread;
10087 10081                  }
10088 10082                  pl[0] = pp;
10089 10083                  pl[1] = NULL;
10090 10084                  return (0);
10091 10085          }
10092 10086  
10093 10087          if (pp != NULL)
10094 10088                  pvn_plist_init(pp, pl, plsz, off, io_len, rw);
10095 10089  
10096 10090          return (error);
10097 10091  }
10098 10092  
10099 10093  static void
10100 10094  nfs4_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg,
10101 10095      cred_t *cr)
10102 10096  {
10103 10097          int error;
10104 10098          page_t *pp;
10105 10099          u_offset_t io_off;
10106 10100          size_t io_len;
10107 10101          struct buf *bp;
10108 10102          uint_t bsize, blksize;
10109 10103          rnode4_t *rp = VTOR4(vp);
10110 10104          page_t *savepp;
10111 10105  
10112 10106          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
10113 10107  
10114 10108          bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
10115 10109  
10116 10110          mutex_enter(&rp->r_statelock);
10117 10111          if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
10118 10112                  /*
10119 10113                   * If less than a block left in file read less
10120 10114                   * than a block.
10121 10115                   */
10122 10116                  blksize = rp->r_size - blkoff;
10123 10117          } else
10124 10118                  blksize = bsize;
10125 10119          mutex_exit(&rp->r_statelock);
10126 10120  
10127 10121          pp = pvn_read_kluster(vp, blkoff, segkmap, addr,
10128 10122              &io_off, &io_len, blkoff, blksize, 1);
10129 10123          /*
10130 10124           * The isra flag passed to the kluster function is 1, we may have
10131 10125           * gotten a return value of NULL for a variety of reasons (# of free
10132 10126           * pages < minfree, someone entered the page on the vnode etc). In all
10133 10127           * cases, we want to punt on the readahead.
10134 10128           */
10135 10129          if (pp == NULL)
10136 10130                  return;
10137 10131  
10138 10132          /*
10139 10133           * Now round the request size up to page boundaries.
10140 10134           * This ensures that the entire page will be
10141 10135           * initialized to zeroes if EOF is encountered.
10142 10136           */
10143 10137          io_len = ptob(btopr(io_len));
10144 10138  
10145 10139          bp = pageio_setup(pp, io_len, vp, B_READ);
10146 10140          ASSERT(bp != NULL);
10147 10141  
10148 10142          /*
10149 10143           * pageio_setup should have set b_addr to 0.  This is correct since
10150 10144           * we want to do I/O on a page boundary. bp_mapin() will use this addr
10151 10145           * to calculate an offset, and then set b_addr to the kernel virtual
10152 10146           * address it allocated for us.
10153 10147           */
10154 10148          ASSERT(bp->b_un.b_addr == 0);
10155 10149  
10156 10150          bp->b_edev = 0;
10157 10151          bp->b_dev = 0;
10158 10152          bp->b_lblkno = lbtodb(io_off);
10159 10153          bp->b_file = vp;
10160 10154          bp->b_offset = (offset_t)blkoff;
10161 10155          bp_mapin(bp);
10162 10156  
10163 10157          /*
10164 10158           * If doing a write beyond what we believe is EOF, don't bother trying
10165 10159           * to read the pages from the server, we'll just zero the pages here.
10166 10160           * We don't check that the rw flag is S_WRITE here because some
10167 10161           * implementations may attempt a read access to the buffer before
10168 10162           * copying data.
10169 10163           */
10170 10164          mutex_enter(&rp->r_statelock);
10171 10165          if (io_off >= rp->r_size && seg == segkmap) {
10172 10166                  mutex_exit(&rp->r_statelock);
10173 10167                  bzero(bp->b_un.b_addr, io_len);
10174 10168                  error = 0;
10175 10169          } else {
10176 10170                  mutex_exit(&rp->r_statelock);
10177 10171                  error = nfs4_bio(bp, NULL, cr, TRUE);
10178 10172                  if (error == NFS_EOF)
10179 10173                          error = 0;
10180 10174          }
10181 10175  
10182 10176          /*
10183 10177           * Unmap the buffer before freeing it.
10184 10178           */
10185 10179          bp_mapout(bp);
10186 10180          pageio_done(bp);
10187 10181  
10188 10182          savepp = pp;
10189 10183          do {
10190 10184                  pp->p_fsdata = C_NOCOMMIT;
10191 10185          } while ((pp = pp->p_next) != savepp);
10192 10186  
10193 10187          pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ);
10194 10188  
10195 10189          /*
10196 10190           * In case of error set readahead offset
10197 10191           * to the lowest offset.
10198 10192           * pvn_read_done() calls VN_DISPOSE to destroy the pages
10199 10193           */
10200 10194          if (error && rp->r_nextr > io_off) {
10201 10195                  mutex_enter(&rp->r_statelock);
10202 10196                  if (rp->r_nextr > io_off)
10203 10197                          rp->r_nextr = io_off;
10204 10198                  mutex_exit(&rp->r_statelock);
10205 10199          }
10206 10200  }
10207 10201  
10208 10202  /*
10209 10203   * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
10210 10204   * If len == 0, do from off to EOF.
10211 10205   *
10212 10206   * The normal cases should be len == 0 && off == 0 (entire vp list) or
10213 10207   * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
10214 10208   * (from pageout).
10215 10209   */
10216 10210  /* ARGSUSED */
10217 10211  static int
10218 10212  nfs4_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
10219 10213      caller_context_t *ct)
10220 10214  {
10221 10215          int error;
10222 10216          rnode4_t *rp;
10223 10217  
10224 10218          ASSERT(cr != NULL);
10225 10219  
10226 10220          if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone)
10227 10221                  return (EIO);
10228 10222  
10229 10223          rp = VTOR4(vp);
10230 10224          if (IS_SHADOW(vp, rp))
10231 10225                  vp = RTOV4(rp);
10232 10226  
10233 10227          /*
10234 10228           * XXX - Why should this check be made here?
10235 10229           */
10236 10230          if (vp->v_flag & VNOMAP)
10237 10231                  return (ENOSYS);
10238 10232  
10239 10233          if (len == 0 && !(flags & B_INVAL) &&
10240 10234              (vp->v_vfsp->vfs_flag & VFS_RDONLY))
10241 10235                  return (0);
10242 10236  
10243 10237          mutex_enter(&rp->r_statelock);
10244 10238          rp->r_count++;
10245 10239          mutex_exit(&rp->r_statelock);
10246 10240          error = nfs4_putpages(vp, off, len, flags, cr);
10247 10241          mutex_enter(&rp->r_statelock);
10248 10242          rp->r_count--;
10249 10243          cv_broadcast(&rp->r_cv);
10250 10244          mutex_exit(&rp->r_statelock);
10251 10245  
10252 10246          return (error);
10253 10247  }
10254 10248  
10255 10249  /*
10256 10250   * Write out a single page, possibly klustering adjacent dirty pages.
10257 10251   */
10258 10252  int
10259 10253  nfs4_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
10260 10254      int flags, cred_t *cr)
10261 10255  {
10262 10256          u_offset_t io_off;
10263 10257          u_offset_t lbn_off;
10264 10258          u_offset_t lbn;
10265 10259          size_t io_len;
10266 10260          uint_t bsize;
10267 10261          int error;
10268 10262          rnode4_t *rp;
10269 10263  
10270 10264          ASSERT(!(vp->v_vfsp->vfs_flag & VFS_RDONLY));
10271 10265          ASSERT(pp != NULL);
10272 10266          ASSERT(cr != NULL);
10273 10267          ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI4(vp)->mi_zone);
10274 10268  
10275 10269          rp = VTOR4(vp);
10276 10270          ASSERT(rp->r_count > 0);
10277 10271          ASSERT(!IS_SHADOW(vp, rp));
10278 10272  
10279 10273          bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
10280 10274          lbn = pp->p_offset / bsize;
10281 10275          lbn_off = lbn * bsize;
10282 10276  
10283 10277          /*
10284 10278           * Find a kluster that fits in one block, or in
10285 10279           * one page if pages are bigger than blocks.  If
10286 10280           * there is less file space allocated than a whole
10287 10281           * page, we'll shorten the i/o request below.
10288 10282           */
10289 10283          pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off,
10290 10284              roundup(bsize, PAGESIZE), flags);
10291 10285  
10292 10286          /*
10293 10287           * pvn_write_kluster shouldn't have returned a page with offset
10294 10288           * behind the original page we were given.  Verify that.
10295 10289           */
10296 10290          ASSERT((pp->p_offset / bsize) >= lbn);
10297 10291  
10298 10292          /*
10299 10293           * Now pp will have the list of kept dirty pages marked for
10300 10294           * write back.  It will also handle invalidation and freeing
10301 10295           * of pages that are not dirty.  Check for page length rounding
10302 10296           * problems.
10303 10297           */
10304 10298          if (io_off + io_len > lbn_off + bsize) {
10305 10299                  ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE);
10306 10300                  io_len = lbn_off + bsize - io_off;
10307 10301          }
10308 10302          /*
10309 10303           * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a
10310 10304           * consistent value of r_size. R4MODINPROGRESS is set in writerp4().
10311 10305           * When R4MODINPROGRESS is set it indicates that a uiomove() is in
10312 10306           * progress and the r_size has not been made consistent with the
10313 10307           * new size of the file. When the uiomove() completes the r_size is
10314 10308           * updated and the R4MODINPROGRESS flag is cleared.
10315 10309           *
10316 10310           * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a
10317 10311           * consistent value of r_size. Without this handshaking, it is
10318 10312           * possible that nfs4_bio() picks  up the old value of r_size
10319 10313           * before the uiomove() in writerp4() completes. This will result
10320 10314           * in the write through nfs4_bio() being dropped.
10321 10315           *
10322 10316           * More precisely, there is a window between the time the uiomove()
10323 10317           * completes and the time the r_size is updated. If a VOP_PUTPAGE()
10324 10318           * operation intervenes in this window, the page will be picked up,
10325 10319           * because it is dirty (it will be unlocked, unless it was
10326 10320           * pagecreate'd). When the page is picked up as dirty, the dirty
10327 10321           * bit is reset (pvn_getdirty()). In nfs4write(), r_size is
10328 10322           * checked. This will still be the old size. Therefore the page will
10329 10323           * not be written out. When segmap_release() calls VOP_PUTPAGE(),
10330 10324           * the page will be found to be clean and the write will be dropped.
10331 10325           */
10332 10326          if (rp->r_flags & R4MODINPROGRESS) {
10333 10327                  mutex_enter(&rp->r_statelock);
10334 10328                  if ((rp->r_flags & R4MODINPROGRESS) &&
10335 10329                      rp->r_modaddr + MAXBSIZE > io_off &&
10336 10330                      rp->r_modaddr < io_off + io_len) {
10337 10331                          page_t *plist;
10338 10332                          /*
10339 10333                           * A write is in progress for this region of the file.
10340 10334                           * If we did not detect R4MODINPROGRESS here then this
10341 10335                           * path through nfs_putapage() would eventually go to
10342 10336                           * nfs4_bio() and may not write out all of the data
10343 10337                           * in the pages. We end up losing data. So we decide
10344 10338                           * to set the modified bit on each page in the page
10345 10339                           * list and mark the rnode with R4DIRTY. This write
10346 10340                           * will be restarted at some later time.
10347 10341                           */
10348 10342                          plist = pp;
10349 10343                          while (plist != NULL) {
10350 10344                                  pp = plist;
10351 10345                                  page_sub(&plist, pp);
10352 10346                                  hat_setmod(pp);
10353 10347                                  page_io_unlock(pp);
10354 10348                                  page_unlock(pp);
10355 10349                          }
10356 10350                          rp->r_flags |= R4DIRTY;
10357 10351                          mutex_exit(&rp->r_statelock);
10358 10352                          if (offp)
10359 10353                                  *offp = io_off;
10360 10354                          if (lenp)
10361 10355                                  *lenp = io_len;
10362 10356                          return (0);
10363 10357                  }
10364 10358                  mutex_exit(&rp->r_statelock);
10365 10359          }
10366 10360  
10367 10361          if (flags & B_ASYNC) {
10368 10362                  error = nfs4_async_putapage(vp, pp, io_off, io_len, flags, cr,
10369 10363                      nfs4_sync_putapage);
10370 10364          } else
10371 10365                  error = nfs4_sync_putapage(vp, pp, io_off, io_len, flags, cr);
10372 10366  
10373 10367          if (offp)
10374 10368                  *offp = io_off;
10375 10369          if (lenp)
10376 10370                  *lenp = io_len;
10377 10371          return (error);
10378 10372  }
10379 10373  
10380 10374  static int
10381 10375  nfs4_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
10382 10376      int flags, cred_t *cr)
10383 10377  {
10384 10378          int error;
10385 10379          rnode4_t *rp;
10386 10380  
10387 10381          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
10388 10382  
10389 10383          flags |= B_WRITE;
10390 10384  
10391 10385          error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
10392 10386  
10393 10387          rp = VTOR4(vp);
10394 10388  
10395 10389          if ((error == ENOSPC || error == EDQUOT || error == EFBIG ||
10396 10390              error == EACCES) &&
10397 10391              (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) {
10398 10392                  if (!(rp->r_flags & R4OUTOFSPACE)) {
10399 10393                          mutex_enter(&rp->r_statelock);
10400 10394                          rp->r_flags |= R4OUTOFSPACE;
10401 10395                          mutex_exit(&rp->r_statelock);
10402 10396                  }
10403 10397                  flags |= B_ERROR;
10404 10398                  pvn_write_done(pp, flags);
10405 10399                  /*
10406 10400                   * If this was not an async thread, then try again to
10407 10401                   * write out the pages, but this time, also destroy
10408 10402                   * them whether or not the write is successful.  This
10409 10403                   * will prevent memory from filling up with these
10410 10404                   * pages and destroying them is the only alternative
10411 10405                   * if they can't be written out.
10412 10406                   *
10413 10407                   * Don't do this if this is an async thread because
10414 10408                   * when the pages are unlocked in pvn_write_done,
10415 10409                   * some other thread could have come along, locked
10416 10410                   * them, and queued for an async thread.  It would be
10417 10411                   * possible for all of the async threads to be tied
10418 10412                   * up waiting to lock the pages again and they would
10419 10413                   * all already be locked and waiting for an async
10420 10414                   * thread to handle them.  Deadlock.
10421 10415                   */
10422 10416                  if (!(flags & B_ASYNC)) {
10423 10417                          error = nfs4_putpage(vp, io_off, io_len,
10424 10418                              B_INVAL | B_FORCE, cr, NULL);
10425 10419                  }
10426 10420          } else {
10427 10421                  if (error)
10428 10422                          flags |= B_ERROR;
10429 10423                  else if (rp->r_flags & R4OUTOFSPACE) {
10430 10424                          mutex_enter(&rp->r_statelock);
10431 10425                          rp->r_flags &= ~R4OUTOFSPACE;
10432 10426                          mutex_exit(&rp->r_statelock);
10433 10427                  }
10434 10428                  pvn_write_done(pp, flags);
10435 10429                  if (freemem < desfree)
10436 10430                          (void) nfs4_commit_vp(vp, (u_offset_t)0, 0, cr,
10437 10431                              NFS4_WRITE_NOWAIT);
10438 10432          }
10439 10433  
10440 10434          return (error);
10441 10435  }
10442 10436  
10443 10437  #ifdef DEBUG
10444 10438  int nfs4_force_open_before_mmap = 0;
10445 10439  #endif
10446 10440  
10447 10441  /* ARGSUSED */
10448 10442  static int
10449 10443  nfs4_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
10450 10444      size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
10451 10445      caller_context_t *ct)
10452 10446  {
10453 10447          struct segvn_crargs vn_a;
10454 10448          int error = 0;
10455 10449          rnode4_t *rp = VTOR4(vp);
10456 10450          mntinfo4_t *mi = VTOMI4(vp);
10457 10451  
10458 10452          if (nfs_zone() != VTOMI4(vp)->mi_zone)
10459 10453                  return (EIO);
10460 10454  
10461 10455          if (vp->v_flag & VNOMAP)
10462 10456                  return (ENOSYS);
10463 10457  
10464 10458          if (off < 0 || (off + len) < 0)
10465 10459                  return (ENXIO);
10466 10460  
10467 10461          if (vp->v_type != VREG)
10468 10462                  return (ENODEV);
10469 10463  
10470 10464          /*
10471 10465           * If the file is delegated to the client don't do anything.
10472 10466           * If the file is not delegated, then validate the data cache.
10473 10467           */
10474 10468          mutex_enter(&rp->r_statev4_lock);
10475 10469          if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
10476 10470                  mutex_exit(&rp->r_statev4_lock);
10477 10471                  error = nfs4_validate_caches(vp, cr);
10478 10472                  if (error)
10479 10473                          return (error);
10480 10474          } else {
10481 10475                  mutex_exit(&rp->r_statev4_lock);
10482 10476          }
10483 10477  
10484 10478          /*
10485 10479           * Check to see if the vnode is currently marked as not cachable.
10486 10480           * This means portions of the file are locked (through VOP_FRLOCK).
10487 10481           * In this case the map request must be refused.  We use
10488 10482           * rp->r_lkserlock to avoid a race with concurrent lock requests.
10489 10483           *
10490 10484           * Atomically increment r_inmap after acquiring r_rwlock. The
10491 10485           * idea here is to acquire r_rwlock to block read/write and
10492 10486           * not to protect r_inmap. r_inmap will inform nfs4_read/write()
10493 10487           * that we are in nfs4_map(). Now, r_rwlock is acquired in order
10494 10488           * and we can prevent the deadlock that would have occurred
10495 10489           * when nfs4_addmap() would have acquired it out of order.
10496 10490           *
10497 10491           * Since we are not protecting r_inmap by any lock, we do not
10498 10492           * hold any lock when we decrement it. We atomically decrement
10499 10493           * r_inmap after we release r_lkserlock.
10500 10494           */
10501 10495  
10502 10496          if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR4(vp)))
10503 10497                  return (EINTR);
10504 10498          atomic_inc_uint(&rp->r_inmap);
10505 10499          nfs_rw_exit(&rp->r_rwlock);
10506 10500  
10507 10501          if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) {
10508 10502                  atomic_dec_uint(&rp->r_inmap);
10509 10503                  return (EINTR);
10510 10504          }
10511 10505  
10512 10506          if (vp->v_flag & VNOCACHE) {
10513 10507                  error = EAGAIN;
10514 10508                  goto done;
10515 10509          }
10516 10510  
10517 10511          /*
10518 10512           * Don't allow concurrent locks and mapping if mandatory locking is
10519 10513           * enabled.
10520 10514           */
10521 10515          if (flk_has_remote_locks(vp)) {
10522 10516                  struct vattr va;
10523 10517                  va.va_mask = AT_MODE;
10524 10518                  error = nfs4getattr(vp, &va, cr);
10525 10519                  if (error != 0)
10526 10520                          goto done;
10527 10521                  if (MANDLOCK(vp, va.va_mode)) {
10528 10522                          error = EAGAIN;
10529 10523                          goto done;
10530 10524                  }
10531 10525          }
10532 10526  
10533 10527          /*
10534 10528           * It is possible that the rnode has a lost lock request that we
10535 10529           * are still trying to recover, and that the request conflicts with
10536 10530           * this map request.
10537 10531           *
10538 10532           * An alternative approach would be for nfs4_safemap() to consider
10539 10533           * queued lock requests when deciding whether to set or clear
10540 10534           * VNOCACHE.  This would require the frlock code path to call
10541 10535           * nfs4_safemap() after enqueing a lost request.
10542 10536           */
10543 10537          if (nfs4_map_lost_lock_conflict(vp)) {
10544 10538                  error = EAGAIN;
10545 10539                  goto done;
10546 10540          }
10547 10541  
10548 10542          as_rangelock(as);
10549 10543          error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
10550 10544          if (error != 0) {
10551 10545                  as_rangeunlock(as);
10552 10546                  goto done;
10553 10547          }
10554 10548  
10555 10549          if (vp->v_type == VREG) {
10556 10550                  /*
10557 10551                   * We need to retrieve the open stream
10558 10552                   */
10559 10553                  nfs4_open_stream_t      *osp = NULL;
10560 10554                  nfs4_open_owner_t       *oop = NULL;
10561 10555  
10562 10556                  oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
10563 10557                  if (oop != NULL) {
10564 10558                          /* returns with 'os_sync_lock' held */
10565 10559                          osp = find_open_stream(oop, rp);
10566 10560                          open_owner_rele(oop);
10567 10561                  }
10568 10562                  if (osp == NULL) {
10569 10563  #ifdef DEBUG
10570 10564                          if (nfs4_force_open_before_mmap) {
10571 10565                                  error = EIO;
10572 10566                                  goto done;
10573 10567                          }
10574 10568  #endif
10575 10569                          /* returns with 'os_sync_lock' held */
10576 10570                          error = open_and_get_osp(vp, cr, &osp);
10577 10571                          if (osp == NULL) {
10578 10572                                  NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE,
10579 10573                                      "nfs4_map: we tried to OPEN the file "
10580 10574                                      "but again no osp, so fail with EIO"));
10581 10575                                  goto done;
10582 10576                          }
10583 10577                  }
10584 10578  
10585 10579                  if (osp->os_failed_reopen) {
10586 10580                          mutex_exit(&osp->os_sync_lock);
10587 10581                          open_stream_rele(osp, rp);
10588 10582                          NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE,
10589 10583                              "nfs4_map: os_failed_reopen set on "
10590 10584                              "osp %p, cr %p, rp %s", (void *)osp,
10591 10585                              (void *)cr, rnode4info(rp)));
10592 10586                          error = EIO;
10593 10587                          goto done;
10594 10588                  }
10595 10589                  mutex_exit(&osp->os_sync_lock);
10596 10590                  open_stream_rele(osp, rp);
10597 10591          }
10598 10592  
10599 10593          vn_a.vp = vp;
10600 10594          vn_a.offset = off;
10601 10595          vn_a.type = (flags & MAP_TYPE);
10602 10596          vn_a.prot = (uchar_t)prot;
10603 10597          vn_a.maxprot = (uchar_t)maxprot;
10604 10598          vn_a.flags = (flags & ~MAP_TYPE);
10605 10599          vn_a.cred = cr;
10606 10600          vn_a.amp = NULL;
10607 10601          vn_a.szc = 0;
10608 10602          vn_a.lgrp_mem_policy_flags = 0;
10609 10603  
10610 10604          error = as_map(as, *addrp, len, segvn_create, &vn_a);
10611 10605          as_rangeunlock(as);
10612 10606  
10613 10607  done:
10614 10608          nfs_rw_exit(&rp->r_lkserlock);
10615 10609          atomic_dec_uint(&rp->r_inmap);
10616 10610          return (error);
10617 10611  }
10618 10612  
10619 10613  /*
10620 10614   * We're most likely dealing with a kernel module that likes to READ
10621 10615   * and mmap without OPENing the file (ie: lookup/read/mmap), so lets
10622 10616   * officially OPEN the file to create the necessary client state
10623 10617   * for bookkeeping of os_mmap_read/write counts.
10624 10618   *
10625 10619   * Since VOP_MAP only passes in a pointer to the vnode rather than
10626 10620   * a double pointer, we can't handle the case where nfs4open_otw()
10627 10621   * returns a different vnode than the one passed into VOP_MAP (since
10628 10622   * VOP_DELMAP will not see the vnode nfs4open_otw used).  In this case,
10629 10623   * we return NULL and let nfs4_map() fail.  Note: the only case where
10630 10624   * this should happen is if the file got removed and replaced with the
10631 10625   * same name on the server (in addition to the fact that we're trying
10632 10626   * to VOP_MAP withouth VOP_OPENing the file in the first place).
10633 10627   */
10634 10628  static int
10635 10629  open_and_get_osp(vnode_t *map_vp, cred_t *cr, nfs4_open_stream_t **ospp)
10636 10630  {
10637 10631          rnode4_t                *rp, *drp;
10638 10632          vnode_t                 *dvp, *open_vp;
10639 10633          char                    file_name[MAXNAMELEN];
10640 10634          int                     just_created;
10641 10635          nfs4_open_stream_t      *osp;
10642 10636          nfs4_open_owner_t       *oop;
10643 10637          int                     error;
10644 10638  
10645 10639          *ospp = NULL;
10646 10640          open_vp = map_vp;
10647 10641  
10648 10642          rp = VTOR4(open_vp);
10649 10643          if ((error = vtodv(open_vp, &dvp, cr, TRUE)) != 0)
10650 10644                  return (error);
10651 10645          drp = VTOR4(dvp);
10652 10646  
10653 10647          if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) {
10654 10648                  VN_RELE(dvp);
10655 10649                  return (EINTR);
10656 10650          }
10657 10651  
10658 10652          if ((error = vtoname(open_vp, file_name, MAXNAMELEN)) != 0) {
10659 10653                  nfs_rw_exit(&drp->r_rwlock);
10660 10654                  VN_RELE(dvp);
10661 10655                  return (error);
10662 10656          }
10663 10657  
10664 10658          mutex_enter(&rp->r_statev4_lock);
10665 10659          if (rp->created_v4) {
10666 10660                  rp->created_v4 = 0;
10667 10661                  mutex_exit(&rp->r_statev4_lock);
10668 10662  
10669 10663                  dnlc_update(dvp, file_name, open_vp);
10670 10664                  /* This is needed so we don't bump the open ref count */
10671 10665                  just_created = 1;
10672 10666          } else {
10673 10667                  mutex_exit(&rp->r_statev4_lock);
10674 10668                  just_created = 0;
10675 10669          }
10676 10670  
10677 10671          VN_HOLD(map_vp);
10678 10672  
10679 10673          error = nfs4open_otw(dvp, file_name, NULL, &open_vp, cr, 0, FREAD, 0,
10680 10674              just_created);
10681 10675          if (error) {
10682 10676                  nfs_rw_exit(&drp->r_rwlock);
10683 10677                  VN_RELE(dvp);
10684 10678                  VN_RELE(map_vp);
10685 10679                  return (error);
10686 10680          }
10687 10681  
10688 10682          nfs_rw_exit(&drp->r_rwlock);
10689 10683          VN_RELE(dvp);
10690 10684  
10691 10685          /*
10692 10686           * If nfs4open_otw() returned a different vnode then "undo"
10693 10687           * the open and return failure to the caller.
10694 10688           */
10695 10689          if (!VN_CMP(open_vp, map_vp)) {
10696 10690                  nfs4_error_t e;
10697 10691  
10698 10692                  NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: "
10699 10693                      "open returned a different vnode"));
10700 10694                  /*
10701 10695                   * If there's an error, ignore it,
10702 10696                   * and let VOP_INACTIVE handle it.
10703 10697                   */
10704 10698                  (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e,
10705 10699                      CLOSE_NORM, 0, 0, 0);
10706 10700                  VN_RELE(map_vp);
10707 10701                  return (EIO);
10708 10702          }
10709 10703  
10710 10704          VN_RELE(map_vp);
10711 10705  
10712 10706          oop = find_open_owner(cr, NFS4_PERM_CREATED, VTOMI4(open_vp));
10713 10707          if (!oop) {
10714 10708                  nfs4_error_t e;
10715 10709  
10716 10710                  NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: "
10717 10711                      "no open owner"));
10718 10712                  /*
10719 10713                   * If there's an error, ignore it,
10720 10714                   * and let VOP_INACTIVE handle it.
10721 10715                   */
10722 10716                  (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e,
10723 10717                      CLOSE_NORM, 0, 0, 0);
10724 10718                  return (EIO);
10725 10719          }
10726 10720          osp = find_open_stream(oop, rp);
10727 10721          open_owner_rele(oop);
10728 10722          *ospp = osp;
10729 10723          return (0);
10730 10724  }
10731 10725  
10732 10726  /*
10733 10727   * Please be aware that when this function is called, the address space write
10734 10728   * a_lock is held.  Do not put over the wire calls in this function.
10735 10729   */
10736 10730  /* ARGSUSED */
10737 10731  static int
10738 10732  nfs4_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
10739 10733      size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
10740 10734      caller_context_t *ct)
10741 10735  {
10742 10736          rnode4_t                *rp;
10743 10737          int                     error = 0;
10744 10738          mntinfo4_t              *mi;
10745 10739  
10746 10740          mi = VTOMI4(vp);
10747 10741          rp = VTOR4(vp);
10748 10742  
10749 10743          if (nfs_zone() != mi->mi_zone)
10750 10744                  return (EIO);
10751 10745          if (vp->v_flag & VNOMAP)
10752 10746                  return (ENOSYS);
10753 10747  
10754 10748          /*
10755 10749           * Don't need to update the open stream first, since this
10756 10750           * mmap can't add any additional share access that isn't
10757 10751           * already contained in the open stream (for the case where we
10758 10752           * open/mmap/only update rp->r_mapcnt/server reboots/reopen doesn't
10759 10753           * take into account os_mmap_read[write] counts).
10760 10754           */
10761 10755          atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
10762 10756  
10763 10757          if (vp->v_type == VREG) {
10764 10758                  /*
10765 10759                   * We need to retrieve the open stream and update the counts.
10766 10760                   * If there is no open stream here, something is wrong.
10767 10761                   */
10768 10762                  nfs4_open_stream_t      *osp = NULL;
10769 10763                  nfs4_open_owner_t       *oop = NULL;
10770 10764  
10771 10765                  oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
10772 10766                  if (oop != NULL) {
10773 10767                          /* returns with 'os_sync_lock' held */
10774 10768                          osp = find_open_stream(oop, rp);
10775 10769                          open_owner_rele(oop);
10776 10770                  }
10777 10771                  if (osp == NULL) {
10778 10772                          NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE,
10779 10773                              "nfs4_addmap: we should have an osp"
10780 10774                              "but we don't, so fail with EIO"));
10781 10775                          error = EIO;
10782 10776                          goto out;
10783 10777                  }
10784 10778  
10785 10779                  NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "nfs4_addmap: osp %p,"
10786 10780                      " pages %ld, prot 0x%x", (void *)osp, btopr(len), prot));
10787 10781  
10788 10782                  /*
10789 10783                   * Update the map count in the open stream.
10790 10784                   * This is necessary in the case where we
10791 10785                   * open/mmap/close/, then the server reboots, and we
10792 10786                   * attempt to reopen.  If the mmap doesn't add share
10793 10787                   * access then we send an invalid reopen with
10794 10788                   * access = NONE.
10795 10789                   *
10796 10790                   * We need to specifically check each PROT_* so a mmap
10797 10791                   * call of (PROT_WRITE | PROT_EXEC) will ensure us both
10798 10792                   * read and write access.  A simple comparison of prot
10799 10793                   * to ~PROT_WRITE to determine read access is insufficient
10800 10794                   * since prot can be |= with PROT_USER, etc.
10801 10795                   */
10802 10796  
10803 10797                  /*
10804 10798                   * Unless we're MAP_SHARED, no sense in adding os_mmap_write
10805 10799                   */
10806 10800                  if ((flags & MAP_SHARED) && (maxprot & PROT_WRITE))
10807 10801                          osp->os_mmap_write += btopr(len);
10808 10802                  if (maxprot & PROT_READ)
10809 10803                          osp->os_mmap_read += btopr(len);
10810 10804                  if (maxprot & PROT_EXEC)
10811 10805                          osp->os_mmap_read += btopr(len);
10812 10806                  /*
10813 10807                   * Ensure that os_mmap_read gets incremented, even if
10814 10808                   * maxprot were to look like PROT_NONE.
10815 10809                   */
10816 10810                  if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) &&
10817 10811                      !(maxprot & PROT_EXEC))
10818 10812                          osp->os_mmap_read += btopr(len);
10819 10813                  osp->os_mapcnt += btopr(len);
10820 10814                  mutex_exit(&osp->os_sync_lock);
10821 10815                  open_stream_rele(osp, rp);
10822 10816          }
10823 10817  
10824 10818  out:
10825 10819          /*
10826 10820           * If we got an error, then undo our
10827 10821           * incrementing of 'r_mapcnt'.
10828 10822           */
10829 10823  
10830 10824          if (error) {
10831 10825                  atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(len));
10832 10826                  ASSERT(rp->r_mapcnt >= 0);
10833 10827          }
10834 10828          return (error);
10835 10829  }
10836 10830  
10837 10831  /* ARGSUSED */
10838 10832  static int
10839 10833  nfs4_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct)
10840 10834  {
10841 10835  
10842 10836          return (VTOR4(vp1) == VTOR4(vp2));
10843 10837  }
10844 10838  
10845 10839  /* ARGSUSED */
10846 10840  static int
10847 10841  nfs4_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
10848 10842      offset_t offset, struct flk_callback *flk_cbp, cred_t *cr,
10849 10843      caller_context_t *ct)
10850 10844  {
10851 10845          int rc;
10852 10846          u_offset_t start, end;
10853 10847          rnode4_t *rp;
10854 10848          int error = 0, intr = INTR4(vp);
10855 10849          nfs4_error_t e;
10856 10850  
10857 10851          if (nfs_zone() != VTOMI4(vp)->mi_zone)
10858 10852                  return (EIO);
10859 10853  
10860 10854          /* check for valid cmd parameter */
10861 10855          if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW)
10862 10856                  return (EINVAL);
10863 10857  
10864 10858          /* Verify l_type. */
10865 10859          switch (bfp->l_type) {
10866 10860          case F_RDLCK:
10867 10861                  if (cmd != F_GETLK && !(flag & FREAD))
10868 10862                          return (EBADF);
10869 10863                  break;
10870 10864          case F_WRLCK:
10871 10865                  if (cmd != F_GETLK && !(flag & FWRITE))
10872 10866                          return (EBADF);
10873 10867                  break;
10874 10868          case F_UNLCK:
10875 10869                  intr = 0;
10876 10870                  break;
10877 10871  
10878 10872          default:
10879 10873                  return (EINVAL);
10880 10874          }
10881 10875  
10882 10876          /* check the validity of the lock range */
10883 10877          if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset))
10884 10878                  return (rc);
10885 10879          if (rc = flk_check_lock_data(start, end, MAXEND))
10886 10880                  return (rc);
10887 10881  
10888 10882          /*
10889 10883           * If the filesystem is mounted using local locking, pass the
10890 10884           * request off to the local locking code.
10891 10885           */
10892 10886          if (VTOMI4(vp)->mi_flags & MI4_LLOCK || vp->v_type != VREG) {
10893 10887                  if (cmd == F_SETLK || cmd == F_SETLKW) {
10894 10888                          /*
10895 10889                           * For complete safety, we should be holding
10896 10890                           * r_lkserlock.  However, we can't call
10897 10891                           * nfs4_safelock and then fs_frlock while
10898 10892                           * holding r_lkserlock, so just invoke
10899 10893                           * nfs4_safelock and expect that this will
10900 10894                           * catch enough of the cases.
10901 10895                           */
10902 10896                          if (!nfs4_safelock(vp, bfp, cr))
10903 10897                                  return (EAGAIN);
10904 10898                  }
10905 10899                  return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
10906 10900          }
10907 10901  
10908 10902          rp = VTOR4(vp);
10909 10903  
10910 10904          /*
10911 10905           * Check whether the given lock request can proceed, given the
10912 10906           * current file mappings.
10913 10907           */
10914 10908          if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr))
10915 10909                  return (EINTR);
10916 10910          if (cmd == F_SETLK || cmd == F_SETLKW) {
10917 10911                  if (!nfs4_safelock(vp, bfp, cr)) {
10918 10912                          rc = EAGAIN;
10919 10913                          goto done;
10920 10914                  }
10921 10915          }
10922 10916  
10923 10917          /*
10924 10918           * Flush the cache after waiting for async I/O to finish.  For new
10925 10919           * locks, this is so that the process gets the latest bits from the
10926 10920           * server.  For unlocks, this is so that other clients see the
10927 10921           * latest bits once the file has been unlocked.  If currently dirty
10928 10922           * pages can't be flushed, then don't allow a lock to be set.  But
10929 10923           * allow unlocks to succeed, to avoid having orphan locks on the
10930 10924           * server.
10931 10925           */
10932 10926          if (cmd != F_GETLK) {
10933 10927                  mutex_enter(&rp->r_statelock);
10934 10928                  while (rp->r_count > 0) {
10935 10929                          if (intr) {
10936 10930                                  klwp_t *lwp = ttolwp(curthread);
10937 10931  
10938 10932                                  if (lwp != NULL)
10939 10933                                          lwp->lwp_nostop++;
10940 10934                                  if (cv_wait_sig(&rp->r_cv,
10941 10935                                      &rp->r_statelock) == 0) {
10942 10936                                          if (lwp != NULL)
10943 10937                                                  lwp->lwp_nostop--;
10944 10938                                          rc = EINTR;
10945 10939                                          break;
10946 10940                                  }
10947 10941                                  if (lwp != NULL)
10948 10942                                          lwp->lwp_nostop--;
10949 10943                          } else {
10950 10944                                  cv_wait(&rp->r_cv, &rp->r_statelock);
10951 10945                          }
10952 10946                  }
10953 10947                  mutex_exit(&rp->r_statelock);
10954 10948                  if (rc != 0)
10955 10949                          goto done;
10956 10950                  error = nfs4_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct);
10957 10951                  if (error) {
10958 10952                          if (error == ENOSPC || error == EDQUOT) {
10959 10953                                  mutex_enter(&rp->r_statelock);
10960 10954                                  if (!rp->r_error)
10961 10955                                          rp->r_error = error;
10962 10956                                  mutex_exit(&rp->r_statelock);
10963 10957                          }
10964 10958                          if (bfp->l_type != F_UNLCK) {
10965 10959                                  rc = ENOLCK;
10966 10960                                  goto done;
10967 10961                          }
10968 10962                  }
10969 10963          }
10970 10964  
10971 10965          /*
10972 10966           * Call the lock manager to do the real work of contacting
10973 10967           * the server and obtaining the lock.
10974 10968           */
10975 10969          nfs4frlock(NFS4_LCK_CTYPE_NORM, vp, cmd, bfp, flag, offset,
10976 10970              cr, &e, NULL, NULL);
10977 10971          rc = e.error;
10978 10972  
10979 10973          if (rc == 0)
10980 10974                  nfs4_lockcompletion(vp, cmd);
10981 10975  
10982 10976  done:
10983 10977          nfs_rw_exit(&rp->r_lkserlock);
10984 10978  
10985 10979          return (rc);
10986 10980  }
10987 10981  
10988 10982  /*
10989 10983   * Free storage space associated with the specified vnode.  The portion
10990 10984   * to be freed is specified by bfp->l_start and bfp->l_len (already
10991 10985   * normalized to a "whence" of 0).
10992 10986   *
10993 10987   * This is an experimental facility whose continued existence is not
10994 10988   * guaranteed.  Currently, we only support the special case
10995 10989   * of l_len == 0, meaning free to end of file.
10996 10990   */
10997 10991  /* ARGSUSED */
10998 10992  static int
10999 10993  nfs4_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
11000 10994      offset_t offset, cred_t *cr, caller_context_t *ct)
11001 10995  {
11002 10996          int error;
11003 10997  
11004 10998          if (nfs_zone() != VTOMI4(vp)->mi_zone)
11005 10999                  return (EIO);
11006 11000          ASSERT(vp->v_type == VREG);
11007 11001          if (cmd != F_FREESP)
11008 11002                  return (EINVAL);
11009 11003  
11010 11004          error = convoff(vp, bfp, 0, offset);
11011 11005          if (!error) {
11012 11006                  ASSERT(bfp->l_start >= 0);
11013 11007                  if (bfp->l_len == 0) {
11014 11008                          struct vattr va;
11015 11009  
11016 11010                          va.va_mask = AT_SIZE;
11017 11011                          va.va_size = bfp->l_start;
11018 11012                          error = nfs4setattr(vp, &va, 0, cr, NULL);
11019 11013  
11020 11014                          if (error == 0) {
11021 11015                                  if (bfp->l_start == 0) {
11022 11016                                          vnevent_truncate(vp, ct);
11023 11017                                  } else {
11024 11018                                          vnevent_resize(vp, ct);
11025 11019                                  }
11026 11020                          }
11027 11021                  } else
11028 11022                          error = EINVAL;
11029 11023          }
11030 11024  
11031 11025          return (error);
11032 11026  }
11033 11027  
11034 11028  /* ARGSUSED */
11035 11029  int
11036 11030  nfs4_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
11037 11031  {
11038 11032          rnode4_t *rp;
11039 11033          rp = VTOR4(vp);
11040 11034  
11041 11035          if (vp->v_type == VREG && IS_SHADOW(vp, rp)) {
11042 11036                  vp = RTOV4(rp);
11043 11037          }
11044 11038          *vpp = vp;
11045 11039          return (0);
11046 11040  }
11047 11041  
11048 11042  /*
11049 11043   * Setup and add an address space callback to do the work of the delmap call.
11050 11044   * The callback will (and must be) deleted in the actual callback function.
11051 11045   *
11052 11046   * This is done in order to take care of the problem that we have with holding
11053 11047   * the address space's a_lock for a long period of time (e.g. if the NFS server
11054 11048   * is down).  Callbacks will be executed in the address space code while the
11055 11049   * a_lock is not held.  Holding the address space's a_lock causes things such
11056 11050   * as ps and fork to hang because they are trying to acquire this lock as well.
11057 11051   */
11058 11052  /* ARGSUSED */
11059 11053  static int
11060 11054  nfs4_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
11061 11055      size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
11062 11056      caller_context_t *ct)
11063 11057  {
11064 11058          int                     caller_found;
11065 11059          int                     error;
11066 11060          rnode4_t                *rp;
11067 11061          nfs4_delmap_args_t      *dmapp;
11068 11062          nfs4_delmapcall_t       *delmap_call;
11069 11063  
11070 11064          if (vp->v_flag & VNOMAP)
11071 11065                  return (ENOSYS);
11072 11066  
11073 11067          /*
11074 11068           * A process may not change zones if it has NFS pages mmap'ed
11075 11069           * in, so we can't legitimately get here from the wrong zone.
11076 11070           */
11077 11071          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11078 11072  
11079 11073          rp = VTOR4(vp);
11080 11074  
11081 11075          /*
11082 11076           * The way that the address space of this process deletes its mapping
11083 11077           * of this file is via the following call chains:
11084 11078           * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap()
11085 11079           * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap()
11086 11080           *
11087 11081           * With the use of address space callbacks we are allowed to drop the
11088 11082           * address space lock, a_lock, while executing the NFS operations that
11089 11083           * need to go over the wire.  Returning EAGAIN to the caller of this
11090 11084           * function is what drives the execution of the callback that we add
11091 11085           * below.  The callback will be executed by the address space code
11092 11086           * after dropping the a_lock.  When the callback is finished, since
11093 11087           * we dropped the a_lock, it must be re-acquired and segvn_unmap()
11094 11088           * is called again on the same segment to finish the rest of the work
11095 11089           * that needs to happen during unmapping.
11096 11090           *
11097 11091           * This action of calling back into the segment driver causes
11098 11092           * nfs4_delmap() to get called again, but since the callback was
11099 11093           * already executed at this point, it already did the work and there
11100 11094           * is nothing left for us to do.
11101 11095           *
11102 11096           * To Summarize:
11103 11097           * - The first time nfs4_delmap is called by the current thread is when
11104 11098           * we add the caller associated with this delmap to the delmap caller
11105 11099           * list, add the callback, and return EAGAIN.
11106 11100           * - The second time in this call chain when nfs4_delmap is called we
11107 11101           * will find this caller in the delmap caller list and realize there
11108 11102           * is no more work to do thus removing this caller from the list and
11109 11103           * returning the error that was set in the callback execution.
11110 11104           */
11111 11105          caller_found = nfs4_find_and_delete_delmapcall(rp, &error);
11112 11106          if (caller_found) {
11113 11107                  /*
11114 11108                   * 'error' is from the actual delmap operations.  To avoid
11115 11109                   * hangs, we need to handle the return of EAGAIN differently
11116 11110                   * since this is what drives the callback execution.
11117 11111                   * In this case, we don't want to return EAGAIN and do the
11118 11112                   * callback execution because there are none to execute.
11119 11113                   */
11120 11114                  if (error == EAGAIN)
11121 11115                          return (0);
11122 11116                  else
11123 11117                          return (error);
11124 11118          }
11125 11119  
11126 11120          /* current caller was not in the list */
11127 11121          delmap_call = nfs4_init_delmapcall();
11128 11122  
11129 11123          mutex_enter(&rp->r_statelock);
11130 11124          list_insert_tail(&rp->r_indelmap, delmap_call);
11131 11125          mutex_exit(&rp->r_statelock);
11132 11126  
11133 11127          dmapp = kmem_alloc(sizeof (nfs4_delmap_args_t), KM_SLEEP);
11134 11128  
11135 11129          dmapp->vp = vp;
11136 11130          dmapp->off = off;
11137 11131          dmapp->addr = addr;
11138 11132          dmapp->len = len;
11139 11133          dmapp->prot = prot;
11140 11134          dmapp->maxprot = maxprot;
11141 11135          dmapp->flags = flags;
11142 11136          dmapp->cr = cr;
11143 11137          dmapp->caller = delmap_call;
11144 11138  
11145 11139          error = as_add_callback(as, nfs4_delmap_callback, dmapp,
11146 11140              AS_UNMAP_EVENT, addr, len, KM_SLEEP);
11147 11141  
11148 11142          return (error ? error : EAGAIN);
11149 11143  }
11150 11144  
11151 11145  static nfs4_delmapcall_t *
11152 11146  nfs4_init_delmapcall()
11153 11147  {
11154 11148          nfs4_delmapcall_t       *delmap_call;
11155 11149  
11156 11150          delmap_call = kmem_alloc(sizeof (nfs4_delmapcall_t), KM_SLEEP);
11157 11151          delmap_call->call_id = curthread;
11158 11152          delmap_call->error = 0;
11159 11153  
11160 11154          return (delmap_call);
11161 11155  }
11162 11156  
11163 11157  static void
11164 11158  nfs4_free_delmapcall(nfs4_delmapcall_t *delmap_call)
11165 11159  {
11166 11160          kmem_free(delmap_call, sizeof (nfs4_delmapcall_t));
11167 11161  }
11168 11162  
11169 11163  /*
11170 11164   * Searches for the current delmap caller (based on curthread) in the list of
11171 11165   * callers.  If it is found, we remove it and free the delmap caller.
11172 11166   * Returns:
11173 11167   *      0 if the caller wasn't found
11174 11168   *      1 if the caller was found, removed and freed.  *errp will be set
11175 11169   *      to what the result of the delmap was.
11176 11170   */
11177 11171  static int
11178 11172  nfs4_find_and_delete_delmapcall(rnode4_t *rp, int *errp)
11179 11173  {
11180 11174          nfs4_delmapcall_t       *delmap_call;
11181 11175  
11182 11176          /*
11183 11177           * If the list doesn't exist yet, we create it and return
11184 11178           * that the caller wasn't found.  No list = no callers.
11185 11179           */
11186 11180          mutex_enter(&rp->r_statelock);
11187 11181          if (!(rp->r_flags & R4DELMAPLIST)) {
11188 11182                  /* The list does not exist */
11189 11183                  list_create(&rp->r_indelmap, sizeof (nfs4_delmapcall_t),
11190 11184                      offsetof(nfs4_delmapcall_t, call_node));
11191 11185                  rp->r_flags |= R4DELMAPLIST;
11192 11186                  mutex_exit(&rp->r_statelock);
11193 11187                  return (0);
11194 11188          } else {
11195 11189                  /* The list exists so search it */
11196 11190                  for (delmap_call = list_head(&rp->r_indelmap);
11197 11191                      delmap_call != NULL;
11198 11192                      delmap_call = list_next(&rp->r_indelmap, delmap_call)) {
11199 11193                          if (delmap_call->call_id == curthread) {
11200 11194                                  /* current caller is in the list */
11201 11195                                  *errp = delmap_call->error;
11202 11196                                  list_remove(&rp->r_indelmap, delmap_call);
11203 11197                                  mutex_exit(&rp->r_statelock);
11204 11198                                  nfs4_free_delmapcall(delmap_call);
11205 11199                                  return (1);
11206 11200                          }
11207 11201                  }
11208 11202          }
11209 11203          mutex_exit(&rp->r_statelock);
11210 11204          return (0);
11211 11205  }
11212 11206  
11213 11207  /*
11214 11208   * Remove some pages from an mmap'd vnode.  Just update the
11215 11209   * count of pages.  If doing close-to-open, then flush and
11216 11210   * commit all of the pages associated with this file.
11217 11211   * Otherwise, start an asynchronous page flush to write out
11218 11212   * any dirty pages.  This will also associate a credential
11219 11213   * with the rnode which can be used to write the pages.
11220 11214   */
11221 11215  /* ARGSUSED */
11222 11216  static void
11223 11217  nfs4_delmap_callback(struct as *as, void *arg, uint_t event)
11224 11218  {
11225 11219          nfs4_error_t            e = { 0, NFS4_OK, RPC_SUCCESS };
11226 11220          rnode4_t                *rp;
11227 11221          mntinfo4_t              *mi;
11228 11222          nfs4_delmap_args_t      *dmapp = (nfs4_delmap_args_t *)arg;
11229 11223  
11230 11224          rp = VTOR4(dmapp->vp);
11231 11225          mi = VTOMI4(dmapp->vp);
11232 11226  
11233 11227          atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len));
11234 11228          ASSERT(rp->r_mapcnt >= 0);
11235 11229  
11236 11230          /*
11237 11231           * Initiate a page flush and potential commit if there are
11238 11232           * pages, the file system was not mounted readonly, the segment
11239 11233           * was mapped shared, and the pages themselves were writeable.
11240 11234           */
11241 11235          if (nfs4_has_pages(dmapp->vp) &&
11242 11236              !(dmapp->vp->v_vfsp->vfs_flag & VFS_RDONLY) &&
11243 11237              dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
11244 11238                  mutex_enter(&rp->r_statelock);
11245 11239                  rp->r_flags |= R4DIRTY;
11246 11240                  mutex_exit(&rp->r_statelock);
11247 11241                  e.error = nfs4_putpage_commit(dmapp->vp, dmapp->off,
11248 11242                      dmapp->len, dmapp->cr);
11249 11243                  if (!e.error) {
11250 11244                          mutex_enter(&rp->r_statelock);
11251 11245                          e.error = rp->r_error;
11252 11246                          rp->r_error = 0;
11253 11247                          mutex_exit(&rp->r_statelock);
11254 11248                  }
11255 11249          } else
11256 11250                  e.error = 0;
11257 11251  
11258 11252          if ((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO))
11259 11253                  (void) nfs4_putpage(dmapp->vp, dmapp->off, dmapp->len,
11260 11254                      B_INVAL, dmapp->cr, NULL);
11261 11255  
11262 11256          if (e.error) {
11263 11257                  e.stat = puterrno4(e.error);
11264 11258                  nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0,
11265 11259                      OP_COMMIT, FALSE, NULL, 0, dmapp->vp);
11266 11260                  dmapp->caller->error = e.error;
11267 11261          }
11268 11262  
11269 11263          /* Check to see if we need to close the file */
11270 11264  
11271 11265          if (dmapp->vp->v_type == VREG) {
11272 11266                  nfs4close_one(dmapp->vp, NULL, dmapp->cr, 0, NULL, &e,
11273 11267                      CLOSE_DELMAP, dmapp->len, dmapp->maxprot, dmapp->flags);
11274 11268  
11275 11269                  if (e.error != 0 || e.stat != NFS4_OK) {
11276 11270                          /*
11277 11271                           * Since it is possible that e.error == 0 and
11278 11272                           * e.stat != NFS4_OK (and vice versa),
11279 11273                           * we do the proper checking in order to get both
11280 11274                           * e.error and e.stat reporting the correct info.
11281 11275                           */
11282 11276                          if (e.stat == NFS4_OK)
11283 11277                                  e.stat = puterrno4(e.error);
11284 11278                          if (e.error == 0)
11285 11279                                  e.error = geterrno4(e.stat);
11286 11280  
11287 11281                          nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0,
11288 11282                              OP_CLOSE, FALSE, NULL, 0, dmapp->vp);
11289 11283                          dmapp->caller->error = e.error;
11290 11284                  }
11291 11285          }
11292 11286  
11293 11287          (void) as_delete_callback(as, arg);
11294 11288          kmem_free(dmapp, sizeof (nfs4_delmap_args_t));
11295 11289  }
11296 11290  
11297 11291  
11298 11292  static uint_t
11299 11293  fattr4_maxfilesize_to_bits(uint64_t ll)
11300 11294  {
11301 11295          uint_t l = 1;
11302 11296  
11303 11297          if (ll == 0) {
11304 11298                  return (0);
11305 11299          }
11306 11300  
11307 11301          if (ll & 0xffffffff00000000) {
11308 11302                  l += 32; ll >>= 32;
11309 11303          }
11310 11304          if (ll & 0xffff0000) {
11311 11305                  l += 16; ll >>= 16;
11312 11306          }
11313 11307          if (ll & 0xff00) {
11314 11308                  l += 8; ll >>= 8;
11315 11309          }
11316 11310          if (ll & 0xf0) {
11317 11311                  l += 4; ll >>= 4;
11318 11312          }
11319 11313          if (ll & 0xc) {
11320 11314                  l += 2; ll >>= 2;
11321 11315          }
11322 11316          if (ll & 0x2) {
11323 11317                  l += 1;
11324 11318          }
11325 11319          return (l);
11326 11320  }
11327 11321  
11328 11322  static int
11329 11323  nfs4_have_xattrs(vnode_t *vp, ulong_t *valp, cred_t *cr)
11330 11324  {
11331 11325          vnode_t *avp = NULL;
11332 11326          int error;
11333 11327  
11334 11328          if ((error = nfs4lookup_xattr(vp, "", &avp,
11335 11329              LOOKUP_XATTR, cr)) == 0)
11336 11330                  error = do_xattr_exists_check(avp, valp, cr);
11337 11331          if (avp)
11338 11332                  VN_RELE(avp);
11339 11333  
11340 11334          return (error);
11341 11335  }
11342 11336  
11343 11337  /* ARGSUSED */
11344 11338  int
11345 11339  nfs4_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
11346 11340      caller_context_t *ct)
11347 11341  {
11348 11342          int error;
11349 11343          hrtime_t t;
11350 11344          rnode4_t *rp;
11351 11345          nfs4_ga_res_t gar;
11352 11346          nfs4_ga_ext_res_t ger;
11353 11347  
11354 11348          gar.n4g_ext_res = &ger;
11355 11349  
11356 11350          if (nfs_zone() != VTOMI4(vp)->mi_zone)
11357 11351                  return (EIO);
11358 11352          if (cmd == _PC_PATH_MAX || cmd == _PC_SYMLINK_MAX) {
11359 11353                  *valp = MAXPATHLEN;
11360 11354                  return (0);
11361 11355          }
11362 11356          if (cmd == _PC_ACL_ENABLED) {
11363 11357                  *valp = _ACL_ACE_ENABLED;
11364 11358                  return (0);
11365 11359          }
11366 11360  
11367 11361          rp = VTOR4(vp);
11368 11362          if (cmd == _PC_XATTR_EXISTS) {
11369 11363                  /*
11370 11364                   * The existence of the xattr directory is not sufficient
11371 11365                   * for determining whether generic user attributes exists.
11372 11366                   * The attribute directory could only be a transient directory
11373 11367                   * used for Solaris sysattr support.  Do a small readdir
11374 11368                   * to verify if the only entries are sysattrs or not.
11375 11369                   *
11376 11370                   * pc4_xattr_valid can be only be trusted when r_xattr_dir
11377 11371                   * is NULL.  Once the xadir vp exists, we can create xattrs,
11378 11372                   * and we don't have any way to update the "base" object's
11379 11373                   * pc4_xattr_exists from the xattr or xadir.  Maybe FEM
11380 11374                   * could help out.
11381 11375                   */
11382 11376                  if (ATTRCACHE4_VALID(vp) && rp->r_pathconf.pc4_xattr_valid &&
11383 11377                      rp->r_xattr_dir == NULL) {
11384 11378                          return (nfs4_have_xattrs(vp, valp, cr));
11385 11379                  }
11386 11380          } else {  /* OLD CODE */
11387 11381                  if (ATTRCACHE4_VALID(vp)) {
11388 11382                          mutex_enter(&rp->r_statelock);
11389 11383                          if (rp->r_pathconf.pc4_cache_valid) {
11390 11384                                  error = 0;
11391 11385                                  switch (cmd) {
11392 11386                                  case _PC_FILESIZEBITS:
11393 11387                                          *valp =
11394 11388                                              rp->r_pathconf.pc4_filesizebits;
11395 11389                                          break;
11396 11390                                  case _PC_LINK_MAX:
11397 11391                                          *valp =
11398 11392                                              rp->r_pathconf.pc4_link_max;
11399 11393                                          break;
11400 11394                                  case _PC_NAME_MAX:
11401 11395                                          *valp =
11402 11396                                              rp->r_pathconf.pc4_name_max;
11403 11397                                          break;
11404 11398                                  case _PC_CHOWN_RESTRICTED:
11405 11399                                          *valp =
11406 11400                                              rp->r_pathconf.pc4_chown_restricted;
11407 11401                                          break;
11408 11402                                  case _PC_NO_TRUNC:
11409 11403                                          *valp =
11410 11404                                              rp->r_pathconf.pc4_no_trunc;
11411 11405                                          break;
11412 11406                                  default:
11413 11407                                          error = EINVAL;
11414 11408                                          break;
11415 11409                                  }
11416 11410                                  mutex_exit(&rp->r_statelock);
11417 11411  #ifdef DEBUG
11418 11412                                  nfs4_pathconf_cache_hits++;
11419 11413  #endif
11420 11414                                  return (error);
11421 11415                          }
11422 11416                          mutex_exit(&rp->r_statelock);
11423 11417                  }
11424 11418          }
11425 11419  #ifdef DEBUG
11426 11420          nfs4_pathconf_cache_misses++;
11427 11421  #endif
11428 11422  
11429 11423          t = gethrtime();
11430 11424  
11431 11425          error = nfs4_attr_otw(vp, TAG_PATHCONF, &gar, NFS4_PATHCONF_MASK, cr);
11432 11426  
11433 11427          if (error) {
11434 11428                  mutex_enter(&rp->r_statelock);
11435 11429                  rp->r_pathconf.pc4_cache_valid = FALSE;
11436 11430                  rp->r_pathconf.pc4_xattr_valid = FALSE;
11437 11431                  mutex_exit(&rp->r_statelock);
11438 11432                  return (error);
11439 11433          }
11440 11434  
11441 11435          /* interpret the max filesize */
11442 11436          gar.n4g_ext_res->n4g_pc4.pc4_filesizebits =
11443 11437              fattr4_maxfilesize_to_bits(gar.n4g_ext_res->n4g_maxfilesize);
11444 11438  
11445 11439          /* Store the attributes we just received */
11446 11440          nfs4_attr_cache(vp, &gar, t, cr, TRUE, NULL);
11447 11441  
11448 11442          switch (cmd) {
11449 11443          case _PC_FILESIZEBITS:
11450 11444                  *valp = gar.n4g_ext_res->n4g_pc4.pc4_filesizebits;
11451 11445                  break;
11452 11446          case _PC_LINK_MAX:
11453 11447                  *valp = gar.n4g_ext_res->n4g_pc4.pc4_link_max;
11454 11448                  break;
11455 11449          case _PC_NAME_MAX:
11456 11450                  *valp = gar.n4g_ext_res->n4g_pc4.pc4_name_max;
11457 11451                  break;
11458 11452          case _PC_CHOWN_RESTRICTED:
11459 11453                  *valp = gar.n4g_ext_res->n4g_pc4.pc4_chown_restricted;
11460 11454                  break;
11461 11455          case _PC_NO_TRUNC:
11462 11456                  *valp = gar.n4g_ext_res->n4g_pc4.pc4_no_trunc;
11463 11457                  break;
11464 11458          case _PC_XATTR_EXISTS:
11465 11459                  if (gar.n4g_ext_res->n4g_pc4.pc4_xattr_exists) {
11466 11460                          if (error = nfs4_have_xattrs(vp, valp, cr))
11467 11461                                  return (error);
11468 11462                  }
11469 11463                  break;
11470 11464          default:
11471 11465                  return (EINVAL);
11472 11466          }
11473 11467  
11474 11468          return (0);
11475 11469  }
11476 11470  
11477 11471  /*
11478 11472   * Called by async thread to do synchronous pageio. Do the i/o, wait
11479 11473   * for it to complete, and cleanup the page list when done.
11480 11474   */
11481 11475  static int
11482 11476  nfs4_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
11483 11477      int flags, cred_t *cr)
11484 11478  {
11485 11479          int error;
11486 11480  
11487 11481          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11488 11482  
11489 11483          error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
11490 11484          if (flags & B_READ)
11491 11485                  pvn_read_done(pp, (error ? B_ERROR : 0) | flags);
11492 11486          else
11493 11487                  pvn_write_done(pp, (error ? B_ERROR : 0) | flags);
11494 11488          return (error);
11495 11489  }
11496 11490  
11497 11491  /* ARGSUSED */
11498 11492  static int
11499 11493  nfs4_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
11500 11494      int flags, cred_t *cr, caller_context_t *ct)
11501 11495  {
11502 11496          int error;
11503 11497          rnode4_t *rp;
11504 11498  
11505 11499          if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone)
11506 11500                  return (EIO);
11507 11501  
11508 11502          if (pp == NULL)
11509 11503                  return (EINVAL);
11510 11504  
11511 11505          rp = VTOR4(vp);
11512 11506          mutex_enter(&rp->r_statelock);
11513 11507          rp->r_count++;
11514 11508          mutex_exit(&rp->r_statelock);
11515 11509  
11516 11510          if (flags & B_ASYNC) {
11517 11511                  error = nfs4_async_pageio(vp, pp, io_off, io_len, flags, cr,
11518 11512                      nfs4_sync_pageio);
11519 11513          } else
11520 11514                  error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
11521 11515          mutex_enter(&rp->r_statelock);
11522 11516          rp->r_count--;
11523 11517          cv_broadcast(&rp->r_cv);
11524 11518          mutex_exit(&rp->r_statelock);
11525 11519          return (error);
11526 11520  }
11527 11521  
11528 11522  /* ARGSUSED */
11529 11523  static void
11530 11524  nfs4_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr,
11531 11525      caller_context_t *ct)
11532 11526  {
11533 11527          int error;
11534 11528          rnode4_t *rp;
11535 11529          page_t *plist;
11536 11530          page_t *pptr;
11537 11531          offset3 offset;
11538 11532          count3 len;
11539 11533          k_sigset_t smask;
11540 11534  
11541 11535          /*
11542 11536           * We should get called with fl equal to either B_FREE or
11543 11537           * B_INVAL.  Any other value is illegal.
11544 11538           *
11545 11539           * The page that we are either supposed to free or destroy
11546 11540           * should be exclusive locked and its io lock should not
11547 11541           * be held.
11548 11542           */
11549 11543          ASSERT(fl == B_FREE || fl == B_INVAL);
11550 11544          ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
11551 11545  
11552 11546          rp = VTOR4(vp);
11553 11547  
11554 11548          /*
11555 11549           * If the page doesn't need to be committed or we shouldn't
11556 11550           * even bother attempting to commit it, then just make sure
11557 11551           * that the p_fsdata byte is clear and then either free or
11558 11552           * destroy the page as appropriate.
11559 11553           */
11560 11554          if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & R4STALE)) {
11561 11555                  pp->p_fsdata = C_NOCOMMIT;
11562 11556                  if (fl == B_FREE)
11563 11557                          page_free(pp, dn);
11564 11558                  else
11565 11559                          page_destroy(pp, dn);
11566 11560                  return;
11567 11561          }
11568 11562  
11569 11563          /*
11570 11564           * If there is a page invalidation operation going on, then
11571 11565           * if this is one of the pages being destroyed, then just
11572 11566           * clear the p_fsdata byte and then either free or destroy
11573 11567           * the page as appropriate.
11574 11568           */
11575 11569          mutex_enter(&rp->r_statelock);
11576 11570          if ((rp->r_flags & R4TRUNCATE) && pp->p_offset >= rp->r_truncaddr) {
11577 11571                  mutex_exit(&rp->r_statelock);
11578 11572                  pp->p_fsdata = C_NOCOMMIT;
11579 11573                  if (fl == B_FREE)
11580 11574                          page_free(pp, dn);
11581 11575                  else
11582 11576                          page_destroy(pp, dn);
11583 11577                  return;
11584 11578          }
11585 11579  
11586 11580          /*
11587 11581           * If we are freeing this page and someone else is already
11588 11582           * waiting to do a commit, then just unlock the page and
11589 11583           * return.  That other thread will take care of commiting
11590 11584           * this page.  The page can be freed sometime after the
11591 11585           * commit has finished.  Otherwise, if the page is marked
11592 11586           * as delay commit, then we may be getting called from
11593 11587           * pvn_write_done, one page at a time.   This could result
11594 11588           * in one commit per page, so we end up doing lots of small
11595 11589           * commits instead of fewer larger commits.  This is bad,
11596 11590           * we want do as few commits as possible.
11597 11591           */
11598 11592          if (fl == B_FREE) {
11599 11593                  if (rp->r_flags & R4COMMITWAIT) {
11600 11594                          page_unlock(pp);
11601 11595                          mutex_exit(&rp->r_statelock);
11602 11596                          return;
11603 11597                  }
11604 11598                  if (pp->p_fsdata == C_DELAYCOMMIT) {
11605 11599                          pp->p_fsdata = C_COMMIT;
11606 11600                          page_unlock(pp);
11607 11601                          mutex_exit(&rp->r_statelock);
11608 11602                          return;
11609 11603                  }
11610 11604          }
11611 11605  
11612 11606          /*
11613 11607           * Check to see if there is a signal which would prevent an
11614 11608           * attempt to commit the pages from being successful.  If so,
11615 11609           * then don't bother with all of the work to gather pages and
11616 11610           * generate the unsuccessful RPC.  Just return from here and
11617 11611           * let the page be committed at some later time.
11618 11612           */
11619 11613          sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
11620 11614          if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) {
11621 11615                  sigunintr(&smask);
11622 11616                  page_unlock(pp);
11623 11617                  mutex_exit(&rp->r_statelock);
11624 11618                  return;
11625 11619          }
11626 11620          sigunintr(&smask);
11627 11621  
11628 11622          /*
11629 11623           * We are starting to need to commit pages, so let's try
11630 11624           * to commit as many as possible at once to reduce the
11631 11625           * overhead.
11632 11626           *
11633 11627           * Set the `commit inprogress' state bit.  We must
11634 11628           * first wait until any current one finishes.  Then
11635 11629           * we initialize the c_pages list with this page.
11636 11630           */
11637 11631          while (rp->r_flags & R4COMMIT) {
11638 11632                  rp->r_flags |= R4COMMITWAIT;
11639 11633                  cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
11640 11634                  rp->r_flags &= ~R4COMMITWAIT;
11641 11635          }
11642 11636          rp->r_flags |= R4COMMIT;
11643 11637          mutex_exit(&rp->r_statelock);
11644 11638          ASSERT(rp->r_commit.c_pages == NULL);
11645 11639          rp->r_commit.c_pages = pp;
11646 11640          rp->r_commit.c_commbase = (offset3)pp->p_offset;
11647 11641          rp->r_commit.c_commlen = PAGESIZE;
11648 11642  
11649 11643          /*
11650 11644           * Gather together all other pages which can be committed.
11651 11645           * They will all be chained off r_commit.c_pages.
11652 11646           */
11653 11647          nfs4_get_commit(vp);
11654 11648  
11655 11649          /*
11656 11650           * Clear the `commit inprogress' status and disconnect
11657 11651           * the list of pages to be committed from the rnode.
11658 11652           * At this same time, we also save the starting offset
11659 11653           * and length of data to be committed on the server.
11660 11654           */
11661 11655          plist = rp->r_commit.c_pages;
11662 11656          rp->r_commit.c_pages = NULL;
11663 11657          offset = rp->r_commit.c_commbase;
11664 11658          len = rp->r_commit.c_commlen;
11665 11659          mutex_enter(&rp->r_statelock);
11666 11660          rp->r_flags &= ~R4COMMIT;
11667 11661          cv_broadcast(&rp->r_commit.c_cv);
11668 11662          mutex_exit(&rp->r_statelock);
11669 11663  
11670 11664          if (curproc == proc_pageout || curproc == proc_fsflush ||
11671 11665              nfs_zone() != VTOMI4(vp)->mi_zone) {
11672 11666                  nfs4_async_commit(vp, plist, offset, len,
11673 11667                      cr, do_nfs4_async_commit);
11674 11668                  return;
11675 11669          }
11676 11670  
11677 11671          /*
11678 11672           * Actually generate the COMMIT op over the wire operation.
11679 11673           */
11680 11674          error = nfs4_commit(vp, (offset4)offset, (count4)len, cr);
11681 11675  
11682 11676          /*
11683 11677           * If we got an error during the commit, just unlock all
11684 11678           * of the pages.  The pages will get retransmitted to the
11685 11679           * server during a putpage operation.
11686 11680           */
11687 11681          if (error) {
11688 11682                  while (plist != NULL) {
11689 11683                          pptr = plist;
11690 11684                          page_sub(&plist, pptr);
11691 11685                          page_unlock(pptr);
11692 11686                  }
11693 11687                  return;
11694 11688          }
11695 11689  
11696 11690          /*
11697 11691           * We've tried as hard as we can to commit the data to stable
11698 11692           * storage on the server.  We just unlock the rest of the pages
11699 11693           * and clear the commit required state.  They will be put
11700 11694           * onto the tail of the cachelist if they are nolonger
11701 11695           * mapped.
11702 11696           */
11703 11697          while (plist != pp) {
11704 11698                  pptr = plist;
11705 11699                  page_sub(&plist, pptr);
11706 11700                  pptr->p_fsdata = C_NOCOMMIT;
11707 11701                  page_unlock(pptr);
11708 11702          }
11709 11703  
11710 11704          /*
11711 11705           * It is possible that nfs4_commit didn't return error but
11712 11706           * some other thread has modified the page we are going
11713 11707           * to free/destroy.
11714 11708           *    In this case we need to rewrite the page. Do an explicit check
11715 11709           * before attempting to free/destroy the page. If modified, needs to
11716 11710           * be rewritten so unlock the page and return.
11717 11711           */
11718 11712          if (hat_ismod(pp)) {
11719 11713                  pp->p_fsdata = C_NOCOMMIT;
11720 11714                  page_unlock(pp);
11721 11715                  return;
11722 11716          }
11723 11717  
11724 11718          /*
11725 11719           * Now, as appropriate, either free or destroy the page
11726 11720           * that we were called with.
11727 11721           */
11728 11722          pp->p_fsdata = C_NOCOMMIT;
11729 11723          if (fl == B_FREE)
11730 11724                  page_free(pp, dn);
11731 11725          else
11732 11726                  page_destroy(pp, dn);
11733 11727  }
11734 11728  
11735 11729  /*
11736 11730   * Commit requires that the current fh be the file written to.
11737 11731   * The compound op structure is:
11738 11732   *      PUTFH(file), COMMIT
11739 11733   */
11740 11734  static int
11741 11735  nfs4_commit(vnode_t *vp, offset4 offset, count4 count, cred_t *cr)
11742 11736  {
11743 11737          COMPOUND4args_clnt args;
11744 11738          COMPOUND4res_clnt res;
11745 11739          COMMIT4res *cm_res;
11746 11740          nfs_argop4 argop[2];
11747 11741          nfs_resop4 *resop;
11748 11742          int doqueue;
11749 11743          mntinfo4_t *mi;
11750 11744          rnode4_t *rp;
11751 11745          cred_t *cred_otw = NULL;
11752 11746          bool_t needrecov = FALSE;
11753 11747          nfs4_recov_state_t recov_state;
11754 11748          nfs4_open_stream_t *osp = NULL;
11755 11749          bool_t first_time = TRUE;       /* first time getting OTW cred */
11756 11750          bool_t last_time = FALSE;       /* last time getting OTW cred */
11757 11751          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
11758 11752  
11759 11753          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11760 11754  
11761 11755          rp = VTOR4(vp);
11762 11756  
11763 11757          mi = VTOMI4(vp);
11764 11758          recov_state.rs_flags = 0;
11765 11759          recov_state.rs_num_retry_despite_err = 0;
11766 11760  get_commit_cred:
11767 11761          /*
11768 11762           * Releases the osp, if a valid open stream is provided.
11769 11763           * Puts a hold on the cred_otw and the new osp (if found).
11770 11764           */
11771 11765          cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
11772 11766              &first_time, &last_time);
11773 11767          args.ctag = TAG_COMMIT;
11774 11768  recov_retry:
11775 11769          /*
11776 11770           * Commit ops: putfh file; commit
11777 11771           */
11778 11772          args.array_len = 2;
11779 11773          args.array = argop;
11780 11774  
11781 11775          e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11782 11776              &recov_state, NULL);
11783 11777          if (e.error) {
11784 11778                  crfree(cred_otw);
11785 11779                  if (osp != NULL)
11786 11780                          open_stream_rele(osp, rp);
11787 11781                  return (e.error);
11788 11782          }
11789 11783  
11790 11784          /* putfh directory */
11791 11785          argop[0].argop = OP_CPUTFH;
11792 11786          argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
11793 11787  
11794 11788          /* commit */
11795 11789          argop[1].argop = OP_COMMIT;
11796 11790          argop[1].nfs_argop4_u.opcommit.offset = offset;
11797 11791          argop[1].nfs_argop4_u.opcommit.count = count;
11798 11792  
11799 11793          doqueue = 1;
11800 11794          rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, &e);
11801 11795  
11802 11796          needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
11803 11797          if (!needrecov && e.error) {
11804 11798                  nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state,
11805 11799                      needrecov);
11806 11800                  crfree(cred_otw);
11807 11801                  if (e.error == EACCES && last_time == FALSE)
11808 11802                          goto get_commit_cred;
11809 11803                  if (osp != NULL)
11810 11804                          open_stream_rele(osp, rp);
11811 11805                  return (e.error);
11812 11806          }
11813 11807  
11814 11808          if (needrecov) {
11815 11809                  if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
11816 11810                      NULL, OP_COMMIT, NULL, NULL, NULL) == FALSE) {
11817 11811                          nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11818 11812                              &recov_state, needrecov);
11819 11813                          if (!e.error)
11820 11814                                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11821 11815                          goto recov_retry;
11822 11816                  }
11823 11817                  if (e.error) {
11824 11818                          nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11825 11819                              &recov_state, needrecov);
11826 11820                          crfree(cred_otw);
11827 11821                          if (osp != NULL)
11828 11822                                  open_stream_rele(osp, rp);
11829 11823                          return (e.error);
11830 11824                  }
11831 11825                  /* fall through for res.status case */
11832 11826          }
11833 11827  
11834 11828          if (res.status) {
11835 11829                  e.error = geterrno4(res.status);
11836 11830                  if (e.error == EACCES && last_time == FALSE) {
11837 11831                          crfree(cred_otw);
11838 11832                          nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11839 11833                              &recov_state, needrecov);
11840 11834                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11841 11835                          goto get_commit_cred;
11842 11836                  }
11843 11837                  /*
11844 11838                   * Can't do a nfs4_purge_stale_fh here because this
11845 11839                   * can cause a deadlock.  nfs4_commit can
11846 11840                   * be called from nfs4_dispose which can be called
11847 11841                   * indirectly via pvn_vplist_dirty.  nfs4_purge_stale_fh
11848 11842                   * can call back to pvn_vplist_dirty.
11849 11843                   */
11850 11844                  if (e.error == ESTALE) {
11851 11845                          mutex_enter(&rp->r_statelock);
11852 11846                          rp->r_flags |= R4STALE;
11853 11847                          if (!rp->r_error)
11854 11848                                  rp->r_error = e.error;
11855 11849                          mutex_exit(&rp->r_statelock);
11856 11850                          PURGE_ATTRCACHE4(vp);
11857 11851                  } else {
11858 11852                          mutex_enter(&rp->r_statelock);
11859 11853                          if (!rp->r_error)
11860 11854                                  rp->r_error = e.error;
11861 11855                          mutex_exit(&rp->r_statelock);
11862 11856                  }
11863 11857          } else {
11864 11858                  ASSERT(rp->r_flags & R4HAVEVERF);
11865 11859                  resop = &res.array[1];  /* commit res */
11866 11860                  cm_res = &resop->nfs_resop4_u.opcommit;
11867 11861                  mutex_enter(&rp->r_statelock);
11868 11862                  if (cm_res->writeverf == rp->r_writeverf) {
11869 11863                          mutex_exit(&rp->r_statelock);
11870 11864                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11871 11865                          nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11872 11866                              &recov_state, needrecov);
11873 11867                          crfree(cred_otw);
11874 11868                          if (osp != NULL)
11875 11869                                  open_stream_rele(osp, rp);
11876 11870                          return (0);
11877 11871                  }
11878 11872                  nfs4_set_mod(vp);
11879 11873                  rp->r_writeverf = cm_res->writeverf;
11880 11874                  mutex_exit(&rp->r_statelock);
11881 11875                  e.error = NFS_VERF_MISMATCH;
11882 11876          }
11883 11877  
11884 11878          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11885 11879          nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, needrecov);
11886 11880          crfree(cred_otw);
11887 11881          if (osp != NULL)
11888 11882                  open_stream_rele(osp, rp);
11889 11883  
11890 11884          return (e.error);
11891 11885  }
11892 11886  
11893 11887  static void
11894 11888  nfs4_set_mod(vnode_t *vp)
11895 11889  {
11896 11890          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11897 11891  
11898 11892          /* make sure we're looking at the master vnode, not a shadow */
11899 11893          pvn_vplist_setdirty(RTOV4(VTOR4(vp)), nfs_setmod_check);
11900 11894  }
11901 11895  
11902 11896  /*
11903 11897   * This function is used to gather a page list of the pages which
11904 11898   * can be committed on the server.
11905 11899   *
11906 11900   * The calling thread must have set R4COMMIT.  This bit is used to
11907 11901   * serialize access to the commit structure in the rnode.  As long
11908 11902   * as the thread has set R4COMMIT, then it can manipulate the commit
11909 11903   * structure without requiring any other locks.
11910 11904   *
11911 11905   * When this function is called from nfs4_dispose() the page passed
11912 11906   * into nfs4_dispose() will be SE_EXCL locked, and so this function
11913 11907   * will skip it. This is not a problem since we initially add the
11914 11908   * page to the r_commit page list.
11915 11909   *
11916 11910   */
11917 11911  static void
11918 11912  nfs4_get_commit(vnode_t *vp)
11919 11913  {
11920 11914          rnode4_t *rp;
11921 11915          page_t *pp;
11922 11916          kmutex_t *vphm;
11923 11917  
11924 11918          rp = VTOR4(vp);
11925 11919  
11926 11920          ASSERT(rp->r_flags & R4COMMIT);
11927 11921  
11928 11922          /* make sure we're looking at the master vnode, not a shadow */
11929 11923  
11930 11924          if (IS_SHADOW(vp, rp))
11931 11925                  vp = RTOV4(rp);
11932 11926  
11933 11927          vphm = page_vnode_mutex(vp);
11934 11928          mutex_enter(vphm);
11935 11929  
11936 11930          /*
11937 11931           * If there are no pages associated with this vnode, then
11938 11932           * just return.
11939 11933           */
11940 11934          if ((pp = vp->v_pages) == NULL) {
11941 11935                  mutex_exit(vphm);
11942 11936                  return;
11943 11937          }
11944 11938  
11945 11939          /*
11946 11940           * Step through all of the pages associated with this vnode
11947 11941           * looking for pages which need to be committed.
11948 11942           */
11949 11943          do {
11950 11944                  /* Skip marker pages. */
11951 11945                  if (pp->p_hash == PVN_VPLIST_HASH_TAG)
11952 11946                          continue;
11953 11947  
11954 11948                  /*
11955 11949                   * First short-cut everything (without the page_lock)
11956 11950                   * and see if this page does not need to be committed
11957 11951                   * or is modified if so then we'll just skip it.
11958 11952                   */
11959 11953                  if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp))
11960 11954                          continue;
11961 11955  
11962 11956                  /*
11963 11957                   * Attempt to lock the page.  If we can't, then
11964 11958                   * someone else is messing with it or we have been
11965 11959                   * called from nfs4_dispose and this is the page that
11966 11960                   * nfs4_dispose was called with.. anyway just skip it.
11967 11961                   */
11968 11962                  if (!page_trylock(pp, SE_EXCL))
11969 11963                          continue;
11970 11964  
11971 11965                  /*
11972 11966                   * Lets check again now that we have the page lock.
11973 11967                   */
11974 11968                  if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
11975 11969                          page_unlock(pp);
11976 11970                          continue;
11977 11971                  }
11978 11972  
11979 11973                  /* this had better not be a free page */
11980 11974                  ASSERT(PP_ISFREE(pp) == 0);
11981 11975  
11982 11976                  /*
11983 11977                   * The page needs to be committed and we locked it.
11984 11978                   * Update the base and length parameters and add it
11985 11979                   * to r_pages.
11986 11980                   */
11987 11981                  if (rp->r_commit.c_pages == NULL) {
11988 11982                          rp->r_commit.c_commbase = (offset3)pp->p_offset;
11989 11983                          rp->r_commit.c_commlen = PAGESIZE;
11990 11984                  } else if (pp->p_offset < rp->r_commit.c_commbase) {
11991 11985                          rp->r_commit.c_commlen = rp->r_commit.c_commbase -
11992 11986                              (offset3)pp->p_offset + rp->r_commit.c_commlen;
11993 11987                          rp->r_commit.c_commbase = (offset3)pp->p_offset;
11994 11988                  } else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen)
11995 11989                      <= pp->p_offset) {
11996 11990                          rp->r_commit.c_commlen = (offset3)pp->p_offset -
11997 11991                              rp->r_commit.c_commbase + PAGESIZE;
11998 11992                  }
11999 11993                  page_add(&rp->r_commit.c_pages, pp);
12000 11994          } while ((pp = pp->p_vpnext) != vp->v_pages);
12001 11995  
12002 11996          mutex_exit(vphm);
12003 11997  }
12004 11998  
12005 11999  /*
12006 12000   * This routine is used to gather together a page list of the pages
12007 12001   * which are to be committed on the server.  This routine must not
12008 12002   * be called if the calling thread holds any locked pages.
12009 12003   *
12010 12004   * The calling thread must have set R4COMMIT.  This bit is used to
12011 12005   * serialize access to the commit structure in the rnode.  As long
12012 12006   * as the thread has set R4COMMIT, then it can manipulate the commit
12013 12007   * structure without requiring any other locks.
12014 12008   */
12015 12009  static void
12016 12010  nfs4_get_commit_range(vnode_t *vp, u_offset_t soff, size_t len)
12017 12011  {
12018 12012  
12019 12013          rnode4_t *rp;
12020 12014          page_t *pp;
12021 12015          u_offset_t end;
12022 12016          u_offset_t off;
12023 12017          ASSERT(len != 0);
12024 12018          rp = VTOR4(vp);
12025 12019          ASSERT(rp->r_flags & R4COMMIT);
12026 12020  
12027 12021          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12028 12022  
12029 12023          /* make sure we're looking at the master vnode, not a shadow */
12030 12024  
12031 12025          if (IS_SHADOW(vp, rp))
12032 12026                  vp = RTOV4(rp);
12033 12027  
12034 12028          /*
12035 12029           * If there are no pages associated with this vnode, then
12036 12030           * just return.
12037 12031           */
12038 12032          if ((pp = vp->v_pages) == NULL)
12039 12033                  return;
12040 12034          /*
12041 12035           * Calculate the ending offset.
12042 12036           */
12043 12037          end = soff + len;
12044 12038          for (off = soff; off < end; off += PAGESIZE) {
12045 12039                  /*
12046 12040                   * Lookup each page by vp, offset.
12047 12041                   */
12048 12042                  if ((pp = page_lookup_nowait(vp, off, SE_EXCL)) == NULL)
12049 12043                          continue;
12050 12044                  /*
12051 12045                   * If this page does not need to be committed or is
12052 12046                   * modified, then just skip it.
12053 12047                   */
12054 12048                  if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
12055 12049                          page_unlock(pp);
12056 12050                          continue;
12057 12051                  }
12058 12052  
12059 12053                  ASSERT(PP_ISFREE(pp) == 0);
12060 12054                  /*
12061 12055                   * The page needs to be committed and we locked it.
12062 12056                   * Update the base and length parameters and add it
12063 12057                   * to r_pages.
12064 12058                   */
12065 12059                  if (rp->r_commit.c_pages == NULL) {
12066 12060                          rp->r_commit.c_commbase = (offset3)pp->p_offset;
12067 12061                          rp->r_commit.c_commlen = PAGESIZE;
12068 12062                  } else {
12069 12063                          rp->r_commit.c_commlen = (offset3)pp->p_offset -
12070 12064                              rp->r_commit.c_commbase + PAGESIZE;
12071 12065                  }
12072 12066                  page_add(&rp->r_commit.c_pages, pp);
12073 12067          }
12074 12068  }
12075 12069  
12076 12070  /*
12077 12071   * Called from nfs4_close(), nfs4_fsync() and nfs4_delmap().
12078 12072   * Flushes and commits data to the server.
12079 12073   */
12080 12074  static int
12081 12075  nfs4_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr)
12082 12076  {
12083 12077          int error;
12084 12078          verifier4 write_verf;
12085 12079          rnode4_t *rp = VTOR4(vp);
12086 12080  
12087 12081          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12088 12082  
12089 12083          /*
12090 12084           * Flush the data portion of the file and then commit any
12091 12085           * portions which need to be committed.  This may need to
12092 12086           * be done twice if the server has changed state since
12093 12087           * data was last written.  The data will need to be
12094 12088           * rewritten to the server and then a new commit done.
12095 12089           *
12096 12090           * In fact, this may need to be done several times if the
12097 12091           * server is having problems and crashing while we are
12098 12092           * attempting to do this.
12099 12093           */
12100 12094  
12101 12095  top:
12102 12096          /*
12103 12097           * Do a flush based on the poff and plen arguments.  This
12104 12098           * will synchronously write out any modified pages in the
12105 12099           * range specified by (poff, plen). This starts all of the
12106 12100           * i/o operations which will be waited for in the next
12107 12101           * call to nfs4_putpage
12108 12102           */
12109 12103  
12110 12104          mutex_enter(&rp->r_statelock);
12111 12105          write_verf = rp->r_writeverf;
12112 12106          mutex_exit(&rp->r_statelock);
12113 12107  
12114 12108          error = nfs4_putpage(vp, poff, plen, B_ASYNC, cr, NULL);
12115 12109          if (error == EAGAIN)
12116 12110                  error = 0;
12117 12111  
12118 12112          /*
12119 12113           * Do a flush based on the poff and plen arguments.  This
12120 12114           * will synchronously write out any modified pages in the
12121 12115           * range specified by (poff, plen) and wait until all of
12122 12116           * the asynchronous i/o's in that range are done as well.
12123 12117           */
12124 12118          if (!error)
12125 12119                  error = nfs4_putpage(vp, poff, plen, 0, cr, NULL);
12126 12120  
12127 12121          if (error)
12128 12122                  return (error);
12129 12123  
12130 12124          mutex_enter(&rp->r_statelock);
12131 12125          if (rp->r_writeverf != write_verf) {
12132 12126                  mutex_exit(&rp->r_statelock);
12133 12127                  goto top;
12134 12128          }
12135 12129          mutex_exit(&rp->r_statelock);
12136 12130  
12137 12131          /*
12138 12132           * Now commit any pages which might need to be committed.
12139 12133           * If the error, NFS_VERF_MISMATCH, is returned, then
12140 12134           * start over with the flush operation.
12141 12135           */
12142 12136          error = nfs4_commit_vp(vp, poff, plen, cr, NFS4_WRITE_WAIT);
12143 12137  
12144 12138          if (error == NFS_VERF_MISMATCH)
12145 12139                  goto top;
12146 12140  
12147 12141          return (error);
12148 12142  }
12149 12143  
12150 12144  /*
12151 12145   * nfs4_commit_vp()  will wait for other pending commits and
12152 12146   * will either commit the whole file or a range, plen dictates
12153 12147   * if we commit whole file. a value of zero indicates the whole
12154 12148   * file. Called from nfs4_putpage_commit() or nfs4_sync_putapage()
12155 12149   */
12156 12150  static int
12157 12151  nfs4_commit_vp(vnode_t *vp, u_offset_t poff, size_t plen,
12158 12152      cred_t *cr, int wait_on_writes)
12159 12153  {
12160 12154          rnode4_t *rp;
12161 12155          page_t *plist;
12162 12156          offset3 offset;
12163 12157          count3 len;
12164 12158  
12165 12159          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12166 12160  
12167 12161          rp = VTOR4(vp);
12168 12162  
12169 12163          /*
12170 12164           *  before we gather commitable pages make
12171 12165           *  sure there are no outstanding async writes
12172 12166           */
12173 12167          if (rp->r_count && wait_on_writes == NFS4_WRITE_WAIT) {
12174 12168                  mutex_enter(&rp->r_statelock);
12175 12169                  while (rp->r_count > 0) {
12176 12170                          cv_wait(&rp->r_cv, &rp->r_statelock);
12177 12171                  }
12178 12172                  mutex_exit(&rp->r_statelock);
12179 12173          }
12180 12174  
12181 12175          /*
12182 12176           * Set the `commit inprogress' state bit.  We must
12183 12177           * first wait until any current one finishes.
12184 12178           */
12185 12179          mutex_enter(&rp->r_statelock);
12186 12180          while (rp->r_flags & R4COMMIT) {
12187 12181                  rp->r_flags |= R4COMMITWAIT;
12188 12182                  cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
12189 12183                  rp->r_flags &= ~R4COMMITWAIT;
12190 12184          }
12191 12185          rp->r_flags |= R4COMMIT;
12192 12186          mutex_exit(&rp->r_statelock);
12193 12187  
12194 12188          /*
12195 12189           * Gather all of the pages which need to be
12196 12190           * committed.
12197 12191           */
12198 12192          if (plen == 0)
12199 12193                  nfs4_get_commit(vp);
12200 12194          else
12201 12195                  nfs4_get_commit_range(vp, poff, plen);
12202 12196  
12203 12197          /*
12204 12198           * Clear the `commit inprogress' bit and disconnect the
12205 12199           * page list which was gathered by nfs4_get_commit.
12206 12200           */
12207 12201          plist = rp->r_commit.c_pages;
12208 12202          rp->r_commit.c_pages = NULL;
12209 12203          offset = rp->r_commit.c_commbase;
12210 12204          len = rp->r_commit.c_commlen;
12211 12205          mutex_enter(&rp->r_statelock);
12212 12206          rp->r_flags &= ~R4COMMIT;
12213 12207          cv_broadcast(&rp->r_commit.c_cv);
12214 12208          mutex_exit(&rp->r_statelock);
12215 12209  
12216 12210          /*
12217 12211           * If any pages need to be committed, commit them and
12218 12212           * then unlock them so that they can be freed some
12219 12213           * time later.
12220 12214           */
12221 12215          if (plist == NULL)
12222 12216                  return (0);
12223 12217  
12224 12218          /*
12225 12219           * No error occurred during the flush portion
12226 12220           * of this operation, so now attempt to commit
12227 12221           * the data to stable storage on the server.
12228 12222           *
12229 12223           * This will unlock all of the pages on the list.
12230 12224           */
12231 12225          return (nfs4_sync_commit(vp, plist, offset, len, cr));
12232 12226  }
12233 12227  
12234 12228  static int
12235 12229  nfs4_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
12236 12230      cred_t *cr)
12237 12231  {
12238 12232          int error;
12239 12233          page_t *pp;
12240 12234  
12241 12235          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12242 12236  
12243 12237          error = nfs4_commit(vp, (offset4)offset, (count3)count, cr);
12244 12238  
12245 12239          /*
12246 12240           * If we got an error, then just unlock all of the pages
12247 12241           * on the list.
12248 12242           */
12249 12243          if (error) {
12250 12244                  while (plist != NULL) {
12251 12245                          pp = plist;
12252 12246                          page_sub(&plist, pp);
12253 12247                          page_unlock(pp);
12254 12248                  }
12255 12249                  return (error);
12256 12250          }
12257 12251          /*
12258 12252           * We've tried as hard as we can to commit the data to stable
12259 12253           * storage on the server.  We just unlock the pages and clear
12260 12254           * the commit required state.  They will get freed later.
12261 12255           */
12262 12256          while (plist != NULL) {
12263 12257                  pp = plist;
12264 12258                  page_sub(&plist, pp);
12265 12259                  pp->p_fsdata = C_NOCOMMIT;
12266 12260                  page_unlock(pp);
12267 12261          }
12268 12262  
12269 12263          return (error);
12270 12264  }
12271 12265  
12272 12266  static void
12273 12267  do_nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
12274 12268      cred_t *cr)
12275 12269  {
12276 12270  
12277 12271          (void) nfs4_sync_commit(vp, plist, offset, count, cr);
12278 12272  }
12279 12273  
12280 12274  /*ARGSUSED*/
12281 12275  static int
12282 12276  nfs4_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
12283 12277      caller_context_t *ct)
12284 12278  {
12285 12279          int             error = 0;
12286 12280          mntinfo4_t      *mi;
12287 12281          vattr_t         va;
12288 12282          vsecattr_t      nfsace4_vsap;
12289 12283  
12290 12284          mi = VTOMI4(vp);
12291 12285          if (nfs_zone() != mi->mi_zone)
12292 12286                  return (EIO);
12293 12287          if (mi->mi_flags & MI4_ACL) {
12294 12288                  /* if we have a delegation, return it */
12295 12289                  if (VTOR4(vp)->r_deleg_type != OPEN_DELEGATE_NONE)
12296 12290                          (void) nfs4delegreturn(VTOR4(vp),
12297 12291                              NFS4_DR_REOPEN|NFS4_DR_PUSH);
12298 12292  
12299 12293                  error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask,
12300 12294                      NFS4_ACL_SET);
12301 12295                  if (error) /* EINVAL */
12302 12296                          return (error);
12303 12297  
12304 12298                  if (vsecattr->vsa_mask & (VSA_ACL | VSA_DFACL)) {
12305 12299                          /*
12306 12300                           * These are aclent_t type entries.
12307 12301                           */
12308 12302                          error = vs_aent_to_ace4(vsecattr, &nfsace4_vsap,
12309 12303                              vp->v_type == VDIR, FALSE);
12310 12304                          if (error)
12311 12305                                  return (error);
12312 12306                  } else {
12313 12307                          /*
12314 12308                           * These are ace_t type entries.
12315 12309                           */
12316 12310                          error = vs_acet_to_ace4(vsecattr, &nfsace4_vsap,
12317 12311                              FALSE);
12318 12312                          if (error)
12319 12313                                  return (error);
12320 12314                  }
12321 12315                  bzero(&va, sizeof (va));
12322 12316                  error = nfs4setattr(vp, &va, flag, cr, &nfsace4_vsap);
12323 12317                  vs_ace4_destroy(&nfsace4_vsap);
12324 12318                  return (error);
12325 12319          }
12326 12320          return (ENOSYS);
12327 12321  }
12328 12322  
12329 12323  /* ARGSUSED */
12330 12324  int
12331 12325  nfs4_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
12332 12326      caller_context_t *ct)
12333 12327  {
12334 12328          int             error;
12335 12329          mntinfo4_t      *mi;
12336 12330          nfs4_ga_res_t   gar;
12337 12331          rnode4_t        *rp = VTOR4(vp);
12338 12332  
12339 12333          mi = VTOMI4(vp);
12340 12334          if (nfs_zone() != mi->mi_zone)
12341 12335                  return (EIO);
12342 12336  
12343 12337          bzero(&gar, sizeof (gar));
12344 12338          gar.n4g_vsa.vsa_mask = vsecattr->vsa_mask;
12345 12339  
12346 12340          /*
12347 12341           * vsecattr->vsa_mask holds the original acl request mask.
12348 12342           * This is needed when determining what to return.
12349 12343           * (See: nfs4_create_getsecattr_return())
12350 12344           */
12351 12345          error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, NFS4_ACL_GET);
12352 12346          if (error) /* EINVAL */
12353 12347                  return (error);
12354 12348  
12355 12349          /*
12356 12350           * If this is a referral stub, don't try to go OTW for an ACL
12357 12351           */
12358 12352          if (RP_ISSTUB_REFERRAL(VTOR4(vp)))
12359 12353                  return (fs_fab_acl(vp, vsecattr, flag, cr, ct));
12360 12354  
12361 12355          if (mi->mi_flags & MI4_ACL) {
12362 12356                  /*
12363 12357                   * Check if the data is cached and the cache is valid.  If it
12364 12358                   * is we don't go over the wire.
12365 12359                   */
12366 12360                  if (rp->r_secattr != NULL && ATTRCACHE4_VALID(vp)) {
12367 12361                          mutex_enter(&rp->r_statelock);
12368 12362                          if (rp->r_secattr != NULL) {
12369 12363                                  error = nfs4_create_getsecattr_return(
12370 12364                                      rp->r_secattr, vsecattr, rp->r_attr.va_uid,
12371 12365                                      rp->r_attr.va_gid,
12372 12366                                      vp->v_type == VDIR);
12373 12367                                  if (!error) { /* error == 0 - Success! */
12374 12368                                          mutex_exit(&rp->r_statelock);
12375 12369                                          return (error);
12376 12370                                  }
12377 12371                          }
12378 12372                          mutex_exit(&rp->r_statelock);
12379 12373                  }
12380 12374  
12381 12375                  /*
12382 12376                   * The getattr otw call will always get both the acl, in
12383 12377                   * the form of a list of nfsace4's, and the number of acl
12384 12378                   * entries; independent of the value of gar.n4g_va.va_mask.
12385 12379                   */
12386 12380                  error =  nfs4_getattr_otw(vp, &gar, cr, 1);
12387 12381                  if (error) {
12388 12382                          vs_ace4_destroy(&gar.n4g_vsa);
12389 12383                          if (error == ENOTSUP || error == EOPNOTSUPP)
12390 12384                                  error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12391 12385                          return (error);
12392 12386                  }
12393 12387  
12394 12388                  if (!(gar.n4g_resbmap & FATTR4_ACL_MASK)) {
12395 12389                          /*
12396 12390                           * No error was returned, but according to the response
12397 12391                           * bitmap, neither was an acl.
12398 12392                           */
12399 12393                          vs_ace4_destroy(&gar.n4g_vsa);
12400 12394                          error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12401 12395                          return (error);
12402 12396                  }
12403 12397  
12404 12398                  /*
12405 12399                   * Update the cache with the ACL.
12406 12400                   */
12407 12401                  nfs4_acl_fill_cache(rp, &gar.n4g_vsa);
12408 12402  
12409 12403                  error = nfs4_create_getsecattr_return(&gar.n4g_vsa,
12410 12404                      vsecattr, gar.n4g_va.va_uid, gar.n4g_va.va_gid,
12411 12405                      vp->v_type == VDIR);
12412 12406                  vs_ace4_destroy(&gar.n4g_vsa);
12413 12407                  if ((error) && (vsecattr->vsa_mask &
12414 12408                      (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) &&
12415 12409                      (error != EACCES)) {
12416 12410                          error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12417 12411                  }
12418 12412                  return (error);
12419 12413          }
12420 12414          error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12421 12415          return (error);
12422 12416  }
12423 12417  
12424 12418  /*
12425 12419   * The function returns:
12426 12420   *      - 0 (zero) if the passed in "acl_mask" is a valid request.
12427 12421   *      - EINVAL if the passed in "acl_mask" is an invalid request.
12428 12422   *
12429 12423   * In the case of getting an acl (op == NFS4_ACL_GET) the mask is invalid if:
12430 12424   * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE)
12431 12425   *
12432 12426   * In the case of setting an acl (op == NFS4_ACL_SET) the mask is invalid if:
12433 12427   * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE)
12434 12428   * - We have a count field set without the corresponding acl field set. (e.g. -
12435 12429   * VSA_ACECNT is set, but VSA_ACE is not)
12436 12430   */
12437 12431  static int
12438 12432  nfs4_is_acl_mask_valid(uint_t acl_mask, nfs4_acl_op_t op)
12439 12433  {
12440 12434          /* Shortcut the masks that are always valid. */
12441 12435          if (acl_mask == (VSA_ACE | VSA_ACECNT))
12442 12436                  return (0);
12443 12437          if (acl_mask == (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT))
12444 12438                  return (0);
12445 12439  
12446 12440          if (acl_mask & (VSA_ACE | VSA_ACECNT)) {
12447 12441                  /*
12448 12442                   * We can't have any VSA_ACL type stuff in the mask now.
12449 12443                   */
12450 12444                  if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL |
12451 12445                      VSA_DFACLCNT))
12452 12446                          return (EINVAL);
12453 12447  
12454 12448                  if (op == NFS4_ACL_SET) {
12455 12449                          if ((acl_mask & VSA_ACECNT) && !(acl_mask & VSA_ACE))
12456 12450                                  return (EINVAL);
12457 12451                  }
12458 12452          }
12459 12453  
12460 12454          if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) {
12461 12455                  /*
12462 12456                   * We can't have any VSA_ACE type stuff in the mask now.
12463 12457                   */
12464 12458                  if (acl_mask & (VSA_ACE | VSA_ACECNT))
12465 12459                          return (EINVAL);
12466 12460  
12467 12461                  if (op == NFS4_ACL_SET) {
12468 12462                          if ((acl_mask & VSA_ACLCNT) && !(acl_mask & VSA_ACL))
12469 12463                                  return (EINVAL);
12470 12464  
12471 12465                          if ((acl_mask & VSA_DFACLCNT) &&
12472 12466                              !(acl_mask & VSA_DFACL))
12473 12467                                  return (EINVAL);
12474 12468                  }
12475 12469          }
12476 12470          return (0);
12477 12471  }
12478 12472  
12479 12473  /*
12480 12474   * The theory behind creating the correct getsecattr return is simply this:
12481 12475   * "Don't return anything that the caller is not expecting to have to free."
12482 12476   */
12483 12477  static int
12484 12478  nfs4_create_getsecattr_return(vsecattr_t *filled_vsap, vsecattr_t *vsap,
12485 12479      uid_t uid, gid_t gid, int isdir)
12486 12480  {
12487 12481          int error = 0;
12488 12482          /* Save the mask since the translators modify it. */
12489 12483          uint_t  orig_mask = vsap->vsa_mask;
12490 12484  
12491 12485          if (orig_mask & (VSA_ACE | VSA_ACECNT)) {
12492 12486                  error = vs_ace4_to_acet(filled_vsap, vsap, uid, gid, FALSE);
12493 12487  
12494 12488                  if (error)
12495 12489                          return (error);
12496 12490  
12497 12491                  /*
12498 12492                   * If the caller only asked for the ace count (VSA_ACECNT)
12499 12493                   * don't give them the full acl (VSA_ACE), free it.
12500 12494                   */
12501 12495                  if (!orig_mask & VSA_ACE) {
12502 12496                          if (vsap->vsa_aclentp != NULL) {
12503 12497                                  kmem_free(vsap->vsa_aclentp,
12504 12498                                      vsap->vsa_aclcnt * sizeof (ace_t));
12505 12499                                  vsap->vsa_aclentp = NULL;
12506 12500                          }
12507 12501                  }
12508 12502                  vsap->vsa_mask = orig_mask;
12509 12503  
12510 12504          } else if (orig_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL |
12511 12505              VSA_DFACLCNT)) {
12512 12506                  error = vs_ace4_to_aent(filled_vsap, vsap, uid, gid,
12513 12507                      isdir, FALSE);
12514 12508  
12515 12509                  if (error)
12516 12510                          return (error);
12517 12511  
12518 12512                  /*
12519 12513                   * If the caller only asked for the acl count (VSA_ACLCNT)
12520 12514                   * and/or the default acl count (VSA_DFACLCNT) don't give them
12521 12515                   * the acl (VSA_ACL) or default acl (VSA_DFACL), free it.
12522 12516                   */
12523 12517                  if (!orig_mask & VSA_ACL) {
12524 12518                          if (vsap->vsa_aclentp != NULL) {
12525 12519                                  kmem_free(vsap->vsa_aclentp,
12526 12520                                      vsap->vsa_aclcnt * sizeof (aclent_t));
12527 12521                                  vsap->vsa_aclentp = NULL;
12528 12522                          }
12529 12523                  }
12530 12524  
12531 12525                  if (!orig_mask & VSA_DFACL) {
12532 12526                          if (vsap->vsa_dfaclentp != NULL) {
12533 12527                                  kmem_free(vsap->vsa_dfaclentp,
12534 12528                                      vsap->vsa_dfaclcnt * sizeof (aclent_t));
12535 12529                                  vsap->vsa_dfaclentp = NULL;
12536 12530                          }
12537 12531                  }
12538 12532                  vsap->vsa_mask = orig_mask;
12539 12533          }
12540 12534          return (0);
12541 12535  }
12542 12536  
12543 12537  /* ARGSUSED */
12544 12538  int
12545 12539  nfs4_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr,
12546 12540      caller_context_t *ct)
12547 12541  {
12548 12542          int error;
12549 12543  
12550 12544          if (nfs_zone() != VTOMI4(vp)->mi_zone)
12551 12545                  return (EIO);
12552 12546          /*
12553 12547           * check for valid cmd parameter
12554 12548           */
12555 12549          if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS)
12556 12550                  return (EINVAL);
12557 12551  
12558 12552          /*
12559 12553           * Check access permissions
12560 12554           */
12561 12555          if ((cmd & F_SHARE) &&
12562 12556              (((shr->s_access & F_RDACC) && (flag & FREAD) == 0) ||
12563 12557              (shr->s_access == F_WRACC && (flag & FWRITE) == 0)))
12564 12558                  return (EBADF);
12565 12559  
12566 12560          /*
12567 12561           * If the filesystem is mounted using local locking, pass the
12568 12562           * request off to the local share code.
12569 12563           */
12570 12564          if (VTOMI4(vp)->mi_flags & MI4_LLOCK)
12571 12565                  return (fs_shrlock(vp, cmd, shr, flag, cr, ct));
12572 12566  
12573 12567          switch (cmd) {
12574 12568          case F_SHARE:
12575 12569          case F_UNSHARE:
12576 12570                  /*
12577 12571                   * This will be properly implemented later,
12578 12572                   * see RFE: 4823948 .
12579 12573                   */
12580 12574                  error = EAGAIN;
12581 12575                  break;
12582 12576  
12583 12577          case F_HASREMOTELOCKS:
12584 12578                  /*
12585 12579                   * NFS client can't store remote locks itself
12586 12580                   */
12587 12581                  shr->s_access = 0;
12588 12582                  error = 0;
12589 12583                  break;
12590 12584  
12591 12585          default:
12592 12586                  error = EINVAL;
12593 12587                  break;
12594 12588          }
12595 12589  
12596 12590          return (error);
12597 12591  }
12598 12592  
12599 12593  /*
12600 12594   * Common code called by directory ops to update the attrcache
12601 12595   */
12602 12596  static int
12603 12597  nfs4_update_attrcache(nfsstat4 status, nfs4_ga_res_t *garp,
12604 12598      hrtime_t t, vnode_t *vp, cred_t *cr)
12605 12599  {
12606 12600          int error = 0;
12607 12601  
12608 12602          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12609 12603  
12610 12604          if (status != NFS4_OK) {
12611 12605                  /* getattr not done or failed */
12612 12606                  PURGE_ATTRCACHE4(vp);
12613 12607                  return (error);
12614 12608          }
12615 12609  
12616 12610          if (garp) {
12617 12611                  nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
12618 12612          } else {
12619 12613                  PURGE_ATTRCACHE4(vp);
12620 12614          }
12621 12615          return (error);
12622 12616  }
12623 12617  
12624 12618  /*
12625 12619   * Update directory caches for directory modification ops (link, rename, etc.)
12626 12620   * When dinfo is NULL, manage dircaches in the old way.
12627 12621   */
12628 12622  static void
12629 12623  nfs4_update_dircaches(change_info4 *cinfo, vnode_t *dvp, vnode_t *vp, char *nm,
12630 12624      dirattr_info_t *dinfo)
12631 12625  {
12632 12626          rnode4_t        *drp = VTOR4(dvp);
12633 12627  
12634 12628          ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
12635 12629  
12636 12630          /* Purge rddir cache for dir since it changed */
12637 12631          if (drp->r_dir != NULL)
12638 12632                  nfs4_purge_rddir_cache(dvp);
12639 12633  
12640 12634          /*
12641 12635           * If caller provided dinfo, then use it to manage dir caches.
12642 12636           */
12643 12637          if (dinfo != NULL) {
12644 12638                  if (vp != NULL) {
12645 12639                          mutex_enter(&VTOR4(vp)->r_statev4_lock);
12646 12640                          if (!VTOR4(vp)->created_v4) {
12647 12641                                  mutex_exit(&VTOR4(vp)->r_statev4_lock);
12648 12642                                  dnlc_update(dvp, nm, vp);
12649 12643                          } else {
12650 12644                                  /*
12651 12645                                   * XXX don't update if the created_v4 flag is
12652 12646                                   * set
12653 12647                                   */
12654 12648                                  mutex_exit(&VTOR4(vp)->r_statev4_lock);
12655 12649                                  NFS4_DEBUG(nfs4_client_state_debug,
12656 12650                                      (CE_NOTE, "nfs4_update_dircaches: "
12657 12651                                      "don't update dnlc: created_v4 flag"));
12658 12652                          }
12659 12653                  }
12660 12654  
12661 12655                  nfs4_attr_cache(dvp, dinfo->di_garp, dinfo->di_time_call,
12662 12656                      dinfo->di_cred, FALSE, cinfo);
12663 12657  
12664 12658                  return;
12665 12659          }
12666 12660  
12667 12661          /*
12668 12662           * Caller didn't provide dinfo, then check change_info4 to update DNLC.
12669 12663           * Since caller modified dir but didn't receive post-dirmod-op dir
12670 12664           * attrs, the dir's attrs must be purged.
12671 12665           *
12672 12666           * XXX this check and dnlc update/purge should really be atomic,
12673 12667           * XXX but can't use rnode statelock because it'll deadlock in
12674 12668           * XXX dnlc_purge_vp, however, the risk is minimal even if a race
12675 12669           * XXX does occur.
12676 12670           *
12677 12671           * XXX We also may want to check that atomic is true in the
12678 12672           * XXX change_info struct. If it is not, the change_info may
12679 12673           * XXX reflect changes by more than one clients which means that
12680 12674           * XXX our cache may not be valid.
12681 12675           */
12682 12676          PURGE_ATTRCACHE4(dvp);
12683 12677          if (drp->r_change == cinfo->before) {
12684 12678                  /* no changes took place in the directory prior to our link */
12685 12679                  if (vp != NULL) {
12686 12680                          mutex_enter(&VTOR4(vp)->r_statev4_lock);
12687 12681                          if (!VTOR4(vp)->created_v4) {
12688 12682                                  mutex_exit(&VTOR4(vp)->r_statev4_lock);
12689 12683                                  dnlc_update(dvp, nm, vp);
12690 12684                          } else {
12691 12685                                  /*
12692 12686                                   * XXX dont' update if the created_v4 flag
12693 12687                                   * is set
12694 12688                                   */
12695 12689                                  mutex_exit(&VTOR4(vp)->r_statev4_lock);
12696 12690                                  NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
12697 12691                                      "nfs4_update_dircaches: don't"
12698 12692                                      " update dnlc: created_v4 flag"));
12699 12693                          }
12700 12694                  }
12701 12695          } else {
12702 12696                  /* Another client modified directory - purge its dnlc cache */
12703 12697                  dnlc_purge_vp(dvp);
12704 12698          }
12705 12699  }
12706 12700  
12707 12701  /*
12708 12702   * The OPEN_CONFIRM operation confirms the sequence number used in OPENing a
12709 12703   * file.
12710 12704   *
12711 12705   * The 'reopening_file' boolean should be set to TRUE if we are reopening this
12712 12706   * file (ie: client recovery) and otherwise set to FALSE.
12713 12707   *
12714 12708   * 'nfs4_start/end_op' should have been called by the proper (ie: not recovery
12715 12709   * initiated) calling functions.
12716 12710   *
12717 12711   * 'resend' is set to TRUE if this is a OPEN_CONFIRM issued as a result
12718 12712   * of resending a 'lost' open request.
12719 12713   *
12720 12714   * 'num_bseqid_retryp' makes sure we don't loop forever on a broken
12721 12715   * server that hands out BAD_SEQID on open confirm.
12722 12716   *
12723 12717   * Errors are returned via the nfs4_error_t parameter.
12724 12718   */
12725 12719  void
12726 12720  nfs4open_confirm(vnode_t *vp, seqid4 *seqid, stateid4 *stateid, cred_t *cr,
12727 12721      bool_t reopening_file, bool_t *retry_open, nfs4_open_owner_t *oop,
12728 12722      bool_t resend, nfs4_error_t *ep, int *num_bseqid_retryp)
12729 12723  {
12730 12724          COMPOUND4args_clnt args;
12731 12725          COMPOUND4res_clnt res;
12732 12726          nfs_argop4 argop[2];
12733 12727          nfs_resop4 *resop;
12734 12728          int doqueue = 1;
12735 12729          mntinfo4_t *mi;
12736 12730          OPEN_CONFIRM4args *open_confirm_args;
12737 12731          int needrecov;
12738 12732  
12739 12733          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12740 12734  #if DEBUG
12741 12735          mutex_enter(&oop->oo_lock);
12742 12736          ASSERT(oop->oo_seqid_inuse);
12743 12737          mutex_exit(&oop->oo_lock);
12744 12738  #endif
12745 12739  
12746 12740  recov_retry_confirm:
12747 12741          nfs4_error_zinit(ep);
12748 12742          *retry_open = FALSE;
12749 12743  
12750 12744          if (resend)
12751 12745                  args.ctag = TAG_OPEN_CONFIRM_LOST;
12752 12746          else
12753 12747                  args.ctag = TAG_OPEN_CONFIRM;
12754 12748  
12755 12749          args.array_len = 2;
12756 12750          args.array = argop;
12757 12751  
12758 12752          /* putfh target fh */
12759 12753          argop[0].argop = OP_CPUTFH;
12760 12754          argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
12761 12755  
12762 12756          argop[1].argop = OP_OPEN_CONFIRM;
12763 12757          open_confirm_args = &argop[1].nfs_argop4_u.opopen_confirm;
12764 12758  
12765 12759          (*seqid) += 1;
12766 12760          open_confirm_args->seqid = *seqid;
12767 12761          open_confirm_args->open_stateid = *stateid;
12768 12762  
12769 12763          mi = VTOMI4(vp);
12770 12764  
12771 12765          rfs4call(mi, &args, &res, cr, &doqueue, 0, ep);
12772 12766  
12773 12767          if (!ep->error && nfs4_need_to_bump_seqid(&res)) {
12774 12768                  nfs4_set_open_seqid((*seqid), oop, args.ctag);
12775 12769          }
12776 12770  
12777 12771          needrecov = nfs4_needs_recovery(ep, FALSE, mi->mi_vfsp);
12778 12772          if (!needrecov && ep->error)
12779 12773                  return;
12780 12774  
12781 12775          if (needrecov) {
12782 12776                  bool_t abort = FALSE;
12783 12777  
12784 12778                  if (reopening_file == FALSE) {
12785 12779                          nfs4_bseqid_entry_t *bsep = NULL;
12786 12780  
12787 12781                          if (!ep->error && res.status == NFS4ERR_BAD_SEQID)
12788 12782                                  bsep = nfs4_create_bseqid_entry(oop, NULL,
12789 12783                                      vp, 0, args.ctag,
12790 12784                                      open_confirm_args->seqid);
12791 12785  
12792 12786                          abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL,
12793 12787                              NULL, NULL, OP_OPEN_CONFIRM, bsep, NULL, NULL);
12794 12788                          if (bsep) {
12795 12789                                  kmem_free(bsep, sizeof (*bsep));
12796 12790                                  if (num_bseqid_retryp &&
12797 12791                                      --(*num_bseqid_retryp) == 0)
12798 12792                                          abort = TRUE;
12799 12793                          }
12800 12794                  }
12801 12795                  if ((ep->error == ETIMEDOUT ||
12802 12796                      res.status == NFS4ERR_RESOURCE) &&
12803 12797                      abort == FALSE && resend == FALSE) {
12804 12798                          if (!ep->error)
12805 12799                                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12806 12800  
12807 12801                          delay(SEC_TO_TICK(confirm_retry_sec));
12808 12802                          goto recov_retry_confirm;
12809 12803                  }
12810 12804                  /* State may have changed so retry the entire OPEN op */
12811 12805                  if (abort == FALSE)
12812 12806                          *retry_open = TRUE;
12813 12807                  else
12814 12808                          *retry_open = FALSE;
12815 12809                  if (!ep->error)
12816 12810                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12817 12811                  return;
12818 12812          }
12819 12813  
12820 12814          if (res.status) {
12821 12815                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12822 12816                  return;
12823 12817          }
12824 12818  
12825 12819          resop = &res.array[1];  /* open confirm res */
12826 12820          bcopy(&resop->nfs_resop4_u.opopen_confirm.open_stateid,
12827 12821              stateid, sizeof (*stateid));
12828 12822  
12829 12823          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12830 12824  }
12831 12825  
12832 12826  /*
12833 12827   * Return the credentials associated with a client state object.  The
12834 12828   * caller is responsible for freeing the credentials.
12835 12829   */
12836 12830  
12837 12831  static cred_t *
12838 12832  state_to_cred(nfs4_open_stream_t *osp)
12839 12833  {
12840 12834          cred_t *cr;
12841 12835  
12842 12836          /*
12843 12837           * It's ok to not lock the open stream and open owner to get
12844 12838           * the oo_cred since this is only written once (upon creation)
12845 12839           * and will not change.
12846 12840           */
12847 12841          cr = osp->os_open_owner->oo_cred;
12848 12842          crhold(cr);
12849 12843  
12850 12844          return (cr);
12851 12845  }
12852 12846  
12853 12847  /*
12854 12848   * nfs4_find_sysid
12855 12849   *
12856 12850   * Find the sysid for the knetconfig associated with the given mi.
12857 12851   */
12858 12852  static struct lm_sysid *
12859 12853  nfs4_find_sysid(mntinfo4_t *mi)
12860 12854  {
12861 12855          ASSERT(nfs_zone() == mi->mi_zone);
12862 12856  
12863 12857          /*
12864 12858           * Switch from RDMA knconf to original mount knconf
12865 12859           */
12866 12860          return (lm_get_sysid(ORIG_KNCONF(mi), &mi->mi_curr_serv->sv_addr,
12867 12861              mi->mi_curr_serv->sv_hostname, NULL));
12868 12862  }
12869 12863  
12870 12864  #ifdef DEBUG
12871 12865  /*
12872 12866   * Return a string version of the call type for easy reading.
12873 12867   */
12874 12868  static char *
12875 12869  nfs4frlock_get_call_type(nfs4_lock_call_type_t ctype)
12876 12870  {
12877 12871          switch (ctype) {
12878 12872          case NFS4_LCK_CTYPE_NORM:
12879 12873                  return ("NORMAL");
12880 12874          case NFS4_LCK_CTYPE_RECLAIM:
12881 12875                  return ("RECLAIM");
12882 12876          case NFS4_LCK_CTYPE_RESEND:
12883 12877                  return ("RESEND");
12884 12878          case NFS4_LCK_CTYPE_REINSTATE:
12885 12879                  return ("REINSTATE");
12886 12880          default:
12887 12881                  cmn_err(CE_PANIC, "nfs4frlock_get_call_type: got illegal "
12888 12882                      "type %d", ctype);
12889 12883                  return ("");
12890 12884          }
12891 12885  }
12892 12886  #endif
12893 12887  
12894 12888  /*
12895 12889   * Map the frlock cmd and lock type to the NFSv4 over-the-wire lock type
12896 12890   * Unlock requests don't have an over-the-wire locktype, so we just return
12897 12891   * something non-threatening.
12898 12892   */
12899 12893  
12900 12894  static nfs_lock_type4
12901 12895  flk_to_locktype(int cmd, int l_type)
12902 12896  {
12903 12897          ASSERT(l_type == F_RDLCK || l_type == F_WRLCK || l_type == F_UNLCK);
12904 12898  
12905 12899          switch (l_type) {
12906 12900          case F_UNLCK:
12907 12901                  return (READ_LT);
12908 12902          case F_RDLCK:
12909 12903                  if (cmd == F_SETLK)
12910 12904                          return (READ_LT);
12911 12905                  else
12912 12906                          return (READW_LT);
12913 12907          case F_WRLCK:
12914 12908                  if (cmd == F_SETLK)
12915 12909                          return (WRITE_LT);
12916 12910                  else
12917 12911                          return (WRITEW_LT);
12918 12912          }
12919 12913          panic("flk_to_locktype");
12920 12914          /*NOTREACHED*/
12921 12915  }
12922 12916  
12923 12917  /*
12924 12918   * Do some preliminary checks for nfs4frlock.
12925 12919   */
12926 12920  static int
12927 12921  nfs4frlock_validate_args(int cmd, flock64_t *flk, int flag, vnode_t *vp,
12928 12922      u_offset_t offset)
12929 12923  {
12930 12924          int error = 0;
12931 12925  
12932 12926          /*
12933 12927           * If we are setting a lock, check that the file is opened
12934 12928           * with the correct mode.
12935 12929           */
12936 12930          if (cmd == F_SETLK || cmd == F_SETLKW) {
12937 12931                  if ((flk->l_type == F_RDLCK && (flag & FREAD) == 0) ||
12938 12932                      (flk->l_type == F_WRLCK && (flag & FWRITE) == 0)) {
12939 12933                          NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12940 12934                              "nfs4frlock_validate_args: file was opened with "
12941 12935                              "incorrect mode"));
12942 12936                          return (EBADF);
12943 12937                  }
12944 12938          }
12945 12939  
12946 12940          /* Convert the offset. It may need to be restored before returning. */
12947 12941          if (error = convoff(vp, flk, 0, offset)) {
12948 12942                  NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12949 12943                      "nfs4frlock_validate_args: convoff  =>  error= %d\n",
12950 12944                      error));
12951 12945                  return (error);
12952 12946          }
12953 12947  
12954 12948          return (error);
12955 12949  }
12956 12950  
12957 12951  /*
12958 12952   * Set the flock64's lm_sysid for nfs4frlock.
12959 12953   */
12960 12954  static int
12961 12955  nfs4frlock_get_sysid(struct lm_sysid **lspp, vnode_t *vp, flock64_t *flk)
12962 12956  {
12963 12957          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12964 12958  
12965 12959          /* Find the lm_sysid */
12966 12960          *lspp = nfs4_find_sysid(VTOMI4(vp));
12967 12961  
12968 12962          if (*lspp == NULL) {
12969 12963                  NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12970 12964                      "nfs4frlock_get_sysid: no sysid, return ENOLCK"));
12971 12965                  return (ENOLCK);
12972 12966          }
12973 12967  
12974 12968          flk->l_sysid = lm_sysidt(*lspp);
12975 12969  
12976 12970          return (0);
12977 12971  }
12978 12972  
12979 12973  /*
12980 12974   * Do the remaining preliminary setup for nfs4frlock.
12981 12975   */
12982 12976  static void
12983 12977  nfs4frlock_pre_setup(clock_t *tick_delayp, nfs4_recov_state_t *recov_statep,
12984 12978      flock64_t *flk, short *whencep, vnode_t *vp, cred_t *search_cr,
12985 12979      cred_t **cred_otw)
12986 12980  {
12987 12981          /*
12988 12982           * set tick_delay to the base delay time.
12989 12983           * (NFS4_BASE_WAIT_TIME is in secs)
12990 12984           */
12991 12985  
12992 12986          *tick_delayp = drv_usectohz(NFS4_BASE_WAIT_TIME * 1000 * 1000);
12993 12987  
12994 12988          /*
12995 12989           * If lock is relative to EOF, we need the newest length of the
12996 12990           * file. Therefore invalidate the ATTR_CACHE.
12997 12991           */
12998 12992  
12999 12993          *whencep = flk->l_whence;
13000 12994  
13001 12995          if (*whencep == 2)              /* SEEK_END */
13002 12996                  PURGE_ATTRCACHE4(vp);
13003 12997  
13004 12998          recov_statep->rs_flags = 0;
13005 12999          recov_statep->rs_num_retry_despite_err = 0;
13006 13000          *cred_otw = nfs4_get_otw_cred(search_cr, VTOMI4(vp), NULL);
13007 13001  }
13008 13002  
13009 13003  /*
13010 13004   * Initialize and allocate the data structures necessary for
13011 13005   * the nfs4frlock call.
13012 13006   * Allocates argsp's op array.
13013 13007   */
13014 13008  static void
13015 13009  nfs4frlock_call_init(COMPOUND4args_clnt *argsp, COMPOUND4args_clnt **argspp,
13016 13010      nfs_argop4 **argopp, nfs4_op_hint_t *op_hintp, flock64_t *flk, int cmd,
13017 13011      bool_t *retry, bool_t *did_start_fop, COMPOUND4res_clnt **respp,
13018 13012      bool_t *skip_get_err, nfs4_lost_rqst_t *lost_rqstp)
13019 13013  {
13020 13014          int             argoplist_size;
13021 13015          int             num_ops = 2;
13022 13016  
13023 13017          *retry = FALSE;
13024 13018          *did_start_fop = FALSE;
13025 13019          *skip_get_err = FALSE;
13026 13020          lost_rqstp->lr_op = 0;
13027 13021          argoplist_size  = num_ops * sizeof (nfs_argop4);
13028 13022          /* fill array with zero */
13029 13023          *argopp = kmem_zalloc(argoplist_size, KM_SLEEP);
13030 13024  
13031 13025          *argspp = argsp;
13032 13026          *respp = NULL;
13033 13027  
13034 13028          argsp->array_len = num_ops;
13035 13029          argsp->array = *argopp;
13036 13030  
13037 13031          /* initialize in case of error; will get real value down below */
13038 13032          argsp->ctag = TAG_NONE;
13039 13033  
13040 13034          if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK)
13041 13035                  *op_hintp = OH_LOCKU;
13042 13036          else
13043 13037                  *op_hintp = OH_OTHER;
13044 13038  }
13045 13039  
13046 13040  /*
13047 13041   * Call the nfs4_start_fop() for nfs4frlock, if necessary.  Assign
13048 13042   * the proper nfs4_server_t for this instance of nfs4frlock.
13049 13043   * Returns 0 (success) or an errno value.
13050 13044   */
13051 13045  static int
13052 13046  nfs4frlock_start_call(nfs4_lock_call_type_t ctype, vnode_t *vp,
13053 13047      nfs4_op_hint_t op_hint, nfs4_recov_state_t *recov_statep,
13054 13048      bool_t *did_start_fop, bool_t *startrecovp)
13055 13049  {
13056 13050          int error = 0;
13057 13051          rnode4_t *rp;
13058 13052  
13059 13053          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13060 13054  
13061 13055          if (ctype == NFS4_LCK_CTYPE_NORM) {
13062 13056                  error = nfs4_start_fop(VTOMI4(vp), vp, NULL, op_hint,
13063 13057                      recov_statep, startrecovp);
13064 13058                  if (error)
13065 13059                          return (error);
13066 13060                  *did_start_fop = TRUE;
13067 13061          } else {
13068 13062                  *did_start_fop = FALSE;
13069 13063                  *startrecovp = FALSE;
13070 13064          }
13071 13065  
13072 13066          if (!error) {
13073 13067                  rp = VTOR4(vp);
13074 13068  
13075 13069                  /* If the file failed recovery, just quit. */
13076 13070                  mutex_enter(&rp->r_statelock);
13077 13071                  if (rp->r_flags & R4RECOVERR) {
13078 13072                          error = EIO;
13079 13073                  }
13080 13074                  mutex_exit(&rp->r_statelock);
13081 13075          }
13082 13076  
13083 13077          return (error);
13084 13078  }
13085 13079  
13086 13080  /*
13087 13081   * Setup the LOCK4/LOCKU4 arguments for resending a lost lock request.  A
13088 13082   * resend nfs4frlock call is initiated by the recovery framework.
13089 13083   * Acquires the lop and oop seqid synchronization.
13090 13084   */
13091 13085  static void
13092 13086  nfs4frlock_setup_resend_lock_args(nfs4_lost_rqst_t *resend_rqstp,
13093 13087      COMPOUND4args_clnt *argsp, nfs_argop4 *argop, nfs4_lock_owner_t **lopp,
13094 13088      nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
13095 13089      LOCK4args **lock_argsp, LOCKU4args **locku_argsp)
13096 13090  {
13097 13091          mntinfo4_t *mi = VTOMI4(resend_rqstp->lr_vp);
13098 13092          int error;
13099 13093  
13100 13094          NFS4_DEBUG((nfs4_lost_rqst_debug || nfs4_client_lock_debug),
13101 13095              (CE_NOTE,
13102 13096              "nfs4frlock_setup_resend_lock_args: have lost lock to resend"));
13103 13097          ASSERT(resend_rqstp != NULL);
13104 13098          ASSERT(resend_rqstp->lr_op == OP_LOCK ||
13105 13099              resend_rqstp->lr_op == OP_LOCKU);
13106 13100  
13107 13101          *oopp = resend_rqstp->lr_oop;
13108 13102          if (resend_rqstp->lr_oop) {
13109 13103                  open_owner_hold(resend_rqstp->lr_oop);
13110 13104                  error = nfs4_start_open_seqid_sync(resend_rqstp->lr_oop, mi);
13111 13105                  ASSERT(error == 0);     /* recov thread always succeeds */
13112 13106          }
13113 13107  
13114 13108          /* Must resend this lost lock/locku request. */
13115 13109          ASSERT(resend_rqstp->lr_lop != NULL);
13116 13110          *lopp = resend_rqstp->lr_lop;
13117 13111          lock_owner_hold(resend_rqstp->lr_lop);
13118 13112          error = nfs4_start_lock_seqid_sync(resend_rqstp->lr_lop, mi);
13119 13113          ASSERT(error == 0);     /* recov thread always succeeds */
13120 13114  
13121 13115          *ospp = resend_rqstp->lr_osp;
13122 13116          if (*ospp)
13123 13117                  open_stream_hold(resend_rqstp->lr_osp);
13124 13118  
13125 13119          if (resend_rqstp->lr_op == OP_LOCK) {
13126 13120                  LOCK4args *lock_args;
13127 13121  
13128 13122                  argop->argop = OP_LOCK;
13129 13123                  *lock_argsp = lock_args = &argop->nfs_argop4_u.oplock;
13130 13124                  lock_args->locktype = resend_rqstp->lr_locktype;
13131 13125                  lock_args->reclaim =
13132 13126                      (resend_rqstp->lr_ctype == NFS4_LCK_CTYPE_RECLAIM);
13133 13127                  lock_args->offset = resend_rqstp->lr_flk->l_start;
13134 13128                  lock_args->length = resend_rqstp->lr_flk->l_len;
13135 13129                  if (lock_args->length == 0)
13136 13130                          lock_args->length = ~lock_args->length;
13137 13131                  nfs4_setup_lock_args(*lopp, *oopp, *ospp,
13138 13132                      mi2clientid(mi), &lock_args->locker);
13139 13133  
13140 13134                  switch (resend_rqstp->lr_ctype) {
13141 13135                  case NFS4_LCK_CTYPE_RESEND:
13142 13136                          argsp->ctag = TAG_LOCK_RESEND;
13143 13137                          break;
13144 13138                  case NFS4_LCK_CTYPE_REINSTATE:
13145 13139                          argsp->ctag = TAG_LOCK_REINSTATE;
13146 13140                          break;
13147 13141                  case NFS4_LCK_CTYPE_RECLAIM:
13148 13142                          argsp->ctag = TAG_LOCK_RECLAIM;
13149 13143                          break;
13150 13144                  default:
13151 13145                          argsp->ctag = TAG_LOCK_UNKNOWN;
13152 13146                          break;
13153 13147                  }
13154 13148          } else {
13155 13149                  LOCKU4args *locku_args;
13156 13150                  nfs4_lock_owner_t *lop = resend_rqstp->lr_lop;
13157 13151  
13158 13152                  argop->argop = OP_LOCKU;
13159 13153                  *locku_argsp = locku_args = &argop->nfs_argop4_u.oplocku;
13160 13154                  locku_args->locktype = READ_LT;
13161 13155                  locku_args->seqid = lop->lock_seqid + 1;
13162 13156                  mutex_enter(&lop->lo_lock);
13163 13157                  locku_args->lock_stateid = lop->lock_stateid;
13164 13158                  mutex_exit(&lop->lo_lock);
13165 13159                  locku_args->offset = resend_rqstp->lr_flk->l_start;
13166 13160                  locku_args->length = resend_rqstp->lr_flk->l_len;
13167 13161                  if (locku_args->length == 0)
13168 13162                          locku_args->length = ~locku_args->length;
13169 13163  
13170 13164                  switch (resend_rqstp->lr_ctype) {
13171 13165                  case NFS4_LCK_CTYPE_RESEND:
13172 13166                          argsp->ctag = TAG_LOCKU_RESEND;
13173 13167                          break;
13174 13168                  case NFS4_LCK_CTYPE_REINSTATE:
13175 13169                          argsp->ctag = TAG_LOCKU_REINSTATE;
13176 13170                          break;
13177 13171                  default:
13178 13172                          argsp->ctag = TAG_LOCK_UNKNOWN;
13179 13173                          break;
13180 13174                  }
13181 13175          }
13182 13176  }
13183 13177  
13184 13178  /*
13185 13179   * Setup the LOCKT4 arguments.
13186 13180   */
13187 13181  static void
13188 13182  nfs4frlock_setup_lockt_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop,
13189 13183      LOCKT4args **lockt_argsp, COMPOUND4args_clnt *argsp, flock64_t *flk,
13190 13184      rnode4_t *rp)
13191 13185  {
13192 13186          LOCKT4args *lockt_args;
13193 13187  
13194 13188          ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone);
13195 13189          ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
13196 13190          argop->argop = OP_LOCKT;
13197 13191          argsp->ctag = TAG_LOCKT;
13198 13192          lockt_args = &argop->nfs_argop4_u.oplockt;
13199 13193  
13200 13194          /*
13201 13195           * The locktype will be READ_LT unless it's
13202 13196           * a write lock. We do this because the Solaris
13203 13197           * system call allows the combination of
13204 13198           * F_UNLCK and F_GETLK* and so in that case the
13205 13199           * unlock is mapped to a read.
13206 13200           */
13207 13201          if (flk->l_type == F_WRLCK)
13208 13202                  lockt_args->locktype = WRITE_LT;
13209 13203          else
13210 13204                  lockt_args->locktype = READ_LT;
13211 13205  
13212 13206          lockt_args->owner.clientid = mi2clientid(VTOMI4(RTOV4(rp)));
13213 13207          /* set the lock owner4 args */
13214 13208          nfs4_setlockowner_args(&lockt_args->owner, rp,
13215 13209              ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id :
13216 13210              flk->l_pid);
13217 13211          lockt_args->offset = flk->l_start;
13218 13212          lockt_args->length = flk->l_len;
13219 13213          if (flk->l_len == 0)
13220 13214                  lockt_args->length = ~lockt_args->length;
13221 13215  
13222 13216          *lockt_argsp = lockt_args;
13223 13217  }
13224 13218  
13225 13219  /*
13226 13220   * If the client is holding a delegation, and the open stream to be used
13227 13221   * with this lock request is a delegation open stream, then re-open the stream.
13228 13222   * Sets the nfs4_error_t to all zeros unless the open stream has already
13229 13223   * failed a reopen or we couldn't find the open stream.  NFS4ERR_DELAY
13230 13224   * means the caller should retry (like a recovery retry).
13231 13225   */
13232 13226  static void
13233 13227  nfs4frlock_check_deleg(vnode_t *vp, nfs4_error_t *ep, cred_t *cr, int lt)
13234 13228  {
13235 13229          open_delegation_type4   dt;
13236 13230          bool_t                  reopen_needed, force;
13237 13231          nfs4_open_stream_t      *osp;
13238 13232          open_claim_type4        oclaim;
13239 13233          rnode4_t                *rp = VTOR4(vp);
13240 13234          mntinfo4_t              *mi = VTOMI4(vp);
13241 13235  
13242 13236          ASSERT(nfs_zone() == mi->mi_zone);
13243 13237  
13244 13238          nfs4_error_zinit(ep);
13245 13239  
13246 13240          mutex_enter(&rp->r_statev4_lock);
13247 13241          dt = rp->r_deleg_type;
13248 13242          mutex_exit(&rp->r_statev4_lock);
13249 13243  
13250 13244          if (dt != OPEN_DELEGATE_NONE) {
13251 13245                  nfs4_open_owner_t       *oop;
13252 13246  
13253 13247                  oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
13254 13248                  if (!oop) {
13255 13249                          ep->stat = NFS4ERR_IO;
13256 13250                          return;
13257 13251                  }
13258 13252                  /* returns with 'os_sync_lock' held */
13259 13253                  osp = find_open_stream(oop, rp);
13260 13254                  if (!osp) {
13261 13255                          open_owner_rele(oop);
13262 13256                          ep->stat = NFS4ERR_IO;
13263 13257                          return;
13264 13258                  }
13265 13259  
13266 13260                  if (osp->os_failed_reopen) {
13267 13261                          NFS4_DEBUG((nfs4_open_stream_debug ||
13268 13262                              nfs4_client_lock_debug), (CE_NOTE,
13269 13263                              "nfs4frlock_check_deleg: os_failed_reopen set "
13270 13264                              "for osp %p, cr %p, rp %s", (void *)osp,
13271 13265                              (void *)cr, rnode4info(rp)));
13272 13266                          mutex_exit(&osp->os_sync_lock);
13273 13267                          open_stream_rele(osp, rp);
13274 13268                          open_owner_rele(oop);
13275 13269                          ep->stat = NFS4ERR_IO;
13276 13270                          return;
13277 13271                  }
13278 13272  
13279 13273                  /*
13280 13274                   * Determine whether a reopen is needed.  If this
13281 13275                   * is a delegation open stream, then send the open
13282 13276                   * to the server to give visibility to the open owner.
13283 13277                   * Even if it isn't a delegation open stream, we need
13284 13278                   * to check if the previous open CLAIM_DELEGATE_CUR
13285 13279                   * was sufficient.
13286 13280                   */
13287 13281  
13288 13282                  reopen_needed = osp->os_delegation ||
13289 13283                      ((lt == F_RDLCK &&
13290 13284                      !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_READ)) ||
13291 13285                      (lt == F_WRLCK &&
13292 13286                      !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_WRITE)));
13293 13287  
13294 13288                  mutex_exit(&osp->os_sync_lock);
13295 13289                  open_owner_rele(oop);
13296 13290  
13297 13291                  if (reopen_needed) {
13298 13292                          /*
13299 13293                           * Always use CLAIM_PREVIOUS after server reboot.
13300 13294                           * The server will reject CLAIM_DELEGATE_CUR if
13301 13295                           * it is used during the grace period.
13302 13296                           */
13303 13297                          mutex_enter(&mi->mi_lock);
13304 13298                          if (mi->mi_recovflags & MI4R_SRV_REBOOT) {
13305 13299                                  oclaim = CLAIM_PREVIOUS;
13306 13300                                  force = TRUE;
13307 13301                          } else {
13308 13302                                  oclaim = CLAIM_DELEGATE_CUR;
13309 13303                                  force = FALSE;
13310 13304                          }
13311 13305                          mutex_exit(&mi->mi_lock);
13312 13306  
13313 13307                          nfs4_reopen(vp, osp, ep, oclaim, force, FALSE);
13314 13308                          if (ep->error == EAGAIN) {
13315 13309                                  nfs4_error_zinit(ep);
13316 13310                                  ep->stat = NFS4ERR_DELAY;
13317 13311                          }
13318 13312                  }
13319 13313                  open_stream_rele(osp, rp);
13320 13314                  osp = NULL;
13321 13315          }
13322 13316  }
13323 13317  
13324 13318  /*
13325 13319   * Setup the LOCKU4 arguments.
13326 13320   * Returns errors via the nfs4_error_t.
13327 13321   * NFS4_OK              no problems.  *go_otwp is TRUE if call should go
13328 13322   *                      over-the-wire.  The caller must release the
13329 13323   *                      reference on *lopp.
13330 13324   * NFS4ERR_DELAY        caller should retry (like recovery retry)
13331 13325   * (other)              unrecoverable error.
13332 13326   */
13333 13327  static void
13334 13328  nfs4frlock_setup_locku_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop,
13335 13329      LOCKU4args **locku_argsp, flock64_t *flk,
13336 13330      nfs4_lock_owner_t **lopp, nfs4_error_t *ep, COMPOUND4args_clnt *argsp,
13337 13331      vnode_t *vp, int flag, u_offset_t offset, cred_t *cr,
13338 13332      bool_t *skip_get_err, bool_t *go_otwp)
13339 13333  {
13340 13334          nfs4_lock_owner_t       *lop = NULL;
13341 13335          LOCKU4args              *locku_args;
13342 13336          pid_t                   pid;
13343 13337          bool_t                  is_spec = FALSE;
13344 13338          rnode4_t                *rp = VTOR4(vp);
13345 13339  
13346 13340          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13347 13341          ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
13348 13342  
13349 13343          nfs4frlock_check_deleg(vp, ep, cr, F_UNLCK);
13350 13344          if (ep->error || ep->stat)
13351 13345                  return;
13352 13346  
13353 13347          argop->argop = OP_LOCKU;
13354 13348          if (ctype == NFS4_LCK_CTYPE_REINSTATE)
13355 13349                  argsp->ctag = TAG_LOCKU_REINSTATE;
13356 13350          else
13357 13351                  argsp->ctag = TAG_LOCKU;
13358 13352          locku_args = &argop->nfs_argop4_u.oplocku;
13359 13353          *locku_argsp = locku_args;
13360 13354  
13361 13355          /* locktype should be set to any legal value */
13362 13356          locku_args->locktype = READ_LT;
13363 13357  
13364 13358          pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id :
13365 13359              flk->l_pid;
13366 13360  
13367 13361          /*
13368 13362           * Get the lock owner stateid.  If no lock owner
13369 13363           * exists, return success.
13370 13364           */
13371 13365          lop = find_lock_owner(rp, pid, LOWN_ANY);
13372 13366          *lopp = lop;
13373 13367          if (lop && CLNT_ISSPECIAL(&lop->lock_stateid))
13374 13368                  is_spec = TRUE;
13375 13369          if (!lop || is_spec) {
13376 13370                  /*
13377 13371                   * No lock owner so no locks to unlock.
13378 13372                   * Return success.  If there was a failed
13379 13373                   * reclaim earlier, the lock might still be
13380 13374                   * registered with the local locking code,
13381 13375                   * so notify it of the unlock.
13382 13376                   *
13383 13377                   * If the lockowner is using a special stateid,
13384 13378                   * then the original lock request (that created
13385 13379                   * this lockowner) was never successful, so we
13386 13380                   * have no lock to undo OTW.
13387 13381                   */
13388 13382                  NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
13389 13383                      "nfs4frlock_setup_locku_args: LOCKU: no lock owner "
13390 13384                      "(%ld) so return success", (long)pid));
13391 13385  
13392 13386                  if (ctype == NFS4_LCK_CTYPE_NORM)
13393 13387                          flk->l_pid = curproc->p_pid;
13394 13388                  nfs4_register_lock_locally(vp, flk, flag, offset);
13395 13389                  /*
13396 13390                   * Release our hold and NULL out so final_cleanup
13397 13391                   * doesn't try to end a lock seqid sync we
13398 13392                   * never started.
13399 13393                   */
13400 13394                  if (is_spec) {
13401 13395                          lock_owner_rele(lop);
13402 13396                          *lopp = NULL;
13403 13397                  }
13404 13398                  *skip_get_err = TRUE;
13405 13399                  *go_otwp = FALSE;
13406 13400                  return;
13407 13401          }
13408 13402  
13409 13403          ep->error = nfs4_start_lock_seqid_sync(lop, VTOMI4(vp));
13410 13404          if (ep->error == EAGAIN) {
13411 13405                  lock_owner_rele(lop);
13412 13406                  *lopp = NULL;
13413 13407                  return;
13414 13408          }
13415 13409  
13416 13410          mutex_enter(&lop->lo_lock);
13417 13411          locku_args->lock_stateid = lop->lock_stateid;
13418 13412          mutex_exit(&lop->lo_lock);
13419 13413          locku_args->seqid = lop->lock_seqid + 1;
13420 13414  
13421 13415          /* leave the ref count on lop, rele after RPC call */
13422 13416  
13423 13417          locku_args->offset = flk->l_start;
13424 13418          locku_args->length = flk->l_len;
13425 13419          if (flk->l_len == 0)
13426 13420                  locku_args->length = ~locku_args->length;
13427 13421  
13428 13422          *go_otwp = TRUE;
13429 13423  }
13430 13424  
13431 13425  /*
13432 13426   * Setup the LOCK4 arguments.
13433 13427   *
13434 13428   * Returns errors via the nfs4_error_t.
13435 13429   * NFS4_OK              no problems
13436 13430   * NFS4ERR_DELAY        caller should retry (like recovery retry)
13437 13431   * (other)              unrecoverable error
13438 13432   */
13439 13433  static void
13440 13434  nfs4frlock_setup_lock_args(nfs4_lock_call_type_t ctype, LOCK4args **lock_argsp,
13441 13435      nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
13442 13436      nfs4_lock_owner_t **lopp, nfs_argop4 *argop, COMPOUND4args_clnt *argsp,
13443 13437      flock64_t *flk, int cmd, vnode_t *vp, cred_t *cr, nfs4_error_t *ep)
13444 13438  {
13445 13439          LOCK4args               *lock_args;
13446 13440          nfs4_open_owner_t       *oop = NULL;
13447 13441          nfs4_open_stream_t      *osp = NULL;
13448 13442          nfs4_lock_owner_t       *lop = NULL;
13449 13443          pid_t                   pid;
13450 13444          rnode4_t                *rp = VTOR4(vp);
13451 13445  
13452 13446          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13453 13447  
13454 13448          nfs4frlock_check_deleg(vp, ep, cr, flk->l_type);
13455 13449          if (ep->error || ep->stat != NFS4_OK)
13456 13450                  return;
13457 13451  
13458 13452          argop->argop = OP_LOCK;
13459 13453          if (ctype == NFS4_LCK_CTYPE_NORM)
13460 13454                  argsp->ctag = TAG_LOCK;
13461 13455          else if (ctype == NFS4_LCK_CTYPE_RECLAIM)
13462 13456                  argsp->ctag = TAG_RELOCK;
13463 13457          else
13464 13458                  argsp->ctag = TAG_LOCK_REINSTATE;
13465 13459          lock_args = &argop->nfs_argop4_u.oplock;
13466 13460          lock_args->locktype = flk_to_locktype(cmd, flk->l_type);
13467 13461          lock_args->reclaim = ctype == NFS4_LCK_CTYPE_RECLAIM ? 1 : 0;
13468 13462          /*
13469 13463           * Get the lock owner.  If no lock owner exists,
13470 13464           * create a 'temporary' one and grab the open seqid
13471 13465           * synchronization (which puts a hold on the open
13472 13466           * owner and open stream).
13473 13467           * This also grabs the lock seqid synchronization.
13474 13468           */
13475 13469          pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid : flk->l_pid;
13476 13470          ep->stat =
13477 13471              nfs4_find_or_create_lock_owner(pid, rp, cr, &oop, &osp, &lop);
13478 13472  
13479 13473          if (ep->stat != NFS4_OK)
13480 13474                  goto out;
13481 13475  
13482 13476          nfs4_setup_lock_args(lop, oop, osp, mi2clientid(VTOMI4(vp)),
13483 13477              &lock_args->locker);
13484 13478  
13485 13479          lock_args->offset = flk->l_start;
13486 13480          lock_args->length = flk->l_len;
13487 13481          if (flk->l_len == 0)
13488 13482                  lock_args->length = ~lock_args->length;
13489 13483          *lock_argsp = lock_args;
13490 13484  out:
13491 13485          *oopp = oop;
13492 13486          *ospp = osp;
13493 13487          *lopp = lop;
13494 13488  }
13495 13489  
13496 13490  /*
13497 13491   * After we get the reply from the server, record the proper information
13498 13492   * for possible resend lock requests.
13499 13493   */
13500 13494  static void
13501 13495  nfs4frlock_save_lost_rqst(nfs4_lock_call_type_t ctype, int error,
13502 13496      nfs_lock_type4 locktype, nfs4_open_owner_t *oop,
13503 13497      nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk,
13504 13498      nfs4_lost_rqst_t *lost_rqstp, cred_t *cr, vnode_t *vp)
13505 13499  {
13506 13500          bool_t unlock = (flk->l_type == F_UNLCK);
13507 13501  
13508 13502          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13509 13503          ASSERT(ctype == NFS4_LCK_CTYPE_NORM ||
13510 13504              ctype == NFS4_LCK_CTYPE_REINSTATE);
13511 13505  
13512 13506          if (error != 0 && !unlock) {
13513 13507                  NFS4_DEBUG((nfs4_lost_rqst_debug ||
13514 13508                      nfs4_client_lock_debug), (CE_NOTE,
13515 13509                      "nfs4frlock_save_lost_rqst: set lo_pending_rqsts to 1 "
13516 13510                      " for lop %p", (void *)lop));
13517 13511                  ASSERT(lop != NULL);
13518 13512                  mutex_enter(&lop->lo_lock);
13519 13513                  lop->lo_pending_rqsts = 1;
13520 13514                  mutex_exit(&lop->lo_lock);
13521 13515          }
13522 13516  
13523 13517          lost_rqstp->lr_putfirst = FALSE;
13524 13518          lost_rqstp->lr_op = 0;
13525 13519  
13526 13520          /*
13527 13521           * For lock/locku requests, we treat EINTR as ETIMEDOUT for
13528 13522           * recovery purposes so that the lock request that was sent
13529 13523           * can be saved and re-issued later.  Ditto for EIO from a forced
13530 13524           * unmount.  This is done to have the client's local locking state
13531 13525           * match the v4 server's state; that is, the request was
13532 13526           * potentially received and accepted by the server but the client
13533 13527           * thinks it was not.
13534 13528           */
13535 13529          if (error == ETIMEDOUT || error == EINTR ||
13536 13530              NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
13537 13531                  NFS4_DEBUG((nfs4_lost_rqst_debug ||
13538 13532                      nfs4_client_lock_debug), (CE_NOTE,
13539 13533                      "nfs4frlock_save_lost_rqst: got a lost %s lock for "
13540 13534                      "lop %p oop %p osp %p", unlock ? "LOCKU" : "LOCK",
13541 13535                      (void *)lop, (void *)oop, (void *)osp));
13542 13536                  if (unlock)
13543 13537                          lost_rqstp->lr_op = OP_LOCKU;
13544 13538                  else {
13545 13539                          lost_rqstp->lr_op = OP_LOCK;
13546 13540                          lost_rqstp->lr_locktype = locktype;
13547 13541                  }
13548 13542                  /*
13549 13543                   * Objects are held and rele'd via the recovery code.
13550 13544                   * See nfs4_save_lost_rqst.
13551 13545                   */
13552 13546                  lost_rqstp->lr_vp = vp;
13553 13547                  lost_rqstp->lr_dvp = NULL;
13554 13548                  lost_rqstp->lr_oop = oop;
13555 13549                  lost_rqstp->lr_osp = osp;
13556 13550                  lost_rqstp->lr_lop = lop;
13557 13551                  lost_rqstp->lr_cr = cr;
13558 13552                  switch (ctype) {
13559 13553                  case NFS4_LCK_CTYPE_NORM:
13560 13554                          flk->l_pid = ttoproc(curthread)->p_pid;
13561 13555                          lost_rqstp->lr_ctype = NFS4_LCK_CTYPE_RESEND;
13562 13556                          break;
13563 13557                  case NFS4_LCK_CTYPE_REINSTATE:
13564 13558                          lost_rqstp->lr_putfirst = TRUE;
13565 13559                          lost_rqstp->lr_ctype = ctype;
13566 13560                          break;
13567 13561                  default:
13568 13562                          break;
13569 13563                  }
13570 13564                  lost_rqstp->lr_flk = flk;
13571 13565          }
13572 13566  }
13573 13567  
13574 13568  /*
13575 13569   * Update lop's seqid.  Also update the seqid stored in a resend request,
13576 13570   * if any.  (Some recovery errors increment the seqid, and we may have to
13577 13571   * send the resend request again.)
13578 13572   */
13579 13573  
13580 13574  static void
13581 13575  nfs4frlock_bump_seqid(LOCK4args *lock_args, LOCKU4args *locku_args,
13582 13576      nfs4_open_owner_t *oop, nfs4_lock_owner_t *lop, nfs4_tag_type_t tag_type)
13583 13577  {
13584 13578          if (lock_args) {
13585 13579                  if (lock_args->locker.new_lock_owner == TRUE)
13586 13580                          nfs4_get_and_set_next_open_seqid(oop, tag_type);
13587 13581                  else {
13588 13582                          ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
13589 13583                          nfs4_set_lock_seqid(lop->lock_seqid + 1, lop);
13590 13584                  }
13591 13585          } else if (locku_args) {
13592 13586                  ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
13593 13587                  nfs4_set_lock_seqid(lop->lock_seqid +1, lop);
13594 13588          }
13595 13589  }
13596 13590  
13597 13591  /*
13598 13592   * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
13599 13593   * COMPOUND4 args/res for calls that need to retry.
13600 13594   * Switches the *cred_otwp to base_cr.
13601 13595   */
13602 13596  static void
13603 13597  nfs4frlock_check_access(vnode_t *vp, nfs4_op_hint_t op_hint,
13604 13598      nfs4_recov_state_t *recov_statep, int needrecov, bool_t *did_start_fop,
13605 13599      COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, int error,
13606 13600      nfs4_lock_owner_t **lopp, nfs4_open_owner_t **oopp,
13607 13601      nfs4_open_stream_t **ospp, cred_t *base_cr, cred_t **cred_otwp)
13608 13602  {
13609 13603          nfs4_open_owner_t       *oop = *oopp;
13610 13604          nfs4_open_stream_t      *osp = *ospp;
13611 13605          nfs4_lock_owner_t       *lop = *lopp;
13612 13606          nfs_argop4              *argop = (*argspp)->array;
13613 13607  
13614 13608          if (*did_start_fop) {
13615 13609                  nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep,
13616 13610                      needrecov);
13617 13611                  *did_start_fop = FALSE;
13618 13612          }
13619 13613          ASSERT((*argspp)->array_len == 2);
13620 13614          if (argop[1].argop == OP_LOCK)
13621 13615                  nfs4args_lock_free(&argop[1]);
13622 13616          else if (argop[1].argop == OP_LOCKT)
13623 13617                  nfs4args_lockt_free(&argop[1]);
13624 13618          kmem_free(argop, 2 * sizeof (nfs_argop4));
13625 13619          if (!error)
13626 13620                  xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp);
13627 13621          *argspp = NULL;
13628 13622          *respp = NULL;
13629 13623  
13630 13624          if (lop) {
13631 13625                  nfs4_end_lock_seqid_sync(lop);
13632 13626                  lock_owner_rele(lop);
13633 13627                  *lopp = NULL;
13634 13628          }
13635 13629  
13636 13630          /* need to free up the reference on osp for lock args */
13637 13631          if (osp != NULL) {
13638 13632                  open_stream_rele(osp, VTOR4(vp));
13639 13633                  *ospp = NULL;
13640 13634          }
13641 13635  
13642 13636          /* need to free up the reference on oop for lock args */
13643 13637          if (oop != NULL) {
13644 13638                  nfs4_end_open_seqid_sync(oop);
13645 13639                  open_owner_rele(oop);
13646 13640                  *oopp = NULL;
13647 13641          }
13648 13642  
13649 13643          crfree(*cred_otwp);
13650 13644          *cred_otwp = base_cr;
13651 13645          crhold(*cred_otwp);
13652 13646  }
13653 13647  
13654 13648  /*
13655 13649   * Function to process the client's recovery for nfs4frlock.
13656 13650   * Returns TRUE if we should retry the lock request; FALSE otherwise.
13657 13651   *
13658 13652   * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
13659 13653   * COMPOUND4 args/res for calls that need to retry.
13660 13654   *
13661 13655   * Note: the rp's r_lkserlock is *not* dropped during this path.
13662 13656   */
13663 13657  static bool_t
13664 13658  nfs4frlock_recovery(int needrecov, nfs4_error_t *ep,
13665 13659      COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp,
13666 13660      LOCK4args *lock_args, LOCKU4args *locku_args,
13667 13661      nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
13668 13662      nfs4_lock_owner_t **lopp, rnode4_t *rp, vnode_t *vp,
13669 13663      nfs4_recov_state_t *recov_statep, nfs4_op_hint_t op_hint,
13670 13664      bool_t *did_start_fop, nfs4_lost_rqst_t *lost_rqstp, flock64_t *flk)
13671 13665  {
13672 13666          nfs4_open_owner_t       *oop = *oopp;
13673 13667          nfs4_open_stream_t      *osp = *ospp;
13674 13668          nfs4_lock_owner_t       *lop = *lopp;
13675 13669  
13676 13670          bool_t abort, retry;
13677 13671  
13678 13672          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13679 13673          ASSERT((*argspp) != NULL);
13680 13674          ASSERT((*respp) != NULL);
13681 13675          if (lock_args || locku_args)
13682 13676                  ASSERT(lop != NULL);
13683 13677  
13684 13678          NFS4_DEBUG((nfs4_client_lock_debug || nfs4_client_recov_debug),
13685 13679              (CE_NOTE, "nfs4frlock_recovery: initiating recovery\n"));
13686 13680  
13687 13681          retry = TRUE;
13688 13682          abort = FALSE;
13689 13683          if (needrecov) {
13690 13684                  nfs4_bseqid_entry_t *bsep = NULL;
13691 13685                  nfs_opnum4 op;
13692 13686  
13693 13687                  op = lock_args ? OP_LOCK : locku_args ? OP_LOCKU : OP_LOCKT;
13694 13688  
13695 13689                  if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) {
13696 13690                          seqid4 seqid;
13697 13691  
13698 13692                          if (lock_args) {
13699 13693                                  if (lock_args->locker.new_lock_owner == TRUE)
13700 13694                                          seqid = lock_args->locker.locker4_u.
13701 13695                                              open_owner.open_seqid;
13702 13696                                  else
13703 13697                                          seqid = lock_args->locker.locker4_u.
13704 13698                                              lock_owner.lock_seqid;
13705 13699                          } else if (locku_args) {
13706 13700                                  seqid = locku_args->seqid;
13707 13701                          } else {
13708 13702                                  seqid = 0;
13709 13703                          }
13710 13704  
13711 13705                          bsep = nfs4_create_bseqid_entry(oop, lop, vp,
13712 13706                              flk->l_pid, (*argspp)->ctag, seqid);
13713 13707                  }
13714 13708  
13715 13709                  abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL,
13716 13710                      (lost_rqstp && (lost_rqstp->lr_op == OP_LOCK ||
13717 13711                      lost_rqstp->lr_op == OP_LOCKU)) ? lost_rqstp :
13718 13712                      NULL, op, bsep, NULL, NULL);
13719 13713  
13720 13714                  if (bsep)
13721 13715                          kmem_free(bsep, sizeof (*bsep));
13722 13716          }
13723 13717  
13724 13718          /*
13725 13719           * Return that we do not want to retry the request for 3 cases:
13726 13720           * 1. If we received EINTR or are bailing out because of a forced
13727 13721           *    unmount, we came into this code path just for the sake of
13728 13722           *    initiating recovery, we now need to return the error.
13729 13723           * 2. If we have aborted recovery.
13730 13724           * 3. We received NFS4ERR_BAD_SEQID.
13731 13725           */
13732 13726          if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp) ||
13733 13727              abort == TRUE || (ep->error == 0 && ep->stat == NFS4ERR_BAD_SEQID))
13734 13728                  retry = FALSE;
13735 13729  
13736 13730          if (*did_start_fop == TRUE) {
13737 13731                  nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep,
13738 13732                      needrecov);
13739 13733                  *did_start_fop = FALSE;
13740 13734          }
13741 13735  
13742 13736          if (retry == TRUE) {
13743 13737                  nfs_argop4      *argop;
13744 13738  
13745 13739                  argop = (*argspp)->array;
13746 13740                  ASSERT((*argspp)->array_len == 2);
13747 13741  
13748 13742                  if (argop[1].argop == OP_LOCK)
13749 13743                          nfs4args_lock_free(&argop[1]);
13750 13744                  else if (argop[1].argop == OP_LOCKT)
13751 13745                          nfs4args_lockt_free(&argop[1]);
13752 13746                  kmem_free(argop, 2 * sizeof (nfs_argop4));
13753 13747                  if (!ep->error)
13754 13748                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp);
13755 13749                  *respp = NULL;
13756 13750                  *argspp = NULL;
13757 13751          }
13758 13752  
13759 13753          if (lop != NULL) {
13760 13754                  nfs4_end_lock_seqid_sync(lop);
13761 13755                  lock_owner_rele(lop);
13762 13756          }
13763 13757  
13764 13758          *lopp = NULL;
13765 13759  
13766 13760          /* need to free up the reference on osp for lock args */
13767 13761          if (osp != NULL) {
13768 13762                  open_stream_rele(osp, rp);
13769 13763                  *ospp = NULL;
13770 13764          }
13771 13765  
13772 13766          /* need to free up the reference on oop for lock args */
13773 13767          if (oop != NULL) {
13774 13768                  nfs4_end_open_seqid_sync(oop);
13775 13769                  open_owner_rele(oop);
13776 13770                  *oopp = NULL;
13777 13771          }
13778 13772  
13779 13773          return (retry);
13780 13774  }
13781 13775  
13782 13776  /*
13783 13777   * Handles the successful reply from the server for nfs4frlock.
13784 13778   */
13785 13779  static void
13786 13780  nfs4frlock_results_ok(nfs4_lock_call_type_t ctype, int cmd, flock64_t *flk,
13787 13781      vnode_t *vp, int flag, u_offset_t offset,
13788 13782      nfs4_lost_rqst_t *resend_rqstp)
13789 13783  {
13790 13784          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13791 13785          if ((cmd == F_SETLK || cmd == F_SETLKW) &&
13792 13786              (flk->l_type == F_RDLCK || flk->l_type == F_WRLCK)) {
13793 13787                  if (ctype == NFS4_LCK_CTYPE_NORM) {
13794 13788                          flk->l_pid = ttoproc(curthread)->p_pid;
13795 13789                          /*
13796 13790                           * We do not register lost locks locally in
13797 13791                           * the 'resend' case since the user/application
13798 13792                           * doesn't think we have the lock.
13799 13793                           */
13800 13794                          ASSERT(!resend_rqstp);
13801 13795                          nfs4_register_lock_locally(vp, flk, flag, offset);
13802 13796                  }
13803 13797          }
13804 13798  }
13805 13799  
13806 13800  /*
13807 13801   * Handle the DENIED reply from the server for nfs4frlock.
13808 13802   * Returns TRUE if we should retry the request; FALSE otherwise.
13809 13803   *
13810 13804   * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
13811 13805   * COMPOUND4 args/res for calls that need to retry.  Can also
13812 13806   * drop and regrab the r_lkserlock.
13813 13807   */
13814 13808  static bool_t
13815 13809  nfs4frlock_results_denied(nfs4_lock_call_type_t ctype, LOCK4args *lock_args,
13816 13810      LOCKT4args *lockt_args, nfs4_open_owner_t **oopp,
13817 13811      nfs4_open_stream_t **ospp, nfs4_lock_owner_t **lopp, int cmd,
13818 13812      vnode_t *vp, flock64_t *flk, nfs4_op_hint_t op_hint,
13819 13813      nfs4_recov_state_t *recov_statep, int needrecov,
13820 13814      COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp,
13821 13815      clock_t *tick_delayp, short *whencep, int *errorp,
13822 13816      nfs_resop4 *resop, cred_t *cr, bool_t *did_start_fop,
13823 13817      bool_t *skip_get_err)
13824 13818  {
13825 13819          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13826 13820  
13827 13821          if (lock_args) {
13828 13822                  nfs4_open_owner_t       *oop = *oopp;
13829 13823                  nfs4_open_stream_t      *osp = *ospp;
13830 13824                  nfs4_lock_owner_t       *lop = *lopp;
13831 13825                  int                     intr;
13832 13826  
13833 13827                  /*
13834 13828                   * Blocking lock needs to sleep and retry from the request.
13835 13829                   *
13836 13830                   * Do not block and wait for 'resend' or 'reinstate'
13837 13831                   * lock requests, just return the error.
13838 13832                   *
13839 13833                   * Note: reclaim requests have cmd == F_SETLK, not F_SETLKW.
13840 13834                   */
13841 13835                  if (cmd == F_SETLKW) {
13842 13836                          rnode4_t *rp = VTOR4(vp);
13843 13837                          nfs_argop4 *argop = (*argspp)->array;
13844 13838  
13845 13839                          ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
13846 13840  
13847 13841                          nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint,
13848 13842                              recov_statep, needrecov);
13849 13843                          *did_start_fop = FALSE;
13850 13844                          ASSERT((*argspp)->array_len == 2);
13851 13845                          if (argop[1].argop == OP_LOCK)
13852 13846                                  nfs4args_lock_free(&argop[1]);
13853 13847                          else if (argop[1].argop == OP_LOCKT)
13854 13848                                  nfs4args_lockt_free(&argop[1]);
13855 13849                          kmem_free(argop, 2 * sizeof (nfs_argop4));
13856 13850                          if (*respp)
13857 13851                                  xdr_free(xdr_COMPOUND4res_clnt,
13858 13852                                      (caddr_t)*respp);
13859 13853                          *argspp = NULL;
13860 13854                          *respp = NULL;
13861 13855                          nfs4_end_lock_seqid_sync(lop);
13862 13856                          lock_owner_rele(lop);
13863 13857                          *lopp = NULL;
13864 13858                          if (osp != NULL) {
13865 13859                                  open_stream_rele(osp, rp);
13866 13860                                  *ospp = NULL;
13867 13861                          }
13868 13862                          if (oop != NULL) {
13869 13863                                  nfs4_end_open_seqid_sync(oop);
13870 13864                                  open_owner_rele(oop);
13871 13865                                  *oopp = NULL;
13872 13866                          }
13873 13867  
13874 13868                          nfs_rw_exit(&rp->r_lkserlock);
13875 13869  
13876 13870                          intr = nfs4_block_and_wait(tick_delayp, rp);
13877 13871  
13878 13872                          if (intr) {
13879 13873                                  (void) nfs_rw_enter_sig(&rp->r_lkserlock,
13880 13874                                      RW_WRITER, FALSE);
13881 13875                                  *errorp = EINTR;
13882 13876                                  return (FALSE);
13883 13877                          }
13884 13878  
13885 13879                          (void) nfs_rw_enter_sig(&rp->r_lkserlock,
13886 13880                              RW_WRITER, FALSE);
13887 13881  
13888 13882                          /*
13889 13883                           * Make sure we are still safe to lock with
13890 13884                           * regards to mmapping.
13891 13885                           */
13892 13886                          if (!nfs4_safelock(vp, flk, cr)) {
13893 13887                                  *errorp = EAGAIN;
13894 13888                                  return (FALSE);
13895 13889                          }
13896 13890  
13897 13891                          return (TRUE);
13898 13892                  }
13899 13893                  if (ctype == NFS4_LCK_CTYPE_NORM)
13900 13894                          *errorp = EAGAIN;
13901 13895                  *skip_get_err = TRUE;
13902 13896                  flk->l_whence = 0;
13903 13897                  *whencep = 0;
13904 13898                  return (FALSE);
13905 13899          } else if (lockt_args) {
13906 13900                  NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
13907 13901                      "nfs4frlock_results_denied: OP_LOCKT DENIED"));
13908 13902  
13909 13903                  denied_to_flk(&resop->nfs_resop4_u.oplockt.denied,
13910 13904                      flk, lockt_args);
13911 13905  
13912 13906                  /* according to NLM code */
13913 13907                  *errorp = 0;
13914 13908                  *whencep = 0;
13915 13909                  *skip_get_err = TRUE;
13916 13910                  return (FALSE);
13917 13911          }
13918 13912          return (FALSE);
13919 13913  }
13920 13914  
13921 13915  /*
13922 13916   * Handles all NFS4 errors besides NFS4_OK and NFS4ERR_DENIED for nfs4frlock.
13923 13917   */
13924 13918  static void
13925 13919  nfs4frlock_results_default(COMPOUND4res_clnt *resp, int *errorp)
13926 13920  {
13927 13921          switch (resp->status) {
13928 13922          case NFS4ERR_ACCESS:
13929 13923          case NFS4ERR_ADMIN_REVOKED:
13930 13924          case NFS4ERR_BADHANDLE:
13931 13925          case NFS4ERR_BAD_RANGE:
13932 13926          case NFS4ERR_BAD_SEQID:
13933 13927          case NFS4ERR_BAD_STATEID:
13934 13928          case NFS4ERR_BADXDR:
13935 13929          case NFS4ERR_DEADLOCK:
13936 13930          case NFS4ERR_DELAY:
13937 13931          case NFS4ERR_EXPIRED:
13938 13932          case NFS4ERR_FHEXPIRED:
13939 13933          case NFS4ERR_GRACE:
13940 13934          case NFS4ERR_INVAL:
13941 13935          case NFS4ERR_ISDIR:
13942 13936          case NFS4ERR_LEASE_MOVED:
13943 13937          case NFS4ERR_LOCK_NOTSUPP:
13944 13938          case NFS4ERR_LOCK_RANGE:
13945 13939          case NFS4ERR_MOVED:
13946 13940          case NFS4ERR_NOFILEHANDLE:
13947 13941          case NFS4ERR_NO_GRACE:
13948 13942          case NFS4ERR_OLD_STATEID:
13949 13943          case NFS4ERR_OPENMODE:
13950 13944          case NFS4ERR_RECLAIM_BAD:
13951 13945          case NFS4ERR_RECLAIM_CONFLICT:
13952 13946          case NFS4ERR_RESOURCE:
13953 13947          case NFS4ERR_SERVERFAULT:
13954 13948          case NFS4ERR_STALE:
13955 13949          case NFS4ERR_STALE_CLIENTID:
13956 13950          case NFS4ERR_STALE_STATEID:
13957 13951                  return;
13958 13952          default:
13959 13953                  NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
13960 13954                      "nfs4frlock_results_default: got unrecognizable "
13961 13955                      "res.status %d", resp->status));
13962 13956                  *errorp = NFS4ERR_INVAL;
13963 13957          }
13964 13958  }
13965 13959  
13966 13960  /*
13967 13961   * The lock request was successful, so update the client's state.
13968 13962   */
13969 13963  static void
13970 13964  nfs4frlock_update_state(LOCK4args *lock_args, LOCKU4args *locku_args,
13971 13965      LOCKT4args *lockt_args, nfs_resop4 *resop, nfs4_lock_owner_t *lop,
13972 13966      vnode_t *vp, flock64_t *flk, cred_t *cr,
13973 13967      nfs4_lost_rqst_t *resend_rqstp)
13974 13968  {
13975 13969          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13976 13970  
13977 13971          if (lock_args) {
13978 13972                  LOCK4res *lock_res;
13979 13973  
13980 13974                  lock_res = &resop->nfs_resop4_u.oplock;
13981 13975                  /* update the stateid with server's response */
13982 13976  
13983 13977                  if (lock_args->locker.new_lock_owner == TRUE) {
13984 13978                          mutex_enter(&lop->lo_lock);
13985 13979                          lop->lo_just_created = NFS4_PERM_CREATED;
13986 13980                          mutex_exit(&lop->lo_lock);
13987 13981                  }
13988 13982  
13989 13983                  nfs4_set_lock_stateid(lop, lock_res->LOCK4res_u.lock_stateid);
13990 13984  
13991 13985                  /*
13992 13986                   * If the lock was the result of a resending a lost
13993 13987                   * request, we've synched up the stateid and seqid
13994 13988                   * with the server, but now the server might be out of sync
13995 13989                   * with what the application thinks it has for locks.
13996 13990                   * Clean that up here.  It's unclear whether we should do
13997 13991                   * this even if the filesystem has been forcibly unmounted.
13998 13992                   * For most servers, it's probably wasted effort, but
13999 13993                   * RFC 7530 lets servers require that unlocks exactly match
14000 13994                   * the locks that are held.
14001 13995                   */
14002 13996                  if (resend_rqstp != NULL &&
14003 13997                      resend_rqstp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE) {
14004 13998                          nfs4_reinstitute_local_lock_state(vp, flk, cr, lop);
14005 13999                  } else {
14006 14000                          flk->l_whence = 0;
14007 14001                  }
14008 14002          } else if (locku_args) {
14009 14003                  LOCKU4res *locku_res;
14010 14004  
14011 14005                  locku_res = &resop->nfs_resop4_u.oplocku;
14012 14006  
14013 14007                  /* Update the stateid with the server's response */
14014 14008                  nfs4_set_lock_stateid(lop, locku_res->lock_stateid);
14015 14009          } else if (lockt_args) {
14016 14010                  /* Switch the lock type to express success, see fcntl */
14017 14011                  flk->l_type = F_UNLCK;
14018 14012                  flk->l_whence = 0;
14019 14013          }
14020 14014  }
14021 14015  
14022 14016  /*
14023 14017   * Do final cleanup before exiting nfs4frlock.
14024 14018   * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
14025 14019   * COMPOUND4 args/res for calls that haven't already.
14026 14020   */
14027 14021  static void
14028 14022  nfs4frlock_final_cleanup(nfs4_lock_call_type_t ctype, COMPOUND4args_clnt *argsp,
14029 14023      COMPOUND4res_clnt *resp, vnode_t *vp, nfs4_op_hint_t op_hint,
14030 14024      nfs4_recov_state_t *recov_statep, int needrecov, nfs4_open_owner_t *oop,
14031 14025      nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk,
14032 14026      short whence, u_offset_t offset, struct lm_sysid *ls,
14033 14027      int *errorp, LOCK4args *lock_args, LOCKU4args *locku_args,
14034 14028      bool_t did_start_fop, bool_t skip_get_err,
14035 14029      cred_t *cred_otw, cred_t *cred)
14036 14030  {
14037 14031          mntinfo4_t      *mi = VTOMI4(vp);
14038 14032          rnode4_t        *rp = VTOR4(vp);
14039 14033          int             error = *errorp;
14040 14034          nfs_argop4      *argop;
14041 14035          int     do_flush_pages = 0;
14042 14036  
14043 14037          ASSERT(nfs_zone() == mi->mi_zone);
14044 14038          /*
14045 14039           * The client recovery code wants the raw status information,
14046 14040           * so don't map the NFS status code to an errno value for
14047 14041           * non-normal call types.
14048 14042           */
14049 14043          if (ctype == NFS4_LCK_CTYPE_NORM) {
14050 14044                  if (*errorp == 0 && resp != NULL && skip_get_err == FALSE)
14051 14045                          *errorp = geterrno4(resp->status);
14052 14046                  if (did_start_fop == TRUE)
14053 14047                          nfs4_end_fop(mi, vp, NULL, op_hint, recov_statep,
14054 14048                              needrecov);
14055 14049  
14056 14050                  /*
14057 14051                   * We've established a new lock on the server, so invalidate
14058 14052                   * the pages associated with the vnode to get the most up to
14059 14053                   * date pages from the server after acquiring the lock. We
14060 14054                   * want to be sure that the read operation gets the newest data.
14061 14055                   * N.B.
14062 14056                   * We used to do this in nfs4frlock_results_ok but that doesn't
14063 14057                   * work since VOP_PUTPAGE can call nfs4_commit which calls
14064 14058                   * nfs4_start_fop. We flush the pages below after calling
14065 14059                   * nfs4_end_fop above
14066 14060                   * The flush of the page cache must be done after
14067 14061                   * nfs4_end_open_seqid_sync() to avoid a 4-way hang.
14068 14062                   */
14069 14063                  if (!error && resp && resp->status == NFS4_OK)
14070 14064                          do_flush_pages = 1;
14071 14065          }
14072 14066          if (argsp) {
14073 14067                  ASSERT(argsp->array_len == 2);
14074 14068                  argop = argsp->array;
14075 14069                  if (argop[1].argop == OP_LOCK)
14076 14070                          nfs4args_lock_free(&argop[1]);
14077 14071                  else if (argop[1].argop == OP_LOCKT)
14078 14072                          nfs4args_lockt_free(&argop[1]);
14079 14073                  kmem_free(argop, 2 * sizeof (nfs_argop4));
14080 14074                  if (resp)
14081 14075                          xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
14082 14076          }
14083 14077  
14084 14078          /* free the reference on the lock owner */
14085 14079          if (lop != NULL) {
14086 14080                  nfs4_end_lock_seqid_sync(lop);
14087 14081                  lock_owner_rele(lop);
14088 14082          }
14089 14083  
14090 14084          /* need to free up the reference on osp for lock args */
14091 14085          if (osp != NULL)
14092 14086                  open_stream_rele(osp, rp);
14093 14087  
14094 14088          /* need to free up the reference on oop for lock args */
14095 14089          if (oop != NULL) {
14096 14090                  nfs4_end_open_seqid_sync(oop);
14097 14091                  open_owner_rele(oop);
14098 14092          }
14099 14093  
14100 14094          if (do_flush_pages)
14101 14095                  nfs4_flush_pages(vp, cred);
14102 14096  
14103 14097          (void) convoff(vp, flk, whence, offset);
14104 14098  
14105 14099          lm_rel_sysid(ls);
14106 14100  
14107 14101          /*
14108 14102           * Record debug information in the event we get EINVAL.
14109 14103           */
14110 14104          mutex_enter(&mi->mi_lock);
14111 14105          if (*errorp == EINVAL && (lock_args || locku_args) &&
14112 14106              (!(mi->mi_flags & MI4_POSIX_LOCK))) {
14113 14107                  if (!(mi->mi_flags & MI4_LOCK_DEBUG)) {
14114 14108                          zcmn_err(getzoneid(), CE_NOTE,
14115 14109                              "%s operation failed with "
14116 14110                              "EINVAL probably since the server, %s,"
14117 14111                              " doesn't support POSIX style locking",
14118 14112                              lock_args ? "LOCK" : "LOCKU",
14119 14113                              mi->mi_curr_serv->sv_hostname);
14120 14114                          mi->mi_flags |= MI4_LOCK_DEBUG;
14121 14115                  }
14122 14116          }
14123 14117          mutex_exit(&mi->mi_lock);
14124 14118  
14125 14119          if (cred_otw)
14126 14120                  crfree(cred_otw);
14127 14121  }
14128 14122  
14129 14123  /*
14130 14124   * This calls the server and the local locking code.
14131 14125   *
14132 14126   * Client locks are registerred locally by oring the sysid with
14133 14127   * LM_SYSID_CLIENT. The server registers locks locally using just the sysid.
14134 14128   * We need to distinguish between the two to avoid collision in case one
14135 14129   * machine is used as both client and server.
14136 14130   *
14137 14131   * Blocking lock requests will continually retry to acquire the lock
14138 14132   * forever.
14139 14133   *
14140 14134   * The ctype is defined as follows:
14141 14135   * NFS4_LCK_CTYPE_NORM: normal lock request.
14142 14136   *
14143 14137   * NFS4_LCK_CTYPE_RECLAIM:  bypass the usual calls for synchronizing with client
14144 14138   * recovery, get the pid from flk instead of curproc, and don't reregister
14145 14139   * the lock locally.
14146 14140   *
14147 14141   * NFS4_LCK_CTYPE_RESEND: same as NFS4_LCK_CTYPE_RECLAIM, with the addition
14148 14142   * that we will use the information passed in via resend_rqstp to setup the
14149 14143   * lock/locku request.  This resend is the exact same request as the 'lost
14150 14144   * lock', and is initiated by the recovery framework. A successful resend
14151 14145   * request can initiate one or more reinstate requests.
14152 14146   *
14153 14147   * NFS4_LCK_CTYPE_REINSTATE: same as NFS4_LCK_CTYPE_RESEND, except that it
14154 14148   * does not trigger additional reinstate requests.  This lock call type is
14155 14149   * set for setting the v4 server's locking state back to match what the
14156 14150   * client's local locking state is in the event of a received 'lost lock'.
14157 14151   *
14158 14152   * Errors are returned via the nfs4_error_t parameter.
14159 14153   */
14160 14154  void
14161 14155  nfs4frlock(nfs4_lock_call_type_t ctype, vnode_t *vp, int cmd, flock64_t *flk,
14162 14156      int flag, u_offset_t offset, cred_t *cr, nfs4_error_t *ep,
14163 14157      nfs4_lost_rqst_t *resend_rqstp, int *did_reclaimp)
14164 14158  {
14165 14159          COMPOUND4args_clnt      args, *argsp = NULL;
14166 14160          COMPOUND4res_clnt       res, *resp = NULL;
14167 14161          nfs_argop4      *argop;
14168 14162          nfs_resop4      *resop;
14169 14163          rnode4_t        *rp;
14170 14164          int             doqueue = 1;
14171 14165          clock_t         tick_delay;  /* delay in clock ticks */
14172 14166          struct lm_sysid *ls;
14173 14167          LOCK4args       *lock_args = NULL;
14174 14168          LOCKU4args      *locku_args = NULL;
14175 14169          LOCKT4args      *lockt_args = NULL;
14176 14170          nfs4_open_owner_t *oop = NULL;
14177 14171          nfs4_open_stream_t *osp = NULL;
14178 14172          nfs4_lock_owner_t *lop = NULL;
14179 14173          bool_t          needrecov = FALSE;
14180 14174          nfs4_recov_state_t recov_state;
14181 14175          short           whence;
14182 14176          nfs4_op_hint_t  op_hint;
14183 14177          nfs4_lost_rqst_t lost_rqst;
14184 14178          bool_t          retry = FALSE;
14185 14179          bool_t          did_start_fop = FALSE;
14186 14180          bool_t          skip_get_err = FALSE;
14187 14181          cred_t          *cred_otw = NULL;
14188 14182          bool_t          recovonly;      /* just queue request */
14189 14183          int             frc_no_reclaim = 0;
14190 14184  #ifdef DEBUG
14191 14185          char *name;
14192 14186  #endif
14193 14187  
14194 14188          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14195 14189  
14196 14190  #ifdef DEBUG
14197 14191          name = fn_name(VTOSV(vp)->sv_name);
14198 14192          NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4frlock: "
14199 14193              "%s: cmd %d, type %d, offset %llu, start %"PRIx64", "
14200 14194              "length %"PRIu64", pid %d, sysid %d, call type %s, "
14201 14195              "resend request %s", name, cmd, flk->l_type, offset, flk->l_start,
14202 14196              flk->l_len, ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid :
14203 14197              flk->l_pid, flk->l_sysid, nfs4frlock_get_call_type(ctype),
14204 14198              resend_rqstp ? "TRUE" : "FALSE"));
14205 14199          kmem_free(name, MAXNAMELEN);
14206 14200  #endif
14207 14201  
14208 14202          nfs4_error_zinit(ep);
14209 14203          ep->error = nfs4frlock_validate_args(cmd, flk, flag, vp, offset);
14210 14204          if (ep->error)
14211 14205                  return;
14212 14206          ep->error = nfs4frlock_get_sysid(&ls, vp, flk);
14213 14207          if (ep->error)
14214 14208                  return;
14215 14209          nfs4frlock_pre_setup(&tick_delay, &recov_state, flk, &whence,
14216 14210              vp, cr, &cred_otw);
14217 14211  
14218 14212  recov_retry:
14219 14213          nfs4frlock_call_init(&args, &argsp, &argop, &op_hint, flk, cmd,
14220 14214              &retry, &did_start_fop, &resp, &skip_get_err, &lost_rqst);
14221 14215          rp = VTOR4(vp);
14222 14216  
14223 14217          ep->error = nfs4frlock_start_call(ctype, vp, op_hint, &recov_state,
14224 14218              &did_start_fop, &recovonly);
14225 14219  
14226 14220          if (ep->error)
14227 14221                  goto out;
14228 14222  
14229 14223          if (recovonly) {
14230 14224                  /*
14231 14225                   * Leave the request for the recovery system to deal with.
14232 14226                   */
14233 14227                  ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
14234 14228                  ASSERT(cmd != F_GETLK);
14235 14229                  ASSERT(flk->l_type == F_UNLCK);
14236 14230  
14237 14231                  nfs4_error_init(ep, EINTR);
14238 14232                  needrecov = TRUE;
14239 14233                  lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
14240 14234                  if (lop != NULL) {
14241 14235                          nfs4frlock_save_lost_rqst(ctype, ep->error, READ_LT,
14242 14236                              NULL, NULL, lop, flk, &lost_rqst, cr, vp);
14243 14237                          (void) nfs4_start_recovery(ep,
14244 14238                              VTOMI4(vp), vp, NULL, NULL,
14245 14239                              (lost_rqst.lr_op == OP_LOCK ||
14246 14240                              lost_rqst.lr_op == OP_LOCKU) ?
14247 14241                              &lost_rqst : NULL, OP_LOCKU, NULL, NULL, NULL);
14248 14242                          lock_owner_rele(lop);
14249 14243                          lop = NULL;
14250 14244                  }
14251 14245                  flk->l_pid = curproc->p_pid;
14252 14246                  nfs4_register_lock_locally(vp, flk, flag, offset);
14253 14247                  goto out;
14254 14248          }
14255 14249  
14256 14250          /* putfh directory fh */
14257 14251          argop[0].argop = OP_CPUTFH;
14258 14252          argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
14259 14253  
14260 14254          /*
14261 14255           * Set up the over-the-wire arguments and get references to the
14262 14256           * open owner, etc.
14263 14257           */
14264 14258  
14265 14259          if (ctype == NFS4_LCK_CTYPE_RESEND ||
14266 14260              ctype == NFS4_LCK_CTYPE_REINSTATE) {
14267 14261                  nfs4frlock_setup_resend_lock_args(resend_rqstp, argsp,
14268 14262                      &argop[1], &lop, &oop, &osp, &lock_args, &locku_args);
14269 14263          } else {
14270 14264                  bool_t go_otw = TRUE;
14271 14265  
14272 14266                  ASSERT(resend_rqstp == NULL);
14273 14267  
14274 14268                  switch (cmd) {
14275 14269                  case F_GETLK:
14276 14270                          nfs4frlock_setup_lockt_args(ctype, &argop[1],
14277 14271                              &lockt_args, argsp, flk, rp);
14278 14272                          break;
14279 14273                  case F_SETLKW:
14280 14274                  case F_SETLK:
14281 14275                          if (flk->l_type == F_UNLCK)
14282 14276                                  nfs4frlock_setup_locku_args(ctype,
14283 14277                                      &argop[1], &locku_args, flk,
14284 14278                                      &lop, ep, argsp,
14285 14279                                      vp, flag, offset, cr,
14286 14280                                      &skip_get_err, &go_otw);
14287 14281                          else
14288 14282                                  nfs4frlock_setup_lock_args(ctype,
14289 14283                                      &lock_args, &oop, &osp, &lop, &argop[1],
14290 14284                                      argsp, flk, cmd, vp, cr, ep);
14291 14285  
14292 14286                          if (ep->error)
14293 14287                                  goto out;
14294 14288  
14295 14289                          switch (ep->stat) {
14296 14290                          case NFS4_OK:
14297 14291                                  break;
14298 14292                          case NFS4ERR_DELAY:
14299 14293                                  /* recov thread never gets this error */
14300 14294                                  ASSERT(resend_rqstp == NULL);
14301 14295                                  ASSERT(did_start_fop);
14302 14296  
14303 14297                                  nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint,
14304 14298                                      &recov_state, TRUE);
14305 14299                                  did_start_fop = FALSE;
14306 14300                                  if (argop[1].argop == OP_LOCK)
14307 14301                                          nfs4args_lock_free(&argop[1]);
14308 14302                                  else if (argop[1].argop == OP_LOCKT)
14309 14303                                          nfs4args_lockt_free(&argop[1]);
14310 14304                                  kmem_free(argop, 2 * sizeof (nfs_argop4));
14311 14305                                  argsp = NULL;
14312 14306                                  goto recov_retry;
14313 14307                          default:
14314 14308                                  ep->error = EIO;
14315 14309                                  goto out;
14316 14310                          }
14317 14311                          break;
14318 14312                  default:
14319 14313                          NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14320 14314                              "nfs4_frlock: invalid cmd %d", cmd));
14321 14315                          ep->error = EINVAL;
14322 14316                          goto out;
14323 14317                  }
14324 14318  
14325 14319                  if (!go_otw)
14326 14320                          goto out;
14327 14321          }
14328 14322  
14329 14323          /* XXX should we use the local reclock as a cache ? */
14330 14324          /*
14331 14325           * Unregister the lock with the local locking code before
14332 14326           * contacting the server.  This avoids a potential race where
14333 14327           * another process gets notified that it has been granted a lock
14334 14328           * before we can unregister ourselves locally.
14335 14329           */
14336 14330          if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) {
14337 14331                  if (ctype == NFS4_LCK_CTYPE_NORM)
14338 14332                          flk->l_pid = ttoproc(curthread)->p_pid;
14339 14333                  nfs4_register_lock_locally(vp, flk, flag, offset);
14340 14334          }
14341 14335  
14342 14336          /*
14343 14337           * Send the server the lock request.  Continually loop with a delay
14344 14338           * if get error NFS4ERR_DENIED (for blocking locks) or NFS4ERR_GRACE.
14345 14339           */
14346 14340          resp = &res;
14347 14341  
14348 14342          NFS4_DEBUG((nfs4_client_call_debug || nfs4_client_lock_debug),
14349 14343              (CE_NOTE,
14350 14344              "nfs4frlock: %s call, rp %s", needrecov ? "recov" : "first",
14351 14345              rnode4info(rp)));
14352 14346  
14353 14347          if (lock_args && frc_no_reclaim) {
14354 14348                  ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM);
14355 14349                  NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14356 14350                      "nfs4frlock: frc_no_reclaim: clearing reclaim"));
14357 14351                  lock_args->reclaim = FALSE;
14358 14352                  if (did_reclaimp)
14359 14353                          *did_reclaimp = 0;
14360 14354          }
14361 14355  
14362 14356          /*
14363 14357           * Do the OTW call.
14364 14358           */
14365 14359          rfs4call(VTOMI4(vp), argsp, resp, cred_otw, &doqueue, 0, ep);
14366 14360  
14367 14361          NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14368 14362              "nfs4frlock: error %d, status %d", ep->error, resp->status));
14369 14363  
14370 14364          needrecov = nfs4_needs_recovery(ep, TRUE, vp->v_vfsp);
14371 14365          NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14372 14366              "nfs4frlock: needrecov %d", needrecov));
14373 14367  
14374 14368          if (ep->error == 0 && nfs4_need_to_bump_seqid(resp))
14375 14369                  nfs4frlock_bump_seqid(lock_args, locku_args, oop, lop,
14376 14370                      args.ctag);
14377 14371  
14378 14372          /*
14379 14373           * Check if one of these mutually exclusive error cases has
14380 14374           * happened:
14381 14375           *   need to swap credentials due to access error
14382 14376           *   recovery is needed
14383 14377           *   different error (only known case is missing Kerberos ticket)
14384 14378           */
14385 14379  
14386 14380          if ((ep->error == EACCES ||
14387 14381              (ep->error == 0 && resp->status == NFS4ERR_ACCESS)) &&
14388 14382              cred_otw != cr) {
14389 14383                  nfs4frlock_check_access(vp, op_hint, &recov_state, needrecov,
14390 14384                      &did_start_fop, &argsp, &resp, ep->error, &lop, &oop, &osp,
14391 14385                      cr, &cred_otw);
14392 14386                  goto recov_retry;
14393 14387          }
14394 14388  
14395 14389          if (needrecov) {
14396 14390                  /*
14397 14391                   * LOCKT requests don't need to recover from lost
14398 14392                   * requests since they don't create/modify state.
14399 14393                   */
14400 14394                  if ((ep->error == EINTR ||
14401 14395                      NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) &&
14402 14396                      lockt_args)
14403 14397                          goto out;
14404 14398                  /*
14405 14399                   * Do not attempt recovery for requests initiated by
14406 14400                   * the recovery framework.  Let the framework redrive them.
14407 14401                   */
14408 14402                  if (ctype != NFS4_LCK_CTYPE_NORM)
14409 14403                          goto out;
14410 14404                  else {
14411 14405                          ASSERT(resend_rqstp == NULL);
14412 14406                  }
14413 14407  
14414 14408                  nfs4frlock_save_lost_rqst(ctype, ep->error,
14415 14409                      flk_to_locktype(cmd, flk->l_type),
14416 14410                      oop, osp, lop, flk, &lost_rqst, cred_otw, vp);
14417 14411  
14418 14412                  retry = nfs4frlock_recovery(needrecov, ep, &argsp,
14419 14413                      &resp, lock_args, locku_args, &oop, &osp, &lop,
14420 14414                      rp, vp, &recov_state, op_hint, &did_start_fop,
14421 14415                      cmd != F_GETLK ? &lost_rqst : NULL, flk);
14422 14416  
14423 14417                  if (retry) {
14424 14418                          ASSERT(oop == NULL);
14425 14419                          ASSERT(osp == NULL);
14426 14420                          ASSERT(lop == NULL);
14427 14421                          goto recov_retry;
14428 14422                  }
14429 14423                  goto out;
14430 14424          }
14431 14425  
14432 14426          /*
14433 14427           * Bail out if have reached this point with ep->error set. Can
14434 14428           * happen if (ep->error == EACCES && !needrecov && cred_otw == cr).
14435 14429           * This happens if Kerberos ticket has expired or has been
14436 14430           * destroyed.
14437 14431           */
14438 14432          if (ep->error != 0)
14439 14433                  goto out;
14440 14434  
14441 14435          /*
14442 14436           * Process the reply.
14443 14437           */
14444 14438          switch (resp->status) {
14445 14439          case NFS4_OK:
14446 14440                  resop = &resp->array[1];
14447 14441                  nfs4frlock_results_ok(ctype, cmd, flk, vp, flag, offset,
14448 14442                      resend_rqstp);
14449 14443                  /*
14450 14444                   * Have a successful lock operation, now update state.
14451 14445                   */
14452 14446                  nfs4frlock_update_state(lock_args, locku_args, lockt_args,
14453 14447                      resop, lop, vp, flk, cr, resend_rqstp);
14454 14448                  break;
14455 14449  
14456 14450          case NFS4ERR_DENIED:
14457 14451                  resop = &resp->array[1];
14458 14452                  retry = nfs4frlock_results_denied(ctype, lock_args, lockt_args,
14459 14453                      &oop, &osp, &lop, cmd, vp, flk, op_hint,
14460 14454                      &recov_state, needrecov, &argsp, &resp,
14461 14455                      &tick_delay, &whence, &ep->error, resop, cr,
14462 14456                      &did_start_fop, &skip_get_err);
14463 14457  
14464 14458                  if (retry) {
14465 14459                          ASSERT(oop == NULL);
14466 14460                          ASSERT(osp == NULL);
14467 14461                          ASSERT(lop == NULL);
14468 14462                          goto recov_retry;
14469 14463                  }
14470 14464                  break;
14471 14465          /*
14472 14466           * If the server won't let us reclaim, fall-back to trying to lock
14473 14467           * the file from scratch. Code elsewhere will check the changeinfo
14474 14468           * to ensure the file hasn't been changed.
14475 14469           */
14476 14470          case NFS4ERR_NO_GRACE:
14477 14471                  if (lock_args && lock_args->reclaim == TRUE) {
14478 14472                          ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM);
14479 14473                          NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14480 14474                              "nfs4frlock: reclaim: NFS4ERR_NO_GRACE"));
14481 14475                          frc_no_reclaim = 1;
14482 14476                          /* clean up before retrying */
14483 14477                          needrecov = 0;
14484 14478                          (void) nfs4frlock_recovery(needrecov, ep, &argsp, &resp,
14485 14479                              lock_args, locku_args, &oop, &osp, &lop, rp, vp,
14486 14480                              &recov_state, op_hint, &did_start_fop, NULL, flk);
14487 14481                          goto recov_retry;
14488 14482                  }
14489 14483                  /* FALLTHROUGH */
14490 14484  
14491 14485          default:
14492 14486                  nfs4frlock_results_default(resp, &ep->error);
14493 14487                  break;
14494 14488          }
14495 14489  out:
14496 14490          /*
14497 14491           * Process and cleanup from error.  Make interrupted unlock
14498 14492           * requests look successful, since they will be handled by the
14499 14493           * client recovery code.
14500 14494           */
14501 14495          nfs4frlock_final_cleanup(ctype, argsp, resp, vp, op_hint, &recov_state,
14502 14496              needrecov, oop, osp, lop, flk, whence, offset, ls, &ep->error,
14503 14497              lock_args, locku_args, did_start_fop,
14504 14498              skip_get_err, cred_otw, cr);
14505 14499  
14506 14500          if (ep->error == EINTR && flk->l_type == F_UNLCK &&
14507 14501              (cmd == F_SETLK || cmd == F_SETLKW))
14508 14502                  ep->error = 0;
14509 14503  }
14510 14504  
14511 14505  /*
14512 14506   * nfs4_safelock:
14513 14507   *
14514 14508   * Return non-zero if the given lock request can be handled without
14515 14509   * violating the constraints on concurrent mapping and locking.
14516 14510   */
14517 14511  
14518 14512  static int
14519 14513  nfs4_safelock(vnode_t *vp, const struct flock64 *bfp, cred_t *cr)
14520 14514  {
14521 14515          rnode4_t *rp = VTOR4(vp);
14522 14516          struct vattr va;
14523 14517          int error;
14524 14518  
14525 14519          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14526 14520          ASSERT(rp->r_mapcnt >= 0);
14527 14521          NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock %s: "
14528 14522              "(%"PRIx64", %"PRIx64"); mapcnt = %ld", bfp->l_type == F_WRLCK ?
14529 14523              "write" : bfp->l_type == F_RDLCK ? "read" : "unlock",
14530 14524              bfp->l_start, bfp->l_len, rp->r_mapcnt));
14531 14525  
14532 14526          if (rp->r_mapcnt == 0)
14533 14527                  return (1);             /* always safe if not mapped */
14534 14528  
14535 14529          /*
14536 14530           * If the file is already mapped and there are locks, then they
14537 14531           * should be all safe locks.  So adding or removing a lock is safe
14538 14532           * as long as the new request is safe (i.e., whole-file, meaning
14539 14533           * length and starting offset are both zero).
14540 14534           */
14541 14535  
14542 14536          if (bfp->l_start != 0 || bfp->l_len != 0) {
14543 14537                  NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: "
14544 14538                      "cannot lock a memory mapped file unless locking the "
14545 14539                      "entire file: start %"PRIx64", len %"PRIx64,
14546 14540                      bfp->l_start, bfp->l_len));
14547 14541                  return (0);
14548 14542          }
14549 14543  
14550 14544          /* mandatory locking and mapping don't mix */
14551 14545          va.va_mask = AT_MODE;
14552 14546          error = VOP_GETATTR(vp, &va, 0, cr, NULL);
14553 14547          if (error != 0) {
14554 14548                  NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: "
14555 14549                      "getattr error %d", error));
14556 14550                  return (0);             /* treat errors conservatively */
14557 14551          }
14558 14552          if (MANDLOCK(vp, va.va_mode)) {
14559 14553                  NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: "
14560 14554                      "cannot mandatory lock and mmap a file"));
14561 14555                  return (0);
14562 14556          }
14563 14557  
14564 14558          return (1);
14565 14559  }
14566 14560  
14567 14561  
14568 14562  /*
14569 14563   * Register the lock locally within Solaris.
14570 14564   * As the client, we "or" the sysid with LM_SYSID_CLIENT when
14571 14565   * recording locks locally.
14572 14566   *
14573 14567   * This should handle conflicts/cooperation with NFS v2/v3 since all locks
14574 14568   * are registered locally.
14575 14569   */
14576 14570  void
14577 14571  nfs4_register_lock_locally(vnode_t *vp, struct flock64 *flk, int flag,
14578 14572      u_offset_t offset)
14579 14573  {
14580 14574          int oldsysid;
14581 14575          int error;
14582 14576  #ifdef DEBUG
14583 14577          char *name;
14584 14578  #endif
14585 14579  
14586 14580          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14587 14581  
14588 14582  #ifdef DEBUG
14589 14583          name = fn_name(VTOSV(vp)->sv_name);
14590 14584          NFS4_DEBUG(nfs4_client_lock_debug,
14591 14585              (CE_NOTE, "nfs4_register_lock_locally: %s: type %d, "
14592 14586              "start %"PRIx64", length %"PRIx64", pid %ld, sysid %d",
14593 14587              name, flk->l_type, flk->l_start, flk->l_len, (long)flk->l_pid,
14594 14588              flk->l_sysid));
14595 14589          kmem_free(name, MAXNAMELEN);
14596 14590  #endif
14597 14591  
14598 14592          /* register the lock with local locking */
14599 14593          oldsysid = flk->l_sysid;
14600 14594          flk->l_sysid |= LM_SYSID_CLIENT;
14601 14595          error = reclock(vp, flk, SETFLCK, flag, offset, NULL);
14602 14596  #ifdef DEBUG
14603 14597          if (error != 0) {
14604 14598                  NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14605 14599                      "nfs4_register_lock_locally: could not register with"
14606 14600                      " local locking"));
14607 14601                  NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT,
14608 14602                      "error %d, vp 0x%p, pid %d, sysid 0x%x",
14609 14603                      error, (void *)vp, flk->l_pid, flk->l_sysid));
14610 14604                  NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT,
14611 14605                      "type %d off 0x%" PRIx64 " len 0x%" PRIx64,
14612 14606                      flk->l_type, flk->l_start, flk->l_len));
14613 14607                  (void) reclock(vp, flk, 0, flag, offset, NULL);
14614 14608                  NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT,
14615 14609                      "blocked by pid %d sysid 0x%x type %d "
14616 14610                      "off 0x%" PRIx64 " len 0x%" PRIx64,
14617 14611                      flk->l_pid, flk->l_sysid, flk->l_type, flk->l_start,
14618 14612                      flk->l_len));
14619 14613          }
14620 14614  #endif
14621 14615          flk->l_sysid = oldsysid;
14622 14616  }
14623 14617  
14624 14618  /*
14625 14619   * nfs4_lockrelease:
14626 14620   *
14627 14621   * Release any locks on the given vnode that are held by the current
14628 14622   * process.  Also removes the lock owner (if one exists) from the rnode's
14629 14623   * list.
14630 14624   */
14631 14625  static int
14632 14626  nfs4_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr)
14633 14627  {
14634 14628          flock64_t ld;
14635 14629          int ret, error;
14636 14630          rnode4_t *rp;
14637 14631          nfs4_lock_owner_t *lop;
14638 14632          nfs4_recov_state_t recov_state;
14639 14633          mntinfo4_t *mi;
14640 14634          bool_t possible_orphan = FALSE;
14641 14635          bool_t recovonly;
14642 14636  
14643 14637          ASSERT((uintptr_t)vp > KERNELBASE);
14644 14638          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14645 14639  
14646 14640          rp = VTOR4(vp);
14647 14641          mi = VTOMI4(vp);
14648 14642  
14649 14643          /*
14650 14644           * If we have not locked anything then we can
14651 14645           * just return since we have no work to do.
14652 14646           */
14653 14647          if (rp->r_lo_head.lo_next_rnode == &rp->r_lo_head) {
14654 14648                  return (0);
14655 14649          }
14656 14650  
14657 14651          /*
14658 14652           * We need to comprehend that another thread may
14659 14653           * kick off recovery and the lock_owner we have stashed
14660 14654           * in lop might be invalid so we should NOT cache it
14661 14655           * locally!
14662 14656           */
14663 14657          recov_state.rs_flags = 0;
14664 14658          recov_state.rs_num_retry_despite_err = 0;
14665 14659          error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state,
14666 14660              &recovonly);
14667 14661          if (error) {
14668 14662                  mutex_enter(&rp->r_statelock);
14669 14663                  rp->r_flags |= R4LODANGLERS;
14670 14664                  mutex_exit(&rp->r_statelock);
14671 14665                  return (error);
14672 14666          }
14673 14667  
14674 14668          lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
14675 14669  
14676 14670          /*
14677 14671           * Check if the lock owner might have a lock (request was sent but
14678 14672           * no response was received).  Also check if there are any remote
14679 14673           * locks on the file.  (In theory we shouldn't have to make this
14680 14674           * second check if there's no lock owner, but for now we'll be
14681 14675           * conservative and do it anyway.)  If either condition is true,
14682 14676           * send an unlock for the entire file to the server.
14683 14677           *
14684 14678           * Note that no explicit synchronization is needed here.  At worst,
14685 14679           * flk_has_remote_locks() will return a false positive, in which case
14686 14680           * the unlock call wastes time but doesn't harm correctness.
14687 14681           */
14688 14682  
14689 14683          if (lop) {
14690 14684                  mutex_enter(&lop->lo_lock);
14691 14685                  possible_orphan = lop->lo_pending_rqsts;
14692 14686                  mutex_exit(&lop->lo_lock);
14693 14687                  lock_owner_rele(lop);
14694 14688          }
14695 14689  
14696 14690          nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0);
14697 14691  
14698 14692          NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14699 14693              "nfs4_lockrelease: possible orphan %d, remote locks %d, for "
14700 14694              "lop %p.", possible_orphan, flk_has_remote_locks(vp),
14701 14695              (void *)lop));
14702 14696  
14703 14697          if (possible_orphan || flk_has_remote_locks(vp)) {
14704 14698                  ld.l_type = F_UNLCK;    /* set to unlock entire file */
14705 14699                  ld.l_whence = 0;        /* unlock from start of file */
14706 14700                  ld.l_start = 0;
14707 14701                  ld.l_len = 0;           /* do entire file */
14708 14702  
14709 14703                  ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL,
14710 14704                      cr, NULL);
14711 14705  
14712 14706                  if (ret != 0) {
14713 14707                          /*
14714 14708                           * If VOP_FRLOCK fails, make sure we unregister
14715 14709                           * local locks before we continue.
14716 14710                           */
14717 14711                          ld.l_pid = ttoproc(curthread)->p_pid;
14718 14712                          nfs4_register_lock_locally(vp, &ld, flag, offset);
14719 14713                          NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14720 14714                              "nfs4_lockrelease: lock release error on vp"
14721 14715                              " %p: error %d.\n", (void *)vp, ret));
14722 14716                  }
14723 14717          }
14724 14718  
14725 14719          recov_state.rs_flags = 0;
14726 14720          recov_state.rs_num_retry_despite_err = 0;
14727 14721          error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state,
14728 14722              &recovonly);
14729 14723          if (error) {
14730 14724                  mutex_enter(&rp->r_statelock);
14731 14725                  rp->r_flags |= R4LODANGLERS;
14732 14726                  mutex_exit(&rp->r_statelock);
14733 14727                  return (error);
14734 14728          }
14735 14729  
14736 14730          /*
14737 14731           * So, here we're going to need to retrieve the lock-owner
14738 14732           * again (in case recovery has done a switch-a-roo) and
14739 14733           * remove it because we can.
14740 14734           */
14741 14735          lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
14742 14736  
14743 14737          if (lop) {
14744 14738                  nfs4_rnode_remove_lock_owner(rp, lop);
14745 14739                  lock_owner_rele(lop);
14746 14740          }
14747 14741  
14748 14742          nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0);
14749 14743          return (0);
14750 14744  }
14751 14745  
14752 14746  /*
14753 14747   * Wait for 'tick_delay' clock ticks.
14754 14748   * Implement exponential backoff until hit the lease_time of this nfs4_server.
14755 14749   * NOTE: lock_lease_time is in seconds.
14756 14750   *
14757 14751   * XXX For future improvements, should implement a waiting queue scheme.
14758 14752   */
14759 14753  static int
14760 14754  nfs4_block_and_wait(clock_t *tick_delay, rnode4_t *rp)
14761 14755  {
14762 14756          long milliseconds_delay;
14763 14757          time_t lock_lease_time;
14764 14758  
14765 14759          /* wait tick_delay clock ticks or siginteruptus */
14766 14760          if (delay_sig(*tick_delay)) {
14767 14761                  return (EINTR);
14768 14762          }
14769 14763          NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_block_and_wait: "
14770 14764              "reissue the lock request: blocked for %ld clock ticks: %ld "
14771 14765              "milliseconds", *tick_delay, drv_hztousec(*tick_delay) / 1000));
14772 14766  
14773 14767          /* get the lease time */
14774 14768          lock_lease_time = r2lease_time(rp);
14775 14769  
14776 14770          /* drv_hztousec converts ticks to microseconds */
14777 14771          milliseconds_delay = drv_hztousec(*tick_delay) / 1000;
14778 14772          if (milliseconds_delay < lock_lease_time * 1000) {
14779 14773                  *tick_delay = 2 * *tick_delay;
14780 14774                  if (drv_hztousec(*tick_delay) > lock_lease_time * 1000 * 1000)
14781 14775                          *tick_delay = drv_usectohz(lock_lease_time*1000*1000);
14782 14776          }
14783 14777          return (0);
14784 14778  }
14785 14779  
14786 14780  
14787 14781  void
14788 14782  nfs4_vnops_init(void)
14789 14783  {
14790 14784  }
14791 14785  
14792 14786  void
14793 14787  nfs4_vnops_fini(void)
14794 14788  {
14795 14789  }
14796 14790  
14797 14791  /*
14798 14792   * Return a reference to the directory (parent) vnode for a given vnode,
14799 14793   * using the saved pathname information and the directory file handle.  The
14800 14794   * caller is responsible for disposing of the reference.
14801 14795   * Returns zero or an errno value.
14802 14796   *
14803 14797   * Caller should set need_start_op to FALSE if it is the recovery
14804 14798   * thread, or if a start_fop has already been done.  Otherwise, TRUE.
14805 14799   */
14806 14800  int
14807 14801  vtodv(vnode_t *vp, vnode_t **dvpp, cred_t *cr, bool_t need_start_op)
14808 14802  {
14809 14803          svnode_t *svnp;
14810 14804          vnode_t *dvp = NULL;
14811 14805          servinfo4_t *svp;
14812 14806          nfs4_fname_t *mfname;
14813 14807          int error;
14814 14808  
14815 14809          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14816 14810  
14817 14811          if (vp->v_flag & VROOT) {
14818 14812                  nfs4_sharedfh_t *sfh;
14819 14813                  nfs_fh4 fh;
14820 14814                  mntinfo4_t *mi;
14821 14815  
14822 14816                  ASSERT(vp->v_type == VREG);
14823 14817  
14824 14818                  mi = VTOMI4(vp);
14825 14819                  svp = mi->mi_curr_serv;
14826 14820                  (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
14827 14821                  fh.nfs_fh4_len = svp->sv_pfhandle.fh_len;
14828 14822                  fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf;
14829 14823                  sfh = sfh4_get(&fh, VTOMI4(vp));
14830 14824                  nfs_rw_exit(&svp->sv_lock);
14831 14825                  mfname = mi->mi_fname;
14832 14826                  fn_hold(mfname);
14833 14827                  dvp = makenfs4node_by_fh(sfh, NULL, &mfname, NULL, mi, cr, 0);
14834 14828                  sfh4_rele(&sfh);
14835 14829  
14836 14830                  if (dvp->v_type == VNON)
14837 14831                          dvp->v_type = VDIR;
14838 14832                  *dvpp = dvp;
14839 14833                  return (0);
14840 14834          }
14841 14835  
14842 14836          svnp = VTOSV(vp);
14843 14837  
14844 14838          if (svnp == NULL) {
14845 14839                  NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14846 14840                      "shadow node is NULL"));
14847 14841                  return (EINVAL);
14848 14842          }
14849 14843  
14850 14844          if (svnp->sv_name == NULL || svnp->sv_dfh == NULL) {
14851 14845                  NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14852 14846                      "shadow node name or dfh val == NULL"));
14853 14847                  return (EINVAL);
14854 14848          }
14855 14849  
14856 14850          error = nfs4_make_dotdot(svnp->sv_dfh, 0, vp, cr, &dvp,
14857 14851              (int)need_start_op);
14858 14852          if (error != 0) {
14859 14853                  NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14860 14854                      "nfs4_make_dotdot returned %d", error));
14861 14855                  return (error);
14862 14856          }
14863 14857          if (!dvp) {
14864 14858                  NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14865 14859                      "nfs4_make_dotdot returned a NULL dvp"));
14866 14860                  return (EIO);
14867 14861          }
14868 14862          if (dvp->v_type == VNON)
14869 14863                  dvp->v_type = VDIR;
14870 14864          ASSERT(dvp->v_type == VDIR);
14871 14865          if (VTOR4(vp)->r_flags & R4ISXATTR) {
14872 14866                  mutex_enter(&dvp->v_lock);
14873 14867                  dvp->v_flag |= V_XATTRDIR;
14874 14868                  mutex_exit(&dvp->v_lock);
14875 14869          }
14876 14870          *dvpp = dvp;
14877 14871          return (0);
14878 14872  }
14879 14873  
14880 14874  /*
14881 14875   * Copy the (final) component name of vp to fnamep.  maxlen is the maximum
14882 14876   * length that fnamep can accept, including the trailing null.
14883 14877   * Returns 0 if okay, returns an errno value if there was a problem.
14884 14878   */
14885 14879  
14886 14880  int
14887 14881  vtoname(vnode_t *vp, char *fnamep, ssize_t maxlen)
14888 14882  {
14889 14883          char *fn;
14890 14884          int err = 0;
14891 14885          servinfo4_t *svp;
14892 14886          svnode_t *shvp;
14893 14887  
14894 14888          /*
14895 14889           * If the file being opened has VROOT set, then this is
14896 14890           * a "file" mount.  sv_name will not be interesting, so
14897 14891           * go back to the servinfo4 to get the original mount
14898 14892           * path and strip off all but the final edge.  Otherwise
14899 14893           * just return the name from the shadow vnode.
14900 14894           */
14901 14895  
14902 14896          if (vp->v_flag & VROOT) {
14903 14897  
14904 14898                  svp = VTOMI4(vp)->mi_curr_serv;
14905 14899                  (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
14906 14900  
14907 14901                  fn = strrchr(svp->sv_path, '/');
14908 14902                  if (fn == NULL)
14909 14903                          err = EINVAL;
14910 14904                  else
14911 14905                          fn++;
14912 14906          } else {
14913 14907                  shvp = VTOSV(vp);
14914 14908                  fn = fn_name(shvp->sv_name);
14915 14909          }
14916 14910  
14917 14911          if (err == 0)
14918 14912                  if (strlen(fn) < maxlen)
14919 14913                          (void) strcpy(fnamep, fn);
14920 14914                  else
14921 14915                          err = ENAMETOOLONG;
14922 14916  
14923 14917          if (vp->v_flag & VROOT)
14924 14918                  nfs_rw_exit(&svp->sv_lock);
14925 14919          else
14926 14920                  kmem_free(fn, MAXNAMELEN);
14927 14921  
14928 14922          return (err);
14929 14923  }
14930 14924  
14931 14925  /*
14932 14926   * Bookkeeping for a close that doesn't need to go over the wire.
14933 14927   * *have_lockp is set to 0 if 'os_sync_lock' is released; otherwise
14934 14928   * it is left at 1.
14935 14929   */
14936 14930  void
14937 14931  nfs4close_notw(vnode_t *vp, nfs4_open_stream_t *osp, int *have_lockp)
14938 14932  {
14939 14933          rnode4_t                *rp;
14940 14934          mntinfo4_t              *mi;
14941 14935  
14942 14936          mi = VTOMI4(vp);
14943 14937          rp = VTOR4(vp);
14944 14938  
14945 14939          NFS4_DEBUG(nfs4close_notw_debug, (CE_NOTE, "nfs4close_notw: "
14946 14940              "rp=%p osp=%p", (void *)rp, (void *)osp));
14947 14941          ASSERT(nfs_zone() == mi->mi_zone);
14948 14942          ASSERT(mutex_owned(&osp->os_sync_lock));
14949 14943          ASSERT(*have_lockp);
14950 14944  
14951 14945          if (!osp->os_valid ||
14952 14946              osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) {
14953 14947                  return;
14954 14948          }
14955 14949  
14956 14950          /*
14957 14951           * This removes the reference obtained at OPEN; ie,
14958 14952           * when the open stream structure was created.
14959 14953           *
14960 14954           * We don't have to worry about calling 'open_stream_rele'
14961 14955           * since we our currently holding a reference to this
14962 14956           * open stream which means the count can not go to 0 with
14963 14957           * this decrement.
14964 14958           */
14965 14959          ASSERT(osp->os_ref_count >= 2);
14966 14960          osp->os_ref_count--;
14967 14961          osp->os_valid = 0;
14968 14962          mutex_exit(&osp->os_sync_lock);
14969 14963          *have_lockp = 0;
14970 14964  
14971 14965          nfs4_dec_state_ref_count(mi);
14972 14966  }
14973 14967  
14974 14968  /*
14975 14969   * Close all remaining open streams on the rnode.  These open streams
14976 14970   * could be here because:
14977 14971   * - The close attempted at either close or delmap failed
14978 14972   * - Some kernel entity did VOP_OPEN but never did VOP_CLOSE
14979 14973   * - Someone did mknod on a regular file but never opened it
14980 14974   */
14981 14975  int
14982 14976  nfs4close_all(vnode_t *vp, cred_t *cr)
14983 14977  {
14984 14978          nfs4_open_stream_t *osp;
14985 14979          int error;
14986 14980          nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
14987 14981          rnode4_t *rp;
14988 14982  
14989 14983          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14990 14984  
14991 14985          error = 0;
14992 14986          rp = VTOR4(vp);
14993 14987  
14994 14988          /*
14995 14989           * At this point, all we know is that the last time
14996 14990           * someone called vn_rele, the count was 1.  Since then,
14997 14991           * the vnode could have been re-activated.  We want to
14998 14992           * loop through the open streams and close each one, but
14999 14993           * we have to be careful since once we release the rnode
15000 14994           * hash bucket lock, someone else is free to come in and
15001 14995           * re-activate the rnode and add new open streams.  The
15002 14996           * strategy is take the rnode hash bucket lock, verify that
15003 14997           * the count is still 1, grab the open stream off the
15004 14998           * head of the list and mark it invalid, then release the
15005 14999           * rnode hash bucket lock and proceed with that open stream.
15006 15000           * This is ok because nfs4close_one() will acquire the proper
15007 15001           * open/create to close/destroy synchronization for open
15008 15002           * streams, and will ensure that if someone has reopened
15009 15003           * the open stream after we've dropped the hash bucket lock
15010 15004           * then we'll just simply return without destroying the
15011 15005           * open stream.
15012 15006           * Repeat until the list is empty.
15013 15007           */
15014 15008  
15015 15009          for (;;) {
15016 15010  
15017 15011                  /* make sure vnode hasn't been reactivated */
15018 15012                  rw_enter(&rp->r_hashq->r_lock, RW_READER);
15019 15013                  mutex_enter(&vp->v_lock);
15020 15014                  if (vp->v_count > 1) {
15021 15015                          mutex_exit(&vp->v_lock);
15022 15016                          rw_exit(&rp->r_hashq->r_lock);
15023 15017                          break;
15024 15018                  }
15025 15019                  /*
15026 15020                   * Grabbing r_os_lock before releasing v_lock prevents
15027 15021                   * a window where the rnode/open stream could get
15028 15022                   * reactivated (and os_force_close set to 0) before we
15029 15023                   * had a chance to set os_force_close to 1.
15030 15024                   */
15031 15025                  mutex_enter(&rp->r_os_lock);
15032 15026                  mutex_exit(&vp->v_lock);
15033 15027  
15034 15028                  osp = list_head(&rp->r_open_streams);
15035 15029                  if (!osp) {
15036 15030                          /* nothing left to CLOSE OTW, so return */
15037 15031                          mutex_exit(&rp->r_os_lock);
15038 15032                          rw_exit(&rp->r_hashq->r_lock);
15039 15033                          break;
15040 15034                  }
15041 15035  
15042 15036                  mutex_enter(&rp->r_statev4_lock);
15043 15037                  /* the file can't still be mem mapped */
15044 15038                  ASSERT(rp->r_mapcnt == 0);
15045 15039                  if (rp->created_v4)
15046 15040                          rp->created_v4 = 0;
15047 15041                  mutex_exit(&rp->r_statev4_lock);
15048 15042  
15049 15043                  /*
15050 15044                   * Grab a ref on this open stream; nfs4close_one
15051 15045                   * will mark it as invalid
15052 15046                   */
15053 15047                  mutex_enter(&osp->os_sync_lock);
15054 15048                  osp->os_ref_count++;
15055 15049                  osp->os_force_close = 1;
15056 15050                  mutex_exit(&osp->os_sync_lock);
15057 15051                  mutex_exit(&rp->r_os_lock);
15058 15052                  rw_exit(&rp->r_hashq->r_lock);
15059 15053  
15060 15054                  nfs4close_one(vp, osp, cr, 0, NULL, &e, CLOSE_FORCE, 0, 0, 0);
15061 15055  
15062 15056                  /* Update error if it isn't already non-zero */
15063 15057                  if (error == 0) {
15064 15058                          if (e.error)
15065 15059                                  error = e.error;
15066 15060                          else if (e.stat)
15067 15061                                  error = geterrno4(e.stat);
15068 15062                  }
15069 15063  
15070 15064  #ifdef  DEBUG
15071 15065                  nfs4close_all_cnt++;
15072 15066  #endif
15073 15067                  /* Release the ref on osp acquired above. */
15074 15068                  open_stream_rele(osp, rp);
15075 15069  
15076 15070                  /* Proceed to the next open stream, if any */
15077 15071          }
15078 15072          return (error);
15079 15073  }
15080 15074  
15081 15075  /*
15082 15076   * nfs4close_one - close one open stream for a file if needed.
15083 15077   *
15084 15078   * "close_type" indicates which close path this is:
15085 15079   * CLOSE_NORM: close initiated via VOP_CLOSE.
15086 15080   * CLOSE_DELMAP: close initiated via VOP_DELMAP.
15087 15081   * CLOSE_FORCE: close initiated via VOP_INACTIVE.  This path forces
15088 15082   *      the close and release of client state for this open stream
15089 15083   *      (unless someone else has the open stream open).
15090 15084   * CLOSE_RESEND: indicates the request is a replay of an earlier request
15091 15085   *      (e.g., due to abort because of a signal).
15092 15086   * CLOSE_AFTER_RESEND: close initiated to "undo" a successful resent OPEN.
15093 15087   *
15094 15088   * CLOSE_RESEND and CLOSE_AFTER_RESEND will not attempt to retry after client
15095 15089   * recovery.  Instead, the caller is expected to deal with retries.
15096 15090   *
15097 15091   * The caller can either pass in the osp ('provided_osp') or not.
15098 15092   *
15099 15093   * 'access_bits' represents the access we are closing/downgrading.
15100 15094   *
15101 15095   * 'len', 'prot', and 'mmap_flags' are used for CLOSE_DELMAP.  'len' is the
15102 15096   * number of bytes we are unmapping, 'maxprot' is the mmap protection, and
15103 15097   * 'mmap_flags' tells us the type of sharing (MAP_PRIVATE or MAP_SHARED).
15104 15098   *
15105 15099   * Errors are returned via the nfs4_error_t.
15106 15100   */
15107 15101  void
15108 15102  nfs4close_one(vnode_t *vp, nfs4_open_stream_t *provided_osp, cred_t *cr,
15109 15103      int access_bits, nfs4_lost_rqst_t *lrp, nfs4_error_t *ep,
15110 15104      nfs4_close_type_t close_type, size_t len, uint_t maxprot,
15111 15105      uint_t mmap_flags)
15112 15106  {
15113 15107          nfs4_open_owner_t *oop;
15114 15108          nfs4_open_stream_t *osp = NULL;
15115 15109          int retry = 0;
15116 15110          int num_retries = NFS4_NUM_RECOV_RETRIES;
15117 15111          rnode4_t *rp;
15118 15112          mntinfo4_t *mi;
15119 15113          nfs4_recov_state_t recov_state;
15120 15114          cred_t *cred_otw = NULL;
15121 15115          bool_t recovonly = FALSE;
15122 15116          int isrecov;
15123 15117          int force_close;
15124 15118          int close_failed = 0;
15125 15119          int did_dec_count = 0;
15126 15120          int did_start_op = 0;
15127 15121          int did_force_recovlock = 0;
15128 15122          int did_start_seqid_sync = 0;
15129 15123          int have_sync_lock = 0;
15130 15124  
15131 15125          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
15132 15126  
15133 15127          NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, "closing vp %p osp %p, "
15134 15128              "lrp %p, close type %d len %ld prot %x mmap flags %x bits %x",
15135 15129              (void *)vp, (void *)provided_osp, (void *)lrp, close_type,
15136 15130              len, maxprot, mmap_flags, access_bits));
15137 15131  
15138 15132          nfs4_error_zinit(ep);
15139 15133          rp = VTOR4(vp);
15140 15134          mi = VTOMI4(vp);
15141 15135          isrecov = (close_type == CLOSE_RESEND ||
15142 15136              close_type == CLOSE_AFTER_RESEND);
15143 15137  
15144 15138          /*
15145 15139           * First get the open owner.
15146 15140           */
15147 15141          if (!provided_osp) {
15148 15142                  oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
15149 15143          } else {
15150 15144                  oop = provided_osp->os_open_owner;
15151 15145                  ASSERT(oop != NULL);
15152 15146                  open_owner_hold(oop);
15153 15147          }
15154 15148  
15155 15149          if (!oop) {
15156 15150                  NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
15157 15151                      "nfs4close_one: no oop, rp %p, mi %p, cr %p, osp %p, "
15158 15152                      "close type %d", (void *)rp, (void *)mi, (void *)cr,
15159 15153                      (void *)provided_osp, close_type));
15160 15154                  ep->error = EIO;
15161 15155                  goto out;
15162 15156          }
15163 15157  
15164 15158          cred_otw = nfs4_get_otw_cred(cr, mi, oop);
15165 15159  recov_retry:
15166 15160          osp = NULL;
15167 15161          close_failed = 0;
15168 15162          force_close = (close_type == CLOSE_FORCE);
15169 15163          retry = 0;
15170 15164          did_start_op = 0;
15171 15165          did_force_recovlock = 0;
15172 15166          did_start_seqid_sync = 0;
15173 15167          have_sync_lock = 0;
15174 15168          recovonly = FALSE;
15175 15169          recov_state.rs_flags = 0;
15176 15170          recov_state.rs_num_retry_despite_err = 0;
15177 15171  
15178 15172          /*
15179 15173           * Second synchronize with recovery.
15180 15174           */
15181 15175          if (!isrecov) {
15182 15176                  ep->error = nfs4_start_fop(mi, vp, NULL, OH_CLOSE,
15183 15177                      &recov_state, &recovonly);
15184 15178                  if (!ep->error) {
15185 15179                          did_start_op = 1;
15186 15180                  } else {
15187 15181                          close_failed = 1;
15188 15182                          /*
15189 15183                           * If we couldn't get start_fop, but have to
15190 15184                           * cleanup state, then at least acquire the
15191 15185                           * mi_recovlock so we can synchronize with
15192 15186                           * recovery.
15193 15187                           */
15194 15188                          if (close_type == CLOSE_FORCE) {
15195 15189                                  (void) nfs_rw_enter_sig(&mi->mi_recovlock,
15196 15190                                      RW_READER, FALSE);
15197 15191                                  did_force_recovlock = 1;
15198 15192                          } else
15199 15193                                  goto out;
15200 15194                  }
15201 15195          }
15202 15196  
15203 15197          /*
15204 15198           * We cannot attempt to get the open seqid sync if nfs4_start_fop
15205 15199           * set 'recovonly' to TRUE since most likely this is due to
15206 15200           * reovery being active (MI4_RECOV_ACTIV).  If recovery is active,
15207 15201           * nfs4_start_open_seqid_sync() will fail with EAGAIN asking us
15208 15202           * to retry, causing us to loop until recovery finishes.  Plus we
15209 15203           * don't need protection over the open seqid since we're not going
15210 15204           * OTW, hence don't need to use the seqid.
15211 15205           */
15212 15206          if (recovonly == FALSE) {
15213 15207                  /* need to grab the open owner sync before 'os_sync_lock' */
15214 15208                  ep->error = nfs4_start_open_seqid_sync(oop, mi);
15215 15209                  if (ep->error == EAGAIN) {
15216 15210                          ASSERT(!isrecov);
15217 15211                          if (did_start_op)
15218 15212                                  nfs4_end_fop(mi, vp, NULL, OH_CLOSE,
15219 15213                                      &recov_state, TRUE);
15220 15214                          if (did_force_recovlock)
15221 15215                                  nfs_rw_exit(&mi->mi_recovlock);
15222 15216                          goto recov_retry;
15223 15217                  }
15224 15218                  did_start_seqid_sync = 1;
15225 15219          }
15226 15220  
15227 15221          /*
15228 15222           * Third get an open stream and acquire 'os_sync_lock' to
15229 15223           * sychronize the opening/creating of an open stream with the
15230 15224           * closing/destroying of an open stream.
15231 15225           */
15232 15226          if (!provided_osp) {
15233 15227                  /* returns with 'os_sync_lock' held */
15234 15228                  osp = find_open_stream(oop, rp);
15235 15229                  if (!osp) {
15236 15230                          ep->error = EIO;
15237 15231                          goto out;
15238 15232                  }
15239 15233          } else {
15240 15234                  osp = provided_osp;
15241 15235                  open_stream_hold(osp);
15242 15236                  mutex_enter(&osp->os_sync_lock);
15243 15237          }
15244 15238          have_sync_lock = 1;
15245 15239  
15246 15240          ASSERT(oop == osp->os_open_owner);
15247 15241  
15248 15242          /*
15249 15243           * Fourth, do any special pre-OTW CLOSE processing
15250 15244           * based on the specific close type.
15251 15245           */
15252 15246          if ((close_type == CLOSE_NORM || close_type == CLOSE_AFTER_RESEND) &&
15253 15247              !did_dec_count) {
15254 15248                  ASSERT(osp->os_open_ref_count > 0);
15255 15249                  osp->os_open_ref_count--;
15256 15250                  did_dec_count = 1;
15257 15251                  if (osp->os_open_ref_count == 0)
15258 15252                          osp->os_final_close = 1;
15259 15253          }
15260 15254  
15261 15255          if (close_type == CLOSE_FORCE) {
15262 15256                  /* see if somebody reopened the open stream. */
15263 15257                  if (!osp->os_force_close) {
15264 15258                          NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE,
15265 15259                              "nfs4close_one: skip CLOSE_FORCE as osp %p "
15266 15260                              "was reopened, vp %p", (void *)osp, (void *)vp));
15267 15261                          ep->error = 0;
15268 15262                          ep->stat = NFS4_OK;
15269 15263                          goto out;
15270 15264                  }
15271 15265  
15272 15266                  if (!osp->os_final_close && !did_dec_count) {
15273 15267                          osp->os_open_ref_count--;
15274 15268                          did_dec_count = 1;
15275 15269                  }
15276 15270  
15277 15271                  /*
15278 15272                   * We can't depend on os_open_ref_count being 0 due to the
15279 15273                   * way executables are opened (VN_RELE to match a VOP_OPEN).
15280 15274                   */
15281 15275  #ifdef  NOTYET
15282 15276                  ASSERT(osp->os_open_ref_count == 0);
15283 15277  #endif
15284 15278                  if (osp->os_open_ref_count != 0) {
15285 15279                          NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE,
15286 15280                              "nfs4close_one: should panic here on an "
15287 15281                              "ASSERT(osp->os_open_ref_count == 0). Ignoring "
15288 15282                              "since this is probably the exec problem."));
15289 15283  
15290 15284                          osp->os_open_ref_count = 0;
15291 15285                  }
15292 15286  
15293 15287                  /*
15294 15288                   * There is the possibility that nfs4close_one()
15295 15289                   * for close_type == CLOSE_DELMAP couldn't find the
15296 15290                   * open stream, thus couldn't decrement its os_mapcnt;
15297 15291                   * therefore we can't use this ASSERT yet.
15298 15292                   */
15299 15293  #ifdef  NOTYET
15300 15294                  ASSERT(osp->os_mapcnt == 0);
15301 15295  #endif
15302 15296                  osp->os_mapcnt = 0;
15303 15297          }
15304 15298  
15305 15299          if (close_type == CLOSE_DELMAP && !did_dec_count) {
15306 15300                  ASSERT(osp->os_mapcnt >= btopr(len));
15307 15301  
15308 15302                  if ((mmap_flags & MAP_SHARED) && (maxprot & PROT_WRITE))
15309 15303                          osp->os_mmap_write -= btopr(len);
15310 15304                  if (maxprot & PROT_READ)
15311 15305                          osp->os_mmap_read -= btopr(len);
15312 15306                  if (maxprot & PROT_EXEC)
15313 15307                          osp->os_mmap_read -= btopr(len);
15314 15308                  /* mirror the PROT_NONE check in nfs4_addmap() */
15315 15309                  if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) &&
15316 15310                      !(maxprot & PROT_EXEC))
15317 15311                          osp->os_mmap_read -= btopr(len);
15318 15312                  osp->os_mapcnt -= btopr(len);
15319 15313                  did_dec_count = 1;
15320 15314          }
15321 15315  
15322 15316          if (recovonly) {
15323 15317                  nfs4_lost_rqst_t lost_rqst;
15324 15318  
15325 15319                  /* request should not already be in recovery queue */
15326 15320                  ASSERT(lrp == NULL);
15327 15321                  nfs4_error_init(ep, EINTR);
15328 15322                  nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop,
15329 15323                      osp, cred_otw, vp);
15330 15324                  mutex_exit(&osp->os_sync_lock);
15331 15325                  have_sync_lock = 0;
15332 15326                  (void) nfs4_start_recovery(ep, mi, vp, NULL, NULL,
15333 15327                      lost_rqst.lr_op == OP_CLOSE ?
15334 15328                      &lost_rqst : NULL, OP_CLOSE, NULL, NULL, NULL);
15335 15329                  close_failed = 1;
15336 15330                  force_close = 0;
15337 15331                  goto close_cleanup;
15338 15332          }
15339 15333  
15340 15334          /*
15341 15335           * If a previous OTW call got NFS4ERR_BAD_SEQID, then
15342 15336           * we stopped operating on the open owner's <old oo_name, old seqid>
15343 15337           * space, which means we stopped operating on the open stream
15344 15338           * too.  So don't go OTW (as the seqid is likely bad, and the
15345 15339           * stateid could be stale, potentially triggering a false
15346 15340           * setclientid), and just clean up the client's internal state.
15347 15341           */
15348 15342          if (osp->os_orig_oo_name != oop->oo_name) {
15349 15343                  NFS4_DEBUG(nfs4close_one_debug || nfs4_client_recov_debug,
15350 15344                      (CE_NOTE, "nfs4close_one: skip OTW close for osp %p "
15351 15345                      "oop %p due to bad seqid (orig oo_name %" PRIx64 " current "
15352 15346                      "oo_name %" PRIx64")",
15353 15347                      (void *)osp, (void *)oop, osp->os_orig_oo_name,
15354 15348                      oop->oo_name));
15355 15349                  close_failed = 1;
15356 15350          }
15357 15351  
15358 15352          /* If the file failed recovery, just quit. */
15359 15353          mutex_enter(&rp->r_statelock);
15360 15354          if (rp->r_flags & R4RECOVERR) {
15361 15355                  close_failed = 1;
15362 15356          }
15363 15357          mutex_exit(&rp->r_statelock);
15364 15358  
15365 15359          /*
15366 15360           * If the force close path failed to obtain start_fop
15367 15361           * then skip the OTW close and just remove the state.
15368 15362           */
15369 15363          if (close_failed)
15370 15364                  goto close_cleanup;
15371 15365  
15372 15366          /*
15373 15367           * Fifth, check to see if there are still mapped pages or other
15374 15368           * opens using this open stream.  If there are then we can't
15375 15369           * close yet but we can see if an OPEN_DOWNGRADE is necessary.
15376 15370           */
15377 15371          if (osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) {
15378 15372                  nfs4_lost_rqst_t        new_lost_rqst;
15379 15373                  bool_t                  needrecov = FALSE;
15380 15374                  cred_t                  *odg_cred_otw = NULL;
15381 15375                  seqid4                  open_dg_seqid = 0;
15382 15376  
15383 15377                  if (osp->os_delegation) {
15384 15378                          /*
15385 15379                           * If this open stream was never OPENed OTW then we
15386 15380                           * surely can't DOWNGRADE it (especially since the
15387 15381                           * osp->open_stateid is really a delegation stateid
15388 15382                           * when os_delegation is 1).
15389 15383                           */
15390 15384                          if (access_bits & FREAD)
15391 15385                                  osp->os_share_acc_read--;
15392 15386                          if (access_bits & FWRITE)
15393 15387                                  osp->os_share_acc_write--;
15394 15388                          osp->os_share_deny_none--;
15395 15389                          nfs4_error_zinit(ep);
15396 15390                          goto out;
15397 15391                  }
15398 15392                  nfs4_open_downgrade(access_bits, 0, oop, osp, vp, cr,
15399 15393                      lrp, ep, &odg_cred_otw, &open_dg_seqid);
15400 15394                  needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp);
15401 15395                  if (needrecov && !isrecov) {
15402 15396                          bool_t abort;
15403 15397                          nfs4_bseqid_entry_t *bsep = NULL;
15404 15398  
15405 15399                          if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID)
15406 15400                                  bsep = nfs4_create_bseqid_entry(oop, NULL,
15407 15401                                      vp, 0,
15408 15402                                      lrp ? TAG_OPEN_DG_LOST : TAG_OPEN_DG,
15409 15403                                      open_dg_seqid);
15410 15404  
15411 15405                          nfs4open_dg_save_lost_rqst(ep->error, &new_lost_rqst,
15412 15406                              oop, osp, odg_cred_otw, vp, access_bits, 0);
15413 15407                          mutex_exit(&osp->os_sync_lock);
15414 15408                          have_sync_lock = 0;
15415 15409                          abort = nfs4_start_recovery(ep, mi, vp, NULL, NULL,
15416 15410                              new_lost_rqst.lr_op == OP_OPEN_DOWNGRADE ?
15417 15411                              &new_lost_rqst : NULL, OP_OPEN_DOWNGRADE,
15418 15412                              bsep, NULL, NULL);
15419 15413                          if (odg_cred_otw)
15420 15414                                  crfree(odg_cred_otw);
15421 15415                          if (bsep)
15422 15416                                  kmem_free(bsep, sizeof (*bsep));
15423 15417  
15424 15418                          if (abort == TRUE)
15425 15419                                  goto out;
15426 15420  
15427 15421                          if (did_start_seqid_sync) {
15428 15422                                  nfs4_end_open_seqid_sync(oop);
15429 15423                                  did_start_seqid_sync = 0;
15430 15424                          }
15431 15425                          open_stream_rele(osp, rp);
15432 15426  
15433 15427                          if (did_start_op)
15434 15428                                  nfs4_end_fop(mi, vp, NULL, OH_CLOSE,
15435 15429                                      &recov_state, FALSE);
15436 15430                          if (did_force_recovlock)
15437 15431                                  nfs_rw_exit(&mi->mi_recovlock);
15438 15432  
15439 15433                          goto recov_retry;
15440 15434                  } else {
15441 15435                          if (odg_cred_otw)
15442 15436                                  crfree(odg_cred_otw);
15443 15437                  }
15444 15438                  goto out;
15445 15439          }
15446 15440  
15447 15441          /*
15448 15442           * If this open stream was created as the results of an open
15449 15443           * while holding a delegation, then just release it; no need
15450 15444           * to do an OTW close.  Otherwise do a "normal" OTW close.
15451 15445           */
15452 15446          if (osp->os_delegation) {
15453 15447                  nfs4close_notw(vp, osp, &have_sync_lock);
15454 15448                  nfs4_error_zinit(ep);
15455 15449                  goto out;
15456 15450          }
15457 15451  
15458 15452          /*
15459 15453           * If this stream is not valid, we're done.
15460 15454           */
15461 15455          if (!osp->os_valid) {
15462 15456                  nfs4_error_zinit(ep);
15463 15457                  goto out;
15464 15458          }
15465 15459  
15466 15460          /*
15467 15461           * Last open or mmap ref has vanished, need to do an OTW close.
15468 15462           * First check to see if a close is still necessary.
15469 15463           */
15470 15464          if (osp->os_failed_reopen) {
15471 15465                  NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
15472 15466                      "don't close OTW osp %p since reopen failed.",
15473 15467                      (void *)osp));
15474 15468                  /*
15475 15469                   * Reopen of the open stream failed, hence the
15476 15470                   * stateid of the open stream is invalid/stale, and
15477 15471                   * sending this OTW would incorrectly cause another
15478 15472                   * round of recovery.  In this case, we need to set
15479 15473                   * the 'os_valid' bit to 0 so another thread doesn't
15480 15474                   * come in and re-open this open stream before
15481 15475                   * this "closing" thread cleans up state (decrementing
15482 15476                   * the nfs4_server_t's state_ref_count and decrementing
15483 15477                   * the os_ref_count).
15484 15478                   */
15485 15479                  osp->os_valid = 0;
15486 15480                  /*
15487 15481                   * This removes the reference obtained at OPEN; ie,
15488 15482                   * when the open stream structure was created.
15489 15483                   *
15490 15484                   * We don't have to worry about calling 'open_stream_rele'
15491 15485                   * since we our currently holding a reference to this
15492 15486                   * open stream which means the count can not go to 0 with
15493 15487                   * this decrement.
15494 15488                   */
15495 15489                  ASSERT(osp->os_ref_count >= 2);
15496 15490                  osp->os_ref_count--;
15497 15491                  nfs4_error_zinit(ep);
15498 15492                  close_failed = 0;
15499 15493                  goto close_cleanup;
15500 15494          }
15501 15495  
15502 15496          ASSERT(osp->os_ref_count > 1);
15503 15497  
15504 15498          /*
15505 15499           * Sixth, try the CLOSE OTW.
15506 15500           */
15507 15501          nfs4close_otw(rp, cred_otw, oop, osp, &retry, &did_start_seqid_sync,
15508 15502              close_type, ep, &have_sync_lock);
15509 15503  
15510 15504          if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) {
15511 15505                  /*
15512 15506                   * Let the recovery thread be responsible for
15513 15507                   * removing the state for CLOSE.
15514 15508                   */
15515 15509                  close_failed = 1;
15516 15510                  force_close = 0;
15517 15511                  retry = 0;
15518 15512          }
15519 15513  
15520 15514          /* See if we need to retry with a different cred */
15521 15515          if ((ep->error == EACCES ||
15522 15516              (ep->error == 0 && ep->stat == NFS4ERR_ACCESS)) &&
15523 15517              cred_otw != cr) {
15524 15518                  crfree(cred_otw);
15525 15519                  cred_otw = cr;
15526 15520                  crhold(cred_otw);
15527 15521                  retry = 1;
15528 15522          }
15529 15523  
15530 15524          if (ep->error || ep->stat)
15531 15525                  close_failed = 1;
15532 15526  
15533 15527          if (retry && !isrecov && num_retries-- > 0) {
15534 15528                  if (have_sync_lock) {
15535 15529                          mutex_exit(&osp->os_sync_lock);
15536 15530                          have_sync_lock = 0;
15537 15531                  }
15538 15532                  if (did_start_seqid_sync) {
15539 15533                          nfs4_end_open_seqid_sync(oop);
15540 15534                          did_start_seqid_sync = 0;
15541 15535                  }
15542 15536                  open_stream_rele(osp, rp);
15543 15537  
15544 15538                  if (did_start_op)
15545 15539                          nfs4_end_fop(mi, vp, NULL, OH_CLOSE,
15546 15540                              &recov_state, FALSE);
15547 15541                  if (did_force_recovlock)
15548 15542                          nfs_rw_exit(&mi->mi_recovlock);
15549 15543                  NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
15550 15544                      "nfs4close_one: need to retry the close "
15551 15545                      "operation"));
15552 15546                  goto recov_retry;
15553 15547          }
15554 15548  close_cleanup:
15555 15549          /*
15556 15550           * Seventh and lastly, process our results.
15557 15551           */
15558 15552          if (close_failed && force_close) {
15559 15553                  /*
15560 15554                   * It's ok to drop and regrab the 'os_sync_lock' since
15561 15555                   * nfs4close_notw() will recheck to make sure the
15562 15556                   * "close"/removal of state should happen.
15563 15557                   */
15564 15558                  if (!have_sync_lock) {
15565 15559                          mutex_enter(&osp->os_sync_lock);
15566 15560                          have_sync_lock = 1;
15567 15561                  }
15568 15562                  /*
15569 15563                   * This is last call, remove the ref on the open
15570 15564                   * stream created by open and clean everything up.
15571 15565                   */
15572 15566                  osp->os_pending_close = 0;
15573 15567                  nfs4close_notw(vp, osp, &have_sync_lock);
15574 15568                  nfs4_error_zinit(ep);
15575 15569          }
15576 15570  
15577 15571          if (!close_failed) {
15578 15572                  if (have_sync_lock) {
15579 15573                          osp->os_pending_close = 0;
15580 15574                          mutex_exit(&osp->os_sync_lock);
15581 15575                          have_sync_lock = 0;
15582 15576                  } else {
15583 15577                          mutex_enter(&osp->os_sync_lock);
15584 15578                          osp->os_pending_close = 0;
15585 15579                          mutex_exit(&osp->os_sync_lock);
15586 15580                  }
15587 15581                  if (did_start_op && recov_state.rs_sp != NULL) {
15588 15582                          mutex_enter(&recov_state.rs_sp->s_lock);
15589 15583                          nfs4_dec_state_ref_count_nolock(recov_state.rs_sp, mi);
15590 15584                          mutex_exit(&recov_state.rs_sp->s_lock);
15591 15585                  } else {
15592 15586                          nfs4_dec_state_ref_count(mi);
15593 15587                  }
15594 15588                  nfs4_error_zinit(ep);
15595 15589          }
15596 15590  
15597 15591  out:
15598 15592          if (have_sync_lock)
15599 15593                  mutex_exit(&osp->os_sync_lock);
15600 15594          if (did_start_op)
15601 15595                  nfs4_end_fop(mi, vp, NULL, OH_CLOSE, &recov_state,
15602 15596                      recovonly ? TRUE : FALSE);
15603 15597          if (did_force_recovlock)
15604 15598                  nfs_rw_exit(&mi->mi_recovlock);
15605 15599          if (cred_otw)
15606 15600                  crfree(cred_otw);
15607 15601          if (osp)
15608 15602                  open_stream_rele(osp, rp);
15609 15603          if (oop) {
15610 15604                  if (did_start_seqid_sync)
15611 15605                          nfs4_end_open_seqid_sync(oop);
15612 15606                  open_owner_rele(oop);
15613 15607          }
15614 15608  }
15615 15609  
15616 15610  /*
15617 15611   * Convert information returned by the server in the LOCK4denied
15618 15612   * structure to the form required by fcntl.
15619 15613   */
15620 15614  static void
15621 15615  denied_to_flk(LOCK4denied *lockt_denied, flock64_t *flk, LOCKT4args *lockt_args)
15622 15616  {
15623 15617          nfs4_lo_name_t *lo;
15624 15618  
15625 15619  #ifdef  DEBUG
15626 15620          if (denied_to_flk_debug) {
15627 15621                  lockt_denied_debug = lockt_denied;
15628 15622                  debug_enter("lockt_denied");
15629 15623          }
15630 15624  #endif
15631 15625  
15632 15626          flk->l_type = lockt_denied->locktype == READ_LT ? F_RDLCK : F_WRLCK;
15633 15627          flk->l_whence = 0;      /* aka SEEK_SET */
15634 15628          flk->l_start = lockt_denied->offset;
15635 15629          flk->l_len = lockt_denied->length;
15636 15630  
15637 15631          /*
15638 15632           * If the blocking clientid matches our client id, then we can
15639 15633           * interpret the lockowner (since we built it).  If not, then
15640 15634           * fabricate a sysid and pid.  Note that the l_sysid field
15641 15635           * in *flk already has the local sysid.
15642 15636           */
15643 15637  
15644 15638          if (lockt_denied->owner.clientid == lockt_args->owner.clientid) {
15645 15639  
15646 15640                  if (lockt_denied->owner.owner_len == sizeof (*lo)) {
15647 15641                          lo = (nfs4_lo_name_t *)
15648 15642                              lockt_denied->owner.owner_val;
15649 15643  
15650 15644                          flk->l_pid = lo->ln_pid;
15651 15645                  } else {
15652 15646                          NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
15653 15647                              "denied_to_flk: bad lock owner length\n"));
15654 15648  
15655 15649                          flk->l_pid = lo_to_pid(&lockt_denied->owner);
15656 15650                  }
15657 15651          } else {
15658 15652                  NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
15659 15653                  "denied_to_flk: foreign clientid\n"));
15660 15654  
15661 15655                  /*
15662 15656                   * Construct a new sysid which should be different from
15663 15657                   * sysids of other systems.
15664 15658                   */
15665 15659  
15666 15660                  flk->l_sysid++;
15667 15661                  flk->l_pid = lo_to_pid(&lockt_denied->owner);
15668 15662          }
15669 15663  }
15670 15664  
15671 15665  static pid_t
15672 15666  lo_to_pid(lock_owner4 *lop)
15673 15667  {
15674 15668          pid_t pid = 0;
15675 15669          uchar_t *cp;
15676 15670          int i;
15677 15671  
15678 15672          cp = (uchar_t *)&lop->clientid;
15679 15673  
15680 15674          for (i = 0; i < sizeof (lop->clientid); i++)
15681 15675                  pid += (pid_t)*cp++;
15682 15676  
15683 15677          cp = (uchar_t *)lop->owner_val;
15684 15678  
15685 15679          for (i = 0; i < lop->owner_len; i++)
15686 15680                  pid += (pid_t)*cp++;
15687 15681  
15688 15682          return (pid);
15689 15683  }
15690 15684  
15691 15685  /*
15692 15686   * Given a lock pointer, returns the length of that lock.
15693 15687   * "end" is the last locked offset the "l_len" covers from
15694 15688   * the start of the lock.
15695 15689   */
15696 15690  static off64_t
15697 15691  lock_to_end(flock64_t *lock)
15698 15692  {
15699 15693          off64_t lock_end;
15700 15694  
15701 15695          if (lock->l_len == 0)
15702 15696                  lock_end = (off64_t)MAXEND;
15703 15697          else
15704 15698                  lock_end = lock->l_start + lock->l_len - 1;
15705 15699  
15706 15700          return (lock_end);
15707 15701  }
15708 15702  
15709 15703  /*
15710 15704   * Given the end of a lock, it will return you the length "l_len" for that lock.
15711 15705   */
15712 15706  static off64_t
15713 15707  end_to_len(off64_t start, off64_t end)
15714 15708  {
15715 15709          off64_t lock_len;
15716 15710  
15717 15711          ASSERT(end >= start);
15718 15712          if (end == MAXEND)
15719 15713                  lock_len = 0;
15720 15714          else
15721 15715                  lock_len = end - start + 1;
15722 15716  
15723 15717          return (lock_len);
15724 15718  }
15725 15719  
15726 15720  /*
15727 15721   * On given end for a lock it determines if it is the last locked offset
15728 15722   * or not, if so keeps it as is, else adds one to return the length for
15729 15723   * valid start.
15730 15724   */
15731 15725  static off64_t
15732 15726  start_check(off64_t x)
15733 15727  {
15734 15728          if (x == MAXEND)
15735 15729                  return (x);
15736 15730          else
15737 15731                  return (x + 1);
15738 15732  }
15739 15733  
15740 15734  /*
15741 15735   * See if these two locks overlap, and if so return 1;
15742 15736   * otherwise, return 0.
15743 15737   */
15744 15738  static int
15745 15739  locks_intersect(flock64_t *llfp, flock64_t *curfp)
15746 15740  {
15747 15741          off64_t llfp_end, curfp_end;
15748 15742  
15749 15743          llfp_end = lock_to_end(llfp);
15750 15744          curfp_end = lock_to_end(curfp);
15751 15745  
15752 15746          if (((llfp_end >= curfp->l_start) &&
15753 15747              (llfp->l_start <= curfp->l_start)) ||
15754 15748              ((curfp->l_start <= llfp->l_start) && (curfp_end >= llfp->l_start)))
15755 15749                  return (1);
15756 15750          return (0);
15757 15751  }
15758 15752  
15759 15753  /*
15760 15754   * Determine what the intersecting lock region is, and add that to the
15761 15755   * 'nl_llpp' locklist in increasing order (by l_start).
15762 15756   */
15763 15757  static void
15764 15758  nfs4_add_lock_range(flock64_t *lost_flp, flock64_t *local_flp,
15765 15759      locklist_t **nl_llpp, vnode_t *vp)
15766 15760  {
15767 15761          locklist_t *intersect_llp, *tmp_fllp, *cur_fllp;
15768 15762          off64_t lost_flp_end, local_flp_end, len, start;
15769 15763  
15770 15764          NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range:"));
15771 15765  
15772 15766          if (!locks_intersect(lost_flp, local_flp))
15773 15767                  return;
15774 15768  
15775 15769          NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: "
15776 15770              "locks intersect"));
15777 15771  
15778 15772          lost_flp_end = lock_to_end(lost_flp);
15779 15773          local_flp_end = lock_to_end(local_flp);
15780 15774  
15781 15775          /* Find the starting point of the intersecting region */
15782 15776          if (local_flp->l_start > lost_flp->l_start)
15783 15777                  start = local_flp->l_start;
15784 15778          else
15785 15779                  start = lost_flp->l_start;
15786 15780  
15787 15781          /* Find the lenght of the intersecting region */
15788 15782          if (lost_flp_end < local_flp_end)
15789 15783                  len = end_to_len(start, lost_flp_end);
15790 15784          else
15791 15785                  len = end_to_len(start, local_flp_end);
15792 15786  
15793 15787          /*
15794 15788           * Prepare the flock structure for the intersection found and insert
15795 15789           * it into the new list in increasing l_start order. This list contains
15796 15790           * intersections of locks registered by the client with the local host
15797 15791           * and the lost lock.
15798 15792           * The lock type of this lock is the same as that of the local_flp.
15799 15793           */
15800 15794          intersect_llp = (locklist_t *)kmem_alloc(sizeof (locklist_t), KM_SLEEP);
15801 15795          intersect_llp->ll_flock.l_start = start;
15802 15796          intersect_llp->ll_flock.l_len = len;
15803 15797          intersect_llp->ll_flock.l_type = local_flp->l_type;
15804 15798          intersect_llp->ll_flock.l_pid = local_flp->l_pid;
15805 15799          intersect_llp->ll_flock.l_sysid = local_flp->l_sysid;
15806 15800          intersect_llp->ll_flock.l_whence = 0;   /* aka SEEK_SET */
15807 15801          intersect_llp->ll_vp = vp;
15808 15802  
15809 15803          tmp_fllp = *nl_llpp;
15810 15804          cur_fllp = NULL;
15811 15805          while (tmp_fllp != NULL && tmp_fllp->ll_flock.l_start <
15812 15806              intersect_llp->ll_flock.l_start) {
15813 15807                          cur_fllp = tmp_fllp;
15814 15808                          tmp_fllp = tmp_fllp->ll_next;
15815 15809          }
15816 15810          if (cur_fllp == NULL) {
15817 15811                  /* first on the list */
15818 15812                  intersect_llp->ll_next = *nl_llpp;
15819 15813                  *nl_llpp = intersect_llp;
15820 15814          } else {
15821 15815                  intersect_llp->ll_next = cur_fllp->ll_next;
15822 15816                  cur_fllp->ll_next = intersect_llp;
15823 15817          }
15824 15818  
15825 15819          NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: "
15826 15820              "created lock region: start %"PRIx64" end %"PRIx64" : %s\n",
15827 15821              intersect_llp->ll_flock.l_start,
15828 15822              intersect_llp->ll_flock.l_start + intersect_llp->ll_flock.l_len,
15829 15823              intersect_llp->ll_flock.l_type == F_RDLCK ? "READ" : "WRITE"));
15830 15824  }
15831 15825  
15832 15826  /*
15833 15827   * Our local locking current state is potentially different than
15834 15828   * what the NFSv4 server thinks we have due to a lost lock that was
15835 15829   * resent and then received.  We need to reset our "NFSv4" locking
15836 15830   * state to match the current local locking state for this pid since
15837 15831   * that is what the user/application sees as what the world is.
15838 15832   *
15839 15833   * We cannot afford to drop the open/lock seqid sync since then we can
15840 15834   * get confused about what the current local locking state "is" versus
15841 15835   * "was".
15842 15836   *
15843 15837   * If we are unable to fix up the locks, we send SIGLOST to the affected
15844 15838   * process.  This is not done if the filesystem has been forcibly
15845 15839   * unmounted, in case the process has already exited and a new process
15846 15840   * exists with the same pid.
15847 15841   */
15848 15842  static void
15849 15843  nfs4_reinstitute_local_lock_state(vnode_t *vp, flock64_t *lost_flp, cred_t *cr,
15850 15844      nfs4_lock_owner_t *lop)
15851 15845  {
15852 15846          locklist_t *locks, *llp, *ri_llp, *tmp_llp;
15853 15847          mntinfo4_t *mi = VTOMI4(vp);
15854 15848          const int cmd = F_SETLK;
15855 15849          off64_t cur_start, llp_ll_flock_end, lost_flp_end;
15856 15850          flock64_t ul_fl;
15857 15851  
15858 15852          NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15859 15853              "nfs4_reinstitute_local_lock_state"));
15860 15854  
15861 15855          /*
15862 15856           * Find active locks for this vp from the local locking code.
15863 15857           * Scan through this list and find out the locks that intersect with
15864 15858           * the lost lock. Once we find the lock that intersects, add the
15865 15859           * intersection area as a new lock to a new list "ri_llp". The lock
15866 15860           * type of the intersection region lock added to ri_llp is the same
15867 15861           * as that found in the active lock list, "list". The intersecting
15868 15862           * region locks are added to ri_llp in increasing l_start order.
15869 15863           */
15870 15864          ASSERT(nfs_zone() == mi->mi_zone);
15871 15865  
15872 15866          locks = flk_active_locks_for_vp(vp);
15873 15867          ri_llp = NULL;
15874 15868  
15875 15869          for (llp = locks; llp != NULL; llp = llp->ll_next) {
15876 15870                  ASSERT(llp->ll_vp == vp);
15877 15871                  /*
15878 15872                   * Pick locks that belong to this pid/lockowner
15879 15873                   */
15880 15874                  if (llp->ll_flock.l_pid != lost_flp->l_pid)
15881 15875                          continue;
15882 15876  
15883 15877                  nfs4_add_lock_range(lost_flp, &llp->ll_flock, &ri_llp, vp);
15884 15878          }
15885 15879  
15886 15880          /*
15887 15881           * Now we have the list of intersections with the lost lock. These are
15888 15882           * the locks that were/are active before the server replied to the
15889 15883           * last/lost lock. Issue these locks to the server here. Playing these
15890 15884           * locks to the server will re-establish our current local locking state
15891 15885           * with the v4 server.
15892 15886           * If we get an error, send SIGLOST to the application for that lock.
15893 15887           */
15894 15888  
15895 15889          for (llp = ri_llp; llp != NULL; llp = llp->ll_next) {
15896 15890                  NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15897 15891                      "nfs4_reinstitute_local_lock_state: need to issue "
15898 15892                      "flock: [%"PRIx64" - %"PRIx64"] : %s",
15899 15893                      llp->ll_flock.l_start,
15900 15894                      llp->ll_flock.l_start + llp->ll_flock.l_len,
15901 15895                      llp->ll_flock.l_type == F_RDLCK ? "READ" :
15902 15896                      llp->ll_flock.l_type == F_WRLCK ? "WRITE" : "INVALID"));
15903 15897                  /*
15904 15898                   * No need to relock what we already have
15905 15899                   */
15906 15900                  if (llp->ll_flock.l_type == lost_flp->l_type)
15907 15901                          continue;
15908 15902  
15909 15903                  push_reinstate(vp, cmd, &llp->ll_flock, cr, lop);
15910 15904          }
15911 15905  
15912 15906          /*
15913 15907           * Now keeping the start of the lost lock as our reference parse the
15914 15908           * newly created ri_llp locklist to find the ranges that we have locked
15915 15909           * with the v4 server but not in the current local locking. We need
15916 15910           * to unlock these ranges.
15917 15911           * These ranges can also be reffered to as those ranges, where the lost
15918 15912           * lock does not overlap with the locks in the ri_llp but are locked
15919 15913           * since the server replied to the lost lock.
15920 15914           */
15921 15915          cur_start = lost_flp->l_start;
15922 15916          lost_flp_end = lock_to_end(lost_flp);
15923 15917  
15924 15918          ul_fl.l_type = F_UNLCK;
15925 15919          ul_fl.l_whence = 0;     /* aka SEEK_SET */
15926 15920          ul_fl.l_sysid = lost_flp->l_sysid;
15927 15921          ul_fl.l_pid = lost_flp->l_pid;
15928 15922  
15929 15923          for (llp = ri_llp; llp != NULL; llp = llp->ll_next) {
15930 15924                  llp_ll_flock_end = lock_to_end(&llp->ll_flock);
15931 15925  
15932 15926                  if (llp->ll_flock.l_start <= cur_start) {
15933 15927                          cur_start = start_check(llp_ll_flock_end);
15934 15928                          continue;
15935 15929                  }
15936 15930                  NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15937 15931                      "nfs4_reinstitute_local_lock_state: "
15938 15932                      "UNLOCK [%"PRIx64" - %"PRIx64"]",
15939 15933                      cur_start, llp->ll_flock.l_start));
15940 15934  
15941 15935                  ul_fl.l_start = cur_start;
15942 15936                  ul_fl.l_len = end_to_len(cur_start,
15943 15937                      (llp->ll_flock.l_start - 1));
15944 15938  
15945 15939                  push_reinstate(vp, cmd, &ul_fl, cr, lop);
15946 15940                  cur_start = start_check(llp_ll_flock_end);
15947 15941          }
15948 15942  
15949 15943          /*
15950 15944           * In the case where the lost lock ends after all intersecting locks,
15951 15945           * unlock the last part of the lost lock range.
15952 15946           */
15953 15947          if (cur_start != start_check(lost_flp_end)) {
15954 15948                  NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15955 15949                      "nfs4_reinstitute_local_lock_state: UNLOCK end of the "
15956 15950                      "lost lock region [%"PRIx64" - %"PRIx64"]",
15957 15951                      cur_start, lost_flp->l_start + lost_flp->l_len));
15958 15952  
15959 15953                  ul_fl.l_start = cur_start;
15960 15954                  /*
15961 15955                   * Is it an to-EOF lock? if so unlock till the end
15962 15956                   */
15963 15957                  if (lost_flp->l_len == 0)
15964 15958                          ul_fl.l_len = 0;
15965 15959                  else
15966 15960                          ul_fl.l_len = start_check(lost_flp_end) - cur_start;
15967 15961  
15968 15962                  push_reinstate(vp, cmd, &ul_fl, cr, lop);
15969 15963          }
15970 15964  
15971 15965          if (locks != NULL)
15972 15966                  flk_free_locklist(locks);
15973 15967  
15974 15968          /* Free up our newly created locklist */
15975 15969          for (llp = ri_llp; llp != NULL; ) {
15976 15970                  tmp_llp = llp->ll_next;
15977 15971                  kmem_free(llp, sizeof (locklist_t));
15978 15972                  llp = tmp_llp;
15979 15973          }
15980 15974  
15981 15975          /*
15982 15976           * Now return back to the original calling nfs4frlock()
15983 15977           * and let us naturally drop our seqid syncs.
15984 15978           */
15985 15979  }
15986 15980  
15987 15981  /*
15988 15982   * Create a lost state record for the given lock reinstantiation request
15989 15983   * and push it onto the lost state queue.
15990 15984   */
15991 15985  static void
15992 15986  push_reinstate(vnode_t *vp, int cmd, flock64_t *flk, cred_t *cr,
15993 15987      nfs4_lock_owner_t *lop)
15994 15988  {
15995 15989          nfs4_lost_rqst_t req;
15996 15990          nfs_lock_type4 locktype;
15997 15991          nfs4_error_t e = { EINTR, NFS4_OK, RPC_SUCCESS };
15998 15992  
15999 15993          ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
16000 15994  
16001 15995          locktype = flk_to_locktype(cmd, flk->l_type);
16002 15996          nfs4frlock_save_lost_rqst(NFS4_LCK_CTYPE_REINSTATE, EINTR, locktype,
16003 15997              NULL, NULL, lop, flk, &req, cr, vp);
16004 15998          (void) nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
16005 15999              (req.lr_op == OP_LOCK || req.lr_op == OP_LOCKU) ?
16006 16000              &req : NULL, flk->l_type == F_UNLCK ? OP_LOCKU : OP_LOCK,
16007 16001              NULL, NULL, NULL);
16008 16002  }

↓ open down ↓

13394 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX