untraverse-2 Wdiff usr/src/uts/common/fs/nfs/nfs_srv.c

Print this page

Send zone's rootvp to untraverse()

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/nfs/nfs_srv.c
          +++ new/usr/src/uts/common/fs/nfs/nfs_srv.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  25   25   * Copyright (c) 2016 by Delphix. All rights reserved.
  26   26   */
  27   27  
  28   28  /*
  29   29   *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  30   30   *      All rights reserved.
  31   31   */
  32   32  
  33   33  /*
  34   34   * Copyright 2018 Nexenta Systems, Inc.
  35   35   * Copyright (c) 2016 by Delphix. All rights reserved.
  36   36   */
  37   37  
  38   38  #include <sys/param.h>
  39   39  #include <sys/types.h>
  40   40  #include <sys/systm.h>
  41   41  #include <sys/cred.h>
  42   42  #include <sys/buf.h>
  43   43  #include <sys/vfs.h>
  44   44  #include <sys/vnode.h>
  45   45  #include <sys/uio.h>
  46   46  #include <sys/stat.h>
  47   47  #include <sys/errno.h>
  48   48  #include <sys/sysmacros.h>
  49   49  #include <sys/statvfs.h>
  50   50  #include <sys/kmem.h>
  51   51  #include <sys/kstat.h>
  52   52  #include <sys/dirent.h>
  53   53  #include <sys/cmn_err.h>
  54   54  #include <sys/debug.h>
  55   55  #include <sys/vtrace.h>
  56   56  #include <sys/mode.h>
  57   57  #include <sys/acl.h>
  58   58  #include <sys/nbmlock.h>
  59   59  #include <sys/policy.h>
  60   60  #include <sys/sdt.h>
  61   61  
  62   62  #include <rpc/types.h>
  63   63  #include <rpc/auth.h>
  64   64  #include <rpc/svc.h>
  65   65  
  66   66  #include <nfs/nfs.h>
  67   67  #include <nfs/export.h>
  68   68  #include <nfs/nfs_cmd.h>
  69   69  
  70   70  #include <vm/hat.h>
  71   71  #include <vm/as.h>
  72   72  #include <vm/seg.h>
  73   73  #include <vm/seg_map.h>
  74   74  #include <vm/seg_kmem.h>
  75   75  
  76   76  #include <sys/strsubr.h>
  77   77  
  78   78  struct rfs_async_write_list;
  79   79  
  80   80  /*
  81   81   * Zone globals of NFSv2 server
  82   82   */
  83   83  typedef struct nfs_srv {
  84   84          kmutex_t                        async_write_lock;
  85   85          struct rfs_async_write_list     *async_write_head;
  86   86  
  87   87          /*
  88   88           * enables write clustering if == 1
  89   89           */
  90   90          int             write_async;
  91   91  } nfs_srv_t;
  92   92  
  93   93  /*
  94   94   * These are the interface routines for the server side of the
  95   95   * Network File System.  See the NFS version 2 protocol specification
  96   96   * for a description of this interface.
  97   97   */
  98   98  
  99   99  static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
 100  100  static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
 101  101                          cred_t *);
 102  102  
 103  103  
 104  104  /*
 105  105   * Some "over the wire" UNIX file types.  These are encoded
 106  106   * into the mode.  This needs to be fixed in the next rev.
 107  107   */
 108  108  #define IFMT            0170000         /* type of file */
 109  109  #define IFCHR           0020000         /* character special */
 110  110  #define IFBLK           0060000         /* block special */
 111  111  #define IFSOCK          0140000         /* socket */
 112  112  
 113  113  u_longlong_t nfs2_srv_caller_id;
 114  114  
 115  115  static nfs_srv_t *
 116  116  nfs_get_srv(void)
 117  117  {
 118  118          nfs_globals_t *ng = zone_getspecific(nfssrv_zone_key, curzone);
 119  119          nfs_srv_t *srv = ng->nfs_srv;
 120  120          ASSERT(srv != NULL);
 121  121          return (srv);
 122  122  }
 123  123  
 124  124  /*
 125  125   * Get file attributes.
 126  126   * Returns the current attributes of the file with the given fhandle.
 127  127   */
 128  128  /* ARGSUSED */
 129  129  void
 130  130  rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
 131  131      struct svc_req *req, cred_t *cr, bool_t ro)
 132  132  {
 133  133          int error;
 134  134          vnode_t *vp;
 135  135          struct vattr va;
 136  136  
 137  137          vp = nfs_fhtovp(fhp, exi);
 138  138          if (vp == NULL) {
 139  139                  ns->ns_status = NFSERR_STALE;
 140  140                  return;
 141  141          }
 142  142  
 143  143          /*
 144  144           * Do the getattr.
 145  145           */
 146  146          va.va_mask = AT_ALL;    /* we want all the attributes */
 147  147  
 148  148          error = rfs4_delegated_getattr(vp, &va, 0, cr);
 149  149  
 150  150          /* check for overflows */
 151  151          if (!error) {
 152  152                  /* Lie about the object type for a referral */
 153  153                  if (vn_is_nfs_reparse(vp, cr))
 154  154                          va.va_type = VLNK;
 155  155  
 156  156                  acl_perm(vp, exi, &va, cr);
 157  157                  error = vattr_to_nattr(&va, &ns->ns_attr);
 158  158          }
 159  159  
 160  160          VN_RELE(vp);
 161  161  
 162  162          ns->ns_status = puterrno(error);
 163  163  }
 164  164  void *
 165  165  rfs_getattr_getfh(fhandle_t *fhp)
 166  166  {
 167  167          return (fhp);
 168  168  }
 169  169  
 170  170  /*
 171  171   * Set file attributes.
 172  172   * Sets the attributes of the file with the given fhandle.  Returns
 173  173   * the new attributes.
 174  174   */
 175  175  /* ARGSUSED */
 176  176  void
 177  177  rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
 178  178      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 179  179  {
 180  180          int error;
 181  181          int flag;
 182  182          int in_crit = 0;
 183  183          vnode_t *vp;
 184  184          struct vattr va;
 185  185          struct vattr bva;
 186  186          struct flock64 bf;
 187  187          caller_context_t ct;
 188  188  
 189  189  
 190  190          vp = nfs_fhtovp(&args->saa_fh, exi);
 191  191          if (vp == NULL) {
 192  192                  ns->ns_status = NFSERR_STALE;
 193  193                  return;
 194  194          }
 195  195  
 196  196          if (rdonly(ro, vp)) {
 197  197                  VN_RELE(vp);
 198  198                  ns->ns_status = NFSERR_ROFS;
 199  199                  return;
 200  200          }
 201  201  
 202  202          error = sattr_to_vattr(&args->saa_sa, &va);
 203  203          if (error) {
 204  204                  VN_RELE(vp);
 205  205                  ns->ns_status = puterrno(error);
 206  206                  return;
 207  207          }
 208  208  
 209  209          /*
 210  210           * If the client is requesting a change to the mtime,
 211  211           * but the nanosecond field is set to 1 billion, then
 212  212           * this is a flag to the server that it should set the
 213  213           * atime and mtime fields to the server's current time.
 214  214           * The 1 billion number actually came from the client
 215  215           * as 1 million, but the units in the over the wire
 216  216           * request are microseconds instead of nanoseconds.
 217  217           *
 218  218           * This is an overload of the protocol and should be
 219  219           * documented in the NFS Version 2 protocol specification.
 220  220           */
 221  221          if (va.va_mask & AT_MTIME) {
 222  222                  if (va.va_mtime.tv_nsec == 1000000000) {
 223  223                          gethrestime(&va.va_mtime);
 224  224                          va.va_atime = va.va_mtime;
 225  225                          va.va_mask |= AT_ATIME;
 226  226                          flag = 0;
 227  227                  } else
 228  228                          flag = ATTR_UTIME;
 229  229          } else
 230  230                  flag = 0;
 231  231  
 232  232          /*
 233  233           * If the filesystem is exported with nosuid, then mask off
 234  234           * the setuid and setgid bits.
 235  235           */
 236  236          if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
 237  237              (exi->exi_export.ex_flags & EX_NOSUID))
 238  238                  va.va_mode &= ~(VSUID | VSGID);
 239  239  
 240  240          ct.cc_sysid = 0;
 241  241          ct.cc_pid = 0;
 242  242          ct.cc_caller_id = nfs2_srv_caller_id;
 243  243          ct.cc_flags = CC_DONTBLOCK;
 244  244  
 245  245          /*
 246  246           * We need to specially handle size changes because it is
 247  247           * possible for the client to create a file with modes
 248  248           * which indicate read-only, but with the file opened for
 249  249           * writing.  If the client then tries to set the size of
 250  250           * the file, then the normal access checking done in
 251  251           * VOP_SETATTR would prevent the client from doing so,
 252  252           * although it should be legal for it to do so.  To get
 253  253           * around this, we do the access checking for ourselves
 254  254           * and then use VOP_SPACE which doesn't do the access
 255  255           * checking which VOP_SETATTR does. VOP_SPACE can only
 256  256           * operate on VREG files, let VOP_SETATTR handle the other
 257  257           * extremely rare cases.
 258  258           * Also the client should not be allowed to change the
 259  259           * size of the file if there is a conflicting non-blocking
 260  260           * mandatory lock in the region of change.
 261  261           */
 262  262          if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
 263  263                  if (nbl_need_check(vp)) {
 264  264                          nbl_start_crit(vp, RW_READER);
 265  265                          in_crit = 1;
 266  266                  }
 267  267  
 268  268                  bva.va_mask = AT_UID | AT_SIZE;
 269  269  
 270  270                  error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
 271  271  
 272  272                  if (error) {
 273  273                          if (in_crit)
 274  274                                  nbl_end_crit(vp);
 275  275                          VN_RELE(vp);
 276  276                          ns->ns_status = puterrno(error);
 277  277                          return;
 278  278                  }
 279  279  
 280  280                  if (in_crit) {
 281  281                          u_offset_t offset;
 282  282                          ssize_t length;
 283  283  
 284  284                          if (va.va_size < bva.va_size) {
 285  285                                  offset = va.va_size;
 286  286                                  length = bva.va_size - va.va_size;
 287  287                          } else {
 288  288                                  offset = bva.va_size;
 289  289                                  length = va.va_size - bva.va_size;
 290  290                          }
 291  291                          if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
 292  292                              NULL)) {
 293  293                                  error = EACCES;
 294  294                          }
 295  295                  }
 296  296  
 297  297                  if (crgetuid(cr) == bva.va_uid && !error &&
 298  298                      va.va_size != bva.va_size) {
 299  299                          va.va_mask &= ~AT_SIZE;
 300  300                          bf.l_type = F_WRLCK;
 301  301                          bf.l_whence = 0;
 302  302                          bf.l_start = (off64_t)va.va_size;
 303  303                          bf.l_len = 0;
 304  304                          bf.l_sysid = 0;
 305  305                          bf.l_pid = 0;
 306  306  
 307  307                          error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
 308  308                              (offset_t)va.va_size, cr, &ct);
 309  309                  }
 310  310                  if (in_crit)
 311  311                          nbl_end_crit(vp);
 312  312          } else
 313  313                  error = 0;
 314  314  
 315  315          /*
 316  316           * Do the setattr.
 317  317           */
 318  318          if (!error && va.va_mask) {
 319  319                  error = VOP_SETATTR(vp, &va, flag, cr, &ct);
 320  320          }
 321  321  
 322  322          /*
 323  323           * check if the monitor on either vop_space or vop_setattr detected
 324  324           * a delegation conflict and if so, mark the thread flag as
 325  325           * wouldblock so that the response is dropped and the client will
 326  326           * try again.
 327  327           */
 328  328          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 329  329                  VN_RELE(vp);
 330  330                  curthread->t_flag |= T_WOULDBLOCK;
 331  331                  return;
 332  332          }
 333  333  
 334  334          if (!error) {
 335  335                  va.va_mask = AT_ALL;    /* get everything */
 336  336  
 337  337                  error = rfs4_delegated_getattr(vp, &va, 0, cr);
 338  338  
 339  339                  /* check for overflows */
 340  340                  if (!error) {
 341  341                          acl_perm(vp, exi, &va, cr);
 342  342                          error = vattr_to_nattr(&va, &ns->ns_attr);
 343  343                  }
 344  344          }
 345  345  
 346  346          ct.cc_flags = 0;
 347  347  
 348  348          /*
 349  349           * Force modified metadata out to stable storage.
 350  350           */
 351  351          (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
 352  352  
 353  353          VN_RELE(vp);
 354  354  
 355  355          ns->ns_status = puterrno(error);
 356  356  }
 357  357  void *
 358  358  rfs_setattr_getfh(struct nfssaargs *args)
 359  359  {
 360  360          return (&args->saa_fh);
 361  361  }
 362  362  
 363  363  /* Change and release @exip and @vpp only in success */
 364  364  int
 365  365  rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
 366  366  {
 367  367          struct exportinfo *exi;
 368  368          vnode_t *vp = *vpp;
 369  369          fid_t fid;
 370  370          int error;
 371  371  
 372  372          VN_HOLD(vp);
 373  373  
 374  374          if ((error = traverse(&vp)) != 0) {
 375  375                  VN_RELE(vp);
 376  376                  return (error);
 377  377          }
 378  378  
 379  379          bzero(&fid, sizeof (fid));
 380  380          fid.fid_len = MAXFIDSZ;
 381  381          error = VOP_FID(vp, &fid, NULL);
 382  382          if (error) {
 383  383                  VN_RELE(vp);
 384  384                  return (error);
 385  385          }
 386  386  
 387  387          exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
 388  388          if (exi == NULL ||
 389  389              (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
 390  390                  /*
 391  391                   * It is not error, just subdir is not exported
 392  392                   * or "nohide" is not set
 393  393                   */
 394  394                  if (exi != NULL)
 395  395                          exi_rele(exi);
 396  396                  VN_RELE(vp);
 397  397          } else {
 398  398                  /* go to submount */
 399  399                  exi_rele(*exip);
 400  400                  *exip = exi;
 401  401  
 402  402                  VN_RELE(*vpp);
 403  403                  *vpp = vp;
 404  404          }
 405  405  
 406  406          return (0);
 407  407  }
 408  408

↓ open down ↓

408 lines elided

↑ open up ↑

 409  409  /*
 410  410   * Given mounted "dvp" and "exi", go upper mountpoint
 411  411   * with dvp/exi correction
 412  412   * Return 0 in success
 413  413   */
 414  414  int
 415  415  rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
 416  416  {
 417  417          struct exportinfo *exi;
 418  418          vnode_t *dvp = *dvpp;
      419 +        vnode_t *zone_rootvp;
 419  420  
 420      -        ASSERT3U((*exip)->exi_zoneid, ==, curzone->zone_id);
 421      -        ASSERT((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp));
      421 +        zone_rootvp = (*exip)->exi_ne->exi_root->exi_vp;
      422 +        ASSERT((dvp->v_flag & VROOT) || VN_CMP(zone_rootvp, dvp));
 422  423  
 423  424          VN_HOLD(dvp);
 424      -        dvp = untraverse(dvp);
      425 +        dvp = untraverse(dvp, zone_rootvp);
 425  426          exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
 426  427          if (exi == NULL) {
 427  428                  VN_RELE(dvp);
 428  429                  return (-1);
 429  430          }
 430  431  
 431      -        ASSERT3U(exi->exi_zoneid, ==, curzone->zone_id);
      432 +        ASSERT3U(exi->exi_zoneid, ==, (*exip)->exi_zoneid);
 432  433          exi_rele(*exip);
 433  434          *exip = exi;
 434  435          VN_RELE(*dvpp);
 435  436          *dvpp = dvp;
 436  437  
 437  438          return (0);
 438  439  }
 439  440  /*
 440  441   * Directory lookup.
 441  442   * Returns an fhandle and file attributes for file name in a directory.

 442  443   */
 443  444  /* ARGSUSED */
 444  445  void
 445  446  rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
 446  447      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 447  448  {
 448  449          int error;
 449  450          vnode_t *dvp;
 450  451          vnode_t *vp;
 451  452          struct vattr va;
 452  453          fhandle_t *fhp = da->da_fhandle;
 453  454          struct sec_ol sec = {0, 0};
 454  455          bool_t publicfh_flag = FALSE, auth_weak = FALSE;
 455  456          char *name;
 456  457          struct sockaddr *ca;
 457  458  
 458  459          /*
 459  460           * Trusted Extension doesn't support NFSv2. MOUNT
 460  461           * will reject v2 clients. Need to prevent v2 client
 461  462           * access via WebNFS here.
 462  463           */
 463  464          if (is_system_labeled() && req->rq_vers == 2) {
 464  465                  dr->dr_status = NFSERR_ACCES;
 465  466                  return;
 466  467          }
 467  468  
 468  469          /*
 469  470           * Disallow NULL paths
 470  471           */
 471  472          if (da->da_name == NULL || *da->da_name == '\0') {
 472  473                  dr->dr_status = NFSERR_ACCES;
 473  474                  return;
 474  475          }
 475  476  
 476  477          /*
 477  478           * Allow lookups from the root - the default
 478  479           * location of the public filehandle.
 479  480           */
 480  481          if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
 481  482                  dvp = ZONE_ROOTVP();
 482  483                  VN_HOLD(dvp);
 483  484          } else {
 484  485                  dvp = nfs_fhtovp(fhp, exi);
 485  486                  if (dvp == NULL) {
 486  487                          dr->dr_status = NFSERR_STALE;
 487  488                          return;
 488  489                  }
 489  490          }
 490  491  
 491  492          exi_hold(exi);
 492  493          ASSERT3U(exi->exi_zoneid, ==, curzone->zone_id);
 493  494  
 494  495          /*
 495  496           * Not allow lookup beyond root.
 496  497           * If the filehandle matches a filehandle of the exi,
 497  498           * then the ".." refers beyond the root of an exported filesystem.
 498  499           */
 499  500          if (strcmp(da->da_name, "..") == 0 &&
 500  501              EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
 501  502                  if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
 502  503                      ((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp))) {
 503  504                          /*
 504  505                           * special case for ".." and 'nohide'exported root
 505  506                           */
 506  507                          if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
 507  508                                  error = NFSERR_ACCES;
 508  509                                  goto out;
 509  510                          }
 510  511                  } else  {
 511  512                          error = NFSERR_NOENT;
 512  513                          goto out;
 513  514                  }
 514  515          }
 515  516  
 516  517          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 517  518          name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
 518  519              MAXPATHLEN);
 519  520  
 520  521          if (name == NULL) {
 521  522                  error = NFSERR_ACCES;
 522  523                  goto out;
 523  524          }
 524  525  
 525  526          /*
 526  527           * If the public filehandle is used then allow
 527  528           * a multi-component lookup, i.e. evaluate
 528  529           * a pathname and follow symbolic links if
 529  530           * necessary.
 530  531           *
 531  532           * This may result in a vnode in another filesystem
 532  533           * which is OK as long as the filesystem is exported.
 533  534           */
 534  535          if (PUBLIC_FH2(fhp)) {
 535  536                  publicfh_flag = TRUE;
 536  537  
 537  538                  exi_rele(exi);
 538  539                  exi = NULL;
 539  540  
 540  541                  error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
 541  542                      &sec);
 542  543          } else {
 543  544                  /*
 544  545                   * Do a normal single component lookup.
 545  546                   */
 546  547                  error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
 547  548                      NULL, NULL, NULL);
 548  549          }
 549  550  
 550  551          if (name != da->da_name)
 551  552                  kmem_free(name, MAXPATHLEN);
 552  553  
 553  554          if (error == 0 && vn_ismntpt(vp)) {
 554  555                  error = rfs_cross_mnt(&vp, &exi);
 555  556                  if (error)
 556  557                          VN_RELE(vp);
 557  558          }
 558  559  
 559  560          if (!error) {
 560  561                  va.va_mask = AT_ALL;    /* we want everything */
 561  562  
 562  563                  error = rfs4_delegated_getattr(vp, &va, 0, cr);
 563  564  
 564  565                  /* check for overflows */
 565  566                  if (!error) {
 566  567                          acl_perm(vp, exi, &va, cr);
 567  568                          error = vattr_to_nattr(&va, &dr->dr_attr);
 568  569                          if (!error) {
 569  570                                  if (sec.sec_flags & SEC_QUERY)
 570  571                                          error = makefh_ol(&dr->dr_fhandle, exi,
 571  572                                              sec.sec_index);
 572  573                                  else {
 573  574                                          error = makefh(&dr->dr_fhandle, vp,
 574  575                                              exi);
 575  576                                          if (!error && publicfh_flag &&
 576  577                                              !chk_clnt_sec(exi, req))
 577  578                                                  auth_weak = TRUE;
 578  579                                  }
 579  580                          }
 580  581                  }
 581  582                  VN_RELE(vp);
 582  583          }
 583  584  
 584  585  out:
 585  586          VN_RELE(dvp);
 586  587  
 587  588          if (exi != NULL)
 588  589                  exi_rele(exi);
 589  590  
 590  591          /*
 591  592           * If it's public fh, no 0x81, and client's flavor is
 592  593           * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
 593  594           * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
 594  595           */
 595  596          if (auth_weak)
 596  597                  dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
 597  598          else
 598  599                  dr->dr_status = puterrno(error);
 599  600  }
 600  601  void *
 601  602  rfs_lookup_getfh(struct nfsdiropargs *da)
 602  603  {
 603  604          return (da->da_fhandle);
 604  605  }
 605  606  
 606  607  /*
 607  608   * Read symbolic link.
 608  609   * Returns the string in the symbolic link at the given fhandle.
 609  610   */
 610  611  /* ARGSUSED */
 611  612  void
 612  613  rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
 613  614      struct svc_req *req, cred_t *cr, bool_t ro)
 614  615  {
 615  616          int error;
 616  617          struct iovec iov;
 617  618          struct uio uio;
 618  619          vnode_t *vp;
 619  620          struct vattr va;
 620  621          struct sockaddr *ca;
 621  622          char *name = NULL;
 622  623          int is_referral = 0;
 623  624  
 624  625          vp = nfs_fhtovp(fhp, exi);
 625  626          if (vp == NULL) {
 626  627                  rl->rl_data = NULL;
 627  628                  rl->rl_status = NFSERR_STALE;
 628  629                  return;
 629  630          }
 630  631  
 631  632          va.va_mask = AT_MODE;
 632  633  
 633  634          error = VOP_GETATTR(vp, &va, 0, cr, NULL);
 634  635  
 635  636          if (error) {
 636  637                  VN_RELE(vp);
 637  638                  rl->rl_data = NULL;
 638  639                  rl->rl_status = puterrno(error);
 639  640                  return;
 640  641          }
 641  642  
 642  643          if (MANDLOCK(vp, va.va_mode)) {
 643  644                  VN_RELE(vp);
 644  645                  rl->rl_data = NULL;
 645  646                  rl->rl_status = NFSERR_ACCES;
 646  647                  return;
 647  648          }
 648  649  
 649  650          /* We lied about the object type for a referral */
 650  651          if (vn_is_nfs_reparse(vp, cr))
 651  652                  is_referral = 1;
 652  653  
 653  654          /*
 654  655           * XNFS and RFC1094 require us to return ENXIO if argument
 655  656           * is not a link. BUGID 1138002.
 656  657           */
 657  658          if (vp->v_type != VLNK && !is_referral) {
 658  659                  VN_RELE(vp);
 659  660                  rl->rl_data = NULL;
 660  661                  rl->rl_status = NFSERR_NXIO;
 661  662                  return;
 662  663          }
 663  664  
 664  665          /*
 665  666           * Allocate data for pathname.  This will be freed by rfs_rlfree.
 666  667           */
 667  668          rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
 668  669  
 669  670          if (is_referral) {
 670  671                  char *s;
 671  672                  size_t strsz;
 672  673  
 673  674                  /* Get an artificial symlink based on a referral */
 674  675                  s = build_symlink(vp, cr, &strsz);
 675  676                  global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
 676  677                  DTRACE_PROBE2(nfs2serv__func__referral__reflink,
 677  678                      vnode_t *, vp, char *, s);
 678  679                  if (s == NULL)
 679  680                          error = EINVAL;
 680  681                  else {
 681  682                          error = 0;
 682  683                          (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
 683  684                          rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
 684  685                          kmem_free(s, strsz);
 685  686                  }
 686  687  
 687  688          } else {
 688  689  
 689  690                  /*
 690  691                   * Set up io vector to read sym link data
 691  692                   */
 692  693                  iov.iov_base = rl->rl_data;
 693  694                  iov.iov_len = NFS_MAXPATHLEN;
 694  695                  uio.uio_iov = &iov;
 695  696                  uio.uio_iovcnt = 1;
 696  697                  uio.uio_segflg = UIO_SYSSPACE;
 697  698                  uio.uio_extflg = UIO_COPY_CACHED;
 698  699                  uio.uio_loffset = (offset_t)0;
 699  700                  uio.uio_resid = NFS_MAXPATHLEN;
 700  701  
 701  702                  /*
 702  703                   * Do the readlink.
 703  704                   */
 704  705                  error = VOP_READLINK(vp, &uio, cr, NULL);
 705  706  
 706  707                  rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
 707  708  
 708  709                  if (!error)
 709  710                          rl->rl_data[rl->rl_count] = '\0';
 710  711  
 711  712          }
 712  713  
 713  714  
 714  715          VN_RELE(vp);
 715  716  
 716  717          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 717  718          name = nfscmd_convname(ca, exi, rl->rl_data,
 718  719              NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
 719  720  
 720  721          if (name != NULL && name != rl->rl_data) {
 721  722                  kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 722  723                  rl->rl_data = name;
 723  724          }
 724  725  
 725  726          /*
 726  727           * XNFS and RFC1094 require us to return ENXIO if argument
 727  728           * is not a link. UFS returns EINVAL if this is the case,
 728  729           * so we do the mapping here. BUGID 1138002.
 729  730           */
 730  731          if (error == EINVAL)
 731  732                  rl->rl_status = NFSERR_NXIO;
 732  733          else
 733  734                  rl->rl_status = puterrno(error);
 734  735  
 735  736  }
 736  737  void *
 737  738  rfs_readlink_getfh(fhandle_t *fhp)
 738  739  {
 739  740          return (fhp);
 740  741  }
 741  742  /*
 742  743   * Free data allocated by rfs_readlink
 743  744   */
 744  745  void
 745  746  rfs_rlfree(struct nfsrdlnres *rl)
 746  747  {
 747  748          if (rl->rl_data != NULL)
 748  749                  kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 749  750  }
 750  751  
 751  752  static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
 752  753  
 753  754  /*
 754  755   * Read data.
 755  756   * Returns some data read from the file at the given fhandle.
 756  757   */
 757  758  /* ARGSUSED */
 758  759  void
 759  760  rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
 760  761      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 761  762  {
 762  763          vnode_t *vp;
 763  764          int error;
 764  765          struct vattr va;
 765  766          struct iovec iov;
 766  767          struct uio uio;
 767  768          mblk_t *mp;
 768  769          int alloc_err = 0;
 769  770          int in_crit = 0;
 770  771          caller_context_t ct;
 771  772  
 772  773          vp = nfs_fhtovp(&ra->ra_fhandle, exi);
 773  774          if (vp == NULL) {
 774  775                  rr->rr_data = NULL;
 775  776                  rr->rr_status = NFSERR_STALE;
 776  777                  return;
 777  778          }
 778  779  
 779  780          if (vp->v_type != VREG) {
 780  781                  VN_RELE(vp);
 781  782                  rr->rr_data = NULL;
 782  783                  rr->rr_status = NFSERR_ISDIR;
 783  784                  return;
 784  785          }
 785  786  
 786  787          ct.cc_sysid = 0;
 787  788          ct.cc_pid = 0;
 788  789          ct.cc_caller_id = nfs2_srv_caller_id;
 789  790          ct.cc_flags = CC_DONTBLOCK;
 790  791  
 791  792          /*
 792  793           * Enter the critical region before calling VOP_RWLOCK
 793  794           * to avoid a deadlock with write requests.
 794  795           */
 795  796          if (nbl_need_check(vp)) {
 796  797                  nbl_start_crit(vp, RW_READER);
 797  798                  if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
 798  799                      0, NULL)) {
 799  800                          nbl_end_crit(vp);
 800  801                          VN_RELE(vp);
 801  802                          rr->rr_data = NULL;
 802  803                          rr->rr_status = NFSERR_ACCES;
 803  804                          return;
 804  805                  }
 805  806                  in_crit = 1;
 806  807          }
 807  808  
 808  809          error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
 809  810  
 810  811          /* check if a monitor detected a delegation conflict */
 811  812          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 812  813                  if (in_crit)
 813  814                          nbl_end_crit(vp);
 814  815                  VN_RELE(vp);
 815  816                  /* mark as wouldblock so response is dropped */
 816  817                  curthread->t_flag |= T_WOULDBLOCK;
 817  818  
 818  819                  rr->rr_data = NULL;
 819  820                  return;
 820  821          }
 821  822  
 822  823          va.va_mask = AT_ALL;
 823  824  
 824  825          error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 825  826  
 826  827          if (error) {
 827  828                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 828  829                  if (in_crit)
 829  830                          nbl_end_crit(vp);
 830  831  
 831  832                  VN_RELE(vp);
 832  833                  rr->rr_data = NULL;
 833  834                  rr->rr_status = puterrno(error);
 834  835  
 835  836                  return;
 836  837          }
 837  838  
 838  839          /*
 839  840           * This is a kludge to allow reading of files created
 840  841           * with no read permission.  The owner of the file
 841  842           * is always allowed to read it.
 842  843           */
 843  844          if (crgetuid(cr) != va.va_uid) {
 844  845                  error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
 845  846  
 846  847                  if (error) {
 847  848                          /*
 848  849                           * Exec is the same as read over the net because
 849  850                           * of demand loading.
 850  851                           */
 851  852                          error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
 852  853                  }
 853  854                  if (error) {
 854  855                          VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 855  856                          if (in_crit)
 856  857                                  nbl_end_crit(vp);
 857  858                          VN_RELE(vp);
 858  859                          rr->rr_data = NULL;
 859  860                          rr->rr_status = puterrno(error);
 860  861  
 861  862                          return;
 862  863                  }
 863  864          }
 864  865  
 865  866          if (MANDLOCK(vp, va.va_mode)) {
 866  867                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 867  868                  if (in_crit)
 868  869                          nbl_end_crit(vp);
 869  870  
 870  871                  VN_RELE(vp);
 871  872                  rr->rr_data = NULL;
 872  873                  rr->rr_status = NFSERR_ACCES;
 873  874  
 874  875                  return;
 875  876          }
 876  877  
 877  878          rr->rr_ok.rrok_wlist_len = 0;
 878  879          rr->rr_ok.rrok_wlist = NULL;
 879  880  
 880  881          if ((u_offset_t)ra->ra_offset >= va.va_size) {
 881  882                  rr->rr_count = 0;
 882  883                  rr->rr_data = NULL;
 883  884                  /*
 884  885                   * In this case, status is NFS_OK, but there is no data
 885  886                   * to encode. So set rr_mp to NULL.
 886  887                   */
 887  888                  rr->rr_mp = NULL;
 888  889                  rr->rr_ok.rrok_wlist = ra->ra_wlist;
 889  890                  if (rr->rr_ok.rrok_wlist)
 890  891                          clist_zero_len(rr->rr_ok.rrok_wlist);
 891  892                  goto done;
 892  893          }
 893  894  
 894  895          if (ra->ra_wlist) {
 895  896                  mp = NULL;
 896  897                  rr->rr_mp = NULL;
 897  898                  (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
 898  899                  if (ra->ra_count > iov.iov_len) {
 899  900                          rr->rr_data = NULL;
 900  901                          rr->rr_status = NFSERR_INVAL;
 901  902                          goto done;
 902  903                  }
 903  904          } else {
 904  905                  /*
 905  906                   * mp will contain the data to be sent out in the read reply.
 906  907                   * This will be freed after the reply has been sent out (by the
 907  908                   * driver).
 908  909                   * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
 909  910                   * that the call to xdrmblk_putmblk() never fails.
 910  911                   */
 911  912                  mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
 912  913                      &alloc_err);
 913  914                  ASSERT(mp != NULL);
 914  915                  ASSERT(alloc_err == 0);
 915  916  
 916  917                  rr->rr_mp = mp;
 917  918  
 918  919                  /*
 919  920                   * Set up io vector
 920  921                   */
 921  922                  iov.iov_base = (caddr_t)mp->b_datap->db_base;
 922  923                  iov.iov_len = ra->ra_count;
 923  924          }
 924  925  
 925  926          uio.uio_iov = &iov;
 926  927          uio.uio_iovcnt = 1;
 927  928          uio.uio_segflg = UIO_SYSSPACE;
 928  929          uio.uio_extflg = UIO_COPY_CACHED;
 929  930          uio.uio_loffset = (offset_t)ra->ra_offset;
 930  931          uio.uio_resid = ra->ra_count;
 931  932  
 932  933          error = VOP_READ(vp, &uio, 0, cr, &ct);
 933  934  
 934  935          if (error) {
 935  936                  if (mp)
 936  937                          freeb(mp);
 937  938  
 938  939                  /*
 939  940                   * check if a monitor detected a delegation conflict and
 940  941                   * mark as wouldblock so response is dropped
 941  942                   */
 942  943                  if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
 943  944                          curthread->t_flag |= T_WOULDBLOCK;
 944  945                  else
 945  946                          rr->rr_status = puterrno(error);
 946  947  
 947  948                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 948  949                  if (in_crit)
 949  950                          nbl_end_crit(vp);
 950  951  
 951  952                  VN_RELE(vp);
 952  953                  rr->rr_data = NULL;
 953  954  
 954  955                  return;
 955  956          }
 956  957  
 957  958          /*
 958  959           * Get attributes again so we can send the latest access
 959  960           * time to the client side for its cache.
 960  961           */
 961  962          va.va_mask = AT_ALL;
 962  963  
 963  964          error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 964  965  
 965  966          if (error) {
 966  967                  if (mp)
 967  968                          freeb(mp);
 968  969  
 969  970                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 970  971                  if (in_crit)
 971  972                          nbl_end_crit(vp);
 972  973  
 973  974                  VN_RELE(vp);
 974  975                  rr->rr_data = NULL;
 975  976                  rr->rr_status = puterrno(error);
 976  977  
 977  978                  return;
 978  979          }
 979  980  
 980  981          rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
 981  982  
 982  983          if (mp) {
 983  984                  rr->rr_data = (char *)mp->b_datap->db_base;
 984  985          } else {
 985  986                  if (ra->ra_wlist) {
 986  987                          rr->rr_data = (caddr_t)iov.iov_base;
 987  988                          if (!rdma_setup_read_data2(ra, rr)) {
 988  989                                  rr->rr_data = NULL;
 989  990                                  rr->rr_status = puterrno(NFSERR_INVAL);
 990  991                          }
 991  992                  }
 992  993          }
 993  994  done:
 994  995          VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 995  996          if (in_crit)
 996  997                  nbl_end_crit(vp);
 997  998  
 998  999          acl_perm(vp, exi, &va, cr);
 999 1000  
1000 1001          /* check for overflows */
1001 1002          error = vattr_to_nattr(&va, &rr->rr_attr);
1002 1003  
1003 1004          VN_RELE(vp);
1004 1005  
1005 1006          rr->rr_status = puterrno(error);
1006 1007  }
1007 1008  
1008 1009  /*
1009 1010   * Free data allocated by rfs_read
1010 1011   */
1011 1012  void
1012 1013  rfs_rdfree(struct nfsrdresult *rr)
1013 1014  {
1014 1015          mblk_t *mp;
1015 1016  
1016 1017          if (rr->rr_status == NFS_OK) {
1017 1018                  mp = rr->rr_mp;
1018 1019                  if (mp != NULL)
1019 1020                          freeb(mp);
1020 1021          }
1021 1022  }
1022 1023  
1023 1024  void *
1024 1025  rfs_read_getfh(struct nfsreadargs *ra)
1025 1026  {
1026 1027          return (&ra->ra_fhandle);
1027 1028  }
1028 1029  
1029 1030  #define MAX_IOVECS      12
1030 1031  
1031 1032  #ifdef DEBUG
1032 1033  static int rfs_write_sync_hits = 0;
1033 1034  static int rfs_write_sync_misses = 0;
1034 1035  #endif
1035 1036  
1036 1037  /*
1037 1038   * Write data to file.
1038 1039   * Returns attributes of a file after writing some data to it.
1039 1040   *
1040 1041   * Any changes made here, especially in error handling might have
1041 1042   * to also be done in rfs_write (which clusters write requests).
1042 1043   */
1043 1044  /* ARGSUSED */
1044 1045  void
1045 1046  rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
1046 1047      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1047 1048  {
1048 1049          int error;
1049 1050          vnode_t *vp;
1050 1051          rlim64_t rlimit;
1051 1052          struct vattr va;
1052 1053          struct uio uio;
1053 1054          struct iovec iov[MAX_IOVECS];
1054 1055          mblk_t *m;
1055 1056          struct iovec *iovp;
1056 1057          int iovcnt;
1057 1058          cred_t *savecred;
1058 1059          int in_crit = 0;
1059 1060          caller_context_t ct;
1060 1061  
1061 1062          vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1062 1063          if (vp == NULL) {
1063 1064                  ns->ns_status = NFSERR_STALE;
1064 1065                  return;
1065 1066          }
1066 1067  
1067 1068          if (rdonly(ro, vp)) {
1068 1069                  VN_RELE(vp);
1069 1070                  ns->ns_status = NFSERR_ROFS;
1070 1071                  return;
1071 1072          }
1072 1073  
1073 1074          if (vp->v_type != VREG) {
1074 1075                  VN_RELE(vp);
1075 1076                  ns->ns_status = NFSERR_ISDIR;
1076 1077                  return;
1077 1078          }
1078 1079  
1079 1080          ct.cc_sysid = 0;
1080 1081          ct.cc_pid = 0;
1081 1082          ct.cc_caller_id = nfs2_srv_caller_id;
1082 1083          ct.cc_flags = CC_DONTBLOCK;
1083 1084  
1084 1085          va.va_mask = AT_UID|AT_MODE;
1085 1086  
1086 1087          error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1087 1088  
1088 1089          if (error) {
1089 1090                  VN_RELE(vp);
1090 1091                  ns->ns_status = puterrno(error);
1091 1092  
1092 1093                  return;
1093 1094          }
1094 1095  
1095 1096          if (crgetuid(cr) != va.va_uid) {
1096 1097                  /*
1097 1098                   * This is a kludge to allow writes of files created
1098 1099                   * with read only permission.  The owner of the file
1099 1100                   * is always allowed to write it.
1100 1101                   */
1101 1102                  error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
1102 1103  
1103 1104                  if (error) {
1104 1105                          VN_RELE(vp);
1105 1106                          ns->ns_status = puterrno(error);
1106 1107                          return;
1107 1108                  }
1108 1109          }
1109 1110  
1110 1111          /*
1111 1112           * Can't access a mandatory lock file.  This might cause
1112 1113           * the NFS service thread to block forever waiting for a
1113 1114           * lock to be released that will never be released.
1114 1115           */
1115 1116          if (MANDLOCK(vp, va.va_mode)) {
1116 1117                  VN_RELE(vp);
1117 1118                  ns->ns_status = NFSERR_ACCES;
1118 1119                  return;
1119 1120          }
1120 1121  
1121 1122          /*
1122 1123           * We have to enter the critical region before calling VOP_RWLOCK
1123 1124           * to avoid a deadlock with ufs.
1124 1125           */
1125 1126          if (nbl_need_check(vp)) {
1126 1127                  nbl_start_crit(vp, RW_READER);
1127 1128                  in_crit = 1;
1128 1129                  if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1129 1130                      wa->wa_count, 0, NULL)) {
1130 1131                          error = EACCES;
1131 1132                          goto out;
1132 1133                  }
1133 1134          }
1134 1135  
1135 1136          error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1136 1137  
1137 1138          /* check if a monitor detected a delegation conflict */
1138 1139          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1139 1140                  goto out;
1140 1141          }
1141 1142  
1142 1143          if (wa->wa_data || wa->wa_rlist) {
1143 1144                  /* Do the RDMA thing if necessary */
1144 1145                  if (wa->wa_rlist) {
1145 1146                          iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1146 1147                          iov[0].iov_len = wa->wa_count;
1147 1148                  } else  {
1148 1149                          iov[0].iov_base = wa->wa_data;
1149 1150                          iov[0].iov_len = wa->wa_count;
1150 1151                  }
1151 1152                  uio.uio_iov = iov;
1152 1153                  uio.uio_iovcnt = 1;
1153 1154                  uio.uio_segflg = UIO_SYSSPACE;
1154 1155                  uio.uio_extflg = UIO_COPY_DEFAULT;
1155 1156                  uio.uio_loffset = (offset_t)wa->wa_offset;
1156 1157                  uio.uio_resid = wa->wa_count;
1157 1158                  /*
1158 1159                   * The limit is checked on the client. We
1159 1160                   * should allow any size writes here.
1160 1161                   */
1161 1162                  uio.uio_llimit = curproc->p_fsz_ctl;
1162 1163                  rlimit = uio.uio_llimit - wa->wa_offset;
1163 1164                  if (rlimit < (rlim64_t)uio.uio_resid)
1164 1165                          uio.uio_resid = (uint_t)rlimit;
1165 1166  
1166 1167                  /*
1167 1168                   * for now we assume no append mode
1168 1169                   */
1169 1170                  /*
1170 1171                   * We're changing creds because VM may fault and we need
1171 1172                   * the cred of the current thread to be used if quota
1172 1173                   * checking is enabled.
1173 1174                   */
1174 1175                  savecred = curthread->t_cred;
1175 1176                  curthread->t_cred = cr;
1176 1177                  error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1177 1178                  curthread->t_cred = savecred;
1178 1179          } else {
1179 1180  
1180 1181                  iovcnt = 0;
1181 1182                  for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1182 1183                          iovcnt++;
1183 1184                  if (iovcnt <= MAX_IOVECS) {
1184 1185  #ifdef DEBUG
1185 1186                          rfs_write_sync_hits++;
1186 1187  #endif
1187 1188                          iovp = iov;
1188 1189                  } else {
1189 1190  #ifdef DEBUG
1190 1191                          rfs_write_sync_misses++;
1191 1192  #endif
1192 1193                          iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1193 1194                  }
1194 1195                  mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1195 1196                  uio.uio_iov = iovp;
1196 1197                  uio.uio_iovcnt = iovcnt;
1197 1198                  uio.uio_segflg = UIO_SYSSPACE;
1198 1199                  uio.uio_extflg = UIO_COPY_DEFAULT;
1199 1200                  uio.uio_loffset = (offset_t)wa->wa_offset;
1200 1201                  uio.uio_resid = wa->wa_count;
1201 1202                  /*
1202 1203                   * The limit is checked on the client. We
1203 1204                   * should allow any size writes here.
1204 1205                   */
1205 1206                  uio.uio_llimit = curproc->p_fsz_ctl;
1206 1207                  rlimit = uio.uio_llimit - wa->wa_offset;
1207 1208                  if (rlimit < (rlim64_t)uio.uio_resid)
1208 1209                          uio.uio_resid = (uint_t)rlimit;
1209 1210  
1210 1211                  /*
1211 1212                   * For now we assume no append mode.
1212 1213                   */
1213 1214                  /*
1214 1215                   * We're changing creds because VM may fault and we need
1215 1216                   * the cred of the current thread to be used if quota
1216 1217                   * checking is enabled.
1217 1218                   */
1218 1219                  savecred = curthread->t_cred;
1219 1220                  curthread->t_cred = cr;
1220 1221                  error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1221 1222                  curthread->t_cred = savecred;
1222 1223  
1223 1224                  if (iovp != iov)
1224 1225                          kmem_free(iovp, sizeof (*iovp) * iovcnt);
1225 1226          }
1226 1227  
1227 1228          VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1228 1229  
1229 1230          if (!error) {
1230 1231                  /*
1231 1232                   * Get attributes again so we send the latest mod
1232 1233                   * time to the client side for its cache.
1233 1234                   */
1234 1235                  va.va_mask = AT_ALL;    /* now we want everything */
1235 1236  
1236 1237                  error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1237 1238  
1238 1239                  /* check for overflows */
1239 1240                  if (!error) {
1240 1241                          acl_perm(vp, exi, &va, cr);
1241 1242                          error = vattr_to_nattr(&va, &ns->ns_attr);
1242 1243                  }
1243 1244          }
1244 1245  
1245 1246  out:
1246 1247          if (in_crit)
1247 1248                  nbl_end_crit(vp);
1248 1249          VN_RELE(vp);
1249 1250  
1250 1251          /* check if a monitor detected a delegation conflict */
1251 1252          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1252 1253                  /* mark as wouldblock so response is dropped */
1253 1254                  curthread->t_flag |= T_WOULDBLOCK;
1254 1255          else
1255 1256                  ns->ns_status = puterrno(error);
1256 1257  
1257 1258  }
1258 1259  
1259 1260  struct rfs_async_write {
1260 1261          struct nfswriteargs *wa;
1261 1262          struct nfsattrstat *ns;
1262 1263          struct svc_req *req;
1263 1264          cred_t *cr;
1264 1265          bool_t ro;
1265 1266          kthread_t *thread;
1266 1267          struct rfs_async_write *list;
1267 1268  };
1268 1269  
1269 1270  struct rfs_async_write_list {
1270 1271          fhandle_t *fhp;
1271 1272          kcondvar_t cv;
1272 1273          struct rfs_async_write *list;
1273 1274          struct rfs_async_write_list *next;
1274 1275  };
1275 1276  
1276 1277  static struct rfs_async_write_list *rfs_async_write_head = NULL;
1277 1278  static kmutex_t rfs_async_write_lock;
1278 1279  static int rfs_write_async = 1; /* enables write clustering if == 1 */
1279 1280  
1280 1281  #define MAXCLIOVECS     42
1281 1282  #define RFSWRITE_INITVAL (enum nfsstat) -1
1282 1283  
1283 1284  #ifdef DEBUG
1284 1285  static int rfs_write_hits = 0;
1285 1286  static int rfs_write_misses = 0;
1286 1287  #endif
1287 1288  
1288 1289  /*
1289 1290   * Write data to file.
1290 1291   * Returns attributes of a file after writing some data to it.
1291 1292   */
1292 1293  void
1293 1294  rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1294 1295      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1295 1296  {
1296 1297          int error;
1297 1298          vnode_t *vp;
1298 1299          rlim64_t rlimit;
1299 1300          struct vattr va;
1300 1301          struct uio uio;
1301 1302          struct rfs_async_write_list *lp;
1302 1303          struct rfs_async_write_list *nlp;
1303 1304          struct rfs_async_write *rp;
1304 1305          struct rfs_async_write *nrp;
1305 1306          struct rfs_async_write *trp;
1306 1307          struct rfs_async_write *lrp;
1307 1308          int data_written;
1308 1309          int iovcnt;
1309 1310          mblk_t *m;
1310 1311          struct iovec *iovp;
1311 1312          struct iovec *niovp;
1312 1313          struct iovec iov[MAXCLIOVECS];
1313 1314          int count;
1314 1315          int rcount;
1315 1316          uint_t off;
1316 1317          uint_t len;
1317 1318          struct rfs_async_write nrpsp;
1318 1319          struct rfs_async_write_list nlpsp;
1319 1320          ushort_t t_flag;
1320 1321          cred_t *savecred;
1321 1322          int in_crit = 0;
1322 1323          caller_context_t ct;
1323 1324          nfs_srv_t *nsrv;
1324 1325  
1325 1326          ASSERT(exi == NULL || exi->exi_zoneid == curzone->zone_id);
1326 1327          nsrv = nfs_get_srv();
1327 1328          if (!nsrv->write_async) {
1328 1329                  rfs_write_sync(wa, ns, exi, req, cr, ro);
1329 1330                  return;
1330 1331          }
1331 1332  
1332 1333          /*
1333 1334           * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1334 1335           * is considered an OK.
1335 1336           */
1336 1337          ns->ns_status = RFSWRITE_INITVAL;
1337 1338  
1338 1339          nrp = &nrpsp;
1339 1340          nrp->wa = wa;
1340 1341          nrp->ns = ns;
1341 1342          nrp->req = req;
1342 1343          nrp->cr = cr;
1343 1344          nrp->ro = ro;
1344 1345          nrp->thread = curthread;
1345 1346  
1346 1347          ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1347 1348  
1348 1349          /*
1349 1350           * Look to see if there is already a cluster started
1350 1351           * for this file.
1351 1352           */
1352 1353          mutex_enter(&nsrv->async_write_lock);
1353 1354          for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
1354 1355                  if (bcmp(&wa->wa_fhandle, lp->fhp,
1355 1356                      sizeof (fhandle_t)) == 0)
1356 1357                          break;
1357 1358          }
1358 1359  
1359 1360          /*
1360 1361           * If lp is non-NULL, then there is already a cluster
1361 1362           * started.  We need to place ourselves in the cluster
1362 1363           * list in the right place as determined by starting
1363 1364           * offset.  Conflicts with non-blocking mandatory locked
1364 1365           * regions will be checked when the cluster is processed.
1365 1366           */
1366 1367          if (lp != NULL) {
1367 1368                  rp = lp->list;
1368 1369                  trp = NULL;
1369 1370                  while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1370 1371                          trp = rp;
1371 1372                          rp = rp->list;
1372 1373                  }
1373 1374                  nrp->list = rp;
1374 1375                  if (trp == NULL)
1375 1376                          lp->list = nrp;
1376 1377                  else
1377 1378                          trp->list = nrp;
1378 1379                  while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1379 1380                          cv_wait(&lp->cv, &nsrv->async_write_lock);
1380 1381                  mutex_exit(&nsrv->async_write_lock);
1381 1382  
1382 1383                  return;
1383 1384          }
1384 1385  
1385 1386          /*
1386 1387           * No cluster started yet, start one and add ourselves
1387 1388           * to the list of clusters.
1388 1389           */
1389 1390          nrp->list = NULL;
1390 1391  
1391 1392          nlp = &nlpsp;
1392 1393          nlp->fhp = &wa->wa_fhandle;
1393 1394          cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1394 1395          nlp->list = nrp;
1395 1396          nlp->next = NULL;
1396 1397  
1397 1398          if (nsrv->async_write_head == NULL) {
1398 1399                  nsrv->async_write_head = nlp;
1399 1400          } else {
1400 1401                  lp = nsrv->async_write_head;
1401 1402                  while (lp->next != NULL)
1402 1403                          lp = lp->next;
1403 1404                  lp->next = nlp;
1404 1405          }
1405 1406          mutex_exit(&nsrv->async_write_lock);
1406 1407  
1407 1408          /*
1408 1409           * Convert the file handle common to all of the requests
1409 1410           * in this cluster to a vnode.
1410 1411           */
1411 1412          vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1412 1413          if (vp == NULL) {
1413 1414                  mutex_enter(&nsrv->async_write_lock);
1414 1415                  if (nsrv->async_write_head == nlp)
1415 1416                          nsrv->async_write_head = nlp->next;
1416 1417                  else {
1417 1418                          lp = nsrv->async_write_head;
1418 1419                          while (lp->next != nlp)
1419 1420                                  lp = lp->next;
1420 1421                          lp->next = nlp->next;
1421 1422                  }
1422 1423                  t_flag = curthread->t_flag & T_WOULDBLOCK;
1423 1424                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
1424 1425                          rp->ns->ns_status = NFSERR_STALE;
1425 1426                          rp->thread->t_flag |= t_flag;
1426 1427                  }
1427 1428                  cv_broadcast(&nlp->cv);
1428 1429                  mutex_exit(&nsrv->async_write_lock);
1429 1430  
1430 1431                  return;
1431 1432          }
1432 1433  
1433 1434          /*
1434 1435           * Can only write regular files.  Attempts to write any
1435 1436           * other file types fail with EISDIR.
1436 1437           */
1437 1438          if (vp->v_type != VREG) {
1438 1439                  VN_RELE(vp);
1439 1440                  mutex_enter(&nsrv->async_write_lock);
1440 1441                  if (nsrv->async_write_head == nlp)
1441 1442                          nsrv->async_write_head = nlp->next;
1442 1443                  else {
1443 1444                          lp = nsrv->async_write_head;
1444 1445                          while (lp->next != nlp)
1445 1446                                  lp = lp->next;
1446 1447                          lp->next = nlp->next;
1447 1448                  }
1448 1449                  t_flag = curthread->t_flag & T_WOULDBLOCK;
1449 1450                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
1450 1451                          rp->ns->ns_status = NFSERR_ISDIR;
1451 1452                          rp->thread->t_flag |= t_flag;
1452 1453                  }
1453 1454                  cv_broadcast(&nlp->cv);
1454 1455                  mutex_exit(&nsrv->async_write_lock);
1455 1456  
1456 1457                  return;
1457 1458          }
1458 1459  
1459 1460          /*
1460 1461           * Enter the critical region before calling VOP_RWLOCK, to avoid a
1461 1462           * deadlock with ufs.
1462 1463           */
1463 1464          if (nbl_need_check(vp)) {
1464 1465                  nbl_start_crit(vp, RW_READER);
1465 1466                  in_crit = 1;
1466 1467          }
1467 1468  
1468 1469          ct.cc_sysid = 0;
1469 1470          ct.cc_pid = 0;
1470 1471          ct.cc_caller_id = nfs2_srv_caller_id;
1471 1472          ct.cc_flags = CC_DONTBLOCK;
1472 1473  
1473 1474          /*
1474 1475           * Lock the file for writing.  This operation provides
1475 1476           * the delay which allows clusters to grow.
1476 1477           */
1477 1478          error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1478 1479  
1479 1480          /* check if a monitor detected a delegation conflict */
1480 1481          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1481 1482                  if (in_crit)
1482 1483                          nbl_end_crit(vp);
1483 1484                  VN_RELE(vp);
1484 1485                  /* mark as wouldblock so response is dropped */
1485 1486                  curthread->t_flag |= T_WOULDBLOCK;
1486 1487                  mutex_enter(&nsrv->async_write_lock);
1487 1488                  if (nsrv->async_write_head == nlp)
1488 1489                          nsrv->async_write_head = nlp->next;
1489 1490                  else {
1490 1491                          lp = nsrv->async_write_head;
1491 1492                          while (lp->next != nlp)
1492 1493                                  lp = lp->next;
1493 1494                          lp->next = nlp->next;
1494 1495                  }
1495 1496                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
1496 1497                          if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1497 1498                                  rp->ns->ns_status = puterrno(error);
1498 1499                                  rp->thread->t_flag |= T_WOULDBLOCK;
1499 1500                          }
1500 1501                  }
1501 1502                  cv_broadcast(&nlp->cv);
1502 1503                  mutex_exit(&nsrv->async_write_lock);
1503 1504  
1504 1505                  return;
1505 1506          }
1506 1507  
1507 1508          /*
1508 1509           * Disconnect this cluster from the list of clusters.
1509 1510           * The cluster that is being dealt with must be fixed
1510 1511           * in size after this point, so there is no reason
1511 1512           * to leave it on the list so that new requests can
1512 1513           * find it.
1513 1514           *
1514 1515           * The algorithm is that the first write request will
1515 1516           * create a cluster, convert the file handle to a
1516 1517           * vnode pointer, and then lock the file for writing.
1517 1518           * This request is not likely to be clustered with
1518 1519           * any others.  However, the next request will create
1519 1520           * a new cluster and be blocked in VOP_RWLOCK while
1520 1521           * the first request is being processed.  This delay
1521 1522           * will allow more requests to be clustered in this
1522 1523           * second cluster.
1523 1524           */
1524 1525          mutex_enter(&nsrv->async_write_lock);
1525 1526          if (nsrv->async_write_head == nlp)
1526 1527                  nsrv->async_write_head = nlp->next;
1527 1528          else {
1528 1529                  lp = nsrv->async_write_head;
1529 1530                  while (lp->next != nlp)
1530 1531                          lp = lp->next;
1531 1532                  lp->next = nlp->next;
1532 1533          }
1533 1534          mutex_exit(&nsrv->async_write_lock);
1534 1535  
1535 1536          /*
1536 1537           * Step through the list of requests in this cluster.
1537 1538           * We need to check permissions to make sure that all
1538 1539           * of the requests have sufficient permission to write
1539 1540           * the file.  A cluster can be composed of requests
1540 1541           * from different clients and different users on each
1541 1542           * client.
1542 1543           *
1543 1544           * As a side effect, we also calculate the size of the
1544 1545           * byte range that this cluster encompasses.
1545 1546           */
1546 1547          rp = nlp->list;
1547 1548          off = rp->wa->wa_offset;
1548 1549          len = (uint_t)0;
1549 1550          do {
1550 1551                  if (rdonly(rp->ro, vp)) {
1551 1552                          rp->ns->ns_status = NFSERR_ROFS;
1552 1553                          t_flag = curthread->t_flag & T_WOULDBLOCK;
1553 1554                          rp->thread->t_flag |= t_flag;
1554 1555                          continue;
1555 1556                  }
1556 1557  
1557 1558                  va.va_mask = AT_UID|AT_MODE;
1558 1559  
1559 1560                  error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1560 1561  
1561 1562                  if (!error) {
1562 1563                          if (crgetuid(rp->cr) != va.va_uid) {
1563 1564                                  /*
1564 1565                                   * This is a kludge to allow writes of files
1565 1566                                   * created with read only permission.  The
1566 1567                                   * owner of the file is always allowed to
1567 1568                                   * write it.
1568 1569                                   */
1569 1570                                  error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1570 1571                          }
1571 1572                          if (!error && MANDLOCK(vp, va.va_mode))
1572 1573                                  error = EACCES;
1573 1574                  }
1574 1575  
1575 1576                  /*
1576 1577                   * Check for a conflict with a nbmand-locked region.
1577 1578                   */
1578 1579                  if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1579 1580                      rp->wa->wa_count, 0, NULL)) {
1580 1581                          error = EACCES;
1581 1582                  }
1582 1583  
1583 1584                  if (error) {
1584 1585                          rp->ns->ns_status = puterrno(error);
1585 1586                          t_flag = curthread->t_flag & T_WOULDBLOCK;
1586 1587                          rp->thread->t_flag |= t_flag;
1587 1588                          continue;
1588 1589                  }
1589 1590                  if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1590 1591                          len = rp->wa->wa_offset + rp->wa->wa_count - off;
1591 1592          } while ((rp = rp->list) != NULL);
1592 1593  
1593 1594          /*
1594 1595           * Step through the cluster attempting to gather as many
1595 1596           * requests which are contiguous as possible.  These
1596 1597           * contiguous requests are handled via one call to VOP_WRITE
1597 1598           * instead of different calls to VOP_WRITE.  We also keep
1598 1599           * track of the fact that any data was written.
1599 1600           */
1600 1601          rp = nlp->list;
1601 1602          data_written = 0;
1602 1603          do {
1603 1604                  /*
1604 1605                   * Skip any requests which are already marked as having an
1605 1606                   * error.
1606 1607                   */
1607 1608                  if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1608 1609                          rp = rp->list;
1609 1610                          continue;
1610 1611                  }
1611 1612  
1612 1613                  /*
1613 1614                   * Count the number of iovec's which are required
1614 1615                   * to handle this set of requests.  One iovec is
1615 1616                   * needed for each data buffer, whether addressed
1616 1617                   * by wa_data or by the b_rptr pointers in the
1617 1618                   * mblk chains.
1618 1619                   */
1619 1620                  iovcnt = 0;
1620 1621                  lrp = rp;
1621 1622                  for (;;) {
1622 1623                          if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1623 1624                                  iovcnt++;
1624 1625                          else {
1625 1626                                  m = lrp->wa->wa_mblk;
1626 1627                                  while (m != NULL) {
1627 1628                                          iovcnt++;
1628 1629                                          m = m->b_cont;
1629 1630                                  }
1630 1631                          }
1631 1632                          if (lrp->list == NULL ||
1632 1633                              lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1633 1634                              lrp->wa->wa_offset + lrp->wa->wa_count !=
1634 1635                              lrp->list->wa->wa_offset) {
1635 1636                                  lrp = lrp->list;
1636 1637                                  break;
1637 1638                          }
1638 1639                          lrp = lrp->list;
1639 1640                  }
1640 1641  
1641 1642                  if (iovcnt <= MAXCLIOVECS) {
1642 1643  #ifdef DEBUG
1643 1644                          rfs_write_hits++;
1644 1645  #endif
1645 1646                          niovp = iov;
1646 1647                  } else {
1647 1648  #ifdef DEBUG
1648 1649                          rfs_write_misses++;
1649 1650  #endif
1650 1651                          niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1651 1652                  }
1652 1653                  /*
1653 1654                   * Put together the scatter/gather iovecs.
1654 1655                   */
1655 1656                  iovp = niovp;
1656 1657                  trp = rp;
1657 1658                  count = 0;
1658 1659                  do {
1659 1660                          if (trp->wa->wa_data || trp->wa->wa_rlist) {
1660 1661                                  if (trp->wa->wa_rlist) {
1661 1662                                          iovp->iov_base =
1662 1663                                              (char *)((trp->wa->wa_rlist)->
1663 1664                                              u.c_daddr3);
1664 1665                                          iovp->iov_len = trp->wa->wa_count;
1665 1666                                  } else  {
1666 1667                                          iovp->iov_base = trp->wa->wa_data;
1667 1668                                          iovp->iov_len = trp->wa->wa_count;
1668 1669                                  }
1669 1670                                  iovp++;
1670 1671                          } else {
1671 1672                                  m = trp->wa->wa_mblk;
1672 1673                                  rcount = trp->wa->wa_count;
1673 1674                                  while (m != NULL) {
1674 1675                                          iovp->iov_base = (caddr_t)m->b_rptr;
1675 1676                                          iovp->iov_len = (m->b_wptr - m->b_rptr);
1676 1677                                          rcount -= iovp->iov_len;
1677 1678                                          if (rcount < 0)
1678 1679                                                  iovp->iov_len += rcount;
1679 1680                                          iovp++;
1680 1681                                          if (rcount <= 0)
1681 1682                                                  break;
1682 1683                                          m = m->b_cont;
1683 1684                                  }
1684 1685                          }
1685 1686                          count += trp->wa->wa_count;
1686 1687                          trp = trp->list;
1687 1688                  } while (trp != lrp);
1688 1689  
1689 1690                  uio.uio_iov = niovp;
1690 1691                  uio.uio_iovcnt = iovcnt;
1691 1692                  uio.uio_segflg = UIO_SYSSPACE;
1692 1693                  uio.uio_extflg = UIO_COPY_DEFAULT;
1693 1694                  uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1694 1695                  uio.uio_resid = count;
1695 1696                  /*
1696 1697                   * The limit is checked on the client. We
1697 1698                   * should allow any size writes here.
1698 1699                   */
1699 1700                  uio.uio_llimit = curproc->p_fsz_ctl;
1700 1701                  rlimit = uio.uio_llimit - rp->wa->wa_offset;
1701 1702                  if (rlimit < (rlim64_t)uio.uio_resid)
1702 1703                          uio.uio_resid = (uint_t)rlimit;
1703 1704  
1704 1705                  /*
1705 1706                   * For now we assume no append mode.
1706 1707                   */
1707 1708  
1708 1709                  /*
1709 1710                   * We're changing creds because VM may fault
1710 1711                   * and we need the cred of the current
1711 1712                   * thread to be used if quota * checking is
1712 1713                   * enabled.
1713 1714                   */
1714 1715                  savecred = curthread->t_cred;
1715 1716                  curthread->t_cred = cr;
1716 1717                  error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1717 1718                  curthread->t_cred = savecred;
1718 1719  
1719 1720                  /* check if a monitor detected a delegation conflict */
1720 1721                  if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1721 1722                          /* mark as wouldblock so response is dropped */
1722 1723                          curthread->t_flag |= T_WOULDBLOCK;
1723 1724  
1724 1725                  if (niovp != iov)
1725 1726                          kmem_free(niovp, sizeof (*niovp) * iovcnt);
1726 1727  
1727 1728                  if (!error) {
1728 1729                          data_written = 1;
1729 1730                          /*
1730 1731                           * Get attributes again so we send the latest mod
1731 1732                           * time to the client side for its cache.
1732 1733                           */
1733 1734                          va.va_mask = AT_ALL;    /* now we want everything */
1734 1735  
1735 1736                          error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1736 1737  
1737 1738                          if (!error)
1738 1739                                  acl_perm(vp, exi, &va, rp->cr);
1739 1740                  }
1740 1741  
1741 1742                  /*
1742 1743                   * Fill in the status responses for each request
1743 1744                   * which was just handled.  Also, copy the latest
1744 1745                   * attributes in to the attribute responses if
1745 1746                   * appropriate.
1746 1747                   */
1747 1748                  t_flag = curthread->t_flag & T_WOULDBLOCK;
1748 1749                  do {
1749 1750                          rp->thread->t_flag |= t_flag;
1750 1751                          /* check for overflows */
1751 1752                          if (!error) {
1752 1753                                  error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1753 1754                          }
1754 1755                          rp->ns->ns_status = puterrno(error);
1755 1756                          rp = rp->list;
1756 1757                  } while (rp != lrp);
1757 1758          } while (rp != NULL);
1758 1759  
1759 1760          /*
1760 1761           * If any data was written at all, then we need to flush
1761 1762           * the data and metadata to stable storage.
1762 1763           */
1763 1764          if (data_written) {
1764 1765                  error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1765 1766  
1766 1767                  if (!error) {
1767 1768                          error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1768 1769                  }
1769 1770          }
1770 1771  
1771 1772          VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1772 1773  
1773 1774          if (in_crit)
1774 1775                  nbl_end_crit(vp);
1775 1776          VN_RELE(vp);
1776 1777  
1777 1778          t_flag = curthread->t_flag & T_WOULDBLOCK;
1778 1779          mutex_enter(&nsrv->async_write_lock);
1779 1780          for (rp = nlp->list; rp != NULL; rp = rp->list) {
1780 1781                  if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1781 1782                          rp->ns->ns_status = puterrno(error);
1782 1783                          rp->thread->t_flag |= t_flag;
1783 1784                  }
1784 1785          }
1785 1786          cv_broadcast(&nlp->cv);
1786 1787          mutex_exit(&nsrv->async_write_lock);
1787 1788  
1788 1789  }
1789 1790  
1790 1791  void *
1791 1792  rfs_write_getfh(struct nfswriteargs *wa)
1792 1793  {
1793 1794          return (&wa->wa_fhandle);
1794 1795  }
1795 1796  
1796 1797  /*
1797 1798   * Create a file.
1798 1799   * Creates a file with given attributes and returns those attributes
1799 1800   * and an fhandle for the new file.
1800 1801   */
1801 1802  void
1802 1803  rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1803 1804      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1804 1805  {
1805 1806          int error;
1806 1807          int lookuperr;
1807 1808          int in_crit = 0;
1808 1809          struct vattr va;
1809 1810          vnode_t *vp;
1810 1811          vnode_t *realvp;
1811 1812          vnode_t *dvp;
1812 1813          char *name = args->ca_da.da_name;
1813 1814          vnode_t *tvp = NULL;
1814 1815          int mode;
1815 1816          int lookup_ok;
1816 1817          bool_t trunc;
1817 1818          struct sockaddr *ca;
1818 1819  
1819 1820          /*
1820 1821           * Disallow NULL paths
1821 1822           */
1822 1823          if (name == NULL || *name == '\0') {
1823 1824                  dr->dr_status = NFSERR_ACCES;
1824 1825                  return;
1825 1826          }
1826 1827  
1827 1828          dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1828 1829          if (dvp == NULL) {
1829 1830                  dr->dr_status = NFSERR_STALE;
1830 1831                  return;
1831 1832          }
1832 1833  
1833 1834          error = sattr_to_vattr(args->ca_sa, &va);
1834 1835          if (error) {
1835 1836                  dr->dr_status = puterrno(error);
1836 1837                  return;
1837 1838          }
1838 1839  
1839 1840          /*
1840 1841           * Must specify the mode.
1841 1842           */
1842 1843          if (!(va.va_mask & AT_MODE)) {
1843 1844                  VN_RELE(dvp);
1844 1845                  dr->dr_status = NFSERR_INVAL;
1845 1846                  return;
1846 1847          }
1847 1848  
1848 1849          /*
1849 1850           * This is a completely gross hack to make mknod
1850 1851           * work over the wire until we can wack the protocol
1851 1852           */
1852 1853          if ((va.va_mode & IFMT) == IFCHR) {
1853 1854                  if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1854 1855                          va.va_type = VFIFO;     /* xtra kludge for named pipe */
1855 1856                  else {
1856 1857                          va.va_type = VCHR;
1857 1858                          /*
1858 1859                           * uncompress the received dev_t
1859 1860                           * if the top half is zero indicating a request
1860 1861                           * from an `older style' OS.
1861 1862                           */
1862 1863                          if ((va.va_size & 0xffff0000) == 0)
1863 1864                                  va.va_rdev = nfsv2_expdev(va.va_size);
1864 1865                          else
1865 1866                                  va.va_rdev = (dev_t)va.va_size;
1866 1867                  }
1867 1868                  va.va_mask &= ~AT_SIZE;
1868 1869          } else if ((va.va_mode & IFMT) == IFBLK) {
1869 1870                  va.va_type = VBLK;
1870 1871                  /*
1871 1872                   * uncompress the received dev_t
1872 1873                   * if the top half is zero indicating a request
1873 1874                   * from an `older style' OS.
1874 1875                   */
1875 1876                  if ((va.va_size & 0xffff0000) == 0)
1876 1877                          va.va_rdev = nfsv2_expdev(va.va_size);
1877 1878                  else
1878 1879                          va.va_rdev = (dev_t)va.va_size;
1879 1880                  va.va_mask &= ~AT_SIZE;
1880 1881          } else if ((va.va_mode & IFMT) == IFSOCK) {
1881 1882                  va.va_type = VSOCK;
1882 1883          } else {
1883 1884                  va.va_type = VREG;
1884 1885          }
1885 1886          va.va_mode &= ~IFMT;
1886 1887          va.va_mask |= AT_TYPE;
1887 1888  
1888 1889          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1889 1890          name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1890 1891              MAXPATHLEN);
1891 1892          if (name == NULL) {
1892 1893                  dr->dr_status = puterrno(EINVAL);
1893 1894                  return;
1894 1895          }
1895 1896  
1896 1897          /*
1897 1898           * Why was the choice made to use VWRITE as the mode to the
1898 1899           * call to VOP_CREATE ? This results in a bug.  When a client
1899 1900           * opens a file that already exists and is RDONLY, the second
1900 1901           * open fails with an EACESS because of the mode.
1901 1902           * bug ID 1054648.
1902 1903           */
1903 1904          lookup_ok = 0;
1904 1905          mode = VWRITE;
1905 1906          if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1906 1907                  error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1907 1908                      NULL, NULL, NULL);
1908 1909                  if (!error) {
1909 1910                          struct vattr at;
1910 1911  
1911 1912                          lookup_ok = 1;
1912 1913                          at.va_mask = AT_MODE;
1913 1914                          error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1914 1915                          if (!error)
1915 1916                                  mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1916 1917                          VN_RELE(tvp);
1917 1918                          tvp = NULL;
1918 1919                  }
1919 1920          }
1920 1921  
1921 1922          if (!lookup_ok) {
1922 1923                  if (rdonly(ro, dvp)) {
1923 1924                          error = EROFS;
1924 1925                  } else if (va.va_type != VREG && va.va_type != VFIFO &&
1925 1926                      va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1926 1927                          error = EPERM;
1927 1928                  } else {
1928 1929                          error = 0;
1929 1930                  }
1930 1931          }
1931 1932  
1932 1933          /*
1933 1934           * If file size is being modified on an already existing file
1934 1935           * make sure that there are no conflicting non-blocking mandatory
1935 1936           * locks in the region being manipulated. Return EACCES if there
1936 1937           * are conflicting locks.
1937 1938           */
1938 1939          if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1939 1940                  lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1940 1941                      NULL, NULL, NULL);
1941 1942  
1942 1943                  if (!lookuperr &&
1943 1944                      rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1944 1945                          VN_RELE(tvp);
1945 1946                          curthread->t_flag |= T_WOULDBLOCK;
1946 1947                          goto out;
1947 1948                  }
1948 1949  
1949 1950                  if (!lookuperr && nbl_need_check(tvp)) {
1950 1951                          /*
1951 1952                           * The file exists. Now check if it has any
1952 1953                           * conflicting non-blocking mandatory locks
1953 1954                           * in the region being changed.
1954 1955                           */
1955 1956                          struct vattr bva;
1956 1957                          u_offset_t offset;
1957 1958                          ssize_t length;
1958 1959  
1959 1960                          nbl_start_crit(tvp, RW_READER);
1960 1961                          in_crit = 1;
1961 1962  
1962 1963                          bva.va_mask = AT_SIZE;
1963 1964                          error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1964 1965                          if (!error) {
1965 1966                                  if (va.va_size < bva.va_size) {
1966 1967                                          offset = va.va_size;
1967 1968                                          length = bva.va_size - va.va_size;
1968 1969                                  } else {
1969 1970                                          offset = bva.va_size;
1970 1971                                          length = va.va_size - bva.va_size;
1971 1972                                  }
1972 1973                                  if (length) {
1973 1974                                          if (nbl_conflict(tvp, NBL_WRITE,
1974 1975                                              offset, length, 0, NULL)) {
1975 1976                                                  error = EACCES;
1976 1977                                          }
1977 1978                                  }
1978 1979                          }
1979 1980                          if (error) {
1980 1981                                  nbl_end_crit(tvp);
1981 1982                                  VN_RELE(tvp);
1982 1983                                  in_crit = 0;
1983 1984                          }
1984 1985                  } else if (tvp != NULL) {
1985 1986                          VN_RELE(tvp);
1986 1987                  }
1987 1988          }
1988 1989  
1989 1990          if (!error) {
1990 1991                  /*
1991 1992                   * If filesystem is shared with nosuid the remove any
1992 1993                   * setuid/setgid bits on create.
1993 1994                   */
1994 1995                  if (va.va_type == VREG &&
1995 1996                      exi->exi_export.ex_flags & EX_NOSUID)
1996 1997                          va.va_mode &= ~(VSUID | VSGID);
1997 1998  
1998 1999                  error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1999 2000                      NULL, NULL);
2000 2001  
2001 2002                  if (!error) {
2002 2003  
2003 2004                          if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
2004 2005                                  trunc = TRUE;
2005 2006                          else
2006 2007                                  trunc = FALSE;
2007 2008  
2008 2009                          if (rfs4_check_delegated(FWRITE, vp, trunc)) {
2009 2010                                  VN_RELE(vp);
2010 2011                                  curthread->t_flag |= T_WOULDBLOCK;
2011 2012                                  goto out;
2012 2013                          }
2013 2014                          va.va_mask = AT_ALL;
2014 2015  
2015 2016                          error = VOP_GETATTR(vp, &va, 0, cr, NULL);
2016 2017  
2017 2018                          /* check for overflows */
2018 2019                          if (!error) {
2019 2020                                  acl_perm(vp, exi, &va, cr);
2020 2021                                  error = vattr_to_nattr(&va, &dr->dr_attr);
2021 2022                                  if (!error) {
2022 2023                                          error = makefh(&dr->dr_fhandle, vp,
2023 2024                                              exi);
2024 2025                                  }
2025 2026                          }
2026 2027                          /*
2027 2028                           * Force modified metadata out to stable storage.
2028 2029                           *
2029 2030                           * if a underlying vp exists, pass it to VOP_FSYNC
2030 2031                           */
2031 2032                          if (VOP_REALVP(vp, &realvp, NULL) == 0)
2032 2033                                  (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
2033 2034                          else
2034 2035                                  (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2035 2036                          VN_RELE(vp);
2036 2037                  }
2037 2038  
2038 2039                  if (in_crit) {
2039 2040                          nbl_end_crit(tvp);
2040 2041                          VN_RELE(tvp);
2041 2042                  }
2042 2043          }
2043 2044  
2044 2045          /*
2045 2046           * Force modified data and metadata out to stable storage.
2046 2047           */
2047 2048          (void) VOP_FSYNC(dvp, 0, cr, NULL);
2048 2049  
2049 2050  out:
2050 2051  
2051 2052          VN_RELE(dvp);
2052 2053  
2053 2054          dr->dr_status = puterrno(error);
2054 2055  
2055 2056          if (name != args->ca_da.da_name)
2056 2057                  kmem_free(name, MAXPATHLEN);
2057 2058  }
2058 2059  void *
2059 2060  rfs_create_getfh(struct nfscreatargs *args)
2060 2061  {
2061 2062          return (args->ca_da.da_fhandle);
2062 2063  }
2063 2064  
2064 2065  /*
2065 2066   * Remove a file.
2066 2067   * Remove named file from parent directory.
2067 2068   */
2068 2069  /* ARGSUSED */
2069 2070  void
2070 2071  rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2071 2072      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2072 2073  {
2073 2074          int error = 0;
2074 2075          vnode_t *vp;
2075 2076          vnode_t *targvp;
2076 2077          int in_crit = 0;
2077 2078  
2078 2079          /*
2079 2080           * Disallow NULL paths
2080 2081           */
2081 2082          if (da->da_name == NULL || *da->da_name == '\0') {
2082 2083                  *status = NFSERR_ACCES;
2083 2084                  return;
2084 2085          }
2085 2086  
2086 2087          vp = nfs_fhtovp(da->da_fhandle, exi);
2087 2088          if (vp == NULL) {
2088 2089                  *status = NFSERR_STALE;
2089 2090                  return;
2090 2091          }
2091 2092  
2092 2093          if (rdonly(ro, vp)) {
2093 2094                  VN_RELE(vp);
2094 2095                  *status = NFSERR_ROFS;
2095 2096                  return;
2096 2097          }
2097 2098  
2098 2099          /*
2099 2100           * Check for a conflict with a non-blocking mandatory share reservation.
2100 2101           */
2101 2102          error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2102 2103              NULL, cr, NULL, NULL, NULL);
2103 2104          if (error != 0) {
2104 2105                  VN_RELE(vp);
2105 2106                  *status = puterrno(error);
2106 2107                  return;
2107 2108          }
2108 2109  
2109 2110          /*
2110 2111           * If the file is delegated to an v4 client, then initiate
2111 2112           * recall and drop this request (by setting T_WOULDBLOCK).
2112 2113           * The client will eventually re-transmit the request and
2113 2114           * (hopefully), by then, the v4 client will have returned
2114 2115           * the delegation.
2115 2116           */
2116 2117  
2117 2118          if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2118 2119                  VN_RELE(vp);
2119 2120                  VN_RELE(targvp);
2120 2121                  curthread->t_flag |= T_WOULDBLOCK;
2121 2122                  return;
2122 2123          }
2123 2124  
2124 2125          if (nbl_need_check(targvp)) {
2125 2126                  nbl_start_crit(targvp, RW_READER);
2126 2127                  in_crit = 1;
2127 2128                  if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2128 2129                          error = EACCES;
2129 2130                          goto out;
2130 2131                  }
2131 2132          }
2132 2133  
2133 2134          error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2134 2135  
2135 2136          /*
2136 2137           * Force modified data and metadata out to stable storage.
2137 2138           */
2138 2139          (void) VOP_FSYNC(vp, 0, cr, NULL);
2139 2140  
2140 2141  out:
2141 2142          if (in_crit)
2142 2143                  nbl_end_crit(targvp);
2143 2144          VN_RELE(targvp);
2144 2145          VN_RELE(vp);
2145 2146  
2146 2147          *status = puterrno(error);
2147 2148  
2148 2149  }
2149 2150  
2150 2151  void *
2151 2152  rfs_remove_getfh(struct nfsdiropargs *da)
2152 2153  {
2153 2154          return (da->da_fhandle);
2154 2155  }
2155 2156  
2156 2157  /*
2157 2158   * rename a file
2158 2159   * Give a file (from) a new name (to).
2159 2160   */
2160 2161  /* ARGSUSED */
2161 2162  void
2162 2163  rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2163 2164      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2164 2165  {
2165 2166          int error = 0;
2166 2167          vnode_t *fromvp;
2167 2168          vnode_t *tovp;
2168 2169          struct exportinfo *to_exi;
2169 2170          fhandle_t *fh;
2170 2171          vnode_t *srcvp;
2171 2172          vnode_t *targvp;
2172 2173          int in_crit = 0;
2173 2174  
2174 2175          fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2175 2176          if (fromvp == NULL) {
2176 2177                  *status = NFSERR_STALE;
2177 2178                  return;
2178 2179          }
2179 2180  
2180 2181          fh = args->rna_to.da_fhandle;
2181 2182          to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2182 2183          if (to_exi == NULL) {
2183 2184                  VN_RELE(fromvp);
2184 2185                  *status = NFSERR_ACCES;
2185 2186                  return;
2186 2187          }
2187 2188          exi_rele(to_exi);
2188 2189  
2189 2190          if (to_exi != exi) {
2190 2191                  VN_RELE(fromvp);
2191 2192                  *status = NFSERR_XDEV;
2192 2193                  return;
2193 2194          }
2194 2195  
2195 2196          tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2196 2197          if (tovp == NULL) {
2197 2198                  VN_RELE(fromvp);
2198 2199                  *status = NFSERR_STALE;
2199 2200                  return;
2200 2201          }
2201 2202  
2202 2203          if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2203 2204                  VN_RELE(tovp);
2204 2205                  VN_RELE(fromvp);
2205 2206                  *status = NFSERR_NOTDIR;
2206 2207                  return;
2207 2208          }
2208 2209  
2209 2210          /*
2210 2211           * Disallow NULL paths
2211 2212           */
2212 2213          if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2213 2214              args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2214 2215                  VN_RELE(tovp);
2215 2216                  VN_RELE(fromvp);
2216 2217                  *status = NFSERR_ACCES;
2217 2218                  return;
2218 2219          }
2219 2220  
2220 2221          if (rdonly(ro, tovp)) {
2221 2222                  VN_RELE(tovp);
2222 2223                  VN_RELE(fromvp);
2223 2224                  *status = NFSERR_ROFS;
2224 2225                  return;
2225 2226          }
2226 2227  
2227 2228          /*
2228 2229           * Check for a conflict with a non-blocking mandatory share reservation.
2229 2230           */
2230 2231          error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2231 2232              NULL, cr, NULL, NULL, NULL);
2232 2233          if (error != 0) {
2233 2234                  VN_RELE(tovp);
2234 2235                  VN_RELE(fromvp);
2235 2236                  *status = puterrno(error);
2236 2237                  return;
2237 2238          }
2238 2239  
2239 2240          /* Check for delegations on the source file */
2240 2241  
2241 2242          if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2242 2243                  VN_RELE(tovp);
2243 2244                  VN_RELE(fromvp);
2244 2245                  VN_RELE(srcvp);
2245 2246                  curthread->t_flag |= T_WOULDBLOCK;
2246 2247                  return;
2247 2248          }
2248 2249  
2249 2250          /* Check for delegation on the file being renamed over, if it exists */
2250 2251  
2251 2252          if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
2252 2253              VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2253 2254              NULL, NULL, NULL) == 0) {
2254 2255  
2255 2256                  if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2256 2257                          VN_RELE(tovp);
2257 2258                          VN_RELE(fromvp);
2258 2259                          VN_RELE(srcvp);
2259 2260                          VN_RELE(targvp);
2260 2261                          curthread->t_flag |= T_WOULDBLOCK;
2261 2262                          return;
2262 2263                  }
2263 2264                  VN_RELE(targvp);
2264 2265          }
2265 2266  
2266 2267  
2267 2268          if (nbl_need_check(srcvp)) {
2268 2269                  nbl_start_crit(srcvp, RW_READER);
2269 2270                  in_crit = 1;
2270 2271                  if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2271 2272                          error = EACCES;
2272 2273                          goto out;
2273 2274                  }
2274 2275          }
2275 2276  
2276 2277          error = VOP_RENAME(fromvp, args->rna_from.da_name,
2277 2278              tovp, args->rna_to.da_name, cr, NULL, 0);
2278 2279  
2279 2280          if (error == 0)
2280 2281                  vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2281 2282                      strlen(args->rna_to.da_name));
2282 2283  
2283 2284          /*
2284 2285           * Force modified data and metadata out to stable storage.
2285 2286           */
2286 2287          (void) VOP_FSYNC(tovp, 0, cr, NULL);
2287 2288          (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2288 2289  
2289 2290  out:
2290 2291          if (in_crit)
2291 2292                  nbl_end_crit(srcvp);
2292 2293          VN_RELE(srcvp);
2293 2294          VN_RELE(tovp);
2294 2295          VN_RELE(fromvp);
2295 2296  
2296 2297          *status = puterrno(error);
2297 2298  
2298 2299  }
2299 2300  void *
2300 2301  rfs_rename_getfh(struct nfsrnmargs *args)
2301 2302  {
2302 2303          return (args->rna_from.da_fhandle);
2303 2304  }
2304 2305  
2305 2306  /*
2306 2307   * Link to a file.
2307 2308   * Create a file (to) which is a hard link to the given file (from).
2308 2309   */
2309 2310  /* ARGSUSED */
2310 2311  void
2311 2312  rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2312 2313      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2313 2314  {
2314 2315          int error;
2315 2316          vnode_t *fromvp;
2316 2317          vnode_t *tovp;
2317 2318          struct exportinfo *to_exi;
2318 2319          fhandle_t *fh;
2319 2320  
2320 2321          fromvp = nfs_fhtovp(args->la_from, exi);
2321 2322          if (fromvp == NULL) {
2322 2323                  *status = NFSERR_STALE;
2323 2324                  return;
2324 2325          }
2325 2326  
2326 2327          fh = args->la_to.da_fhandle;
2327 2328          to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2328 2329          if (to_exi == NULL) {
2329 2330                  VN_RELE(fromvp);
2330 2331                  *status = NFSERR_ACCES;
2331 2332                  return;
2332 2333          }
2333 2334          exi_rele(to_exi);
2334 2335  
2335 2336          if (to_exi != exi) {
2336 2337                  VN_RELE(fromvp);
2337 2338                  *status = NFSERR_XDEV;
2338 2339                  return;
2339 2340          }
2340 2341  
2341 2342          tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2342 2343          if (tovp == NULL) {
2343 2344                  VN_RELE(fromvp);
2344 2345                  *status = NFSERR_STALE;
2345 2346                  return;
2346 2347          }
2347 2348  
2348 2349          if (tovp->v_type != VDIR) {
2349 2350                  VN_RELE(tovp);
2350 2351                  VN_RELE(fromvp);
2351 2352                  *status = NFSERR_NOTDIR;
2352 2353                  return;
2353 2354          }
2354 2355          /*
2355 2356           * Disallow NULL paths
2356 2357           */
2357 2358          if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2358 2359                  VN_RELE(tovp);
2359 2360                  VN_RELE(fromvp);
2360 2361                  *status = NFSERR_ACCES;
2361 2362                  return;
2362 2363          }
2363 2364  
2364 2365          if (rdonly(ro, tovp)) {
2365 2366                  VN_RELE(tovp);
2366 2367                  VN_RELE(fromvp);
2367 2368                  *status = NFSERR_ROFS;
2368 2369                  return;
2369 2370          }
2370 2371  
2371 2372          error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2372 2373  
2373 2374          /*
2374 2375           * Force modified data and metadata out to stable storage.
2375 2376           */
2376 2377          (void) VOP_FSYNC(tovp, 0, cr, NULL);
2377 2378          (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2378 2379  
2379 2380          VN_RELE(tovp);
2380 2381          VN_RELE(fromvp);
2381 2382  
2382 2383          *status = puterrno(error);
2383 2384  
2384 2385  }
2385 2386  void *
2386 2387  rfs_link_getfh(struct nfslinkargs *args)
2387 2388  {
2388 2389          return (args->la_from);
2389 2390  }
2390 2391  
2391 2392  /*
2392 2393   * Symbolicly link to a file.
2393 2394   * Create a file (to) with the given attributes which is a symbolic link
2394 2395   * to the given path name (to).
2395 2396   */
2396 2397  void
2397 2398  rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2398 2399      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2399 2400  {
2400 2401          int error;
2401 2402          struct vattr va;
2402 2403          vnode_t *vp;
2403 2404          vnode_t *svp;
2404 2405          int lerror;
2405 2406          struct sockaddr *ca;
2406 2407          char *name = NULL;
2407 2408  
2408 2409          /*
2409 2410           * Disallow NULL paths
2410 2411           */
2411 2412          if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2412 2413                  *status = NFSERR_ACCES;
2413 2414                  return;
2414 2415          }
2415 2416  
2416 2417          vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2417 2418          if (vp == NULL) {
2418 2419                  *status = NFSERR_STALE;
2419 2420                  return;
2420 2421          }
2421 2422  
2422 2423          if (rdonly(ro, vp)) {
2423 2424                  VN_RELE(vp);
2424 2425                  *status = NFSERR_ROFS;
2425 2426                  return;
2426 2427          }
2427 2428  
2428 2429          error = sattr_to_vattr(args->sla_sa, &va);
2429 2430          if (error) {
2430 2431                  VN_RELE(vp);
2431 2432                  *status = puterrno(error);
2432 2433                  return;
2433 2434          }
2434 2435  
2435 2436          if (!(va.va_mask & AT_MODE)) {
2436 2437                  VN_RELE(vp);
2437 2438                  *status = NFSERR_INVAL;
2438 2439                  return;
2439 2440          }
2440 2441  
2441 2442          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2442 2443          name = nfscmd_convname(ca, exi, args->sla_tnm,
2443 2444              NFSCMD_CONV_INBOUND, MAXPATHLEN);
2444 2445  
2445 2446          if (name == NULL) {
2446 2447                  *status = NFSERR_ACCES;
2447 2448                  return;
2448 2449          }
2449 2450  
2450 2451          va.va_type = VLNK;
2451 2452          va.va_mask |= AT_TYPE;
2452 2453  
2453 2454          error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2454 2455  
2455 2456          /*
2456 2457           * Force new data and metadata out to stable storage.
2457 2458           */
2458 2459          lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2459 2460              NULL, cr, NULL, NULL, NULL);
2460 2461  
2461 2462          if (!lerror) {
2462 2463                  (void) VOP_FSYNC(svp, 0, cr, NULL);
2463 2464                  VN_RELE(svp);
2464 2465          }
2465 2466  
2466 2467          /*
2467 2468           * Force modified data and metadata out to stable storage.
2468 2469           */
2469 2470          (void) VOP_FSYNC(vp, 0, cr, NULL);
2470 2471  
2471 2472          VN_RELE(vp);
2472 2473  
2473 2474          *status = puterrno(error);
2474 2475          if (name != args->sla_tnm)
2475 2476                  kmem_free(name, MAXPATHLEN);
2476 2477  
2477 2478  }
2478 2479  void *
2479 2480  rfs_symlink_getfh(struct nfsslargs *args)
2480 2481  {
2481 2482          return (args->sla_from.da_fhandle);
2482 2483  }
2483 2484  
2484 2485  /*
2485 2486   * Make a directory.
2486 2487   * Create a directory with the given name, parent directory, and attributes.
2487 2488   * Returns a file handle and attributes for the new directory.
2488 2489   */
2489 2490  /* ARGSUSED */
2490 2491  void
2491 2492  rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2492 2493      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2493 2494  {
2494 2495          int error;
2495 2496          struct vattr va;
2496 2497          vnode_t *dvp = NULL;
2497 2498          vnode_t *vp;
2498 2499          char *name = args->ca_da.da_name;
2499 2500  
2500 2501          /*
2501 2502           * Disallow NULL paths
2502 2503           */
2503 2504          if (name == NULL || *name == '\0') {
2504 2505                  dr->dr_status = NFSERR_ACCES;
2505 2506                  return;
2506 2507          }
2507 2508  
2508 2509          vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2509 2510          if (vp == NULL) {
2510 2511                  dr->dr_status = NFSERR_STALE;
2511 2512                  return;
2512 2513          }
2513 2514  
2514 2515          if (rdonly(ro, vp)) {
2515 2516                  VN_RELE(vp);
2516 2517                  dr->dr_status = NFSERR_ROFS;
2517 2518                  return;
2518 2519          }
2519 2520  
2520 2521          error = sattr_to_vattr(args->ca_sa, &va);
2521 2522          if (error) {
2522 2523                  VN_RELE(vp);
2523 2524                  dr->dr_status = puterrno(error);
2524 2525                  return;
2525 2526          }
2526 2527  
2527 2528          if (!(va.va_mask & AT_MODE)) {
2528 2529                  VN_RELE(vp);
2529 2530                  dr->dr_status = NFSERR_INVAL;
2530 2531                  return;
2531 2532          }
2532 2533  
2533 2534          va.va_type = VDIR;
2534 2535          va.va_mask |= AT_TYPE;
2535 2536  
2536 2537          error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2537 2538  
2538 2539          if (!error) {
2539 2540                  /*
2540 2541                   * Attribtutes of the newly created directory should
2541 2542                   * be returned to the client.
2542 2543                   */
2543 2544                  va.va_mask = AT_ALL; /* We want everything */
2544 2545                  error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2545 2546  
2546 2547                  /* check for overflows */
2547 2548                  if (!error) {
2548 2549                          acl_perm(vp, exi, &va, cr);
2549 2550                          error = vattr_to_nattr(&va, &dr->dr_attr);
2550 2551                          if (!error) {
2551 2552                                  error = makefh(&dr->dr_fhandle, dvp, exi);
2552 2553                          }
2553 2554                  }
2554 2555                  /*
2555 2556                   * Force new data and metadata out to stable storage.
2556 2557                   */
2557 2558                  (void) VOP_FSYNC(dvp, 0, cr, NULL);
2558 2559                  VN_RELE(dvp);
2559 2560          }
2560 2561  
2561 2562          /*
2562 2563           * Force modified data and metadata out to stable storage.
2563 2564           */
2564 2565          (void) VOP_FSYNC(vp, 0, cr, NULL);
2565 2566  
2566 2567          VN_RELE(vp);
2567 2568  
2568 2569          dr->dr_status = puterrno(error);
2569 2570  
2570 2571  }
2571 2572  void *
2572 2573  rfs_mkdir_getfh(struct nfscreatargs *args)
2573 2574  {
2574 2575          return (args->ca_da.da_fhandle);
2575 2576  }
2576 2577  
2577 2578  /*
2578 2579   * Remove a directory.
2579 2580   * Remove the given directory name from the given parent directory.
2580 2581   */
2581 2582  /* ARGSUSED */
2582 2583  void
2583 2584  rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2584 2585      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2585 2586  {
2586 2587          int error;
2587 2588          vnode_t *vp;
2588 2589  
2589 2590          /*
2590 2591           * Disallow NULL paths
2591 2592           */
2592 2593          if (da->da_name == NULL || *da->da_name == '\0') {
2593 2594                  *status = NFSERR_ACCES;
2594 2595                  return;
2595 2596          }
2596 2597  
2597 2598          vp = nfs_fhtovp(da->da_fhandle, exi);
2598 2599          if (vp == NULL) {
2599 2600                  *status = NFSERR_STALE;
2600 2601                  return;
2601 2602          }
2602 2603  
2603 2604          if (rdonly(ro, vp)) {
2604 2605                  VN_RELE(vp);
2605 2606                  *status = NFSERR_ROFS;
2606 2607                  return;
2607 2608          }
2608 2609  
2609 2610          /*
2610 2611           * VOP_RMDIR takes a third argument (the current
2611 2612           * directory of the process).  That's because someone
2612 2613           * wants to return EINVAL if one tries to remove ".".
2613 2614           * Of course, NFS servers have no idea what their
2614 2615           * clients' current directories are.  We fake it by
2615 2616           * supplying a vnode known to exist and illegal to
2616 2617           * remove.
2617 2618           */
2618 2619          error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
2619 2620  
2620 2621          /*
2621 2622           * Force modified data and metadata out to stable storage.
2622 2623           */
2623 2624          (void) VOP_FSYNC(vp, 0, cr, NULL);
2624 2625  
2625 2626          VN_RELE(vp);
2626 2627  
2627 2628          /*
2628 2629           * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2629 2630           * if the directory is not empty.  A System V NFS server
2630 2631           * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2631 2632           * over the wire.
2632 2633           */
2633 2634          if (error == EEXIST)
2634 2635                  *status = NFSERR_NOTEMPTY;
2635 2636          else
2636 2637                  *status = puterrno(error);
2637 2638  
2638 2639  }
2639 2640  void *
2640 2641  rfs_rmdir_getfh(struct nfsdiropargs *da)
2641 2642  {
2642 2643          return (da->da_fhandle);
2643 2644  }
2644 2645  
2645 2646  /* ARGSUSED */
2646 2647  void
2647 2648  rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2648 2649      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2649 2650  {
2650 2651          int error;
2651 2652          int iseof;
2652 2653          struct iovec iov;
2653 2654          struct uio uio;
2654 2655          vnode_t *vp;
2655 2656          char *ndata = NULL;
2656 2657          struct sockaddr *ca;
2657 2658          size_t nents;
2658 2659          int ret;
2659 2660  
2660 2661          vp = nfs_fhtovp(&rda->rda_fh, exi);
2661 2662          if (vp == NULL) {
2662 2663                  rd->rd_entries = NULL;
2663 2664                  rd->rd_status = NFSERR_STALE;
2664 2665                  return;
2665 2666          }
2666 2667  
2667 2668          if (vp->v_type != VDIR) {
2668 2669                  VN_RELE(vp);
2669 2670                  rd->rd_entries = NULL;
2670 2671                  rd->rd_status = NFSERR_NOTDIR;
2671 2672                  return;
2672 2673          }
2673 2674  
2674 2675          (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2675 2676  
2676 2677          error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2677 2678  
2678 2679          if (error) {
2679 2680                  rd->rd_entries = NULL;
2680 2681                  goto bad;
2681 2682          }
2682 2683  
2683 2684          if (rda->rda_count == 0) {
2684 2685                  rd->rd_entries = NULL;
2685 2686                  rd->rd_size = 0;
2686 2687                  rd->rd_eof = FALSE;
2687 2688                  goto bad;
2688 2689          }
2689 2690  
2690 2691          rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2691 2692  
2692 2693          /*
2693 2694           * Allocate data for entries.  This will be freed by rfs_rddirfree.
2694 2695           */
2695 2696          rd->rd_bufsize = (uint_t)rda->rda_count;
2696 2697          rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2697 2698  
2698 2699          /*
2699 2700           * Set up io vector to read directory data
2700 2701           */
2701 2702          iov.iov_base = (caddr_t)rd->rd_entries;
2702 2703          iov.iov_len = rda->rda_count;
2703 2704          uio.uio_iov = &iov;
2704 2705          uio.uio_iovcnt = 1;
2705 2706          uio.uio_segflg = UIO_SYSSPACE;
2706 2707          uio.uio_extflg = UIO_COPY_CACHED;
2707 2708          uio.uio_loffset = (offset_t)rda->rda_offset;
2708 2709          uio.uio_resid = rda->rda_count;
2709 2710  
2710 2711          /*
2711 2712           * read directory
2712 2713           */
2713 2714          error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2714 2715  
2715 2716          /*
2716 2717           * Clean up
2717 2718           */
2718 2719          if (!error) {
2719 2720                  /*
2720 2721                   * set size and eof
2721 2722                   */
2722 2723                  if (uio.uio_resid == rda->rda_count) {
2723 2724                          rd->rd_size = 0;
2724 2725                          rd->rd_eof = TRUE;
2725 2726                  } else {
2726 2727                          rd->rd_size = (uint32_t)(rda->rda_count -
2727 2728                              uio.uio_resid);
2728 2729                          rd->rd_eof = iseof ? TRUE : FALSE;
2729 2730                  }
2730 2731          }
2731 2732  
2732 2733          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2733 2734          nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2734 2735          ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2735 2736              rda->rda_count, &ndata);
2736 2737  
2737 2738          if (ret != 0) {
2738 2739                  size_t dropbytes;
2739 2740                  /*
2740 2741                   * We had to drop one or more entries in order to fit
2741 2742                   * during the character conversion.  We need to patch
2742 2743                   * up the size and eof info.
2743 2744                   */
2744 2745                  if (rd->rd_eof)
2745 2746                          rd->rd_eof = FALSE;
2746 2747                  dropbytes = nfscmd_dropped_entrysize(
2747 2748                      (struct dirent64 *)rd->rd_entries, nents, ret);
2748 2749                  rd->rd_size -= dropbytes;
2749 2750          }
2750 2751          if (ndata == NULL) {
2751 2752                  ndata = (char *)rd->rd_entries;
2752 2753          } else if (ndata != (char *)rd->rd_entries) {
2753 2754                  kmem_free(rd->rd_entries, rd->rd_bufsize);
2754 2755                  rd->rd_entries = (void *)ndata;
2755 2756                  rd->rd_bufsize = rda->rda_count;
2756 2757          }
2757 2758  
2758 2759  bad:
2759 2760          VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2760 2761  
2761 2762  #if 0 /* notyet */
2762 2763          /*
2763 2764           * Don't do this.  It causes local disk writes when just
2764 2765           * reading the file and the overhead is deemed larger
2765 2766           * than the benefit.
2766 2767           */
2767 2768          /*
2768 2769           * Force modified metadata out to stable storage.
2769 2770           */
2770 2771          (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2771 2772  #endif
2772 2773  
2773 2774          VN_RELE(vp);
2774 2775  
2775 2776          rd->rd_status = puterrno(error);
2776 2777  
2777 2778  }
2778 2779  void *
2779 2780  rfs_readdir_getfh(struct nfsrddirargs *rda)
2780 2781  {
2781 2782          return (&rda->rda_fh);
2782 2783  }
2783 2784  void
2784 2785  rfs_rddirfree(struct nfsrddirres *rd)
2785 2786  {
2786 2787          if (rd->rd_entries != NULL)
2787 2788                  kmem_free(rd->rd_entries, rd->rd_bufsize);
2788 2789  }
2789 2790  
2790 2791  /* ARGSUSED */
2791 2792  void
2792 2793  rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2793 2794      struct svc_req *req, cred_t *cr, bool_t ro)
2794 2795  {
2795 2796          int error;
2796 2797          struct statvfs64 sb;
2797 2798          vnode_t *vp;
2798 2799  
2799 2800          vp = nfs_fhtovp(fh, exi);
2800 2801          if (vp == NULL) {
2801 2802                  fs->fs_status = NFSERR_STALE;
2802 2803                  return;
2803 2804          }
2804 2805  
2805 2806          error = VFS_STATVFS(vp->v_vfsp, &sb);
2806 2807  
2807 2808          if (!error) {
2808 2809                  fs->fs_tsize = nfstsize();
2809 2810                  fs->fs_bsize = sb.f_frsize;
2810 2811                  fs->fs_blocks = sb.f_blocks;
2811 2812                  fs->fs_bfree = sb.f_bfree;
2812 2813                  fs->fs_bavail = sb.f_bavail;
2813 2814          }
2814 2815  
2815 2816          VN_RELE(vp);
2816 2817  
2817 2818          fs->fs_status = puterrno(error);
2818 2819  
2819 2820  }
2820 2821  void *
2821 2822  rfs_statfs_getfh(fhandle_t *fh)
2822 2823  {
2823 2824          return (fh);
2824 2825  }
2825 2826  
2826 2827  static int
2827 2828  sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2828 2829  {
2829 2830          vap->va_mask = 0;
2830 2831  
2831 2832          /*
2832 2833           * There was a sign extension bug in some VFS based systems
2833 2834           * which stored the mode as a short.  When it would get
2834 2835           * assigned to a u_long, no sign extension would occur.
2835 2836           * It needed to, but this wasn't noticed because sa_mode
2836 2837           * would then get assigned back to the short, thus ignoring
2837 2838           * the upper 16 bits of sa_mode.
2838 2839           *
2839 2840           * To make this implementation work for both broken
2840 2841           * clients and good clients, we check for both versions
2841 2842           * of the mode.
2842 2843           */
2843 2844          if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2844 2845              sa->sa_mode != (uint32_t)-1) {
2845 2846                  vap->va_mask |= AT_MODE;
2846 2847                  vap->va_mode = sa->sa_mode;
2847 2848          }
2848 2849          if (sa->sa_uid != (uint32_t)-1) {
2849 2850                  vap->va_mask |= AT_UID;
2850 2851                  vap->va_uid = sa->sa_uid;
2851 2852          }
2852 2853          if (sa->sa_gid != (uint32_t)-1) {
2853 2854                  vap->va_mask |= AT_GID;
2854 2855                  vap->va_gid = sa->sa_gid;
2855 2856          }
2856 2857          if (sa->sa_size != (uint32_t)-1) {
2857 2858                  vap->va_mask |= AT_SIZE;
2858 2859                  vap->va_size = sa->sa_size;
2859 2860          }
2860 2861          if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2861 2862              sa->sa_atime.tv_usec != (int32_t)-1) {
2862 2863  #ifndef _LP64
2863 2864                  /* return error if time overflow */
2864 2865                  if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2865 2866                          return (EOVERFLOW);
2866 2867  #endif
2867 2868                  vap->va_mask |= AT_ATIME;
2868 2869                  /*
2869 2870                   * nfs protocol defines times as unsigned so don't extend sign,
2870 2871                   * unless sysadmin set nfs_allow_preepoch_time.
2871 2872                   */
2872 2873                  NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2873 2874                  vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2874 2875          }
2875 2876          if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2876 2877              sa->sa_mtime.tv_usec != (int32_t)-1) {
2877 2878  #ifndef _LP64
2878 2879                  /* return error if time overflow */
2879 2880                  if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2880 2881                          return (EOVERFLOW);
2881 2882  #endif
2882 2883                  vap->va_mask |= AT_MTIME;
2883 2884                  /*
2884 2885                   * nfs protocol defines times as unsigned so don't extend sign,
2885 2886                   * unless sysadmin set nfs_allow_preepoch_time.
2886 2887                   */
2887 2888                  NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2888 2889                  vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2889 2890          }
2890 2891          return (0);
2891 2892  }
2892 2893  
2893 2894  static const enum nfsftype vt_to_nf[] = {
2894 2895          0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2895 2896  };
2896 2897  
2897 2898  /*
2898 2899   * check the following fields for overflow: nodeid, size, and time.
2899 2900   * There could be a problem when converting 64-bit LP64 fields
2900 2901   * into 32-bit ones.  Return an error if there is an overflow.
2901 2902   */
2902 2903  int
2903 2904  vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2904 2905  {
2905 2906          ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2906 2907          na->na_type = vt_to_nf[vap->va_type];
2907 2908  
2908 2909          if (vap->va_mode == (unsigned short) -1)
2909 2910                  na->na_mode = (uint32_t)-1;
2910 2911          else
2911 2912                  na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2912 2913  
2913 2914          if (vap->va_uid == (unsigned short)(-1))
2914 2915                  na->na_uid = (uint32_t)(-1);
2915 2916          else if (vap->va_uid == UID_NOBODY)
2916 2917                  na->na_uid = (uint32_t)NFS_UID_NOBODY;
2917 2918          else
2918 2919                  na->na_uid = vap->va_uid;
2919 2920  
2920 2921          if (vap->va_gid == (unsigned short)(-1))
2921 2922                  na->na_gid = (uint32_t)-1;
2922 2923          else if (vap->va_gid == GID_NOBODY)
2923 2924                  na->na_gid = (uint32_t)NFS_GID_NOBODY;
2924 2925          else
2925 2926                  na->na_gid = vap->va_gid;
2926 2927  
2927 2928          /*
2928 2929           * Do we need to check fsid for overflow?  It is 64-bit in the
2929 2930           * vattr, but are bigger than 32 bit values supported?
2930 2931           */
2931 2932          na->na_fsid = vap->va_fsid;
2932 2933  
2933 2934          na->na_nodeid = vap->va_nodeid;
2934 2935  
2935 2936          /*
2936 2937           * Check to make sure that the nodeid is representable over the
2937 2938           * wire without losing bits.
2938 2939           */
2939 2940          if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2940 2941                  return (EFBIG);
2941 2942          na->na_nlink = vap->va_nlink;
2942 2943  
2943 2944          /*
2944 2945           * Check for big files here, instead of at the caller.  See
2945 2946           * comments in cstat for large special file explanation.
2946 2947           */
2947 2948          if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2948 2949                  if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2949 2950                          return (EFBIG);
2950 2951                  if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2951 2952                          /* UNKNOWN_SIZE | OVERFLOW */
2952 2953                          na->na_size = MAXOFF32_T;
2953 2954                  } else
2954 2955                          na->na_size = vap->va_size;
2955 2956          } else
2956 2957                  na->na_size = vap->va_size;
2957 2958  
2958 2959          /*
2959 2960           * If the vnode times overflow the 32-bit times that NFS2
2960 2961           * uses on the wire then return an error.
2961 2962           */
2962 2963          if (!NFS_VAP_TIME_OK(vap)) {
2963 2964                  return (EOVERFLOW);
2964 2965          }
2965 2966          na->na_atime.tv_sec = vap->va_atime.tv_sec;
2966 2967          na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2967 2968  
2968 2969          na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2969 2970          na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2970 2971  
2971 2972          na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2972 2973          na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2973 2974  
2974 2975          /*
2975 2976           * If the dev_t will fit into 16 bits then compress
2976 2977           * it, otherwise leave it alone. See comments in
2977 2978           * nfs_client.c.
2978 2979           */
2979 2980          if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2980 2981              getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2981 2982                  na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2982 2983          else
2983 2984                  (void) cmpldev(&na->na_rdev, vap->va_rdev);
2984 2985  
2985 2986          na->na_blocks = vap->va_nblocks;
2986 2987          na->na_blocksize = vap->va_blksize;
2987 2988  
2988 2989          /*
2989 2990           * This bit of ugliness is a *TEMPORARY* hack to preserve the
2990 2991           * over-the-wire protocols for named-pipe vnodes.  It remaps the
2991 2992           * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2992 2993           *
2993 2994           * BUYER BEWARE:
2994 2995           *  If you are porting the NFS to a non-Sun server, you probably
2995 2996           *  don't want to include the following block of code.  The
2996 2997           *  over-the-wire special file types will be changing with the
2997 2998           *  NFS Protocol Revision.
2998 2999           */
2999 3000          if (vap->va_type == VFIFO)
3000 3001                  NA_SETFIFO(na);
3001 3002          return (0);
3002 3003  }
3003 3004  
3004 3005  /*
3005 3006   * acl v2 support: returns approximate permission.
3006 3007   *      default: returns minimal permission (more restrictive)
3007 3008   *      aclok: returns maximal permission (less restrictive)
3008 3009   *      This routine changes the permissions that are alaredy in *va.
3009 3010   *      If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
3010 3011   *      CLASS_OBJ is always the same as GROUP_OBJ entry.
3011 3012   */
3012 3013  static void
3013 3014  acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
3014 3015  {
3015 3016          vsecattr_t      vsa;
3016 3017          int             aclcnt;
3017 3018          aclent_t        *aclentp;
3018 3019          mode_t          mask_perm;
3019 3020          mode_t          grp_perm;
3020 3021          mode_t          other_perm;
3021 3022          mode_t          other_orig;
3022 3023          int             error;
3023 3024  
3024 3025          /* dont care default acl */
3025 3026          vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
3026 3027          error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
3027 3028  
3028 3029          if (!error) {
3029 3030                  aclcnt = vsa.vsa_aclcnt;
3030 3031                  if (aclcnt > MIN_ACL_ENTRIES) {
3031 3032                          /* non-trivial ACL */
3032 3033                          aclentp = vsa.vsa_aclentp;
3033 3034                          if (exi->exi_export.ex_flags & EX_ACLOK) {
3034 3035                                  /* maximal permissions */
3035 3036                                  grp_perm = 0;
3036 3037                                  other_perm = 0;
3037 3038                                  for (; aclcnt > 0; aclcnt--, aclentp++) {
3038 3039                                          switch (aclentp->a_type) {
3039 3040                                          case USER_OBJ:
3040 3041                                                  break;
3041 3042                                          case USER:
3042 3043                                                  grp_perm |=
3043 3044                                                      aclentp->a_perm << 3;
3044 3045                                                  other_perm |= aclentp->a_perm;
3045 3046                                                  break;
3046 3047                                          case GROUP_OBJ:
3047 3048                                                  grp_perm |=
3048 3049                                                      aclentp->a_perm << 3;
3049 3050                                                  break;
3050 3051                                          case GROUP:
3051 3052                                                  other_perm |= aclentp->a_perm;
3052 3053                                                  break;
3053 3054                                          case OTHER_OBJ:
3054 3055                                                  other_orig = aclentp->a_perm;
3055 3056                                                  break;
3056 3057                                          case CLASS_OBJ:
3057 3058                                                  mask_perm = aclentp->a_perm;
3058 3059                                                  break;
3059 3060                                          default:
3060 3061                                                  break;
3061 3062                                          }
3062 3063                                  }
3063 3064                                  grp_perm &= mask_perm << 3;
3064 3065                                  other_perm &= mask_perm;
3065 3066                                  other_perm |= other_orig;
3066 3067  
3067 3068                          } else {
3068 3069                                  /* minimal permissions */
3069 3070                                  grp_perm = 070;
3070 3071                                  other_perm = 07;
3071 3072                                  for (; aclcnt > 0; aclcnt--, aclentp++) {
3072 3073                                          switch (aclentp->a_type) {
3073 3074                                          case USER_OBJ:
3074 3075                                                  break;
3075 3076                                          case USER:
3076 3077                                          case CLASS_OBJ:
3077 3078                                                  grp_perm &=
3078 3079                                                      aclentp->a_perm << 3;
3079 3080                                                  other_perm &=
3080 3081                                                      aclentp->a_perm;
3081 3082                                                  break;
3082 3083                                          case GROUP_OBJ:
3083 3084                                                  grp_perm &=
3084 3085                                                      aclentp->a_perm << 3;
3085 3086                                                  break;
3086 3087                                          case GROUP:
3087 3088                                                  other_perm &=
3088 3089                                                      aclentp->a_perm;
3089 3090                                                  break;
3090 3091                                          case OTHER_OBJ:
3091 3092                                                  other_perm &=
3092 3093                                                      aclentp->a_perm;
3093 3094                                                  break;
3094 3095                                          default:
3095 3096                                                  break;
3096 3097                                          }
3097 3098                                  }
3098 3099                          }
3099 3100                          /* copy to va */
3100 3101                          va->va_mode &= ~077;
3101 3102                          va->va_mode |= grp_perm | other_perm;
3102 3103                  }
3103 3104                  if (vsa.vsa_aclcnt)
3104 3105                          kmem_free(vsa.vsa_aclentp,
3105 3106                              vsa.vsa_aclcnt * sizeof (aclent_t));
3106 3107          }
3107 3108  }
3108 3109  
3109 3110  void
3110 3111  rfs_srvrinit(void)
3111 3112  {
3112 3113          nfs2_srv_caller_id = fs_new_caller_id();
3113 3114  }
3114 3115  
3115 3116  void
3116 3117  rfs_srvrfini(void)
3117 3118  {
3118 3119  }
3119 3120  
3120 3121  /* ARGSUSED */
3121 3122  void
3122 3123  rfs_srv_zone_init(nfs_globals_t *ng)
3123 3124  {
3124 3125          nfs_srv_t *ns;
3125 3126  
3126 3127          ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
3127 3128  
3128 3129          mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3129 3130          ns->write_async = 1;
3130 3131  
3131 3132          ng->nfs_srv = ns;
3132 3133  }
3133 3134  
3134 3135  /* ARGSUSED */
3135 3136  void
3136 3137  rfs_srv_zone_fini(nfs_globals_t *ng)
3137 3138  {
3138 3139          nfs_srv_t *ns = ng->nfs_srv;
3139 3140  
3140 3141          ng->nfs_srv = NULL;
3141 3142  
3142 3143          mutex_destroy(&ns->async_write_lock);
3143 3144          kmem_free(ns, sizeof (*ns));
3144 3145  }
3145 3146  
3146 3147  static int
3147 3148  rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3148 3149  {
3149 3150          struct clist    *wcl;
3150 3151          int             wlist_len;
3151 3152          uint32_t        count = rr->rr_count;
3152 3153  
3153 3154          wcl = ra->ra_wlist;
3154 3155  
3155 3156          if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3156 3157                  return (FALSE);
3157 3158          }
3158 3159  
3159 3160          wcl = ra->ra_wlist;
3160 3161          rr->rr_ok.rrok_wlist_len = wlist_len;
3161 3162          rr->rr_ok.rrok_wlist = wcl;
3162 3163  
3163 3164          return (TRUE);
3164 3165  }

↓ open down ↓

2723 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX